In [1]:
import pandas as pd

In [2]:
# Read in the unpatched dataset
df = pd.read_csv("feedbacks.csv")
df.head(1)

Unnamed: 0,language,id,country,user_type,organization,surname,feedback,status,company_size,first_name,reference_initiative,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level
0,en,2665651,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,,COM(2021)206,2021-08-06 23:57:37,anonymous,24212003,closed,,,


In [3]:
# Reclassification: Other -> academia
academia = [2665640, 2665609, 2665430, 2662611]
df.loc[df["id"].isin(academia), "user_type"] = "academic_research_institution"

In [4]:
# Reclassification: Other -> NGO
ngo = [2665628, 2665582, 2665562, 2665551, 2665524, 2665502, 2665501, 2665296] \
    + [2663361, 2662109]  # fmt: skip
df.loc[df["id"].isin(ngo), "user_type"] = "ngo"

In [5]:
# Reclassification: Other -> business association
business = [2665599, 2665503, 2662473, 2662182]
df.loc[df["id"].isin(business), "user_type"] = "business_association"

In [6]:
# Reclassification: Other -> company
company = [2665534]
df.loc[df["id"].isin(company), "user_type"] = "company"

In [7]:
# Reclassification: Other -> EU citizen
citizen = [2663295]
df.loc[df["id"].isin(citizen), "user_type"] = "eu_citizen"

# Citizens represent themselves, company size entries should be a mistake
df.loc[df["user_type"] == "eu_citizen", "company_size"] = None

In [8]:
# Reclassification: Other -> public authority
public = [2663289, 2662219, 2661501]
df.loc[df["id"].isin(public), "user_type"] = "public_authority"

In [9]:
# Reclassification: Other -> standardizing body
public = [2665329, 2665289, 2661384, 2635987]
df.loc[df["id"].isin(public), "user_type"] = "standardizing_body"

In [10]:
# Check that no standardizing bodies have other user types
df.loc[df["organization"].str.contains("Standa|standa|Norm|norm|ETSI|CEN", regex=True).fillna(False)]

Unnamed: 0,language,id,country,user_type,organization,surname,feedback,status,company_size,first_name,reference_initiative,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level
109,en,2665497,BEL,business_association,Small Business Standards (SBS),EDVARDSSON,SBS thanks the European Commission for the opp...,PUBLISHED,micro,Maria,COM(2021)206,2021-08-06 09:46:39,withinfo,24212003,closed,653009713663-08,,
121,en,2665478,GBR,company,Standard Chartered,Webber,Please see attached feedback from Standard Cha...,PUBLISHED,large,Marianne,COM(2021)206,2021-08-05 20:41:56,withinfo,24212003,closed,16595501800-80,,
170,nl,2665329,NLD,standardizing_body,Netherlands Normalisation Institute (NEN),Meuleman,The Medical Device / AI Expert Group (MD-AIG) ...,PUBLISHED,large,Lysette,COM(2021)206,2021-08-04 13:51:28,withinfo,24212003,closed,,,
180,en,2665289,FRA,standardizing_body,ETSI,DOR,ETSI welcomes the draft Regulation on AI prese...,PUBLISHED,medium,margot,COM(2021)206,2021-08-04 09:29:49,withinfo,24212003,closed,474710916419-15,,
268,es,2662182,ESP,business_association,CENTRO ESPAÑOL DE DERECHOS REPROGRÁFICOS EGDPI...,MORAN,\nEstas son las contribuciones del CENTRO ESPA...,PUBLISHED,small,MERCEDES,COM(2021)206,2021-07-13 11:57:51,withinfo,24212003,closed,021880022200-45,,
278,en,2661384,FRA,standardizing_body,ETSI,DOR,ETSI welcomes the draft Regulation on AI prese...,PUBLISHED,medium,margot,COM(2021)206,2021-07-06 18:10:48,withinfo,24212003,closed,474710916419-15,,
291,en,2635987,DEU,standardizing_body,DIN Deutsches Institut für Normung e.V.,GABLER,We welcome the European Commission's proposal ...,PUBLISHED,large,Sibylle,COM(2021)206,2021-06-15 15:50:26,withinfo,24212003,closed,989808524267-58,,


In [11]:
# Fix typo in user type
df["user_type"] = df["user_type"].map(
    lambda x: x
    if x != "academic_research_instittution"
    else "academic_research_institution"
)

In [12]:
# Look for submissions with duplicate organization names
dup_orgs = df["organization"].value_counts()
dup_orgs = dup_orgs[dup_orgs > 1].index

# Look for submissions with duplicate author names
df["full_name"] = df["first_name"] + " " + df["surname"]
dup_names = df["full_name"].value_counts()
dup_names = dup_names[dup_names > 1].index

# Show all duplicates
pd.concat(
    (
        df.query("organization.isin(@dup_orgs)").sort_values("organization"),
        df.query("full_name.isin(@dup_names)").sort_values("full_name"),
    )
).drop_duplicates(subset="id")

Unnamed: 0,language,id,country,user_type,organization,surname,feedback,status,company_size,first_name,reference_initiative,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name
105,en,2665502,BEL,ngo,AI4Belgium,Ackerman,"Key feedback points\n1)\tOverall, the proposed...",PUBLISHED,large,Nathanael,COM(2021)206,2021-08-06 09:55:40,withinfo,24212003,closed,,,,Nathanael Ackerman
106,en,2665501,BEL,ngo,AI4Belgium,,"Key feedback points\n1)\tOverall, the proposed...",PUBLISHED,large,,COM(2021)206,2021-08-06 09:54:36,anonymous,24212003,closed,,,,
180,en,2665289,FRA,standardizing_body,ETSI,DOR,ETSI welcomes the draft Regulation on AI prese...,PUBLISHED,medium,margot,COM(2021)206,2021-08-04 09:29:49,withinfo,24212003,closed,474710916419-15,,,margot DOR
278,en,2661384,FRA,standardizing_body,ETSI,DOR,ETSI welcomes the draft Regulation on AI prese...,PUBLISHED,medium,margot,COM(2021)206,2021-07-06 18:10:48,withinfo,24212003,closed,474710916419-15,,,margot DOR
49,en,2665583,DEU,company,Infineon Technologies AG,SEBASTIAN,"Overall, Infineon Technologies AG welcomes the...",PUBLISHED,large,Ina,COM(2021)206,2021-08-06 16:27:53,withinfo,24212003,closed,10751968675-85,,,Ina SEBASTIAN
64,en,2665559,DEU,company,Infineon Technologies AG,,"Overall, Infineon Technologies AG welcomes the...",PUBLISHED,large,,COM(2021)206,2021-08-06 14:42:26,anonymous,24212003,closed,10751968675-85,,,
36,en,2665603,DEU,company,European AI Forum,Dickmann,Please find attached joint response to the Eur...,PUBLISHED,large,Alex,COM(2021)206,2021-08-06 17:52:24,withinfo,24212003,closed,,,,Alex Dickmann
37,en,2665602,DEU,business_association,KI Bundesverband e.V.,Dickmann,Please find attached KI Bundesverband's respon...,PUBLISHED,large,Alex,COM(2021)206,2021-08-06 17:48:40,withinfo,24212003,closed,,,,Alex Dickmann
26,en,2665616,FRA,trade_union,Federation of Craft Businesses in the automoti...,CASTELL,Federation of Craft Businesses in the automoti...,PUBLISHED,small,Brigitte,COM(2021)206,2021-08-06 19:26:07,withinfo,24212003,closed,705440625408-01,,,Brigitte CASTELL
79,en,2665537,FRA,trade_union,Federation of Craft Businesses in the automoti...,CASTELL,Federation of Craft Businesses in the automoti...,PUBLISHED,small,Brigitte,COM(2021)206,2021-08-06 13:19:08,withinfo,24212003,closed,705440625408-01,,,Brigitte CASTELL


In [13]:
# Remove duplicate submissions
duplicates = [2665559, 2661384, 2665501, 2665537]
df = df.query("~id.isin(@duplicates)")

In [14]:
# Simple statistic
df["user_type"].value_counts()

company                          87
business_association             79
ngo                              64
academic_research_institution    24
trade_union                      14
eu_citizen                       13
public_authority                 11
consumer_organisation             5
standardizing_body                3
Name: user_type, dtype: int64

In [15]:
# Limit to actually published feedback (in case some got rejected)
df = df.query("status == 'PUBLISHED'")

In [16]:
df.to_csv("patched_feedbacks.csv", index=False)