In [1]:
import os
if not os.environ.get("PREAMBLE_RUN", False): 
    %run -i "../preamble.py" 2

In [2]:
import pandas as pd
import requests
import os

from src.constants import PATH_INTERMED_CHES_51_MIG_W_META, PATH_INTERMED_CHES_52_NAT_MEMBER

# set display options for the notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In order to link the national party memberships of the MEPs in our migration dataset with the corresponding ches party entries, we need to extract all the national parties our MEPs were part of during the legislative periods of 2014-2024 and then map those to their corresponding party ID of the CHES dataset (if possible.) Therefore, in this script we extract the memberships and link them to their corresponding CHES parties.

In [None]:
# first load enriched migration dataset
original_enriched_migration_df = pd.read_parquet(PATH_INTERMED_CHES_51_MIG_W_META, engine='fastparquet')
enriched_migration_df = original_enriched_migration_df.copy(deep=True)
display(enriched_migration_df.head(5))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,ep_identifier,ep_label,ep_citizenship,ep_placeOfBirth,ep_bday,ep_deathDate,ep_gender,ep_hasMembership
0,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282,197567,Karen MELCHIOR,DNK,Gentofte,1980-10-15,,FEMALE,"[{""id"":""membership/197567-f-161757"",""type"":""Me..."
1,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501,197772,Tineke STRIK,NLD,Appeltern,1961-09-28,,FEMALE,"[{""id"":""membership/197772-f-161044"",""type"":""Me..."
2,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925,254718,Anabela RODRIGUES,PRT,Lisboa,1976-10-18,,FEMALE,"[{""id"":""membership/254718-f-169354"",""type"":""Me..."
3,1054,Beata Kempa,Pani Przewodnicząca! Pani Komisarz! Potrzebuje...,2024-04-22,20. Amending Directive 2011/36/EU on preventin...,14,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam Commissioner! We need very specific, ver...",machine_gm,2024,(extreme)_right,ECR,0.256493,197519,Beata KEMPA,POL,Syców,1966-02-11,,FEMALE,"[{""id"":""membership/197519-f-164917"",""type"":""Me..."
4,1056,Dorien Rookmaker,"Voorzitter, dank u wel, meneer Engerer, want u...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,16,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, thank you, Mr Engerer, becaus...",machine_gm,2024,(extreme)_right,ECR,0.475315,204733,Dorien ROOKMAKER,NLD,Zijpe,1964-07-30,,FEMALE,"[{""id"":""membership/204733-f-169018"",""type"":""Me..."


In [4]:
# Now use the MEP's identifier to fetch the relevant metadata
metadata = []
identifiers = list(set(enriched_migration_df['ep_identifier']))
print("Amount of (unique) identifiers in enriched_migration_df:", len(identifiers))
batch_size = 128
idx = 0
while idx < len(identifiers):
    print(f"idx: {idx}")
    batch = identifiers[idx: idx + batch_size]
    response = requests.get(f"https://data.europarl.europa.eu/api/v2/meps/{','.join(batch)}",
                                params={"format": "application/ld+json"})
    
    if response.status_code == 200:
        metadata.extend(response.json()['data'])
    else:
        print(f"An error occurred: {response.status_code}")

    idx += batch_size
print("Finished without error.")
# create dataframe
original_mep_metadata = pd.DataFrame(metadata)

Amount of (unique) identifiers in enriched_migration_df: 889
idx: 0
idx: 128
idx: 256
idx: 384
idx: 512
idx: 640
idx: 768
Finished without error.


In [5]:
mep_metadata = original_mep_metadata.copy(deep=True)
# only keep relevant columns
mep_metadata = mep_metadata[["identifier", "label", "hasMembership"]]
display(mep_metadata.head(3))

Unnamed: 0,identifier,label,hasMembership
0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '..."
1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ..."
2,103746,Michal ŠIMEČKA,"[{'id': 'membership/103746-f-166291', 'type': ..."


Now let's take extract membership and party affiliations (i.e. the "hasMembership" column) of the fetched metadata

In [6]:
def extract_national_political_group(memberships):
    national_parties = [{"id": membership.get('organization', '')[len("org/"):],
                         "startDate": membership['memberDuring'].get('startDate'),
                         "endDate": membership['memberDuring'].get('endDate'),
                         "role": membership.get('role'),
                         "class": membership.get('membershipClassification', '')[len("def/ep-entities/"):]
                         } for membership in memberships if
                         membership.get('membershipClassification', "") == 'def/ep-entities/NATIONAL_POLITICAL_GROUP']
    return national_parties

# include this for sanity checks
def extract_eu_political_group(memberships):
    european_parties = [{"id": membership.get('organization', '')[len("org/"):],
                         "startDate": membership['memberDuring'].get('startDate'),
                         "endDate": membership['memberDuring'].get('endDate'),
                         "role": membership.get('role'),
                         "class": membership.get('membershipClassification', '')[len("def/ep-entities/"):]
                         } for membership in memberships if
                         membership.get('membershipClassification', "") == 'def/ep-entities/EU_POLITICAL_GROUP']
    return european_parties

In [7]:
mep_metadata['national_parties'] = mep_metadata['hasMembership'].apply(extract_national_political_group)
# do the same for eu parties (for sanity checks later on)
#mep_metadata['eu_parties'] = mep_metadata['hasMembership'].apply(extract_eu_political_group)
display(mep_metadata.head(3))

Unnamed: 0,identifier,label,hasMembership,national_parties
0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...","[{'id': '4117', 'startDate': '2015-09-01', 'en..."
1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...","[{'id': '4025', 'startDate': '2014-07-01', 'en..."
2,103746,Michal ŠIMEČKA,"[{'id': 'membership/103746-f-166291', 'type': ...","[{'id': '5587', 'startDate': '2019-07-02', 'en..."


Now fetch the relevant metadata from the EP API on all the national parties, which are occuring in our dataset, as well

In [8]:
# get all occurring party id's
national_party_ids = set(mep_metadata['national_parties'].explode().apply(lambda org: org['id']))

# fetch the data on national parties
national_parties = []
national_party_ids = list(national_party_ids)
print(f"We have {len(national_party_ids)} occurring national parties.")
batch_size = 128
idx = 0
while idx < len(national_party_ids):
    print(f"idx: {idx}")
    batch = national_party_ids[idx: idx + batch_size]
    response = requests.get(f"https://data.europarl.europa.eu/api/v2/corporate-bodies/{','.join(batch)}",
                                params={"format": "application/ld+json"})

    if response.status_code == 200:
        national_parties.extend(response.json()['data'])
    else:
        print(f"An error occurred: {response.status_code}")

    idx += batch_size
print("Finished without error.")
print(f"Loaded data on {len(national_parties)} national parties.")

We have 878 occurring national parties.
idx: 0
idx: 128
idx: 256
idx: 384
idx: 512
idx: 640
idx: 768
Finished without error.
Loaded data on 878 national parties.


In [13]:
original_national_parties_df = pd.DataFrame(national_parties)
national_parties_df = original_national_parties_df.copy(deep=True)
national_parties_df.head(3)

Unnamed: 0,id,type,represents,identifier,source,temporal,label,altLabel,notation_providerTemporalBodyId,notation_codictBodyId,prefLabel,classification,isVersionOf
0,org/1353,Organization,[http://publications.europa.eu/resource/author...,1353,EU_PARLIAMENT,"{'id': 'time-period/20040720-20090713', 'type'...",TS,"{'et': 'TS', 'nl': 'TS', 'ro': 'TS', 'da': 'TS...",1353,1353,"{'el': 'Tėvynės sąjunga', 'mt': 'Tėvynės sąjun...",def/ep-entities/NATIONAL_POLITICAL_GROUP,
1,org/1367,Organization,[http://publications.europa.eu/resource/author...,1367,EU_PARLIAMENT,"{'id': 'time-period/20040720-20090325', 'type'...",IL,"{'ga': 'IL', 'sl': 'IL', 'sk': 'IL', 'cs': 'IL...",1367,1367,{'pt': 'Erakond Isamaaliit (Pro Patria Union)'...,def/ep-entities/NATIONAL_POLITICAL_GROUP,
2,org/1384,Organization,[http://publications.europa.eu/resource/author...,1384,EU_PARLIAMENT,"{'id': 'time-period/20040720-20090713', 'type'...",A,"{'hu': 'A', 'fr': 'A', 'it': 'A', 'lv': 'A', '...",1384,1384,"{'nl': 'Socialdemokratiet', 'lt': 'Socialdemok...",def/ep-entities/NATIONAL_POLITICAL_GROUP,


In [None]:
# Now do the same for the european parties (keep for sanity checks later)
## get all occurring party id's
#european_party_ids = set(enriched_migration_df['eu_parties'].explode().apply(lambda org: org['id']))
#
## fetch the data on european parties
#european_parties = []
#european_party_ids = list(european_party_ids)
#print(f"We have {len(european_party_ids)} occurring european parties.")
#batch_size = 128
#idx = 0
#while idx < len(european_party_ids):
#    print(f"idx: {idx}")
#    batch = european_party_ids[idx: idx + batch_size]
#    response = requests.get(f"https://data.europarl.europa.eu/api/v2/corporate-bodies/{','.join(batch)}",
#                                params={"format": "application/ld+json"})
#
#    if response.status_code == 200:
#        european_parties.extend(response.json()['data'])
#    else:
#        print(f"An error occurred: {response.status_code}")
#
#    idx += batch_size
#print("Finished without error.")
#print(f"Loaded data on {len(european_parties)} european parties.")

In [None]:
#original_european_parties_df = pd.DataFrame(european_parties)
#european_parties_df = original_european_parties_df.copy(deep=True)
#european_parties_df.head(3)

Clean up the dataframe on national parties a bit

In [14]:
# replace URI with actual value for column "represents"
national_parties_df['represents'] = national_parties_df['represents'].apply(lambda rep: rep[0][len('http://publications.europa.eu/resource/authority/country/'):] if isinstance(rep, list) else rep)

national_parties_df['classification'] = national_parties_df['classification'].str.removeprefix("def/ep-entities/")

# only keep relevant columns
national_parties_df = national_parties_df[["represents", "identifier", "temporal", "label", "altLabel", "prefLabel", "classification"]]
# rename the columns to avoid ambiguity later on
national_parties_df = national_parties_df.rename(columns={
    "represents" : "party_represents",
    "identifier" : "party_id",
    "temporal" : "party_temporal",
    "label" : "party_label",
    "altLabel" : "party_altLabel",
    "prefLabel" : "party_prefLabel",
    "classification" : "party_classification",
})

# For the national parties, extract all possible labels and party names as lists each
national_parties_df["party_all_labels"] = national_parties_df["party_altLabel"].apply(lambda d: (list(d.values())))
# add the label from column "party_label" to the list (and eliminate duplicates)
national_parties_df["party_all_labels"] = national_parties_df.apply(
    lambda row: list(set(row["party_all_labels"] + [row["party_label"]])),
    axis=1
)
# also extract all possible (and unique) party names
national_parties_df["party_all_names"] = national_parties_df["party_prefLabel"].apply(lambda d: (list(set(d.values()))))

display(national_parties_df.head())

Unnamed: 0,party_represents,party_id,party_temporal,party_label,party_altLabel,party_prefLabel,party_classification,party_all_labels,party_all_names
0,LTU,1353,"{'id': 'time-period/20040720-20090713', 'type'...",TS,"{'et': 'TS', 'nl': 'TS', 'ro': 'TS', 'da': 'TS...","{'el': 'Tėvynės sąjunga', 'mt': 'Tėvynės sąjun...",NATIONAL_POLITICAL_GROUP,[TS],[Tėvynės sąjunga]
1,EST,1367,"{'id': 'time-period/20040720-20090325', 'type'...",IL,"{'ga': 'IL', 'sl': 'IL', 'sk': 'IL', 'cs': 'IL...",{'pt': 'Erakond Isamaaliit (Pro Patria Union)'...,NATIONAL_POLITICAL_GROUP,[IL],[Erakond Isamaaliit (Pro Patria Union)]
2,DNK,1384,"{'id': 'time-period/20040720-20090713', 'type'...",A,"{'hu': 'A', 'fr': 'A', 'it': 'A', 'lv': 'A', '...","{'nl': 'Socialdemokratiet', 'lt': 'Socialdemok...",NATIONAL_POLITICAL_GROUP,[A],[Socialdemokratiet]
3,SVN,1399,"{'id': 'time-period/20040720-20090713', 'type'...",NSi,"{'da': 'NSi', 'nl': 'NSi', 'ro': 'NSi', 'pl': ...","{'cs': 'Nova Slovenija', 'sk': 'Nova Slovenija...",NATIONAL_POLITICAL_GROUP,[NSi],[Nova Slovenija]
4,POL,1410,"{'id': 'time-period/20040720-20090713', 'type'...",SO,"{'nl': 'SO', 'ro': 'SO', 'da': 'SO', 'sk': 'SO...","{'pt': 'Samoobrona RP', 'bg': 'Samoobrona RP',...",NATIONAL_POLITICAL_GROUP,[SO],[Samoobrona RP]


Now lets merge the meta data of the parties (national and european) with the meta data of the Members of Parliament and create a "Membership" Dataframe

In [15]:
nat_memberships_df = mep_metadata.explode('national_parties').reset_index()
display(nat_memberships_df.sort_values(by=['identifier']).head(10))

Unnamed: 0,index,identifier,label,hasMembership,national_parties
1966,768,101580,Constance LE GRIP,"[{'id': 'membership/101580-f-137779', 'type': ...","{'id': '2733', 'startDate': '2010-02-10', 'end..."
1964,768,101580,Constance LE GRIP,"[{'id': 'membership/101580-f-137779', 'type': ...","{'id': '4907', 'startDate': '2015-05-30', 'end..."
1965,768,101580,Constance LE GRIP,"[{'id': 'membership/101580-f-137779', 'type': ...","{'id': '4041', 'startDate': '2014-07-01', 'end..."
1661,640,102886,Ildikó GÁLL-PELCZ,"[{'id': 'membership/102886-f-149915', 'type': ...","{'id': '2753', 'startDate': '2010-06-02', 'end..."
1660,640,102886,Ildikó GÁLL-PELCZ,"[{'id': 'membership/102886-f-149915', 'type': ...","{'id': '4068', 'startDate': '2014-07-01', 'end..."
0,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...","{'id': '4117', 'startDate': '2015-09-01', 'end..."
2,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...","{'id': '2941', 'startDate': '2011-09-13', 'end..."
1,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...","{'id': '6696', 'startDate': '2024-07-16', 'end..."
1353,512,103381,Terry REINTKE,"[{'id': 'membership/103381-f-156375', 'type': ...","{'id': '6773', 'startDate': '2024-07-16', 'end..."
1354,512,103381,Terry REINTKE,"[{'id': 'membership/103381-f-156375', 'type': ...","{'id': '4010', 'startDate': '2014-07-01', 'end..."


In [16]:
# extract the key-value pairs of the dictionaries in "national_parties" column as new columns
nat_memberships_unpacked_df = nat_memberships_df.join(pd.json_normalize(nat_memberships_df["national_parties"])).drop(columns="national_parties").reset_index()
# rename the columns for clarification
nat_memberships_unpacked_df = nat_memberships_unpacked_df.rename(columns={
    "identifier" : "person_id",
    "label" : "person_label",
    "id" : "party_id",
    "startDate" : "member_startDate",
    "endDate" : "member_endDate",
    "role" : "member_role",
    "class" : "party_class"
})

# clean up role column
nat_memberships_unpacked_df['member_role'] = nat_memberships_unpacked_df['member_role'].str.removeprefix("def/ep-roles/")

display(nat_memberships_unpacked_df.head())

Unnamed: 0,level_0,index,person_id,person_label,hasMembership,party_id,member_startDate,member_endDate,member_role,party_class
0,0,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",4117,2015-09-01,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP
1,1,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",6696,2024-07-16,,MEMBER,NATIONAL_POLITICAL_GROUP
2,2,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",2941,2011-09-13,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP
3,3,1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",4025,2014-07-01,2019-05-20,MEMBER,NATIONAL_POLITICAL_GROUP
4,4,1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",2714,2010-07-19,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP


Apply some sanity checks

In [17]:
# quick sanity check (all memberships should be unique (i.e. the combination of MEP's id, party's id, start and endDate)
duplicates = nat_memberships_unpacked_df.duplicated(subset=["person_id", "party_id", "member_startDate", "member_endDate"], keep=False)
print("Any duplicates in membership (combinations):", duplicates.any()) # should return False for combinations (person_id, party_id, member_startDate, member_endDate)
#display(nat_memberships_unpacked_df.loc[duplicates].head(15))

Any duplicates in membership (combinations): False


In [18]:
# quick sanity smell test if the time periods of membership do add up per member
def check_desc_start_order_after_desc_end(df):
    d = df.copy()
    d["startDate"] = pd.to_datetime(d["member_startDate"], format="%Y-%m-%d", errors="raise")
    d["endDate"]   = pd.to_datetime(d["member_endDate"],   format="%Y-%m-%d", errors="raise")

    # sort by person then endDate descending
    d = d.sort_values(["person_id", "endDate"], ascending=[True, True], kind="mergesort")

    # for each person, check whether startDate is descending in this order
    # startDate descending means startDate >= next_startDate for all adjacent rows
    d["next_start"] = d.groupby("person_id")["startDate"].shift(-1)
    bad = d["next_start"].notna() & (d["startDate"] > d["next_start"])  # violates descending

    # persons that fail the heuristic
    bad_persons = d.loc[bad, "person_id"].unique()

    return bad_persons, d[d["person_id"].isin(bad_persons)].drop(columns=["next_start"])

# usage:
bad_persons, details = check_desc_start_order_after_desc_end(nat_memberships_unpacked_df)
print(bad_persons) # should return []
display(details.head())

[]


Unnamed: 0,level_0,index,person_id,person_label,hasMembership,party_id,member_startDate,member_endDate,member_role,party_class,startDate,endDate


Wonderful, now let's merge with the metadata of the national parties

In [19]:
enriched_memberships = nat_memberships_unpacked_df.merge(
    national_parties_df,
    left_on="party_id",
    right_on="party_id",
    how="left",   # keep all memberships, even if some parties lack metadata
    validate="many_to_one"  # checks for duplicates of "identifier" in national_parties_df
)

# parties table should usually have unique party_id
assert national_parties_df["party_id"].is_unique

# see if any memberships failed to find metadata
missing_meta = enriched_memberships["party_label"].isna().sum()
print("Memberships without party metadata:", missing_meta) # should return 0

display(enriched_memberships.head(20))

Memberships without party metadata: 0


Unnamed: 0,level_0,index,person_id,person_label,hasMembership,party_id,member_startDate,member_endDate,member_role,party_class,party_represents,party_temporal,party_label,party_altLabel,party_prefLabel,party_classification,party_all_labels,party_all_names
0,0,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",4117,2015-09-01,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20140701-20190701', 'type'...",PVV,"{'ga': 'PVV', 'lt': 'PVV', 'nl': 'PVV', 'ro': ...","{'de': 'Partij voor de Vrijheid', 'lt': 'Parti...",NATIONAL_POLITICAL_GROUP,[PVV],[Partij voor de Vrijheid]
1,1,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",6696,2024-07-16,,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20240716', 'type': 'Period...",PVV,"{'sv': 'PVV', 'pt': 'PVV', 'bg': 'PVV', 'el': ...","{'nl': 'Partij voor de Vrijheid', 'ro': 'Parti...",NATIONAL_POLITICAL_GROUP,[PVV],[Partij voor de Vrijheid]
2,2,0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",2941,2011-09-13,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20090714-20140630', 'type'...",PVV,"{'lt': 'PVV', 'nl': 'PVV', 'ro': 'PVV', 'da': ...","{'nl': 'Partij voor de Vrijheid', 'ro': 'Parti...",NATIONAL_POLITICAL_GROUP,[PVV],[Partij voor de Vrijheid]
3,3,1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",4025,2014-07-01,2019-05-20,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20140701-20190701', 'type'...",PSOE,"{'lv': 'PSOE', 'it': 'PSOE', 'fr': 'PSOE', 'hu...","{'hr': 'Partido Socialista Obrero Español', 'e...",NATIONAL_POLITICAL_GROUP,[PSOE],[Partido Socialista Obrero Español]
4,4,1,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",2714,2010-07-19,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20090714-20140630', 'type'...",PSOE,"{'bg': 'PSOE', 'pt': 'PSOE', 'sv': 'PSOE', 'fi...","{'lt': 'Partido Socialista Obrero Español', 'd...",NATIONAL_POLITICAL_GROUP,[PSOE],[Partido Socialista Obrero Español]
5,5,2,103746,Michal ŠIMEČKA,"[{'id': 'membership/103746-f-166291', 'type': ...",5587,2019-07-02,2023-10-24,MEMBER,NATIONAL_POLITICAL_GROUP,SVK,"{'id': 'time-period/20190702-20240715', 'type'...",PS,"{'nl': 'Progresívne Slovensko', 'ro': 'Progres...","{'sl': 'Progresívne Slovensko', 'ga': 'Progres...",NATIONAL_POLITICAL_GROUP,"[Progresívne Slovensko, PS]",[Progresívne Slovensko]
6,6,3,106202,Julia REID,"[{'id': 'membership/106202-f-148838', 'type': ...",4951,2018-12-08,2019-02-11,MEMBER,NATIONAL_POLITICAL_GROUP,GBR,"{'id': 'time-period/20151001-20190701', 'type'...",Independent,"{'it': 'Independent', 'fr': 'Independent', 'hu...","{'ro': 'Independent', 'da': 'Independent', 'pl...",NATIONAL_POLITICAL_GROUP,[Independent],[Independent]
7,7,3,106202,Julia REID,"[{'id': 'membership/106202-f-148838', 'type': ...",5073,2019-02-12,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,GBR,"{'id': 'time-period/20190205-20190701', 'type'...",BREX,"{'nl': 'The Brexit Party', 'ro': 'The Brexit P...","{'es': 'The Brexit Party', 'et': 'The Brexit P...",NATIONAL_POLITICAL_GROUP,"[The Brexit Party, BREX]",[The Brexit Party]
8,8,3,106202,Julia REID,"[{'id': 'membership/106202-f-148838', 'type': ...",4051,2014-07-01,2018-12-07,MEMBER,NATIONAL_POLITICAL_GROUP,GBR,"{'id': 'time-period/20140701-20190701', 'type'...",UKIP,"{'pl': 'UKIP', 'ga': 'UKIP', 'sl': 'UKIP', 'sk...","{'es': 'United Kingdom Independence Party', 'b...",NATIONAL_POLITICAL_GROUP,[UKIP],[United Kingdom Independence Party]
9,9,4,110977,Florian PHILIPPOT,"[{'id': 'membership/110977-f-148001', 'type': ...",5027,2017-10-04,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,FRA,"{'id': 'time-period/20171004-20190701', 'type'...",Les Patriotes,"{'mt': 'Patri', 'lt': 'Patri', 'en': 'Patri', ...","{'nl': 'Les Patriotes', 'ro': 'Les Patriotes',...",NATIONAL_POLITICAL_GROUP,"[Patri, Les Patriotes]",[Les Patriotes]


NOTE: It seems that there is no 1-to-1 mapping between a party's label and a party's id, i.e. a party can have multiple ids (this is also reflected in the original national parties dataframe and due to the differences in the temporal description / column "party_temporal"), but for each separate period store in "party_temporal" we should still have a unique combination of party label and party id nonetheless.

In [20]:
# briefly investigate this on a copy
checker_copy = enriched_memberships.copy(deep=True)
def canonicalize_dict(d):
    if d is None:
        return None
    # tuple of (key, value) sorted by key
    return tuple(sorted(d.items()))

checker_copy["_party_temporal_key"] = checker_copy["party_temporal"].apply(canonicalize_dict)

s = checker_copy.groupby("party_id")["_party_temporal_key"].nunique(dropna=False)
bad_party_ids = s[s != 1]

if bad_party_ids.empty:
    print("Each party_id maps to exactly one party_temporal dict")
else:
    print("Inconsistent party_temporal for these party_id values:")
    print(bad_party_ids)

    conflicts = (
        checker_copy[checker_copy["party_id"].isin(bad_party_ids.index)]
        .loc[:, ["party_id", "party_temporal"]]
        .drop_duplicates()
    )
    display(conflicts)

Each party_id maps to exactly one party_temporal dict


Wonderful, now let's clean this up a bit and store it for later merging with CHES meta data

In [21]:
# remove unneeded columns
enriched_memberships = enriched_memberships.drop(columns=["level_0", "index", "party_classification", "party_altLabel", "party_prefLabel"])
display(enriched_memberships.head(5))

Unnamed: 0,person_id,person_label,hasMembership,party_id,member_startDate,member_endDate,member_role,party_class,party_represents,party_temporal,party_label,party_all_labels,party_all_names
0,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",4117,2015-09-01,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20140701-20190701', 'type'...",PVV,[PVV],[Partij voor de Vrijheid]
1,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",6696,2024-07-16,,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20240716', 'type': 'Period...",PVV,[PVV],[Partij voor de Vrijheid]
2,103246,Auke ZIJLSTRA,"[{'id': 'membership/103246-m-14834', 'type': '...",2941,2011-09-13,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP,NLD,"{'id': 'time-period/20090714-20140630', 'type'...",PVV,[PVV],[Partij voor de Vrijheid]
3,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",4025,2014-07-01,2019-05-20,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20140701-20190701', 'type'...",PSOE,[PSOE],[Partido Socialista Obrero Español]
4,103488,Sergio GUTIÉRREZ PRIETO,"[{'id': 'membership/103488-f-148559', 'type': ...",2714,2010-07-19,2014-06-30,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20090714-20140630', 'type'...",PSOE,[PSOE],[Partido Socialista Obrero Español]


In [None]:
# store as parquet file
# enriched_memberships.to_parquet(PATH_INTERMED_CHES_52_NAT_MEMBER, engine='fastparquet')