In [None]:
import os
if not os.environ.get("PREAMBLE_RUN", False): 
    %run -i "../preamble.py" 2

In [None]:
import pandas as pd
import requests
import numpy as np

from src.constants import PATH_INTERMED_CHES_54_MIG_W_CHES_META, PATH_ORIGINAL_CHES_RAW_CSV, PATH_MIGRATION_SPEECHES, PATH_MIGRATION_CHES, PATH_MIGRATION_CHES_FALLBACK

# set display options for the notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Finally, now that we have it all togethet let's replace/enrich the CHES metadata of every row in our migration dataset with the actual CHES scores / survey data corresponding to the time of the speech. Since, for our migration dataset, we only have CHES datapoints for the years 2014, 2019 and 2024 we will do this in different ways/variants, namely:
- **exact match only**: we only map the rows (i.e. the combinations of speech/speaker/national parties) of 2014, 2019 and 2024 to the corresponding CHES scores of that national party
- **fallback match**: for each speech we match with the lastest CHES score available (i.e. if CHES score for 2014 exists for given speaker/party, map his '2015 speeches' also to this 2014 score)
- **interpolated match**: for all speeches with missing CHES score after exact match, we check if there are CHES scores which "bracket" this speech, e.g. for a '2016 speech' the CHES sores of 2014 and 2019 would bracket, and we then perform a year-wise linear interpolation to infer the score (this assumes there is a somewhat smooth transition between the CHES scores over the years reflecting the position change of parties in the ideological space over time)
-> see more in the dedicated script

Load the dataset we want to merge

In [None]:
# load the data
# 1) the migration dataset (pls) enriched with meta information about the speaker (i.e. national party affiliation and the corresponding [CHES] party id)
enriched_migration_df = pd.read_parquet(PATH_INTERMED_CHES_54_MIG_W_CHES_META, engine='fastparquet')
# (2) the raw CHES dataset to enrich the migration set with (i.e. replace the CHES meta information with the actual scorings / survey items)
raw_CHES_df = pd.read_parquet(PATH_ORIGINAL_CHES_RAW_CSV, engine='fastparquet')
# 3) the vanilla/plain migration (pls) dataset to get part of the desired output columns (i.e. drop the rest of cols)
plain_migration_df = pd.read_parquet(PATH_MIGRATION_SPEECHES, engine='fastparquet')

In [4]:
print("enriched migration data:")
display(enriched_migration_df.head(3))
print("raw CHES data:")
display(raw_CHES_df.head(3))
print("vanilla/plain migration data:")
display(plain_migration_df.head(3))

enriched migration data:


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,ep_identifier,ep_label,ep_citizenship,ep_placeOfBirth,ep_bday,ep_deathDate,ep_gender,ep_hasMembership,person_id_memb,person_label_memb,hasMembership_memb,party_id_memb,member_startDate_memb,member_endDate_memb,member_role_memb,party_class_memb,country_code_memb,party_temporal_memb,party_label_memb,party_all_labels_memb,party_all_names_memb,party_id_ches,country_code_ches,party_abbrev_ches,party_name_ches,party_name_en_ches
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,Amjad BASHIR,GBR,Jhelum (PK),1952-09-17,,MALE,"[{'id': 'membership/124956-f-143535', 'type': ...",124956,Amjad BASHIR,"[{'id': 'membership/124956-f-143535', 'type': ...",4051,2014-07-01,2015-01-28,MEMBER,NATIONAL_POLITICAL_GROUP,GBR,"{'id': 'time-period/20140701-20190701', 'type'...",UKIP,[UKIP],[United Kingdom Independence Party],1108,GBR,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party]
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,Miguel ARIAS CAÑETE,ESP,Madrid,1950-02-24,,MALE,"[{'id': 'membership/1351-f-105362', 'type': 'M...",1351,Miguel ARIAS CAÑETE,"[{'id': 'membership/1351-f-105362', 'type': 'M...",4024,2014-07-01,2014-10-31,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20140701-20190701', 'type'...",PP,[PP],[Partido Popular],502,ESP,[PP],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]"
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,Monika SMOLKOVÁ,SVK,Janovík,1956-10-06,,FEMALE,"[{'id': 'membership/96655-m-15750', 'type': 'M...",96655,Monika SMOLKOVÁ,"[{'id': 'membership/96655-m-15750', 'type': 'M...",4159,2014-07-01,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,SVK,"{'id': 'time-period/20140701-20190701', 'type'...",SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],2803,SVK,"[Smer, Smer-SD]","[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]"


raw CHES data:


Unnamed: 0,year,country,eastwest,eumember,party_id,party,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion
0,1999,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.111111,2.625,,,,3.875,,,,6.666667,4.722222,1.388889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.333333,6.666667,,6.555555,,,6.625,5.888889,5.666667,6.555555,,,,,,,,,,,,,,2025.1
1,2002,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.35,2.5,,,,4.0,,,,6.09,4.533333,2.111111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,,4.1,6.55,,4.88,5.85,5.0,6.3,6.33,,,,,,,,,,,,,,2025.1
2,2006,1,1,1,102,PS,21322.0,13.0,16.700001,2003,13.5,5,1.0,3.5,3.17,,,,2.83,,,,5.71,4.3,3.43,,2.67,6.5,2.33,6.83,2.0,8.0,,3.67,6.5,2.33,5.67,,,,1.33,5.67,2.5,6.33,,4.2,,,3.17,4.33,3.17,6.33,4.4,3.0,,,,,,6.17,7.0,,,2.67,3.5,,6.5,,4.0,,,,6.67,,,5.83,6.0,,,,,,,,,,,,,2025.1


vanilla/plain migration data:


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob
1660703242768,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282
1661057279072,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501
1661377510144,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925


Prepare the datasets for merging

In [5]:
# work on copies
minim_enriched_migration_df = enriched_migration_df.copy(deep=True)
harmonized_CHES_df = raw_CHES_df.copy(deep=True)

In [None]:
# only keep the relevant columns we also have in the original migration dataset (+ the ones we need for the final merge)
output_cols_migration = plain_migration_df.columns.tolist()
output_cols_merger = ["ep_identifier", "party_id_memb", "party_id_ches", "party_label_memb", "party_all_labels_memb", "party_all_names_memb", "party_name_ches", "party_name_en_ches"]
output_cols_minim_all = output_cols_migration + output_cols_merger
# unify column names for the merge process
minim_enriched_migration_df = minim_enriched_migration_df[output_cols_minim_all].rename(columns={"ep_identifier" : "person_id_ep", "party_id_memb" : "party_id_ep", "party_label_memb" : "party_label_ep", "party_all_labels_memb" : "party_all_labels_ep", "party_all_names_memb" : "party_all_names_ep"})
display(minim_enriched_migration_df.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party]
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]"
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]"


In [7]:
# harmonize the relevant migration and ches dataset/columns (for merging)
harmonized_CHES_df = harmonized_CHES_df
display(harmonized_CHES_df.info())

# convert "year" and "party_id" col dtype to (pandas) string type
harmonized_CHES_df["party_id"] = harmonized_CHES_df["party_id"].astype("string").str.strip()
harmonized_CHES_df["year"] = harmonized_CHES_df["year"].astype("string").str.strip()
display(harmonized_CHES_df.info())

# do same for migration dataset
#display(minim_enriched_migration_df.info())
minim_enriched_migration_df["party_id_ches"] = minim_enriched_migration_df["party_id_ches"].astype("string").str.strip()
minim_enriched_migration_df["year"] = minim_enriched_migration_df["year"].astype("string").str.strip()
minim_enriched_migration_df["date"] = minim_enriched_migration_df["date"].astype("string").str.strip()
display(minim_enriched_migration_df.info())

<class 'pandas.DataFrame'>
RangeIndex: 1441 entries, 0 to 1440
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     1441 non-null   int64  
 1   country                  1441 non-null   int64  
 2   eastwest                 1441 non-null   int64  
 3   eumember                 1441 non-null   int64  
 4   party_id                 1441 non-null   int64  
 5   party                    1437 non-null   object 
 6   cmp_id                   1134 non-null   float64
 7   vote                     1377 non-null   float64
 8   seat                     1372 non-null   float64
 9   electionyear             1441 non-null   int64  
 10  epvote                   1221 non-null   float64
 11  family                   1441 non-null   int64  
 12  govt                     1196 non-null   float64
 13  lrgen                    1441 non-null   float64
 14  lrecon                   1441 non-n

None

<class 'pandas.DataFrame'>
RangeIndex: 1441 entries, 0 to 1440
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     1441 non-null   string 
 1   country                  1441 non-null   int64  
 2   eastwest                 1441 non-null   int64  
 3   eumember                 1441 non-null   int64  
 4   party_id                 1441 non-null   string 
 5   party                    1437 non-null   object 
 6   cmp_id                   1134 non-null   float64
 7   vote                     1377 non-null   float64
 8   seat                     1372 non-null   float64
 9   electionyear             1441 non-null   int64  
 10  epvote                   1221 non-null   float64
 11  family                   1441 non-null   int64  
 12  govt                     1196 non-null   float64
 13  lrgen                    1441 non-null   float64
 14  lrecon                   1441 non-n

None

<class 'pandas.DataFrame'>
RangeIndex: 9705 entries, 0 to 9704
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           9705 non-null   int64  
 1   speaker              9705 non-null   object 
 2   text                 9705 non-null   object 
 3   date                 9705 non-null   string 
 4   agenda               9705 non-null   object 
 5   speechnumber         9705 non-null   int64  
 6   procedure_ID         9705 non-null   object 
 7   partyfacts_ID        8529 non-null   float64
 8   period               9705 non-null   int64  
 9   chair                9705 non-null   bool   
 10  MEP                  9705 non-null   bool   
 11  commission           9705 non-null   bool   
 12  written              9705 non-null   bool   
 13  multispeaker         9705 non-null   bool   
 14  link                 9705 non-null   object 
 15  translatedText       9705 non-null   object 
 16 

None

Lets first make a plain/exact merge (i.e. only match rows where party IDs overlap and date of speech lies in the year of the corresponding CHES score/survey for that party)

In [8]:
# first parse date and extract year (as string to match right_df["year"])
minim_enriched_migration_df["year"] = pd.to_datetime(minim_enriched_migration_df["date"], errors="coerce").dt.year.astype("string").str.strip()

In [9]:
# perform a left join (enrich left with right columns)
migration_with_CHES_plain = minim_enriched_migration_df.merge(
    harmonized_CHES_df,
    how="left",
    left_on=["party_id_ches", "year"],
    right_on=["party_id", "year"],
    suffixes=("_mig", "_ches"),
    validate="m:1",   # should be a many-to-one merge
)
display(migration_with_CHES_plain.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,country,eastwest,eumember,party_id,party_ches,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],11.0,1.0,1.0,1108,UKIP,51951.0,3.1,0.0,2010.0,27.5,1.0,0.0,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,3.0,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,9.0,8.0,1.0,2025.1
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",5.0,1.0,1.0,502,PP,33610.0,41.9,53.099998,2011.0,26.1,2.0,1.0,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,1.0,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,13.0,5.0,12.0,2025.1
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",28.0,0.0,1.0,2803,Smer-SD,96423.0,44.4,55.299999,2012.0,24.1,5.0,1.0,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,1.0,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,14.0,17.0,13.0,2025.1


Now perform some sanity checks

In [10]:
# compute match rate (how many left rows found a right row)
match_rate = migration_with_CHES_plain["chesversion"].notna().mean() * 100
print(f"Match rate: {match_rate:.1f}%")
match_rate = migration_with_CHES_plain["eumember"].notna().mean() * 100
print(f"Match rate: {match_rate:.1f}%")
print(migration_with_CHES_plain["eumember"].notna().sum())

Match rate: 6.8%
Match rate: 6.8%
661


In [11]:
# woah this is quite low, check if this adds up in the migration dataset
ches_years = harmonized_CHES_df["year"].unique()
print(ches_years)
ches_migration_pls_overlap_rate = minim_enriched_migration_df["year"].isin(ches_years).mean() * 100
print(f"(Ches Year) Overlap rate: {ches_migration_pls_overlap_rate:.1f}%")

# only for parties that turn up in our ches dataset now
ches_ids = harmonized_CHES_df["party_id"].unique()
only_ches_ids_overlap = minim_enriched_migration_df.loc[minim_enriched_migration_df["year"].isin(ches_years) & minim_enriched_migration_df["party_id_ches"].isin(ches_ids)]
print(only_ches_ids_overlap.shape[0])
# result: the numbers seem about right (considering we have not checked for exact match between speech date and ches year here)

<StringArray>
['1999', '2002', '2006', '2010', '2014', '2019', '2024']
Length: 7, dtype: string
(Ches Year) Overlap rate: 7.7%
689


In [12]:
# store the merged data anyways
migration_with_CHES_exact = migration_with_CHES_plain.copy(deep=True).rename(columns={"party_ches": "party_label_ches", "year_mig" : "year", "party_mig" : "party"})
display(migration_with_CHES_exact.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,country,eastwest,eumember,party_id,party_label_ches,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],11.0,1.0,1.0,1108,UKIP,51951.0,3.1,0.0,2010.0,27.5,1.0,0.0,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,3.0,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,9.0,8.0,1.0,2025.1
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",5.0,1.0,1.0,502,PP,33610.0,41.9,53.099998,2011.0,26.1,2.0,1.0,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,1.0,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,13.0,5.0,12.0,2025.1
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",28.0,0.0,1.0,2803,Smer-SD,96423.0,44.4,55.299999,2012.0,24.1,5.0,1.0,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,1.0,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,14.0,17.0,13.0,2025.1


In [None]:
# migration_with_CHES_exact.to_parquet(PATH_MIGRATION_CHES, engine='fastparquet')

**Intermed Result_01**

Apparently, we only have around 7% of our migration speech dates fall exactly in a year where there exists a corresponding ches score. This is less than expected.

**Consequence**

Lets try to do a fallback merge instead, and see how that changes things (i.e. if there exists no exactly matching ches score for given speech date, take the nearest on in the past)

In [14]:
# Let's merge such that each speech/row of migration dataset gets the nearest survey row (of CHES data) of the past for the corresponding national party of the MEP giving the speech

# first sanity check that the combination of "year" and "party_id_ches" value is unique such that we don't end up with any ties during merging
if harmonized_CHES_df.duplicated(subset=["party_id", "year"]).any():
    ties = (harmonized_CHES_df[harmonized_CHES_df.duplicated(["party_id", "year"], keep=False)]
            .sort_values(["party_id", "year"]))
    raise ValueError(
        "Right dataframe has ties (multiple rows) for the same (party_id, year). "
        "Resolve before asof-merge.\n"
        f"Example ties:\n{ties.head(20)}"
    )

In [15]:
# perform a merge_asof (for this the keys need to be sorted)
# work on copies
left = minim_enriched_migration_df.copy(deep=True).reset_index(drop=True)
left["_row_id"] = left.index # keep this to later resort to original order
right = harmonized_CHES_df.copy(deep=True)

# convert year to Int64
left["year"] = left["year"].astype("Int64")
right["year"] = right["year"].astype("Int64")
right["_year"] = right["year"].astype("Int64") # helper column for merge

In [16]:
# avoid nulls in the on-keys
assert left["year"].notna().all()
assert right["_year"].notna().all()

In [17]:
# now merge with a merge_asof (after sorting the keys)
left_sorted  = left.sort_values(["year", "party_id_ches"])
right_sorted = right.sort_values(["year", "party_id"])

# backward does what we want: match the nearest key on the right that is <= the left key
migration_with_CHES_fallback = pd.merge_asof(
    left_sorted,
    right_sorted,
    left_on="year",
    right_on="_year",
    left_by="party_id_ches",
    right_by="party_id",
    direction="backward",      # <= date (still before)
    allow_exact_matches=True,
    suffixes=("_mig", "_ches")# year-date equal to date is allowed; False if we want strictly before
).drop(columns=["_year"]) # drop helper column again

# restore the original order
migration_with_CHES_fallback = migration_with_CHES_fallback.sort_values("_row_id").drop(columns="_row_id")

In [18]:
# add indicator if fallback or exact match
migration_with_CHES_fallback["fallback_used"] = (migration_with_CHES_fallback["year_mig"] > migration_with_CHES_fallback["year_ches"]).astype("boolean")

In [19]:
display(migration_with_CHES_fallback.head())

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year_mig,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,year_ches,country,eastwest,eumember,party_id,party_ches,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion,fallback_used
18,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],2014,11.0,1.0,1.0,1108,UKIP,51951.0,3.1,0.0,2010.0,27.5,1.0,0.0,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,3.0,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,9.0,8.0,1.0,2025.1,False
150,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",2014,5.0,1.0,1.0,502,PP,33610.0,41.9,53.099998,2011.0,26.1,2.0,1.0,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,1.0,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,13.0,5.0,12.0,2025.1,False
84,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",2014,28.0,0.0,1.0,2803,Smer-SD,96423.0,44.4,55.299999,2012.0,24.1,5.0,1.0,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,1.0,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,14.0,17.0,13.0,2025.1,False
57,291861,Kristina Winberg,Fru talman! Schengenavtalet ingicks i en anda ...,2014-07-14,13. One-minute speeches on matters of politica...,24,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,Fru talman! The Schengen Agreement was conclud...,machine_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.700463,124995,4253,1610,Sverigedemokraterna,[Sverigedemokraterna],[Sverigedemokraterna],[Sverigedemokraterna],[Sweden Democrats],2014,16.0,1.0,1.0,1610,SD,11710.0,12.9,14.0,2014.0,9.7,1.0,0.0,7.761905,5.4,3.476191,,,9.238095,8.380953,,,1.272727,6.136363,0.95,,4.705883,,4.529412,,4.588235,,4.470588,9.277778,,8.157895,,,,,5.625,,9.777778,,,9.842105,,,9.777778,,9.8125,,7.3125,,7.5,,,,,5.25,,6.352941,,,,3.0,1.833333,,1.277778,,1.0625,,1.785714,,,1.235294,1.333333,,,,,8.894737,2.944444,,,,9.0,12.0,11.0,2025.1,False
130,291519,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,"Αξιότιμε κύριε Juncker, στις προτεραιότητές σα...",2014-07-15,5. Statement by the candidate for President of...,25,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,"Dear Mr Juncker, in your priorities you refer ...",machine_pl,2014,christian_conservative,PPE,0.604105,125065,4055,402,N.D.,"[Ν.Δ., N.D.]","[Νέα Δημοκρατία, Nea Demokratia]",[Néa Dimokratía],[New Democracy],2014,4.0,1.0,1.0,402,ND,34511.0,29.7,43.0,2012.0,22.7,2.0,1.0,7.222222,7.111111,9.444445,,,7.0,6.333334,,,6.555555,7.444445,2.555556,,5.5,,6.333333,,6.555555,,6.0,7.111111,,8.111111,,,,,8.444445,,8.0,,,8.222222,,,7.444445,,8.222222,,6.0,,7.666666,,,,,5.714286,,4.428571,,,,1.0,6.428571,,6.555555,,5.555555,,6.777778,,,6.111111,3.714286,,,,,2.333333,4.555555,,,,13.0,8.0,14.0,2025.1,False


In [20]:
# compute match rate (how many left rows found a right row)
match_rate_fallback = migration_with_CHES_fallback["chesversion"].notna().mean() * 100
print(f"Match rate: {match_rate_fallback:.1f}%")
print(migration_with_CHES_fallback["eumember"].notna().mean() * 100)

Match rate: 89.4%
89.3560020607934


In [21]:
# compute fallback rate itself (how many rows only matched thanks to fallback)
fallback_rate = migration_with_CHES_fallback["fallback_used"].mean() * 100
print(fallback_rate)

92.37776752767527


In [22]:
# quick sanity checks
# sanity check if fallback did work correctly (left year should always be greater / equal than right year
n_bad = (migration_with_CHES_fallback["year_mig"] < migration_with_CHES_fallback["year_ches"]).sum()
print(n_bad) # should be 0


merged_smell_copy = migration_with_CHES_fallback.copy(deep=True)
# Row count should match left_df
assert len(merged_smell_copy) == len(left), "Unexpected row count change"

# How many left rows found no match?
no_match_rate = merged_smell_copy["year_ches"].isna().mean()
no_match_count = merged_smell_copy["year_ches"].isna().sum()

print(no_match_rate)
print(no_match_count)

# check if each ches year is less/equal to speech date
date_year = pd.to_datetime(merged_smell_copy["date"], errors="coerce").dt.year
year_val  = pd.to_numeric(merged_smell_copy["year_ches"], errors="coerce")

merged_smell_copy["date_after_year"] = ((date_year >= year_val).where(date_year.notna() & year_val.notna(), pd.NA))
all_ok = merged_smell_copy["date_after_year"].dropna().all()
print(all_ok) # should be true

0
0.10643997939206594
1033
True


Well this is much better now. Briefly check the output and store for later.

In [23]:
display(migration_with_CHES_fallback.head(5))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year_mig,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,year_ches,country,eastwest,eumember,party_id,party_ches,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion,fallback_used
18,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],2014,11.0,1.0,1.0,1108,UKIP,51951.0,3.1,0.0,2010.0,27.5,1.0,0.0,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,3.0,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,9.0,8.0,1.0,2025.1,False
150,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",2014,5.0,1.0,1.0,502,PP,33610.0,41.9,53.099998,2011.0,26.1,2.0,1.0,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,1.0,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,13.0,5.0,12.0,2025.1,False
84,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",2014,28.0,0.0,1.0,2803,Smer-SD,96423.0,44.4,55.299999,2012.0,24.1,5.0,1.0,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,1.0,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,14.0,17.0,13.0,2025.1,False
57,291861,Kristina Winberg,Fru talman! Schengenavtalet ingicks i en anda ...,2014-07-14,13. One-minute speeches on matters of politica...,24,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,Fru talman! The Schengen Agreement was conclud...,machine_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.700463,124995,4253,1610,Sverigedemokraterna,[Sverigedemokraterna],[Sverigedemokraterna],[Sverigedemokraterna],[Sweden Democrats],2014,16.0,1.0,1.0,1610,SD,11710.0,12.9,14.0,2014.0,9.7,1.0,0.0,7.761905,5.4,3.476191,,,9.238095,8.380953,,,1.272727,6.136363,0.95,,4.705883,,4.529412,,4.588235,,4.470588,9.277778,,8.157895,,,,,5.625,,9.777778,,,9.842105,,,9.777778,,9.8125,,7.3125,,7.5,,,,,5.25,,6.352941,,,,3.0,1.833333,,1.277778,,1.0625,,1.785714,,,1.235294,1.333333,,,,,8.894737,2.944444,,,,9.0,12.0,11.0,2025.1,False
130,291519,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,"Αξιότιμε κύριε Juncker, στις προτεραιότητές σα...",2014-07-15,5. Statement by the candidate for President of...,25,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,"Dear Mr Juncker, in your priorities you refer ...",machine_pl,2014,christian_conservative,PPE,0.604105,125065,4055,402,N.D.,"[Ν.Δ., N.D.]","[Νέα Δημοκρατία, Nea Demokratia]",[Néa Dimokratía],[New Democracy],2014,4.0,1.0,1.0,402,ND,34511.0,29.7,43.0,2012.0,22.7,2.0,1.0,7.222222,7.111111,9.444445,,,7.0,6.333334,,,6.555555,7.444445,2.555556,,5.5,,6.333333,,6.555555,,6.0,7.111111,,8.111111,,,,,8.444445,,8.0,,,8.222222,,,7.444445,,8.222222,,6.0,,7.666666,,,,,5.714286,,4.428571,,,,1.0,6.428571,,6.555555,,5.555555,,6.777778,,,6.111111,3.714286,,,,,2.333333,4.555555,,,,13.0,8.0,14.0,2025.1,False


In [24]:
# convert types into string where reasonable
# work on copy
final_fallback_output = migration_with_CHES_fallback.copy(deep=True)
print(final_fallback_output["year_ches"].isna().sum()) # we have nan values since not all rows matched! -> we cannot have numpy int as dtype
print(final_fallback_output["year_ches"].dtype)
print(final_fallback_output["year_mig"].dtype)
#final_fallback_output["year_ches"] = final_fallback_output["year_ches"].astype("Int64") # convert to pandas int type

1033
Int64
Int64


In [25]:
final_fallback_output = final_fallback_output.rename(columns={"party_ches": "party_label_ches", "year_mig" : "year", "party_mig" : "party"})
display(final_fallback_output.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,year_ches,country,eastwest,eumember,party_id,party_label_ches,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion,fallback_used
18,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],2014,11.0,1.0,1.0,1108,UKIP,51951.0,3.1,0.0,2010.0,27.5,1.0,0.0,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,3.0,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,9.0,8.0,1.0,2025.1,False
150,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",2014,5.0,1.0,1.0,502,PP,33610.0,41.9,53.099998,2011.0,26.1,2.0,1.0,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,1.0,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,13.0,5.0,12.0,2025.1,False
84,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",2014,28.0,0.0,1.0,2803,Smer-SD,96423.0,44.4,55.299999,2012.0,24.1,5.0,1.0,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,1.0,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,14.0,17.0,13.0,2025.1,False


Wonderful, lets store this for our analysis

In [None]:
# final_fallback_output.to_parquet(PATH_MIGRATION_CHES_FALLBACK, engine='fastparquet')