In [None]:
import os
if not os.environ.get("PREAMBLE_RUN", False): 
    %run -i "../preamble.py" 2

In [None]:
import pandas as pd
import requests
import numpy as np

from src.constants import PATH_INTERMED_CHES_54_MIG_W_CHES_META, PATH_ORIGINAL_CHES_RAW_CSV, PATH_MIGRATION_SPEECHES, PATH_MIGRATION_CHES_INTERPOLATED

# set display options for the notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Replace CHES meta with 'raw'/actual CHES scores and linearly interpolate scores for speeches not given in 2014/'19/'24:

Let's try to linearly interpolate the CHES scores for speech dates with missing (exact) year match. This was also done in the literature, e.g. by [Adams et al., 2014](https://onlinelibrary.wiley.com/doi/epdf/10.1111/ajps.12115?saml_referrer)

Load the dataset we want to merge first

In [None]:
# load the data
# 1) the migration dataset (pls) enriched with meta information about the speaker (i.e. national party affiliation and the corresponding [CHES] party id)
enriched_migration_df = pd.read_parquet(PATH_INTERMED_CHES_54_MIG_W_CHES_META, engine='fastparquet')
# (2) the raw CHES dataset to enrich the migration set with (i.e. replace the CHES meta information with the actual scorings / survey items)
raw_CHES_df = pd.read_parquet(PATH_ORIGINAL_CHES_RAW_CSV, engine='fastparquet')
# 3) the vanilla/plain migration (pls) dataset to get part of the desired output columns (i.e. drop the rest of cols)
plain_migration_df = pd.read_parquet(PATH_MIGRATION_SPEECHES, engine='fastparquet')

In [3]:
print("enriched migration data:")
print(enriched_migration_df.shape[0])
display(enriched_migration_df.head(3))
print("raw CHES data:")
print(raw_CHES_df.shape[0])
display(raw_CHES_df.head(3))
print("vanilla/plain migration data:")
print(plain_migration_df.shape[0])
display(plain_migration_df.head(3))

enriched migration data:
9705


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,ep_identifier,ep_label,ep_citizenship,ep_placeOfBirth,ep_bday,ep_deathDate,ep_gender,ep_hasMembership,person_id_memb,person_label_memb,hasMembership_memb,party_id_memb,member_startDate_memb,member_endDate_memb,member_role_memb,party_class_memb,country_code_memb,party_temporal_memb,party_label_memb,party_all_labels_memb,party_all_names_memb,party_id_ches,country_code_ches,party_abbrev_ches,party_name_ches,party_name_en_ches
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,Amjad BASHIR,GBR,Jhelum (PK),1952-09-17,,MALE,"[{'id': 'membership/124956-f-143535', 'type': ...",124956,Amjad BASHIR,"[{'id': 'membership/124956-f-143535', 'type': ...",4051,2014-07-01,2015-01-28,MEMBER,NATIONAL_POLITICAL_GROUP,GBR,"{'id': 'time-period/20140701-20190701', 'type'...",UKIP,[UKIP],[United Kingdom Independence Party],1108,GBR,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party]
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,Miguel ARIAS CAÑETE,ESP,Madrid,1950-02-24,,MALE,"[{'id': 'membership/1351-f-105362', 'type': 'M...",1351,Miguel ARIAS CAÑETE,"[{'id': 'membership/1351-f-105362', 'type': 'M...",4024,2014-07-01,2014-10-31,MEMBER,NATIONAL_POLITICAL_GROUP,ESP,"{'id': 'time-period/20140701-20190701', 'type'...",PP,[PP],[Partido Popular],502,ESP,[PP],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]"
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,Monika SMOLKOVÁ,SVK,Janovík,1956-10-06,,FEMALE,"[{'id': 'membership/96655-m-15750', 'type': 'M...",96655,Monika SMOLKOVÁ,"[{'id': 'membership/96655-m-15750', 'type': 'M...",4159,2014-07-01,2019-07-01,MEMBER,NATIONAL_POLITICAL_GROUP,SVK,"{'id': 'time-period/20140701-20190701', 'type'...",SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],2803,SVK,"[Smer, Smer-SD]","[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]"


raw CHES data:
1441


Unnamed: 0,year,country,eastwest,eumember,party_id,party,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion
0,1999,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.111111,2.625,,,,3.875,,,,6.666667,4.722222,1.388889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.333333,6.666667,,6.555555,,,6.625,5.888889,5.666667,6.555555,,,,,,,,,,,,,,2025.1
1,2002,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.35,2.5,,,,4.0,,,,6.09,4.533333,2.111111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,,4.1,6.55,,4.88,5.85,5.0,6.3,6.33,,,,,,,,,,,,,,2025.1
2,2006,1,1,1,102,PS,21322.0,13.0,16.700001,2003,13.5,5,1.0,3.5,3.17,,,,2.83,,,,5.71,4.3,3.43,,2.67,6.5,2.33,6.83,2.0,8.0,,3.67,6.5,2.33,5.67,,,,1.33,5.67,2.5,6.33,,4.2,,,3.17,4.33,3.17,6.33,4.4,3.0,,,,,,6.17,7.0,,,2.67,3.5,,6.5,,4.0,,,,6.67,,,5.83,6.0,,,,,,,,,,,,,2025.1


vanilla/plain migration data:
9705


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob
2054628059040,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282
2054601056272,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501
0,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925


Now prepare the datasets for merging

In [4]:
# work on copies
minim_enriched_migration_df = enriched_migration_df.copy(deep=True)
harmonized_CHES_df = raw_CHES_df.copy(deep=True)

In [5]:
# only keep the relevant columns we also have in the original migration dataset (+ the ones we need for the final merge)
output_cols_migration = plain_migration_df.columns.tolist()
output_cols_merger = ["ep_identifier", "party_id_memb", "party_id_ches", "party_label_memb", "party_all_labels_memb", "party_all_names_memb", "party_name_ches", "party_name_en_ches"]
output_cols_minim_all = output_cols_migration + output_cols_merger
# unify column names throughput the merge process
minim_enriched_migration_df = minim_enriched_migration_df[output_cols_minim_all].rename(columns={"ep_identifier" : "person_id_ep", "party_id_memb" : "party_id_ep", "party_label_memb" : "party_label_ep", "party_all_labels_memb" : "party_all_labels_ep", "party_all_names_memb" : "party_all_names_ep"})
display(minim_enriched_migration_df.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,2014,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party]
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,2014,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]"
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,2014,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]"


In [6]:
# define all the ches columns we have
ches_cols_match = ["year", "country","eastwest", "eumember", "party_id", "party", "cmp_id"]
# apparently we do not have an "expert" column as stated in the CHES codebook :(
ches_cols_meta = ["vote", "seat", "electionyear", "epvote", "family","govt", "chesversion"]
ches_cols_ideo = ["lrgen", "lrecon", "lrecon_salience", "lrecon_dissent", "lrecon_blur", "galtan", "galtan_salience", "galtan_dissent", "galtan_blur"]
# NOTE: Scores in col "eu_position" is from 1-7 instead of 0-10 !!
ches_cols_eu_integ = ["eu_position", "eu_salience", "eu_dissent", "eu_blur"]
ches_cols_policy_dim = ["spendvtax", "spendvtax_salience", "deregulation", "dereg_salience", "redistribution", "redist_salience", "econ_interven", "civlib_laworder", "civlib_salience", "sociallifestyle", "social_salience", "womens_rights", "lgbtq_rights", "samesex_marriage", "religious_principles", "relig_salience", "immigrate_policy", "immigrate_salience", "immigrate_dissent", "multiculturalism", "multicult_salience", "multicult_dissent", "nationalism", "nationalism_salience", "ethnic_minorities", "ethnic_salience", "urban_rural", "urban_salience", "environment", "enviro_salience", "climate_change", "climate_change_salience", "protectionism", "regions", "region_salience", "international_security", "international_salience", "us", "us_salience"]
# NOTE: col "eu_benefit" is only 3-scale score (!) and apart from col "eu_russia", the rest of the columns are from 1-7 instead of 0-10 !!
ches_cols_eu_policy = ["eu_benefit", "eu_ep", "eu_fiscal", "eu_intmark", "eu_employ", "eu_budgets", "eu_agri", "eu_cohesion", "eu_environ", "eu_asylum", "eu_foreign", "eu_turkey", "eu_russia"]
ches_cols_salience = ["russian_interference", "anti_islam_rhetoric"]
ches_cols_party_char = ["people_vs_elite", "antielite_salience", "corrupt_salience", "members_vs_leadership", "executive_power", "judicial_independence"]
# NOTE: mip is categorical/string value
ches_cols_mip = ["mip_one", "mip_two", "mip_three"]
ches_cols_lists = [ches_cols_match, ches_cols_meta, ches_cols_ideo, ches_cols_eu_integ, ches_cols_policy_dim, ches_cols_eu_policy, ches_cols_salience, ches_cols_party_char, ches_cols_mip]
ches_cols_all = [col for col_list in ches_cols_lists for col in col_list]

# quick sanity check
#display(raw_CHES_df[ches_cols_all].head())
#print(raw_CHES_df.shape)
#print(raw_CHES_df[ches_cols_all].shape)

In [7]:
ordinal_cols = ["eu_benefit"]
# define the columns with scale 1-7 we need to re-calibrate (according to CHES codebook)
cols_in_1_7_range = ["eu_position"] + ches_cols_eu_policy
cols_in_1_7_range.remove("eu_benefit")
cols_in_1_7_range.remove("eu_russia")

In [8]:
# harmonize the relevant migration and ches dataset/columns (for merging)
harmonized_CHES_df = harmonized_CHES_df
display(harmonized_CHES_df.info())

# convert "year" and "party_id" col dtype to (pandas) string type
harmonized_CHES_df["party_id"] = harmonized_CHES_df["party_id"].astype("string").str.strip()
harmonized_CHES_df["year"] = harmonized_CHES_df["year"].astype("Int64")
display(harmonized_CHES_df.info())

# do same for migration dataset
#display(minim_enriched_migration_df.info())
minim_enriched_migration_df["party_id_ches"] = minim_enriched_migration_df["party_id_ches"].astype("string").str.strip()
minim_enriched_migration_df["year"] = minim_enriched_migration_df["year"].astype("Int64")
minim_enriched_migration_df["date"] = minim_enriched_migration_df["date"].astype("string").str.strip()
display(minim_enriched_migration_df.info())

<class 'pandas.DataFrame'>
RangeIndex: 1441 entries, 0 to 1440
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     1441 non-null   int64  
 1   country                  1441 non-null   int64  
 2   eastwest                 1441 non-null   int64  
 3   eumember                 1441 non-null   int64  
 4   party_id                 1441 non-null   int64  
 5   party                    1437 non-null   object 
 6   cmp_id                   1134 non-null   float64
 7   vote                     1377 non-null   float64
 8   seat                     1372 non-null   float64
 9   electionyear             1441 non-null   int64  
 10  epvote                   1221 non-null   float64
 11  family                   1441 non-null   int64  
 12  govt                     1196 non-null   float64
 13  lrgen                    1441 non-null   float64
 14  lrecon                   1441 non-n

None

<class 'pandas.DataFrame'>
RangeIndex: 1441 entries, 0 to 1440
Data columns (total 90 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     1441 non-null   Int64  
 1   country                  1441 non-null   int64  
 2   eastwest                 1441 non-null   int64  
 3   eumember                 1441 non-null   int64  
 4   party_id                 1441 non-null   string 
 5   party                    1437 non-null   object 
 6   cmp_id                   1134 non-null   float64
 7   vote                     1377 non-null   float64
 8   seat                     1372 non-null   float64
 9   electionyear             1441 non-null   int64  
 10  epvote                   1221 non-null   float64
 11  family                   1441 non-null   int64  
 12  govt                     1196 non-null   float64
 13  lrgen                    1441 non-null   float64
 14  lrecon                   1441 non-n

None

<class 'pandas.DataFrame'>
RangeIndex: 9705 entries, 0 to 9704
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           9705 non-null   int64  
 1   speaker              9705 non-null   object 
 2   text                 9705 non-null   object 
 3   date                 9705 non-null   string 
 4   agenda               9705 non-null   object 
 5   speechnumber         9705 non-null   int64  
 6   procedure_ID         9705 non-null   object 
 7   partyfacts_ID        8529 non-null   float64
 8   period               9705 non-null   int64  
 9   chair                9705 non-null   bool   
 10  MEP                  9705 non-null   bool   
 11  commission           9705 non-null   bool   
 12  written              9705 non-null   bool   
 13  multispeaker         9705 non-null   bool   
 14  link                 9705 non-null   object 
 15  translatedText       9705 non-null   object 
 16 

None

In [9]:
# now re-calibrate the column values with range 1-7 to common range 0-10
harmonized_CHES_df[cols_in_1_7_range] = (
    (harmonized_CHES_df[cols_in_1_7_range] - 1) / (7 - 1) * 10
)

Lets merge such that each speech/row of migration dataset gets the interpolated ches scores of past and future for the corresponding national party of the MEP giving the speech, (i.e. this assumes
expect party positions to move meaningfully between waves),
like so

**Linear interpolation** between adjacent waves (no extrapolation):

For ches surveys at year w1 and w2 and speech at year t with w1 < t < w2,

we do:


![alt text](image.png)


Pros:
- avoids discontinuous jumps at wave boundaries
- uses future information only to estimate the value between two observed measurements
conceptually matches “positions evolve gradually”

Cons:
- we must justify smoothness (and party shifts could be abrupt)
- CHES is expert perception; movement may partly reflect events near the later wave

BUT plain merging does not leave us with enough datapoints and interpolating is something already done in the literature (see above/report)
- if we do this, keep it as a secondary specification (?) unless our paper is explicitly about continuous party-position trajectories

In [10]:
# columns in right_df we want to interpolate (we chose all numeric columns but drop ordinal/categorical ones, i.e. remove "eu_benefit")
value_cols_lists = [ches_cols_ideo, ches_cols_eu_integ, ches_cols_policy_dim, ches_cols_eu_policy, ches_cols_salience, ches_cols_party_char]
value_cols = [col for col_list in value_cols_lists for col in col_list]
value_cols.remove("eu_benefit")

# work on copies and create stable row id so we can restore original order
left = minim_enriched_migration_df.copy(deep=True).reset_index(drop=True)
left["_row_id"] = left.index
right = raw_CHES_df.copy(deep=True)

In [11]:
# remove potential ambiguity in column names before merging
left.drop("year", inplace=True, axis=1)
left.rename(columns={"party" : "party_mig"}, inplace=True)
right.rename(columns={"year" : "year_ches"}, inplace=True)

display(left.head(2))
display(right.head(2))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,_row_id
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],0
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",1


Unnamed: 0,year_ches,country,eastwest,eumember,party_id,party,cmp_id,vote,seat,electionyear,epvote,family,govt,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_benefit,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,mip_one,mip_two,mip_three,chesversion
0,1999,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.111111,2.625,,,,3.875,,,,6.666667,4.722222,1.388889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.333333,6.666667,,6.555555,,,6.625,5.888889,5.666667,6.555555,,,,,,,,,,,,,,2025.1
1,2002,1,1,1,102,PS,21322.0,10.2,12.7,1999,9.59,5,1.0,3.35,2.5,,,,4.0,,,,6.09,4.533333,2.111111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,,4.1,6.55,,4.88,5.85,5.0,6.3,6.33,,,,,,,,,,,,,,2025.1


In [12]:
# ensure IDs same dtype (pandas string, handles NaN nicely for us)
left["party_id_ches"] = left["party_id_ches"].astype("string").str.strip()
right["party_id"] = right["party_id"].astype("string").str.strip()

In [13]:
# Year from left date
left["year_mig"] = pd.to_datetime(left["date"], errors="coerce").dt.year.astype("Int64")
# Right year as Int64
right["year_ches"] = pd.to_numeric(right["year_ches"], errors="coerce").astype("Int64")

Do the merge in a two step way, first make an exact match and with all unmatched left rows left, check if they have "bracketing" right rows (i.e. for left year 2017, there exist two right rows with year_1 < 2017 < year_2) perform merge with a pair of merge_asof calls (prev/next year within party)

In [14]:
# exact merge first
# exact merge: keep all left rows, add right columns where exact match exists
exact = left.merge(
    right[["party_id", "year_ches"] + value_cols],
    how="left",
    left_on=["party_id_ches", "year_mig"],
    right_on=["party_id", "year_ches"],
    validate="m:1",
)

# flag exact matches
exact["match_type"] = np.where(exact["party_id"].notna(), "exact", pd.NA)
exact["year_prev"] = pd.NA
exact["year_next"] = pd.NA

In [15]:
# now split
needs_interp = exact["match_type"].isna()
left_unmatched = exact.loc[needs_interp, ["_row_id", "party_id_ches", "year_mig"]].copy(deep=True)

In [16]:
# now interpolate only for unmatched rows (year-wise)
# i.e. we find, for each unmatched left row (using merge_asof)
# - prev: right year < year
# - next: right year > year

left_s  = left_unmatched.sort_values(["year_mig", "party_id_ches"]).copy(deep=True)
right_s = right.sort_values(["year_ches", "party_id"]).copy(deep=True)

prev = pd.merge_asof(
    left_s,
    right_s[["party_id", "year_ches"] + value_cols],
    left_on="year_mig",
    right_on="year_ches",
    left_by="party_id_ches",
    right_by="party_id",
    direction="backward",
    allow_exact_matches=False,   # strict < (since exact already handled)
)

nxt = pd.merge_asof(
    left_s,
    right_s[["party_id", "year_ches"] + value_cols],
    left_on="year_mig",
    right_on="year_ches",
    left_by="party_id_ches",
    right_by="party_id",
    direction="forward",
    allow_exact_matches=False,   # strict >
)


In [17]:
# Now compute interpolation (vectorized, no index alignment headaches with pandas since we use numpy arrays)

y  = left_s["year_mig"].to_numpy(dtype="float64")
y1 = prev["year_ches"].to_numpy(dtype="float64")
y2 = nxt["year_ches"].to_numpy(dtype="float64")

interp_ok = ~np.isnan(y1) & ~np.isnan(y2) & (y1 < y) & (y < y2)

interp_out = left_s[["_row_id", "party_id_ches", "year_mig"]].copy()
interp_out["match_type"] = pd.NA
interp_out.loc[interp_ok, "match_type"] = "interpolated"
interp_out["year_prev"] = pd.Series(prev["year_ches"].to_numpy()).where(interp_ok)
interp_out["year_next"] = pd.Series(nxt["year_ches"].to_numpy()).where(interp_ok)

for c in value_cols:
    v1 = prev[c].to_numpy(dtype="float64")
    v2 = nxt[c].to_numpy(dtype="float64")

    res = np.full(len(interp_out), np.nan, dtype="float64")
    ok = interp_ok & ~np.isnan(v1) & ~np.isnan(v2)
    res[ok] = v1[ok] + ((y[ok] - y1[ok]) / (y2[ok] - y1[ok])) * (v2[ok] - v1[ok])
    interp_out[c] = res

# rows without bracketing years remain NaN and match_type stays NA

Now combine first exact match and the second interpolated match

In [18]:
# we update only the rows that were unmatched after exact merge, using _row_id to align.
final = exact.copy()

# set interpolated values into the corresponding rows
final = final.merge(
    interp_out[["_row_id", "match_type", "year_prev", "year_next"] + value_cols],
    on="_row_id",
    how="left",
    suffixes=("", "_interp"),
)

# where exact match missing, fill from interpolation
mask = final["match_type"].isna() & final["match_type_interp"].notna()

final.loc[mask, "match_type"] = final.loc[mask, "match_type_interp"]
final.loc[mask, "year_prev"] = final.loc[mask, "year_prev_interp"]
final.loc[mask, "year_next"] = final.loc[mask, "year_next_interp"]

for c in value_cols:
    final.loc[mask, c] = final.loc[mask, f"{c}_interp"]

# cleanup
drop_cols = ["party_id"] + [f"{c}_interp" for c in value_cols] + ["match_type_interp", "year_prev_interp", "year_next_interp"]
final = final.drop(columns=[c for c in drop_cols if c in final.columns])

final = final.sort_values("_row_id").drop(columns="_row_id")

#At this point, we should have the following:
#   -> exact matches have match_type="exact", years prev/next are NA
#   -> interpolated matches have match_type="interpolated", prev/next filled
#   -> no match remains match_type NA and numeric cols NaN

In [19]:
display(final.head())

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,year_mig,year_ches,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,match_type,year_prev,year_next
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],2014,2014,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,exact,,
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",2014,2014,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,exact,,
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",2014,2014,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,exact,,
3,291861,Kristina Winberg,Fru talman! Schengenavtalet ingicks i en anda ...,2014-07-14,13. One-minute speeches on matters of politica...,24,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,Fru talman! The Schengen Agreement was conclud...,machine_pl,(extreme)_right,EDD/INDDEM/EFD,0.700463,124995,4253,1610,Sverigedemokraterna,[Sverigedemokraterna],[Sverigedemokraterna],[Sverigedemokraterna],[Sweden Democrats],2014,2014,7.761905,5.4,3.476191,,,9.238095,8.380953,,,1.272727,6.136363,0.95,,4.705883,,4.529412,,4.588235,,4.470588,9.277778,,8.157895,,,,,5.625,,9.777778,,,9.842105,,,9.777778,,9.8125,,7.3125,,7.5,,,,,5.25,,6.352941,,,,1.833333,,1.277778,,1.0625,,1.785714,,,1.235294,1.333333,,,,,8.894737,2.944444,,,,exact,,
4,291519,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,"Αξιότιμε κύριε Juncker, στις προτεραιότητές σα...",2014-07-15,5. Statement by the candidate for President of...,25,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,"Dear Mr Juncker, in your priorities you refer ...",machine_pl,christian_conservative,PPE,0.604105,125065,4055,402,N.D.,"[Ν.Δ., N.D.]","[Νέα Δημοκρατία, Nea Demokratia]",[Néa Dimokratía],[New Democracy],2014,2014,7.222222,7.111111,9.444445,,,7.0,6.333334,,,6.555555,7.444445,2.555556,,5.5,,6.333333,,6.555555,,6.0,7.111111,,8.111111,,,,,8.444445,,8.0,,,8.222222,,,7.444445,,8.222222,,6.0,,7.666666,,,,,5.714286,,4.428571,,,,6.428571,,6.555555,,5.555555,,6.777778,,,6.111111,3.714286,,,,,2.333333,4.555555,,,,exact,,


Perform some sanity checks on the final data

In [20]:
# distribution checks
# how many rows fell into each case?
print(final["match_type"].value_counts(dropna=False))

# coverage per numerical/interpolated column
print(final[value_cols].notna().mean().sort_values())

match_type
interpolated    7647
NaN             1397
exact            661
Name: count, dtype: int64
spendvtax_salience         0.000000
social_salience            0.000000
civlib_salience            0.000000
dereg_salience             0.000000
relig_salience             0.000000
eu_employ                  0.000000
eu_environ                 0.000000
eu_agri                    0.000000
international_salience     0.000000
us                         0.000000
us_salience                0.000000
eu_fiscal                  0.000000
urban_salience             0.000000
region_salience            0.000000
nationalism_salience       0.000000
ethnic_salience            0.000000
eu_russia                  0.008655
womens_rights              0.008655
climate_change_salience    0.008655
climate_change             0.008655
samesex_marriage           0.008655
lgbtq_rights               0.008655
judicial_independence      0.008655
executive_power            0.008655
eu_turkey                  0.025142


In [21]:
# row count preserved
assert len(final) == len(left)

# left keys unchanged
assert final["party_id_ches"].equals(left["party_id_ches"])
assert final["year_mig"].equals(left["year_mig"])

# exact match correctness, i.e. every exact match was really an exact match
exact = final["match_type"] == "exact"
# year_prev / year_next must be NaN
assert final.loc[exact, ["year_prev", "year_next"]].isna().all().all()

# exact match correspond to right data
exact_keys = final.loc[exact, ["party_id_ches", "year_mig"]]
right_keys = right.rename(columns={"party_id": "party_id_ches"})[["party_id_ches", "year_ches"]]

missing = exact_keys.merge(right_keys, how="left", indicator=True)
assert (missing["_merge"] == "both").all()

In [22]:
# apparently this assertion here fails, because missing rows that have no bracketing years
# still get assigned the match_type = interpolation (overwritten after we combine the results to final result)

# interpolated rows must have both bounds
#interp = final["match_type"] == "interpolated"
#assert final.loc[interp, "year_prev"].notna().all()
#assert final.loc[interp, "year_next"].notna().all()

# -> so lets instead ask it the other way:

bad = (final["match_type"] == "interpolated") & final["year_prev"].isna()

print(bad.sum())
display(final.loc[bad, ["party_id_ches", "year_mig", "year_prev", "year_next", "match_type", "year_ches"]].head(20))

bad_both_prev_next_missing = (final["match_type"] == "interpolated") & final["year_prev"].isna() & final["year_next"].isna()
print(bad_both_prev_next_missing.sum())

1596


Unnamed: 0,party_id_ches,year_mig,year_prev,year_next,match_type,year_ches
396,2203,2015,,,interpolated,
397,2701,2015,,,interpolated,
398,1206,2015,,,interpolated,
399,2601,2015,,,interpolated,
400,2302,2015,,,interpolated,
401,2603,2015,,,interpolated,
402,1205,2015,,,interpolated,
403,2605,2015,,,interpolated,
404,815,2015,,,interpolated,
405,815,2015,,,interpolated,


1596


In [23]:
# then check instead if all rows that are not exact match and not missing (i.e. year_ches not NaN) have bracketing years -> are interpolated
mask = (final["match_type"] == "interpolated") & final["year_ches"].notna()

assert final.loc[mask, "year_prev"].notna().all()
assert final.loc[mask, "year_next"].notna().all()
# if this assertion does not fail, it means our match_type column is wrongly labeling matches instead

In [24]:
# NOTE: Apparently the column "match_type" is not correct anymore (see above)
# FIX: in the end, check the match type and if in doubt, derive from actual data
y  = pd.to_numeric(final["year_mig"], errors="coerce").to_numpy(dtype="float64")
y_ches = pd.to_numeric(final["year_ches"], errors="coerce").to_numpy(dtype="float64")
y1 = pd.to_numeric(final["year_prev"], errors="coerce").to_numpy(dtype="float64")
y2 = pd.to_numeric(final["year_next"], errors="coerce").to_numpy(dtype="float64")

exact  = np.isnan(y1) & np.isnan(y2) & (y == y_ches)
interp = ~np.isnan(y1) & ~np.isnan(y2) & (y1 < y) & (y < y2)

final["match_type"] = "none"
final.loc[exact, "match_type"] = "exact"
final.loc[interp, "match_type"] = "interpolated"


interp = final["match_type"] == "interpolated"
assert final.loc[interp, "year_prev"].notna().all()
assert final.loc[interp, "year_next"].notna().all()

In [25]:
bad = (final["match_type"] == "interpolated") & final["year_prev"].isna()

print(bad.sum())
display(final.loc[bad, ["party_id_ches", "year_mig", "year_prev", "year_next", "match_type", "year_ches"]].head(20))

bad_both_prev_next_missing = (final["match_type"] == "interpolated") & final["year_prev"].isna() & final["year_next"].isna()
print(bad_both_prev_next_missing.sum())

0


Unnamed: 0,party_id_ches,year_mig,year_prev,year_next,match_type,year_ches


0


In [26]:
# bounds must actually bracket the left year
y  = pd.to_numeric(final["year_mig"],  errors="coerce").to_numpy(dtype="float64")
y1 = pd.to_numeric(final["year_prev"], errors="coerce").to_numpy(dtype="float64")
y2 = pd.to_numeric(final["year_next"], errors="coerce").to_numpy(dtype="float64")

interp = final["match_type"] == "interpolated"
brackets = (~np.isnan(y1)) & (~np.isnan(y2)) & (y1 < y) & (y < y2)

bad = interp & ~brackets

print("rows failing bracketing check:", bad.sum())
final.loc[bad, ["party_id_ches", "year_mig", "year_prev", "year_next"]].head(20)

assert bad.sum() == 0

rows failing bracketing check: 0


In [27]:
# distribution checks
# how many rows fell into each case?
display(final["match_type"].value_counts(dropna=False))

# coverage per numerical/interpolated column
display(final[value_cols].notna().mean().sort_values())

match_type
interpolated    5910
none            3134
exact            661
Name: count, dtype: int64

spendvtax_salience         0.000000
social_salience            0.000000
civlib_salience            0.000000
dereg_salience             0.000000
relig_salience             0.000000
eu_employ                  0.000000
eu_environ                 0.000000
eu_agri                    0.000000
international_salience     0.000000
us                         0.000000
us_salience                0.000000
eu_fiscal                  0.000000
urban_salience             0.000000
region_salience            0.000000
nationalism_salience       0.000000
ethnic_salience            0.000000
eu_russia                  0.008655
womens_rights              0.008655
climate_change_salience    0.008655
climate_change             0.008655
samesex_marriage           0.008655
lgbtq_rights               0.008655
judicial_independence      0.008655
executive_power            0.008655
eu_turkey                  0.025142
eu_ep                      0.025142
international_security     0.025142
multicult_dissent          0

Wonderful, now lets clean it up and store for later analysis

In [28]:
display(final.head())

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,block,party_mig,migration_prob,person_id_ep,party_id_ep,party_id_ches,party_label_ep,party_all_labels_ep,party_all_names_ep,party_name_ches,party_name_en_ches,year_mig,year_ches,lrgen,lrecon,lrecon_salience,lrecon_dissent,lrecon_blur,galtan,galtan_salience,galtan_dissent,galtan_blur,eu_position,eu_salience,eu_dissent,eu_blur,spendvtax,spendvtax_salience,deregulation,dereg_salience,redistribution,redist_salience,econ_interven,civlib_laworder,civlib_salience,sociallifestyle,social_salience,womens_rights,lgbtq_rights,samesex_marriage,religious_principles,relig_salience,immigrate_policy,immigrate_salience,immigrate_dissent,multiculturalism,multicult_salience,multicult_dissent,nationalism,nationalism_salience,ethnic_minorities,ethnic_salience,urban_rural,urban_salience,environment,enviro_salience,climate_change,climate_change_salience,protectionism,regions,region_salience,international_security,international_salience,us,us_salience,eu_ep,eu_fiscal,eu_intmark,eu_employ,eu_budgets,eu_agri,eu_cohesion,eu_environ,eu_asylum,eu_foreign,eu_turkey,eu_russia,russian_interference,anti_islam_rhetoric,people_vs_elite,antielite_salience,corrupt_salience,members_vs_leadership,executive_power,judicial_independence,match_type,year_prev,year_next
0,292003,Amjad Bashir,"Mr Arias, we have heard about the injustice do...",2014-07-02,12. Programme of activities of the Italian Pre...,41,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr Arias, we have heard about the injustice do...",original_pl,(extreme)_right,EDD/INDDEM/EFD,0.257913,124956,4051,1108,UKIP,[UKIP],[United Kingdom Independence Party],[United Kingdom Independence Party],[United Kingdom Independence Party],2014,2014,9.142858,8.571428,5.142857,,,9.285714,8.142858,,,1.142857,9.142858,0.714286,,8.833333,,8.333333,,7.5,,8.5,8.857142,,8.5,,,,,6.8,,10.0,,,9.8,,,9.833333,,8.428572,,5.666667,,9.0,,,,,5.75,,5.0,,,,1.166667,,2.571429,,1.142857,,1.285714,,,1.285714,1.0,,,,,9.285714,6.8,,,,exact,,
1,292004,Miguel Arias Cañete,"Sí, efectivamente, para luchar contra la inmig...",2014-07-02,12. Programme of activities of the Italian Pre...,42,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Yes, indeed, to fight against immigration we m...",machine_pl,christian_conservative,PPE,0.494101,1351,4024,502,PP,[PP],[Partido Popular],"[Partido Popular, Alianza-Partido Popular]","[People’s Party, People’s Alliance Party]",2014,2014,7.3,7.666667,8.777778,,,8.0,7.222222,,,6.8,6.9,0.6,,7.6,,7.9,,7.6,,7.3,8.3,,7.0,,,,,7.8,,8.1,,,8.25,,,7.2,,7.444445,,5.5,,7.375,,,,,7.5,,2.444444,,,,5.666667,,6.7,,5.555555,,6.75,,,6.75,4.555555,,,,,1.4,3.4,,,,exact,,
2,292025,Monika Smolková,– Taliansko predstavilo veľmi ambiciózny plán ...,2014-07-02,12. Programme of activities of the Italian Pre...,63,,6399.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,- Italy presented a very ambitious plan for it...,machine_pl,social_democratic,PSE/S&D,0.323411,96655,4159,2803,SMER-SD,[SMER-SD],[SMER-Sociálna demokracia],"[Strana Smer–Tretia Cesta, Smer–sociálna demok...","[Direction–Third Way, Direction–Social Democracy]",2014,2014,3.692308,2.571429,8.642858,,,6.928571,4.846154,,,6.142857,6.571429,1.454546,,2.714286,,2.214286,,2.642857,,1.714286,6.923077,,6.692307,,,,,4.857143,,6.461538,,,6.416666,,,6.785714,,7.307693,,5.857143,,7.153846,,,,,7.0,,5.0,,,,5.9,,5.846154,,5.071429,,6.833333,,,5.153846,4.6,,,,,3.714286,3.785714,,,,exact,,
3,291861,Kristina Winberg,Fru talman! Schengenavtalet ingicks i en anda ...,2014-07-14,13. One-minute speeches on matters of politica...,24,,6404.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,Fru talman! The Schengen Agreement was conclud...,machine_pl,(extreme)_right,EDD/INDDEM/EFD,0.700463,124995,4253,1610,Sverigedemokraterna,[Sverigedemokraterna],[Sverigedemokraterna],[Sverigedemokraterna],[Sweden Democrats],2014,2014,7.761905,5.4,3.476191,,,9.238095,8.380953,,,1.272727,6.136363,0.95,,4.705883,,4.529412,,4.588235,,4.470588,9.277778,,8.157895,,,,,5.625,,9.777778,,,9.842105,,,9.777778,,9.8125,,7.3125,,7.5,,,,,5.25,,6.352941,,,,1.833333,,1.277778,,1.0625,,1.785714,,,1.235294,1.333333,,,,,8.894737,2.944444,,,,exact,,
4,291519,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,"Αξιότιμε κύριε Juncker, στις προτεραιότητές σα...",2014-07-15,5. Statement by the candidate for President of...,25,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,"Dear Mr Juncker, in your priorities you refer ...",machine_pl,christian_conservative,PPE,0.604105,125065,4055,402,N.D.,"[Ν.Δ., N.D.]","[Νέα Δημοκρατία, Nea Demokratia]",[Néa Dimokratía],[New Democracy],2014,2014,7.222222,7.111111,9.444445,,,7.0,6.333334,,,6.555555,7.444445,2.555556,,5.5,,6.333333,,6.555555,,6.0,7.111111,,8.111111,,,,,8.444445,,8.0,,,8.222222,,,7.444445,,8.222222,,6.0,,7.666666,,,,,5.714286,,4.428571,,,,6.428571,,6.555555,,5.555555,,6.777778,,,6.111111,3.714286,,,,,2.333333,4.555555,,,,exact,,


In [29]:
# convert types into string where reasonable
# work on copy
final_output_interpolated = final.copy(deep=True)
print(final_output_interpolated["year_ches"].isna().sum()) # we have nan values since not all rows matched! -> we cannot have numpy int as dtype
print(final_output_interpolated["year_ches"].dtype)
print(final_output_interpolated["year_mig"].dtype)
print(final_output_interpolated["match_type"].dtype)
print(final_output_interpolated["year_prev"].dtype)
print(final_output_interpolated["year_next"].dtype)
#final_fallback_output["year_ches"] = final_fallback_output["year_ches"].astype("Int64") # convert to pandas int type

# since we have NaN values in columns match_type, year_prev and year_next, convert these to string
# ensure IDs same dtype (pandas string, handles NaN nicely for us)
final_output_interpolated["match_type"] = final_output_interpolated["match_type"].astype("string").str.strip()
final_output_interpolated["year_prev"] = final_output_interpolated["year_prev"].astype("string").str.strip()
final_output_interpolated["year_next"] = final_output_interpolated["year_next"].astype("string").str.strip()
print(final_output_interpolated["match_type"].dtype)
print(final_output_interpolated["year_prev"].dtype)
print(final_output_interpolated["year_next"].dtype)

9044
Int64
Int64
str
object
object
string
string
string


In [None]:
# now store for later analysis
# final_output_interpolated.to_parquet(PATH_MIGRATION_CHES_INTERPOLATED, engine='fastparquet')