In [2]:
import pandas as pd
import requests
import numpy as np
import preamble

# set display options for the notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Since the PLS dataset does not come with the European Parliament's person_id of each speaker, that we would need to enrich our dataset with each speaker's national party affiliation at the time of speech delivery, we fetch the metadata of all MEP's during the legislative periods of 2014-2024 from the API of the EP Open Data Portal, in particular their ep_ID to map those to the speakers in our (PLS) migration dataset.

TODO add description section about sanity checks with EU party

In [3]:
# define some helper functions
import unicodedata
import re
from typing import Optional

# 00 Normalize names
def normalize_name(x: object) -> str | None:
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = str(x).strip()
    if not s:
        return None

    # Unicode normalize (handles different codepoint compositions)
    s = unicodedata.normalize("NFKC", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s)

    # Optional: make case-insensitive comparisons safer across languages
    # (keeps Greek/Latin letters as-is; just normalizes case)
    s = s.casefold()

    return s

# 01_Normalize the strings (lower-casing, removing accents/diacritics and stripping extra whitespace)
def normalize_str(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.lower().strip()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s

#print(normalize_str("ÖVP"))
#print(normalize_str("Parti québécois"))
#print(normalize_str(" ÖVP "))

# helper function for lists (creates a normalized set)
#def normalize_list(lst):
#    return {normalize_str(x) for x in lst if isinstance(x, str)}
#
#print(normalize_list(["ÖVP", "Parti québécois", " ÖVP "]))

In [4]:
"""
Strict name normalization for matching.

Goal: reduce a personal name to a very basic form so that small differences
(whitespace, punctuation, case, accents/diacritics) disappear.

Example:
  "Jiří POSPÍŠIL" and "Jíri Pospísíl" -> "jiripospisil"
"""

_ALNUM_RE = re.compile(r"[^0-9a-z]+")  # after casefold+ascii translit we keep a-z0-9

def _strip_diacritics(s: str) -> str:
    # NFKD splits letters+diacritics; we drop combining marks
    return "".join(
        ch for ch in unicodedata.normalize("NFKD", s)
        if not unicodedata.combining(ch)
    )

def normalize_name_strict(name: object, *, transliterate: bool = True) -> Optional[str]:
    """
    Return a strict normalized key for a name, or None if empty.

    Steps:
      1) NFKC normalize
      2) casefold
      3) transliterate to ASCII Latin (optional, requires Unidecode)
      4) strip diacritics
      5) keep only [a-z0-9], remove everything else (spaces, punctuation, hyphens, etc.)
    """
    if name is None:
        return None

    s = str(name).strip()
    if not s:
        return None

    # 1) Unicode canonicalization
    s = unicodedata.normalize("NFKC", s)

    # 2) Case-insensitive across languages
    s = s.casefold()

    # 3) Transliterate Greek/Cyrillic/etc. to Latin if possible
    if transliterate:
        try:
            from unidecode import unidecode  # pip install Unidecode
            s = unidecode(s)
        except Exception:
            # If Unidecode isn't available, we continue without transliteration
            pass

    # 4) Remove accents/diacritics
    s = _strip_diacritics(s)

    # 5) Keep only letters/digits; this also removes whitespace between names
    s = _ALNUM_RE.sub("", s)

    return s or None



#tests = [
#    "Nótis Mariás",
#    "Jiří POSPÍŠIL",
#    "Jíri Pospísíl",
#    " Jean-Luc  Mélenchon ",
#    "O’Connor",
#    "Κυριάκος Μητσοτάκης",  # needs Unidecode to transliterate well
#    "Бойко Борисов",        # needs Unidecode to transliterate well
#]
#for t in tests:
#    print(f"{t!r} -> {normalize_name_strict(t)}")

First load the PLS data

In [5]:
# load the full pls dataset
original_pls_df = pd.read_parquet("data/final/full.parquet")
pls_df = original_pls_df.copy(deep=True)
display(pls_df.head())

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,migration_prob
0,1,Daniel Freund,"Frau Präsidentin, liebe Kolleginnen und Kolleg...",2024-04-25,2. Interinstitutional Body for Ethical Standar...,2,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, dear colleagues! Politics mus...",machine_gm,2024,green,Greens/EFA,0.000154,0.399109,0.000154,0.000154,0.000154,0.056084,0.000154,0.061098,0.173615,0.111376,0.000154,0.000154,0.000154,0.000154,0.000154,0.015829,0.000154,0.000154,0.068097,0.000154,0.055674,0.000154,0.000154,0.01407,0.000154,0.000154,0.000154,0.000154,0.04197,0.000154,0.000154
0,3,Sven Simon,"Madam President, colleagues, this last day of ...",2024-04-25,2. Interinstitutional Body for Ethical Standar...,4,,6398.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, colleagues, this last day of ...",original_gm,2024,christian_conservative,PPE,0.000397,0.137624,0.000397,0.000397,0.000397,0.104021,0.000397,0.000397,0.639629,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.108396,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397
0,4,Gabriele Bischoff,"Frau Präsidentin, werte Kommission, Kolleginne...",2024-04-25,2. Interinstitutional Body for Ethical Standar...,5,,6399.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, honourable Commission, collea...",machine_gm,2024,social_democratic,PSE/S&D,0.22439,0.371594,0.0002,0.0002,0.0002,0.251997,0.0002,0.017877,0.100508,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002,0.00607,0.0002,0.013827,0.0002,0.0002,0.0002,0.009344,0.0002,0.0002,0.0002,0.0002,0.0002,0.0002
0,5,Gilles Boyer,"Madame la Présidente, Monsieur le Commissaire,...",2024-04-25,2. Interinstitutional Body for Ethical Standar...,6,,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, Commissioner, it is always an...",machine_gm,2024,liberal,ELDR/ALDE/Renew,0.462239,0.000433,0.000433,0.000433,0.000433,0.21925,0.000433,0.000433,0.186607,0.000433,0.000433,0.000433,0.000433,0.000433,0.000433,0.023902,0.000433,0.025769,0.000433,0.000433,0.000433,0.000433,0.000433,0.000433,0.000433,0.000433,0.071834,0.000433,0.000433,0.000433,0.000433
0,6,Heidi Hautala,"Madam President, we really have to thank Danie...",2024-04-25,2. Interinstitutional Body for Ethical Standar...,7,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, we really have to thank Danie...",original_gm,2024,green,Greens/EFA,0.058282,0.246905,0.000472,0.000472,0.000472,0.253456,0.000472,0.000472,0.211278,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.048039,0.000472,0.000472,0.014229,0.000472,0.058577,0.000472,0.000472,0.098843,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472,0.000472


In [6]:
# load the migration dataset
original_migration_df = pd.read_parquet("data/final/migration.parquet")
migration_df = original_migration_df.copy(deep=True)
display(migration_df.head())

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob
2578264806592,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282
2577226584400,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501
0,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925
0,1054,Beata Kempa,Pani Przewodnicząca! Pani Komisarz! Potrzebuje...,2024-04-22,20. Amending Directive 2011/36/EU on preventin...,14,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam Commissioner! We need very specific, ver...",machine_gm,2024,(extreme)_right,ECR,0.256493
0,1056,Dorien Rookmaker,"Voorzitter, dank u wel, meneer Engerer, want u...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,16,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, thank you, Mr Engerer, becaus...",machine_gm,2024,(extreme)_right,ECR,0.475315


Now let's fetch the meta data of all the MEPs from the API of the EP Open Data Portal

In [7]:
# now fetch the (basic) data on the MEPs from the EU OpenData Portal
mep_data = {}
# we have 11 terms in total (but we start with the 5th term, i.e. 1999-2004 term)
for term in range(5,11):
    response = requests.get("https://data.europarl.europa.eu/api/v2/meps",
                            params={"parliamentary-term": term, "format": "application/ld+json"})
    if response.status_code == 200:
        mep_data[term] = response.json()['data']
    else:
        print(f"Couldn't find data on LT: {term}")
print("Finished without error.")
# merge everything into one dataframe
mep_df = pd.DataFrame([{**mep, "period": term} for term, data in mep_data.items() for mep in data])
#display(mep_df.head())

Finished without error.


In [8]:
# Now use the MEP's identifier to fetch the relevant metadata
metadata = []
identifiers = list(set(mep_df['identifier']))
print("Amount of (unique) identifiers in mep_df:", len(identifiers))
batch_size = 128
idx = 0
while idx < len(identifiers):
    print(f"idx: {idx}")
    batch = identifiers[idx: idx + batch_size]
    response = requests.get(f"https://data.europarl.europa.eu/api/v2/meps/{','.join(batch)}",
                                params={"format": "application/ld+json"})

    if response.status_code == 200:
        metadata.extend(response.json()['data'])
    else:
        print(f"An error occurred: {response.status_code}")

    idx += batch_size
print("Finished without error.")
# create dataframe
original_mep_metadata = pd.DataFrame(metadata)

Amount of (unique) identifiers in mep_df: 3294
idx: 0
idx: 128
idx: 256
idx: 384
idx: 512
idx: 640
idx: 768
idx: 896
idx: 1024
idx: 1152
idx: 1280
idx: 1408
idx: 1536
idx: 1664
idx: 1792
idx: 1920
idx: 2048
idx: 2176
idx: 2304
idx: 2432
idx: 2560
idx: 2688
idx: 2816
idx: 2944
idx: 3072
idx: 3200
Finished without error.


In [9]:
mep_metadata = original_mep_metadata.copy(deep=True)
display(mep_metadata.head())

Unnamed: 0,id,type,identifier,label,notation_codictPersonId,bday,hasGender,hasHonorificPrefix,hasMembership,citizenship,placeOfBirth,familyName,givenName,img,sortLabel,upperFamilyName,upperGivenName,deathDate,hasEmail,account,homepage,officialFamilyName,officialGivenName,upperOfficialFamilyName,upperOfficialGivenName
0,person/1103,Person,1103,Giorgio NAPOLITANO,1103,1925-06-29,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...,"[{'id': 'membership/1103-f-103657', 'type': 'M...",http://publications.europa.eu/resource/authori...,Napoli,Napolitano,Giorgio,https://www.europarl.europa.eu/mepphoto/1103.jpg,NAPOLITANO,NAPOLITANO,GIORGIO,,,,,,,,
1,person/111024,Person,111024,Jens NILSSON,111024,1948-09-25,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...,"[{'id': 'membership/111024-f-142216', 'type': ...",http://publications.europa.eu/resource/authori...,Västervik,Nilsson,Jens,https://www.europarl.europa.eu/mepphoto/111024...,NILSSON,NILSSON,JENS,2018-03-12,,,,,,,
2,person/117477,Person,117477,Mark DEMESMAEKER,117477,1958-09-12,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...,"[{'id': 'membership/117477-f-139130', 'type': ...",http://publications.europa.eu/resource/authori...,,Demesmaeker,Mark,https://www.europarl.europa.eu/mepphoto/117477...,DEMESMAEKER,DEMESMAEKER,MARK,,,,,,,,
3,person/1183,Person,1183,Dagmar ROTH-BEHRENDT,1183,1953-02-21,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...,"[{'id': 'membership/1183-f-99095', 'type': 'Me...",http://publications.europa.eu/resource/authori...,Frankfurt/Main,Roth-Behrendt,Dagmar,https://www.europarl.europa.eu/mepphoto/1183.jpg,ROTH-BEHRENDT,ROTH-BEHRENDT,DAGMAR,,,,,,,,
4,person/1186,Person,1186,Astrid LULLING,1186,1929-06-11,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...,"[{'id': 'membership/1186-f-99329', 'type': 'Me...",http://publications.europa.eu/resource/authori...,Schifflange,Lulling,Astrid,https://www.europarl.europa.eu/mepphoto/1186.jpg,LULLING,LULLING,ASTRID,,,,,,,,


Perform some cleaning/basic preprocessing of the metadata
- remove url-type values/columns
- pull all name-related columns together as set of (normalized) values
- additionally perform a "strict" normalization of the label column only
- drop unneeded columns

In [10]:
# do some cleaning of the metadata
# replace URI entries with actual data for gender, citizenship and prefix
mep_metadata['gender'] = mep_metadata['hasGender'].apply(lambda gstr: gstr[len("http://publications.europa.eu/resource/authority/human-sex/"):])

mep_metadata['citizenship'] = mep_metadata['citizenship'].apply(lambda gstr: gstr[len("http://publications.europa.eu/resource/authority/country/"):])

mep_metadata['honorific_prefix'] =  mep_metadata['hasHonorificPrefix'].apply(lambda prefix: prefix[len("http://publications.europa.eu/resource/authority/honorific/"):] if isinstance(prefix, str) else prefix)

# get rid of ambiguous types in column "placeOfBirth"
mep_metadata['placeOfBirth'] = mep_metadata['placeOfBirth'].apply(lambda x: str(x) if isinstance(x, list) else x)

# normalize the names and create set of possible (norm) names for each MEP
mep_metadata["name_set_norm"] = mep_metadata[["label", "familyName", "givenName", "sortLabel", "upperFamilyName", "upperGivenName",
                                "officialFamilyName", "officialGivenName", "upperOfficialFamilyName", "upperOfficialGivenName"
                                              ]].apply(
    lambda r: list(
        set(
            nx for nx in (normalize_name(x) for x in r)
            if nx  # drops None and "" (empty string)
        )
    ),
    axis=1
)

# also perform a "strict" normalization of the label column only
mep_metadata["label_strict_norm"] = mep_metadata["label"].apply(normalize_name_strict)
#display(mep_metadata.head())

In [11]:
# drop unneeded columns
meta_cleaned_df = mep_metadata[["identifier", "label", "label_strict_norm", "name_set_norm",
                                "citizenship", "placeOfBirth", "bday", "deathDate", "gender", "hasMembership"
                                ]]
display(meta_cleaned_df.head())

Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership
0,1103,Giorgio NAPOLITANO,giorgionapolitano,"[giorgio, giorgio napolitano, napolitano]",ITA,Napoli,1925-06-29,,MALE,"[{'id': 'membership/1103-f-103657', 'type': 'M..."
1,111024,Jens NILSSON,jensnilsson,"[nilsson, jens nilsson, jens]",SWE,Västervik,1948-09-25,2018-03-12,MALE,"[{'id': 'membership/111024-f-142216', 'type': ..."
2,117477,Mark DEMESMAEKER,markdemesmaeker,"[mark demesmaeker, demesmaeker, mark]",BEL,,1958-09-12,,MALE,"[{'id': 'membership/117477-f-139130', 'type': ..."
3,1183,Dagmar ROTH-BEHRENDT,dagmarrothbehrendt,"[dagmar, dagmar roth-behrendt, roth-behrendt]",DEU,Frankfurt/Main,1953-02-21,,FEMALE,"[{'id': 'membership/1183-f-99095', 'type': 'Me..."
4,1186,Astrid LULLING,astridlulling,"[astrid, lulling, astrid lulling]",LUX,Schifflange,1929-06-11,,FEMALE,"[{'id': 'membership/1186-f-99329', 'type': 'Me..."


Perform some initial/exploratory investigation, namely:
1. Investigate if combination of MEP's identifiers and (strict norm) names/labels is unique

In [12]:
# 00. do we have the same amount of unique identifiers and names/labels (strict norm)?
strict_norm_counts = meta_cleaned_df["label_strict_norm"].nunique()
id_counts = meta_cleaned_df["identifier"].nunique()
print("(strict norm) label counts: ", strict_norm_counts)
print("id counts: ", id_counts)
print("meta_df shape: ", meta_cleaned_df.shape[0])
print(f"We have {id_counts-strict_norm_counts} unique IDs more, than we have unique (strict norm.) labels/person names!")

# 01. who has more than one ID assigned?
mep_person_id_counts = meta_cleaned_df.groupby("label_strict_norm")["identifier"].nunique()
mep_id_person_counts = meta_cleaned_df.groupby("identifier")["label_strict_norm"].nunique()
#print(mep_person_id_counts)

mep_person_w_multiple_ids = mep_person_id_counts[mep_person_id_counts > 1]
mep_id_w_multiple_labels = mep_id_person_counts[mep_id_person_counts > 1]
print("Amount of Persons with multiple IDs:")
print(mep_person_w_multiple_ids)
print("Amount of Identifiers with multiple (strict norm) Names/Labels:")
print(mep_id_w_multiple_labels)

(strict norm) label counts:  3292
id counts:  3294
meta_df shape:  3294
We have 2 unique IDs more, than we have unique (strict norm.) labels/person names!
Amount of Persons with multiple IDs:
label_strict_norm
jiripospisil    2
pavelsvoboda    2
Name: identifier, dtype: int64
Amount of Identifiers with multiple (strict norm) Names/Labels:
Series([], Name: label_strict_norm, dtype: int64)


It seems that all combinations of person name and ID are unique except for two individuals. "Jiří POSPÍŠIL" and "Pavel SVOBODA" have each two IDs assigned. This needs further investigation!

In [13]:
display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "jiripospisil"])
corrupt_jiri = meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "jiripospisil"]
display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "pavelsvoboda"])
corrupt_pavel = meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "pavelsvoboda"]

Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership
444,23845,Jiří POSPÍŠIL,jiripospisil,"[pospíšil, pospisil, jiří pospíšil, jiří]",CZE,Brno,1949-05-09,,MALE,"[{'id': 'membership/23845-f-109156', 'type': '..."
1431,125706,Jiří POSPÍŠIL,jiripospisil,"[pospíšil, pospisil jiri, jiří pospíšil, jiří]",CZE,Chomutov,1975-11-24,,MALE,"[{'id': 'membership/125706-f-151047', 'type': ..."


Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership
2039,96272,Pavel SVOBODA,pavelsvoboda,"[pavel svoboda, svobodap, svoboda, pavel]",CZE,Praha,1962-04-09,,MALE,"[{'id': 'membership/96272-f-150543', 'type': '..."
2742,23708,Pavel SVOBODA,pavelsvoboda,"[pavel svoboda, svoboda, pavel]",CZE,Praha,1969-03-09,,MALE,"[{'id': 'membership/23708-f-109121', 'type': '..."


In [14]:
# check for with period / legislative term these MEPs served (between 5th and 10th term)
print("Entries in MEP data for speaker: Jiří POSPÍŠIL")
display(mep_df.loc[mep_df["label"].apply(normalize_name_strict) == "jiripospisil"])
print("Entries in MEP data for speaker: Pavel SVOBODA")
display(mep_df.loc[mep_df["label"].apply(normalize_name_strict) == "pavelsvoboda"])

Entries in MEP data for speaker: Jiří POSPÍŠIL


Unnamed: 0,id,type,identifier,label,familyName,givenName,sortLabel,period,officialFamilyName,officialGivenName
384,person/23845,Person,23845,Jiří POSPÍŠIL,Pospíšil,Jiří,POSPISIL,5,,
3271,person/125706,Person,125706,Jiří POSPÍŠIL,Pospíšil,Jiří,POSPISIL JIRI,8,,
4135,person/125706,Person,125706,Jiří POSPÍŠIL,Pospíšil,Jiří,POSPISIL JIRI,9,,


Entries in MEP data for speaker: Pavel SVOBODA


Unnamed: 0,id,type,identifier,label,familyName,givenName,sortLabel,period,officialFamilyName,officialGivenName
425,person/23708,Person,23708,Pavel SVOBODA,Svoboda,Pavel,SVOBODA,5,,
3095,person/96272,Person,96272,Pavel SVOBODA,Svoboda,Pavel,SVOBODAP,8,,


**Intermed Result:**
The politician Jiří POSPÍŠIL with ID 23845 only served for the 5th term (-> not relevant for us), the other with ID 125706 for the 8th and 9th term (-> potentially relevant for us).
The same goes for the politician Pavel SVOBODA, the one with ID 23708 only served in the 5th term (irrelevant), the other with ID 96272 served for the 8th term (-> potentially relevant for us). Since we ultimately are only interested in the 2014-2024 time period.

In [15]:
# check if these labels also appear in the migration dataset
print("Entries in migration dataset for speaker: Jiří POSPÍŠIL")
display(migration_df.loc[migration_df["speaker"].apply(normalize_name_strict) == "jiripospisil"].head(5))
print("Entries in migration dataset for speaker: Pavel SVOBODA")
display(migration_df.loc[migration_df["speaker"].apply(normalize_name_strict) == "pavelsvoboda"].head(5))

Entries in migration dataset for speaker: Jiří POSPÍŠIL


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob
8244227693845946468,9804,Jiří Pospíšil,"Pane předsedající, tady já budu velmi stručný....",2023-10-18,24.3. Schengen area: digitalisation of the vis...,1,,6398.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr. President, I will be very brief. I conside...",machine_gm,2023,christian_conservative,PPE,0.447782
2314885530818453536,13487,Jiří Pospíšil,"Pane předsedající, já jsem podpořil toto usnes...",2023-07-12,"22.4. Situation in Lebanon (RC-B9-0323/2023, B...",1,,6398.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, I supported this resolution, and...",machine_gm,2023,christian_conservative,PPE,0.380926
7163380306301427744,14656,Jiří Pospíšil,"Paní předsedající, vždy jsem velmi pozorně pos...",2023-06-14,"19.2. Humanitarian situation in Sudan, in part...",14,,6398.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, I have always listened very a...",machine_gm,2023,christian_conservative,PPE,0.269396
2314885530818453536,64453,Jiří Pospíšil,"Pane předsedající, já jsem se také chtěl zapoj...",2019-04-17,13. European Border and Coast Guard (debate)20...,38,bill_26275_ID,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr. President, I also wanted to join the debat...",machine_pl,2019,christian_conservative,PPE,0.343389
2314885866834519156,88454,Jiří Pospíšil,"Já chci opětovně upozornit na problém, na kter...",2017-11-29,24. One-minute speeches on matters of politica...,2,,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,I want to once again draw attention to a probl...,machine_pl,2017,christian_conservative,PPE,0.278574


Entries in migration dataset for speaker: Pavel SVOBODA


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob
243708,124440,Pavel Svoboda,Nepodpořil jsem přijetí zprávy o lidských práv...,2016-10-25,8.14. Human rights and migration in third coun...,168,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,I did not support the adoption of the report o...,machine_pl,2016,christian_conservative,PPE,0.440498
244171,132256,Pavel Svoboda,Nepodpořil jsem dnes zprávu na téma azylu a do...,2016-09-15,12.4. Asylum: provisional measures in favour o...,163,bill_2168_ID,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,I did not support the report today on the subj...,machine_pl,2016,christian_conservative,PPE,0.695423
244237,135255,Pavel Svoboda,"Pane předsedající, pane komisaři, podporuji ná...",2016-09-14,15. Travel document for the return of illegall...,22,bill_2234_ID bill_2234_ID,6398.0,8,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner, I support the prop...",machine_pl,2016,christian_conservative,PPE,0.415694
244671,140841,Pavel Svoboda,Podpořil jsem zřízení evropské pohraniční a po...,2016-07-06,7.5. European Border and Coast Guard (A8-0200/...,224,bill_2236_ID bill_2236_ID,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,I supported the establishment of a European bo...,machine_pl,2016,christian_conservative,PPE,0.657059
244969,143603,Pavel Svoboda,Při dnešním hlasování o zprávě nazvané „Uprchl...,2016-07-05,5.4. Refugees: social inclusion and integratio...,212,,6398.0,8,False,True,False,True,False,https://www.europarl.europa.eu/doceo/document/...,I abstained from today's vote on the report en...,machine_pl,2016,christian_conservative,PPE,0.334924


**RESULT**
The combination of identifier and person label is unique in metadata except for two labels, namely
- "Jiří POSPÍŠIL" (CZE), born 09.05.1949 in Chomutov (MEP from 1 May 2004 – 19 July 2004 -> 5th term only) [see wiki](https://en.wikipedia.org/wiki/Ji%C5%99%C3%AD_Posp%C3%AD%C5%A1il_(politician,_born_1949))
- "Jiří POSPÍŠIL" (CZE), born 24.11.1975 in Brno (MEP from 1 July 2014 – 15 July 2024) [see wiki](https://en.wikipedia.org/wiki/Ji%C5%99%C3%AD_Posp%C3%AD%C5%A1il)
-> there are indeed two EU politicians with that name

and
- "Pavel SVOBODA" (CZE), born 09.04.1962 in Praha [see wiki](https://en.wikipedia.org/wiki/Pavel_Svoboda) and [see europarl](https://www.europarl.europa.eu/meps/en/96272/PAVEL_SVOBODA/history/8)
- "Pavel SVOBODA" (CZE), born 09.03.1969 in Praha [see europarl](https://www.europarl.europa.eu/meps/en/23708/PAVEL_SVOBODA/history/5)

This is indeed no artefact but two existing MEP's each shared the same name with another MEP.
In both cases only one of the speakers, namely Jiří POSPÍŠIL (ID 125706) and Pavel SVOBODA (ID 96272) served in a term relevant for us and also contributed speeches to the migration dataset (-> relevant for us!).

**CONSEQUENCE** Omit the irrelevant speakers from metadata / exclude them from the mapping (in order to avoid disambiguity based on the (normalized) speaker name)

In [16]:
# remove the irrelevant politicians (see above) from the metadata / further processing & mapping
meta_cleaned_df = meta_cleaned_df[~meta_cleaned_df["identifier"].isin(["23845", "23708"])]
display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"].isin(["jiripospisil", "pavelsvoboda"])])
###
ambiguous = meta_cleaned_df.groupby("label_strict_norm")["identifier"].nunique()
ambiguous = ambiguous[ambiguous > 1]
if len(ambiguous) > 0:
    print("ERROR")
else:
    print("FINISHED Cleaning")

Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership
1431,125706,Jiří POSPÍŠIL,jiripospisil,"[pospíšil, pospisil jiri, jiří pospíšil, jiří]",CZE,Chomutov,1975-11-24,,MALE,"[{'id': 'membership/125706-f-151047', 'type': ..."
2039,96272,Pavel SVOBODA,pavelsvoboda,"[pavel svoboda, svobodap, svoboda, pavel]",CZE,Praha,1962-04-09,,MALE,"[{'id': 'membership/96272-f-150543', 'type': '..."


FINISHED Cleaning


Let's also briefly investigate the names in our pls/migration dataset to look for potential disambiguity.

In [17]:
# normalize the speaker names (strictly)
pls_df["speaker_loose_norm"] = pls_df["speaker"].apply(normalize_str)
migration_df["speaker_loose_norm"] = migration_df["speaker"].apply(normalize_str)
###
pls_df["speaker_strict_norm"] = pls_df["speaker"].apply(normalize_name_strict)
migration_df["speaker_strict_norm"] = migration_df["speaker"].apply(normalize_name_strict)

In [18]:
# check if the combinations of normalized speaker name, legislative period and EU party membership are coherent,
# i.e. group by speaker and period, then check for party
pls_party_counts_norm = pls_df.groupby(["speaker", "period"])["party"].nunique(dropna=False)
# groups where party changes (or multiple different values incl. NaN)
bad_pls_parties_norm = pls_party_counts_norm[pls_party_counts_norm > 1]
#print(bad_pls_parties_norm)

# do the same for migration
migration_party_counts_norm = migration_df.groupby(["speaker_strict_norm", "period"])["party"].nunique(dropna=False)
# groups where party changes (or multiple different values incl. NaN)
bad_migration_parties_norm = migration_party_counts_norm[migration_party_counts_norm > 1]
print(bad_migration_parties_norm)
print(len(bad_migration_parties_norm))

# now for migration but without the normalization of speaker name
# do the same for migration
migration_party_counts = migration_df.groupby(["speaker", "period"])["party"].nunique(dropna=False)
# groups where party changes (or multiple different values incl. NaN)
bad_migration_parties = migration_party_counts[migration_party_counts > 1]
print(bad_migration_parties)
print(len(bad_migration_parties))

speaker_strict_norm             period
amjadbashir                     8         2
antanasguoga                    8         2
beatrixvonstorch                8         2
bernardmonot                    8         2
helmutgeuking                   9         2
janiceatkinson                  8         2
joseinaciofaria                 8         2
kristinawinberg                 8         2
laurentiurebega                 8         2
marcuspretzell                  8         2
mireilledornano                 8         2
rozagrafinvonthunundhohenstein  9         2
sophiemontel                    8         2
stefanomaullu                   8         2
sylwiaspurek                    9         2
teuvohakkarainen                9         2
vincenzosofo                    9         2
Name: party, dtype: int64
17
speaker                              period
Amjad Bashir                         8         2
Antanas Guoga                        8         2
Beatrix von Storch                   8    

Apparently, it can happen and does happen that MEPs are part of different European parties within a legislative period (i.e. they switch their EP party/group within a term). Keep this in mind for later analysis.

In [19]:
# now investigate if this happens within days too? (should not otherwise normalization must have mixed up different MEPs, i.e. different name but same normalization)
migration_party_counts_norm = migration_df.groupby(["speaker_strict_norm", "date"])["party"].nunique(dropna=False)
# groups where party changes within a day (or multiple different values incl. NaN)
bad_migration_parties_norm = migration_party_counts_norm[migration_party_counts_norm > 1]
print(bad_migration_parties_norm)
print(len(bad_migration_parties_norm))
##
# do the same for loose-/non normalized
migration_party_counts_norm = migration_df.groupby(["speaker_loose_norm", "date"])["party"].nunique(dropna=False)
# groups where party changes within a day (or multiple different values incl. NaN)
bad_migration_parties_norm = migration_party_counts_norm[migration_party_counts_norm > 1]
print(bad_migration_parties_norm)
print(len(bad_migration_parties_norm))
##
migration_party_counts_norm = migration_df.groupby(["speaker", "date"])["party"].nunique(dropna=False)
# groups where party changes within a day (or multiple different values incl. NaN)
bad_migration_parties_norm = migration_party_counts_norm[migration_party_counts_norm > 1]
print(bad_migration_parties_norm)
print(len(bad_migration_parties_norm))
##
# do the same for full pls dataset
pls_party_counts_norm = pls_df.groupby(["speaker_strict_norm", "date"])["party"].nunique(dropna=False) # "speaker" / "speaker_loose_norm"
# groups where party changes (or multiple different values incl. NaN)
bad_pls_parties_norm = pls_party_counts_norm[pls_party_counts_norm > 1]
print(bad_pls_parties_norm)
print(len(bad_pls_parties_norm))

Series([], Name: party, dtype: int64)
0
Series([], Name: party, dtype: int64)
0
Series([], Name: party, dtype: int64)
0
speaker_strict_norm  date      
antanasguoga         2016-10-04    2
n                    2015-09-09    2
Name: party, dtype: int64
2


**Result:** The (strict) normalization does not appear to have led to the mix up of different MEPs based on this quick sanity check. Indeed, the fact that MEP "antanasguoga" was part of 
two different European parties within the same day is reasonable, since for the 04.10.2016/05.10.2016 his switch from the ALDE Group to the EPP Group is recorded (see [here](https://www.europarl.europa.eu/meps/en/124763/ANTANAS_GUOGA/history/8) for more on this)

Now let's now perform the mapping between pls/migration dataset and EU metadata.

In [20]:
# work on copies
pls_df = original_pls_df.copy(deep=True)
migration_df = migration_df.copy(deep=True)

# this we already cleaned (so use it further)
meta_cleaned_df = meta_cleaned_df

# prepare the merge for pls/migration dataset, i.e.
# normalize the speaker names (strictly)
pls_df["speaker_loose_norm"] = pls_df["speaker"].apply(normalize_str)
migration_df["speaker_loose_norm"] = migration_df["speaker"].apply(normalize_str)
###
pls_df["speaker_strict_norm"] = pls_df["speaker"].apply(normalize_name_strict)
migration_df["speaker_strict_norm"] = migration_df["speaker"].apply(normalize_name_strict)

# prepare the merge for meta_df, i.e. (update/perform) normalization for speaker names/labels column
meta_cleaned_df["label_loose_norm"] = meta_cleaned_df["label"].apply(normalize_str)

# merge the pls/migration dataset on the (loose/) strict normalized speaker name
migration_matching_result = migration_df.merge(
    meta_cleaned_df,
    left_on="speaker_strict_norm",
    right_on="label_strict_norm",
    #left_on="speaker_loose_norm",
    #right_on="label_loose_norm",
    how="left"
)

pls_matching_result = pls_df.merge(
    meta_cleaned_df,
    left_on="speaker_strict_norm",
    right_on="label_strict_norm",
    #left_on="speaker_loose_norm",
    #right_on="label_loose_norm",
    how="left"
)

In [21]:
# investigate the merge for pls
# how many speeches did not get an identifier?
missing_pls = pls_matching_result["identifier"].isna().sum()
print("PLS speeches without identifier:", missing_pls)

# which speakers failed to match?
unmatched_pls_speakers = (
    pls_matching_result.loc[pls_matching_result["identifier"].isna(), "speaker_strict_norm"] #"speaker_loose_norm"]
    .value_counts()
)
#display(unmatched_pls_speakers)

PLS speeches without identifier: 16451


In [22]:
# stichproben analyse for pls
#display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "taniagonzalezpenasylolasanchezcaldentey"])
#display(pls_df.loc[pls_df["speaker_strict_norm"] == "taniagonzalezpenasylolasanchezcaldentey"])
#display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "dennisdejongenannemariemineur"])
#display(pls_df.loc[pls_df["speaker_strict_norm"] == "dennisdejongenannemariemineur"])
#display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "notesmarias"])
#display(pls_df.loc[pls_df["speaker_strict_norm"] == "notesmarias"].head())

In [23]:
# investigate the merge for migration dataset as well
# how many speeches did not get an identifier?
missing_ids_migration = migration_matching_result["identifier"].isna().sum()
print("Migration speeches without identifier:", missing_ids_migration)

# which speakers failed to match?
unmatched_migration_speeches = (
    migration_matching_result.loc[migration_matching_result["identifier"].isna(), ["speaker", "speaker_strict_norm",
                                                                                   "date", "period", "party", "multispeaker",
                                                                                   "written"
                                                                                   ]] #"speaker_loose_norm"]

)
#display(unmatched_migration_speeches.head())
unmatched_migration_speakers = (unmatched_migration_speeches[["speaker", "speaker_strict_norm"]]
                                .drop_duplicates("speaker_strict_norm"))
display(unmatched_migration_speakers)
print(unmatched_migration_speakers.shape[0])

# how many speeches per misspelled MEP are affected?
unmatched_migration_speaker_count = (
    migration_matching_result.loc[migration_matching_result["identifier"].isna(), "speaker_strict_norm"] #"speaker_loose_norm"]
    .value_counts()
)
display(unmatched_migration_speaker_count)

Migration speeches without identifier: 883


Unnamed: 0,speaker,speaker_strict_norm
35,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,
47,Ana Miranda,anamiranda
298,Jarosław Duda,jarosawduda
563,Maite Pagazaurtundúa Ruiz,maitepagazaurtunduaruiz
653,Theresa Muigg,theresamuigg
737,Róża Gräfin von Thun und Hohenstein,rozagrafinvonthunundhohenstein
1226,Pernille Weiss,pernilleweiss
1800,Yana Toom,yanatoom
2017,Anders Primdahl Vistisen,andersprimdahlvistisen
2496,Inmaculada Rodríguez-Piñero Fernández,inmaculadarodriguezpinerofernandez


29


speaker_strict_norm
maitepagazaurtunduaruiz               30
monikaflasikovabenova                 25
andersprimdahlvistisen                15
annaelzbietafotyga                     9
francescgambus                         9
anamiranda                             8
agnieszkakozowskarajewicz              8
yanatoom                               7
charlestannock                         7
evapaunova                             6
krystynamariaybacka                    5
bodilceballos                          5
rozagrafinvonthunundhohenstein         4
valdemartomasevski                     4
martinaandersonandlynnboylan           4
theresamuigg                           3
pernilleweiss                          3
inmaculadarodriguezpinerofernandez     3
jillianevans                           2
janecollinsandmikehookem               2
jordisebastia                          2
jarosawduda                            1
jakopdalunde                           1
m                                    

In [24]:
# stichproben analyse for migration as well (look into the original dataframes we merged together)
# NOTE: one possible error reason: misspelled speaker name
display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "notesmarias"])
display(migration_df.loc[migration_df["speaker_strict_norm"] == "notesmarias"].head(5))
# NOTE: another possible error reason: multispeaker speech and therefore two names in column "speaker" (together with
# some separator like "and" or "und" or "y")
display(meta_cleaned_df.loc[meta_cleaned_df["label_strict_norm"] == "martinaandersonandlynnboylan"])
display(migration_df.loc[migration_df["speaker_strict_norm"] == "martinaandersonandlynnboylan"])

Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership,label_loose_norm


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,speaker_loose_norm,speaker_strict_norm


Unnamed: 0,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership,label_loose_norm


Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,speaker_loose_norm,speaker_strict_norm
243360,111392,Martina Anderson and Lynn Boylan,The file sought for flexibility instruments to...,2016-12-01,7.18. Mobilisation of the Flexibility Instrume...,2,bill_2068_ID,6402.0,8,False,True,False,True,True,https://www.europarl.europa.eu/doceo/document/...,The file sought for flexibility instruments to...,original_pl,2016,left,NGL/The Left,0.306839,martina anderson and lynn boylan,martinaandersonandlynnboylan
248017,208464,Martina Anderson and Lynn Boylan,We voted in favour of the resolution as it cal...,2015-11-26,12.4. Education for children in emergency situ...,8,,6402.0,8,False,True,False,True,True,https://www.europarl.europa.eu/doceo/document/...,We voted in favour of the resolution as it cal...,original_pl,2015,left,NGL/The Left,0.386243,martina anderson and lynn boylan,martinaandersonandlynnboylan
248032,208936,Martina Anderson and Lynn Boylan,We supported this report as it calls on the Me...,2015-11-25,10.1. Draft amending budget No 8/2015: Own res...,3,,6402.0,8,False,True,False,True,True,https://www.europarl.europa.eu/doceo/document/...,We supported this report as it calls on the Me...,original_pl,2015,left,NGL/The Left,0.256227,martina anderson and lynn boylan,martinaandersonandlynnboylan
248064,209084,Martina Anderson and Lynn Boylan,We supported this report as it was necessary t...,2015-11-25,10.2. Mobilisation of the Flexibility Instrume...,4,,6402.0,8,False,True,False,True,True,https://www.europarl.europa.eu/doceo/document/...,We supported this report as it was necessary t...,original_pl,2015,left,NGL/The Left,0.547178,martina anderson and lynn boylan,martinaandersonandlynnboylan


**Intermed Result:**
Apparently, in the case of written speeches, i.e. multispeaker speeches, we have all the names (?) / multiple names in the column "speaker" for migration/pls data set. We need to consider these cases separately! There are other cases where we have misspelling of the names (likely already before normalization, e.g. "notesmarias" -> Notes Marias, instead of "Notis Marias" [see wiki](https://en.wikipedia.org/wiki/Notis_Marias) )

In [25]:
# check for missing identifiers only for speeches that are not "multispeaker"
mask_missing_single_speaker = migration_matching_result["identifier"].isna() & (migration_matching_result["multispeaker"] == False)

missing_single_speaker = mask_missing_single_speaker.sum()
print("Single-Speaker speeches without identifier:", missing_single_speaker)


unmatched_migration_speakers_single_speaker = (
    migration_matching_result.loc[mask_missing_single_speaker, "speaker_strict_norm"]
          .value_counts()
)

unmatched_migration_speakers_single_speeches = (
    migration_matching_result.loc[mask_missing_single_speaker, ["speaker", "speaker_strict_norm",
                                                                                   "date", "period", "party", "multispeaker",
                                                                                   "written"
                                                                                   ]] #"speaker_loose_norm"]
)
#unmatched_migration_speakers_single_speaker.head(20)

# which speakers failed to match?
unmatched_migration_speakers_single_speakers = (unmatched_migration_speakers_single_speeches[["speaker", "speaker_strict_norm",
                                                                                              "period", "party"]]
                                .sort_values(by="period", ascending=False).drop_duplicates("speaker_strict_norm"))

display(unmatched_migration_speakers_single_speakers)
print(unmatched_migration_speakers_single_speakers.shape[0])
print(unmatched_migration_speakers_single_speakers.to_csv(index=False))

Single-Speaker speeches without identifier: 877


Unnamed: 0,speaker,speaker_strict_norm,period,party
35,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,,9,PPE
47,Ana Miranda,anamiranda,9,Greens/EFA
298,Jarosław Duda,jarosawduda,9,PPE
563,Maite Pagazaurtundúa Ruiz,maitepagazaurtunduaruiz,9,ELDR/ALDE/Renew
653,Theresa Muigg,theresamuigg,9,PSE/S&D
737,Róża Gräfin von Thun und Hohenstein,rozagrafinvonthunundhohenstein,9,ELDR/ALDE/Renew
1226,Pernille Weiss,pernilleweiss,9,PPE
1800,Yana Toom,yanatoom,9,ELDR/ALDE/Renew
2017,Anders Primdahl Vistisen,andersprimdahlvistisen,8,ECR
2496,Inmaculada Rodríguez-Piñero Fernández,inmaculadarodriguezpinerofernandez,8,PSE/S&D


27
speaker,speaker_strict_norm,period,party
Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,,9,PPE
Ana Miranda,anamiranda,9,Greens/EFA
Jarosław Duda,jarosawduda,9,PPE
Maite Pagazaurtundúa Ruiz,maitepagazaurtunduaruiz,9,ELDR/ALDE/Renew
Theresa Muigg,theresamuigg,9,PSE/S&D
Róża Gräfin von Thun und Hohenstein,rozagrafinvonthunundhohenstein,9,ELDR/ALDE/Renew
Pernille Weiss,pernilleweiss,9,PPE
Yana Toom,yanatoom,9,ELDR/ALDE/Renew
Anders Primdahl Vistisen,andersprimdahlvistisen,8,ECR
Inmaculada Rodríguez-Piñero Fernández,inmaculadarodriguezpinerofernandez,8,PSE/S&D
Valdemar Tomaševski,valdemartomasevski,8,ECR
Anna Elżbieta Fotyga,annaelzbietafotyga,8,ECR
Charles Tannock,charlestannock,8,ECR
Francesc Gambús,francescgambus,8,PPE
Agnieszka Kozłowska-Rajewicz,agnieszkakozowskarajewicz,8,PPE
Jillian Evans,jillianevans,8,Greens/EFA
Monika Flašíková Beňová,monikaflasikovabenova,8,PSE/S&D
Eva Paunova,evapaunova,8,PPE
Jakop Dalunde,jakopdalunde,8,Greens/EFA
Krystyna Maria Łybacka,krystynamariaybacka,8,PSE/S&D
Mαρία Σπ

In [26]:
#print("FOR FULL PLS")
## investigate how many migration speeches we have where multispeaker==True/False and/or written==True/False
#pls_multiT_writtenF = pls_df[(pls_df["multispeaker"] == True) & (pls_df["written"] == False)]
#print("Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == FALSE: ", pls_multiT_writtenF.shape[0])
#pls_multiF_writtenT = pls_df[(pls_df["multispeaker"] == False) & (pls_df["written"] == True)]
#print("Entries in Migration Dataset with 'multispeaker' == FALSE and 'written' == TRUE: ", pls_multiF_writtenT.shape[0])
#pls_multiT_writtenT = pls_df[(pls_df["multispeaker"] == True) & (pls_df["written"] == True)]
#print("Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == TRUE: ", pls_multiT_writtenT.shape[0])
#print("FOR MIGRATION")
# RESULTS FOR FULL PLS
# Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == FALSE:  0
# Entries in Migration Dataset with 'multispeaker' == FALSE and 'written' == TRUE:  130189
# Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == TRUE:  134

# investigate how many migration speeches we have where multispeaker==True/False and/or written==True/False
migration_multiT_writtenF = migration_df[(migration_df["multispeaker"] == True) & (migration_df["written"] == False)]
print("Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == FALSE: ", migration_multiT_writtenF.shape[0])
migration_multiF_writtenT = migration_df[(migration_df["multispeaker"] == False) & (migration_df["written"] == True)]
print("Entries in Migration Dataset with 'multispeaker' == FALSE and 'written' == TRUE: ", migration_multiF_writtenT.shape[0])
migration_multiT_writtenT = migration_df[(migration_df["multispeaker"] == True) & (migration_df["written"] == True)]
print("Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == TRUE: ", migration_multiT_writtenT.shape[0])

Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == FALSE:  0
Entries in Migration Dataset with 'multispeaker' == FALSE and 'written' == TRUE:  4246
Entries in Migration Dataset with 'multispeaker' == TRUE and 'written' == TRUE:  6


**CONSEQUENCE:**
We perform a semi-automatic revision using an external LLM (ChatGPT 5.2) to infer the correct speaker names using the misspelled names and also fetch their EP homepage to extract their corresponding EP-ID each.
- take care of the "multispeaker" speeches (i.e the 6 speeches with multispeaker == True and possibly multiple speaker names in column "speaker") by only considering the first of the two speakers (quick and dirty fix but neglectable since only 6 speeches affected)
- take care of the misspelling in column "speaker" of "migration_df" and manually/semi-auto assign the correct ID of meta_df, since these are now doable mis-alignments (in total ~ 70 speakers/mis-alignments left) by semi-automatic revision/data fetching using LLM

In [27]:
# first take care of the multispeaker speeches/rows
mask_missing_multi_speaker = migration_matching_result["identifier"].isna() & (migration_matching_result["multispeaker"] == True)

missing_multi_speaker = mask_missing_multi_speaker.sum()
print("Multi-Speaker speeches without identifier:", missing_multi_speaker)


unmatched_migration_speakers_multi_speaker = (
    migration_matching_result.loc[mask_missing_multi_speaker, "speaker_strict_norm"]
          .value_counts()
)
print("Value Counts for unmatched multispeaker speeches:")
print(unmatched_migration_speakers_multi_speaker)

unmatched_migration_speakers_multi_speeches = (
    migration_matching_result.loc[mask_missing_multi_speaker, ["speaker", "speaker_strict_norm",
                                                                                   "date", "period", "party", "multispeaker",
                                                                                   "written"
                                                                                   ]] #"speaker_loose_norm"]
)
display(unmatched_migration_speakers_multi_speeches.head(10))

# which speakers failed to match?
unmatched_migration_speakers_multi_speakers = (unmatched_migration_speakers_multi_speeches[["speaker", "speaker_strict_norm",
                                                                                              "period", "party"]]
                                .sort_values(by="period", ascending=False).drop_duplicates("speaker_strict_norm"))

display(unmatched_migration_speakers_multi_speakers)
print(unmatched_migration_speakers_multi_speakers.shape[0])
print(unmatched_migration_speakers_multi_speakers.to_csv(index=False))

Multi-Speaker speeches without identifier: 6
Value Counts for unmatched multispeaker speeches:
speaker_strict_norm
martinaandersonandlynnboylan    4
janecollinsandmikehookem        2
Name: count, dtype: int64


Unnamed: 0,speaker,speaker_strict_norm,date,period,party,multispeaker,written
3613,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,2016-12-01,8,NGL/The Left,True,True
7299,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,2015-11-26,8,NGL/The Left,True,True
7311,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,2015-11-25,8,NGL/The Left,True,True
7338,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,2015-11-25,8,NGL/The Left,True,True
7722,Jane Collins and Mike Hookem,janecollinsandmikehookem,2015-10-14,8,EDD/INDDEM/EFD,True,True
7777,Jane Collins and Mike Hookem,janecollinsandmikehookem,2015-10-14,8,EDD/INDDEM/EFD,True,True


Unnamed: 0,speaker,speaker_strict_norm,period,party
3613,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,8,NGL/The Left
7722,Jane Collins and Mike Hookem,janecollinsandmikehookem,8,EDD/INDDEM/EFD


2
speaker,speaker_strict_norm,period,party
Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,8,NGL/The Left
Jane Collins and Mike Hookem,janecollinsandmikehookem,8,EDD/INDDEM/EFD



**CONSEQUENCE:**
- We know that both irish politicians Martina Anderson [ep profile](https://www.europarl.europa.eu/meps/en/113959/MARTINA_ANDERSON/history/9) and Lynn Boylan [ep profile](https://www.europarl.europa.eu/meps/en/124984/LYNN_BOYLAN/home) were part of Sinn Féin (nationally) and member of the Left group in the European Parliament - GUE/NGL // Group of the European United Left - Nordic Green Left - Member -> therefore we treat them as one conceptually (i.e. consider these speeches as if given from "Martina Anderson").
- The same holds for british politicians Jane Collins [ep profile](https://www.europarl.europa.eu/meps/en/124955/JANE_COLLINS/history/8) and Mike Hookem [ep profile](https://www.europarl.europa.eu/meps/en/124957/MIKE_HOOKEM/history/8), we therefore treat these speeches as given by Jane Collins


In [28]:
missing_multi_speaker_mapping_df = pd.DataFrame([
    {
        "speaker": "Martina Anderson and Lynn Boylan",
        "speaker_strict_norm": "martinaandersonandlynnboylan",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Martina ANDERSON",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/113959/MARTINA_ANDERSON/history/9",
    },
    {
        "speaker": "Jane Collins and Mike Hookem",
        "speaker_strict_norm": "janecollinsandmikehookem",
        "period": 8,
        "party": "EDD/INDDEM/EFD",
        "canonical_name": "Jane COLLINS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124955/JANE_COLLINS/history/8",
    },

])

# infer the EP-IDs of the missing MEPs (from the EP-Profile weblink)
missing_multi_speaker_mapping_df["ep_id"] = (
    missing_multi_speaker_mapping_df["ep_profile_url"]
    .str.extract(r"/meps/en/(\d+)/")[0] # NOTE this only works for ep-website with "en" setting
    #.astype("Int64")
)

In [29]:
# quick sanity check: merge with EP metadata on ep_id
missing_multi_speaker_mapping_merged = missing_multi_speaker_mapping_df.merge(
    meta_cleaned_df,
    left_on="ep_id",
    right_on="identifier",
    how="left",
    suffixes=("_miss", "_meta")
)

display(missing_multi_speaker_mapping_merged[["speaker", "speaker_strict_norm", "canonical_name", "ep_id", "identifier", "label", "label_strict_norm", "name_set_norm"]])

Unnamed: 0,speaker,speaker_strict_norm,canonical_name,ep_id,identifier,label,label_strict_norm,name_set_norm
0,Martina Anderson and Lynn Boylan,martinaandersonandlynnboylan,Martina ANDERSON,113959,113959,Martina ANDERSON,martinaanderson,"[martina, martina anderson, anderson martina, ..."
1,Jane Collins and Mike Hookem,janecollinsandmikehookem,Jane COLLINS,124955,124955,Jane COLLINS,janecollins,"[jane collins, collins, jane]"


In [30]:
# now use LLM (chatGPT) to infer the correct names and EP ids of the misspelled/missing MEPs (and manually check the links)
mapping_df_01 = pd.DataFrame([
    {
        "speaker": "Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη",
        "speaker_strict_norm": "elissabetbozempergkbruonide",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Elissavet VOZEMBERG-VRIONIDI",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125065/ELISSAVET_VOZEMBERG-VRIONIDI/home",
    },
    {
        "speaker": "Ana Miranda",
        "speaker_strict_norm": "anamiranda",
        "period": 9,
        "party": "Greens/EFA",
        "canonical_name": "Ana MIRANDA PAZ",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/24942/ANA_MIRANDA/home",
    },
    {
        "speaker": "Андрей Ковачев",
        "speaker_strict_norm": "andreikovachev",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Andrey KOVATCHEV",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/97968/ANDREY_KOVATCHEV/home",
    },
    {
        "speaker": "Иво Христов",
        "speaker_strict_norm": "ivokhristov",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Ivo HRISTOV",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197846/IVO_HRISTOV/history/9",
    },
    {
        "speaker": "Δημήτριος Παπαδημούλης",
        "speaker_strict_norm": "demetriospapademoules",
        "period": 9,
        "party": "NGL/The Left",
        "canonical_name": "Dimitrios PAPADIMOULIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/28586/DIMITRIOS_PAPADIMOULIS/home",
    },
    {
        "speaker": "Γεώργιος Κύρτσος",
        "speaker_strict_norm": "georgioskurtsos",
        "period": 9,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Georgios KYRTSOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125063/GEORGIOS_KYRTSOS/home",
    },
    {
        "speaker": "Λουκάς Φουρλάς",
        "speaker_strict_norm": "loukasphourlas",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Loucas FOURLAS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197414/LOUCAS_FOURLAS/home",
    },
    {
        "speaker": "Κώστας Μαυρίδης",
        "speaker_strict_norm": "kostasmaurides",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Costas MAVRIDES",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124691/COSTAS_MAVRIDES/home",
    },
    {
        "speaker": "Νιαζί Κιζιλγιουρέκ",
        "speaker_strict_norm": "niazikizilgiourek",
        "period": 9,
        "party": "NGL/The Left",
        "canonical_name": "Niyazi KIZILYUREK",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197415/NIYAZI_KIZILYUREK/home",
    },
    {
        "speaker": "Jarosław Duda",
        "speaker_strict_norm": "jaroslawduda",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Jarosław DUDA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197510/JAROSLAW_DUDA/home",
    },
])

#display(mapping_df_01)

mapping_df_02 = pd.DataFrame([
    {
        "speaker": "Ελένη Σταύρου",
        "speaker_strict_norm": "elenestaurou",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Eleni STAVROU",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/239271/ELENI_STAVROU/home",
    },
    {
        "speaker": "Άννα-Μισέλ Ασημακοπούλου",
        "speaker_strict_norm": "annamiselasemakopoulou",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Anna-Michelle ASIMAKOPOULOU",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197695/ANNA-MICHELLE_ASIMAKOPOULOU/home",
    },
    {
        "speaker": "Κωνσταντίνος Αρβανίτης",
        "speaker_strict_norm": "konstantinosarbanites",
        "period": 9,
        "party": "NGL/The Left",
        "canonical_name": "Konstantinos ARVANITIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197701/KONSTANTINOS_ARVANITIS/home",
    },
    {
        "speaker": "Maite Pagazaurtundúa Ruiz",
        "speaker_strict_norm": "maitepagazaurtunduaruiz",
        "period": 9,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Maite PAGAZAURTUNDÚA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125038/MAITE_PAGAZAURTUNDUA/home",
    },
    {
        "speaker": "Στέλιος Κυμπουρόπουλος",
        "speaker_strict_norm": "stelioskumpouropoulos",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Stelios KYMPOUROPOULOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197692/STELIOS_KYMPOUROPOULOS/home",
    },
    {
        "speaker": "Theresa Muigg",
        "speaker_strict_norm": "theresamuigg",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Theresa BIELOWSKI",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/238674/THERESA_BIELOWSKI/home",
    },
    {
        "speaker": "Βαγγέλης Μεϊμαράκης",
        "speaker_strict_norm": "baggelesmeimarakes",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Vangelis MEIMARAKIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197662/EVANGELOS_MEIMARAKIS/home",
    },
    {
        "speaker": "Róża Gräfin von Thun und Hohenstein",
        "speaker_strict_norm": "rozagrafinvonthunundhohenstein",
        "period": 9,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Róża THUN UND HOHENSTEIN",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/96776/ROZA_THUN%2BUND%2BHOHENSTEIN/home",
    },
    {
        "speaker": "Елена Йончева",
        "speaker_strict_norm": "elenaioncheva",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Elena YONCHEVA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197842/ELENA_YONCHEVA/home",
    },
    {
        "speaker": "Илхан Кючюк",
        "speaker_strict_norm": "ilkhankiuchiuk",
        "period": 9,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Ilhan KYUCHYUK",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124866/ILHAN_KYUCHYUK/home",
    },
])

#display(mapping_df_02)

mapping_df_03 = pd.DataFrame([
    {
        "speaker": "Петър Витанов",
        "speaker_strict_norm": "petrvitanov",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Petar VITANOV",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197844/PETAR_VITANOV/history/9",
    },
    {
        "speaker": "Pernille Weiss",
        "speaker_strict_norm": "pernilleweiss",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Pernille WEISS-EHLER",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/197515/PERNILLE_WEISS/history/9",
    },
    {
        "speaker": "Εμμανουήλ Φράγκος",
        "speaker_strict_norm": "emmanouelphragkos",
        "period": 9,
        "party": "ECR",
        "canonical_name": "Emmanouil FRAGKOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/198490/EMMANOUIL_FRAGKOS/home",
    },
    {
        "speaker": "Ева Майдел",
        "speaker_strict_norm": "evamaidel",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Eva MAYDELL",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/98341/EVA_MAYDELL/home",
    },
    {
        "speaker": "Δημήτρης Παπαδάκης",
        "speaker_strict_norm": "demetrespapadakes",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Dimitrios PAPADAKIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124692/DEMETRIS_PAPADAKIS/history/9",
    },
    {
        "speaker": "Λευτέρης Χριστοφόρου",
        "speaker_strict_norm": "leutereskhristophorou",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Lefteris CHRISTOFOROU",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/26837/LEFTERIS_CHRISTOFOROU/history/9",
    },
    {
        "speaker": "Νίκος Ανδρουλάκης",
        "speaker_strict_norm": "nikosandroulakes",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Nikos ANDROULAKIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125110/NIKOS_ANDROULAKIS/history/9",
    },
    {
        "speaker": "Μαρία Σπυράκη",
        "speaker_strict_norm": "mariaspurake",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Maria SPYRAKI",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125064/MARIA_SPYRAKI/history/9",
    },
    {
        "speaker": "Εύα Καϊλή",
        "speaker_strict_norm": "euakaile",
        "period": 9,
        "party": "PSE/S&D",
        "canonical_name": "Eva KAILI",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125109/EVA_KAILI/history/9",
    },
    {
        "speaker": "Yana Toom",
        "speaker_strict_norm": "yanatoom",
        "period": 9,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Jana TOOM",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124700/YANA_TOOM/home",
    },
])

#display(mapping_df_03)

mapping_df_04 = pd.DataFrame([
    {
        "speaker": "Θεόδωρος Ζαγοράκης",
        "speaker_strict_norm": "theodoroszagorakes",
        "period": 9,
        "party": "PPE",
        "canonical_name": "Theodoros ZAGORAKIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125067/THEODOROS_ZAGORAKIS/history/9",
    },
    {
        "speaker": "Anders Primdahl Vistisen",
        "speaker_strict_norm": "andersprimdahlvistisen",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Anders VISTISEN",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124875/ANDERS_VISTISEN/home#detailedcardmep",
    },
    {
        "speaker": "Νότης Μαριάς",
        "speaker_strict_norm": "notesmarias",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Notis MARIAS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125069/NOTIS_MARIAS/history/8",
    },
    {
        "speaker": "Νικόλαος Χουντής",
        "speaker_strict_norm": "nikolaoskhountes",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Nikolaos CHOUNTIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/26851/NIKOLAOS_CHOUNTIS/history/8",
    },
    {
        "speaker": "Κώστας Χρυσόγονος",
        "speaker_strict_norm": "kostaskhrusogonos",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Kostas CHRYSOGONOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125061/KOSTAS_CHRYSOGONOS/history/8",
    },
    {
        "speaker": "Κωνσταντίνα Κούνεβα",
        "speaker_strict_norm": "konstantinakouneba",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Kostadinka KUNEVA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125092/KOSTADINKA_KUNEVA/history/8",
    },
    {
        "speaker": "Ελένη Θεοχάρους",
        "speaker_strict_norm": "elenetheokharous",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Eleni THEOCHAROUS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/25704/ELENI_THEOCHAROUS/history/8",
    },
    {
        "speaker": "Νεοκλής Συλικιώτης",
        "speaker_strict_norm": "neoklessulikiotes",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Neoklis SYLIKIOTIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124689/NEOKLIS_SYLIKIOTIS/history/8",
    },
    {
        "speaker": "Inmaculada Rodríguez-Piñero Fernández",
        "speaker_strict_norm": "inmaculadarodriguezpinerofernandez",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Inma RODRÍGUEZ-PIÑERO",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125043/INMA_RODRIGUEZ-PINERO/history/9",
    },
    {
        "speaker": "Филиз Хюсменова",
        "speaker_strict_norm": "filizkhiusmenova",
        "period": 8,
        "party": "ELDR/ALDE/Renew",
        "canonical_name": "Filiz HYUSMENOVA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/34249/FILIZ_HYUSMENOVA/history/8",
    },
])

#display(mapping_df_04)

mapping_df_05 = pd.DataFrame([
    {
        "speaker": "Σοφία Σακοράφα",
        "speaker_strict_norm": "sophiasakorapha",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Sofia SAKORAFA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125091/SOFIA_SAKORAFA/history/8",
    },
    {
        "speaker": "Τάκης Χατζηγεωργίου",
        "speaker_strict_norm": "takeskhatzegeorgiou",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Takis HADJIGEORGIOU",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/96907/TAKIS_HADJIGEORGIOU/history/8",
    },
    {
        "speaker": "Μιλτιάδης Κύρκος",
        "speaker_strict_norm": "miltiadeskurkos",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Miltiadis KYRKOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125113/MILTIADIS_KYRKOS/history/8",
    },
    {
        "speaker": "Valdemar Tomaševski",
        "speaker_strict_norm": "valdemartomasevski",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Waldemar TOMASZEWSKI",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/96697/VALDEMAR_TOMASEVSKI/home",
    },
    {
        "speaker": "Anna Elżbieta Fotyga",
        "speaker_strict_norm": "annaelzbietafotyga",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Anna FOTYGA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/28353/ANNA_FOTYGA/history/9",
    },
    {
        "speaker": "Charles Tannock",
        "speaker_strict_norm": "charlestannock",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Timothy Charles Ayrton TANNOCK",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/4521/TIMOTHY+CHARLES+AYRTON_TANNOCK/history/8",
    },
    {
        "speaker": "Мария Габриел",
        "speaker_strict_norm": "mariiagabriel",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Mariya GABRIEL",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/96848/MARIYA_GABRIEL/history/8",
    },
    {
        "speaker": "Francesc Gambús",
        "speaker_strict_norm": "francescgambus",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Francisco de Paula GAMBUS MILLET",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125006/FRANCISCO+DE+PAULA_GAMBUS+MILLET/history/8",
    },
    {
        "speaker": "Agnieszka Kozłowska-Rajewicz",
        "speaker_strict_norm": "agnieszkakozlowskarajewicz",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Agnieszka KOZŁOWSKA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124889/AGNIESZKA_KOZLOWSKA/history/8",
    },
    {
        "speaker": "Jillian Evans",
        "speaker_strict_norm": "jillianevans",
        "period": 8,
        "party": "Greens/EFA",
        "canonical_name": "Jill EVANS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/4550/JILL_EVANS/history/9",
    },
])

#display(mapping_df_05)

mapping_df_06 = pd.DataFrame([
    {
        "speaker": "Monika Flašíková Beňová",
        "speaker_strict_norm": "monikaflasikovabenova",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Monika BEŇOVÁ",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/23868/MONIKA_BENOVA/home",
    },
    {
        "speaker": "Светослав Христов Малинов",
        "speaker_strict_norm": "svetoslavkhristovmalinov",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Svetoslav Hristov MALINOV",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/111027/SVETOSLAV+HRISTOV_MALINOV/history/8",
    },
    {
        # is this MEP truly missing/non-existent?
        # NOTE: No, Eva Paunova is called Eva Maydell (born Paunova)
        "speaker": "Eva Paunova",
        "speaker_strict_norm": "evapaunova",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Eva MAYDELL",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/98341/EVA_PAUNOVA/home",
    },
    {
        "speaker": "Николай Бареков",
        "speaker_strict_norm": "nikolaibarekov",
        "period": 8,
        "party": "ECR",
        "canonical_name": "Nikolay BAREKOV",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124871/NIKOLAY_BAREKOV/history/8",
    },
    {
        "speaker": "Jakop Dalunde",
        "speaker_strict_norm": "jakopdalunde",
        "period": 8,
        "party": "Greens/EFA",
        "canonical_name": "Jakop G. DALUNDE",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/183338/JAKOP+G._DALUNDE/history/9",
    },
    {
        "speaker": "Krystyna Maria Łybacka",
        "speaker_strict_norm": "krystynamarialybacka",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Krystyna ŁYBACKA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124883/KRYSTYNA_LYBACKA/history/8",
    },
    {
        "speaker": "Илияна Йотова",
        "speaker_strict_norm": "iliianaiotova",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Iliana IOTOVA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/38605/ILIANA_IOTOVA/history/8",
    },
    {
        "speaker": "Μανώλης Κεφαλογιάννης",
        "speaker_strict_norm": "manoleskephalogiannes",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Emmanouil KEFALOGIANNIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125068/MANOLIS_KEFALOGIANNIS/home",
    },
    {
        "speaker": "Γιώργος Γραμματικάκης",
        "speaker_strict_norm": "giorgosgrammatikakes",
        "period": 8,
        "party": "PSE/S&D",
        "canonical_name": "Giorgos GRAMMATIKAKIS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125112/GIORGOS_GRAMMATIKAKIS/history/8",
    },
    {
        "speaker": "Etelios Kouloglou",
        "speaker_strict_norm": "etelioskouloglou",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Stelios KOULOGLOU",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/130833/STELIOS_KOULOGLOU/history/9",
    },
])

#display(mapping_df_06)

mapping_df_07 = pd.DataFrame([
    {
        "speaker": "Bodil Ceballos",
        "speaker_strict_norm": "bodilceballos",
        "period": 8,
        "party": "Greens/EFA",
        "canonical_name": "Bodil VALERO",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124993/BODIL_VALERO/history/8",
    },
    {
        "speaker": "Εμμανουήλ Γλέζος",
        "speaker_strict_norm": "emmanouelglezos",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Emmanouil GLEZOS",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/1654/EMMANOUIL_GLEZOS/history/8",
    },
    {
        "speaker": "Jordi Sebastià",
        "speaker_strict_norm": "jordisebastia",
        "period": 8,
        "party": "Greens/EFA",
        "canonical_name": "Jordi Vicent SEBASTIA TALAVERA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125053/JORDI+VICENT_SEBASTIA+TALAVERA/history/8",
    },
    {
        "speaker": "Comodini Cachia",
        "speaker_strict_norm": "comodinicachia",
        "period": 8,
        "party": "PPE",
        "canonical_name": "Therese COMODINI CACHIA",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/124968/THERESE_COMODINI+CACHIA/history/8",
    },
    {
        "speaker": "Iosu Juaristi Abaunz",
        "speaker_strict_norm": "iosujuaristiabaunz",
        "period": 8,
        "party": "NGL/The Left",
        "canonical_name": "Josu JUARISTI ABAUNZ",
        "ep_profile_url": "https://www.europarl.europa.eu/meps/en/125051/JOSU_JUARISTI+ABAUNZ/history/8",
    },
])

#display(mapping_df_07)

missing_mapping_df_all = pd.concat(
    [
        mapping_df_01,
        mapping_df_02,
        mapping_df_03,
        mapping_df_04,
        mapping_df_05,
        mapping_df_06,
        mapping_df_07,
    ],
    ignore_index=True,
)

# infer the EP-IDs of the missing MEPs (from the EP-Profile weblink)
missing_mapping_df_all["ep_id"] = (
    missing_mapping_df_all["ep_profile_url"]
    .str.extract(r"/meps/en/(\d+)/")[0] # NOTE this only works for ep-website with "en" setting
    #.astype("Int64")
)

# quick sanity check: merge with EP metadata on ep_id
missing_mapping_merged = missing_mapping_df_all.merge(
    meta_cleaned_df,
    left_on="ep_id",
    right_on="identifier",
    how="left",
    suffixes=("_miss", "_meta")
)

display(missing_mapping_merged[["speaker", "speaker_strict_norm", "canonical_name", "ep_id", "identifier", "label", "label_strict_norm", "name_set_norm"]])

Unnamed: 0,speaker,speaker_strict_norm,canonical_name,ep_id,identifier,label,label_strict_norm,name_set_norm
0,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,elissabetbozempergkbruonide,Elissavet VOZEMBERG-VRIONIDI,125065,125065,Elissavet VOZEMBERG-VRIONIDI,elissavetvozembergvrionidi,"[elissavet vozemberg-vrionidi, ελισσαβετ, voze..."
1,Ana Miranda,anamiranda,Ana MIRANDA PAZ,24942,24942,Ana MIRANDA PAZ,anamirandapaz,"[miranda paz, ana, miranda ana, ana miranda paz]"
2,Андрей Ковачев,andreikovachev,Andrey KOVATCHEV,97968,97968,Andrey KOVATCHEV,andreykovatchev,"[andrey, andrey kovatchev, андрей, ковачев, ko..."
3,Иво Христов,ivokhristov,Ivo HRISTOV,197846,197846,Ivo HRISTOV,ivohristov,"[ivo hristov, христов, hristov, иво, ivo]"
4,Δημήτριος Παπαδημούλης,demetriospapademoules,Dimitrios PAPADIMOULIS,28586,28586,Dimitrios PAPADIMOULIS,dimitriospapadimoulis,"[παπαδημούλησ, dimitrios, παπαδημουλησ, δημήτρ..."
5,Γεώργιος Κύρτσος,georgioskurtsos,Georgios KYRTSOS,125063,125063,Georgios KYRTSOS,georgioskyrtsos,"[γεωργιοσ, γεώργιοσ, kyrtsos, κύρτσοσ, georgio..."
6,Λουκάς Φουρλάς,loukasphourlas,Loucas FOURLAS,197414,197414,Loucas FOURLAS,loucasfourlas,"[λουκάσ, loucas fourlas, φουρλάσ, fourlas, λου..."
7,Κώστας Μαυρίδης,kostasmaurides,Costas MAVRIDES,124691,124691,Costas MAVRIDES,costasmavrides,"[mavrides, costas mavrides, costas, μαυρίδησ, ..."
8,Νιαζί Κιζιλγιουρέκ,niazikizilgiourek,Niyazi KIZILYUREK,197415,197415,Niyazi KIZILYÜREK,niyazikizilyurek,"[niyazi, κιζιλγιουρέκ, kizilyürek, niyazi kizi..."
9,Jarosław Duda,jaroslawduda,Jarosław DUDA,197510,197510,Jarosław DUDA-LATOSZEWSKI,jarosawdudalatoszewski,"[dudaj, jarosław duda-latoszewski, duda-latosz..."


**NOTE:**
Apparently Eva Paunova (initially no match found in EP metadata) is called Eva Maydell (born Paunova) (see [here](https://www.eppgroup.eu/who-we-are/our-members/eva-maydell) for more info).


In [31]:
# this seems to have worked, now work in these changes and re-try the mapping of pls/migration

# fetch data fresh
#pls_df = original_pls_df.copy(deep=True)
migration_df = migration_df.copy(deep=True)

# this we already cleaned (so use it further)
meta_cleaned_df = meta_cleaned_df

# prepare the merge for pls/migration dataset, i.e.
# normalize the speaker names (strictly)
#pls_df["speaker_strict_norm"] = pls_df["speaker"].apply(normalize_name_strict)
migration_df["speaker_strict_norm"] = migration_df["speaker"].apply(normalize_name_strict)

# extract mapping from old to corrected name_strict_norm for both multispeaker's and misspelled MEP's speeches
missing_maps_multi = missing_multi_speaker_mapping_merged[["speaker_strict_norm", "label_strict_norm"]]
missing_maps_misspelled = missing_mapping_merged[["speaker_strict_norm", "label_strict_norm"]]

# build mapping: old_strict_norm -> corrected_strict_norm
corr_map_multi = (
    missing_maps_multi
    .dropna(subset=["speaker_strict_norm", "label_strict_norm"])
    .drop_duplicates(subset=["speaker_strict_norm"])  # ensure 1-to-1
    .set_index("speaker_strict_norm")["label_strict_norm"]
)

# keep track of how many rows changed (1)
before = migration_df["speaker_strict_norm"].copy()

# replace only where a correction exists
migration_df["speaker_strict_norm"] = migration_df["speaker_strict_norm"].replace(corr_map_multi)

# how many rows changed?
n_changed_multi = (before != migration_df["speaker_strict_norm"]).sum()
print("Rows updated after correcting multispeaker speeches:", n_changed_multi)

# now do the same for misspelled MEP's speeches
# build mapping: old_strict_norm -> corrected_strict_norm
corr_map_misspelled = (
    missing_maps_misspelled
    .dropna(subset=["speaker_strict_norm", "label_strict_norm"])
    .drop_duplicates(subset=["speaker_strict_norm"])  # ensure 1-to-1
    .set_index("speaker_strict_norm")["label_strict_norm"]
)

# keep track of how many rows changed (2)
before = migration_df["speaker_strict_norm"].copy()

# replace only where a correction exists
migration_df["speaker_strict_norm"] = migration_df["speaker_strict_norm"].replace(corr_map_misspelled)

# how many rows changed?
n_changed_misspelled = (before != migration_df["speaker_strict_norm"]).sum()
print("Rows updated after correcting misspelled MEP's speeches:", n_changed_misspelled)

Rows updated after correcting multispeaker speeches: 721
Rows updated after correcting misspelled MEP's speeches: 861


Wonderful, now lets re-try the merge as before 

In [32]:
# merge the pls/migration dataset on the (loose/) strict normalized speaker name
migration_matching_result = migration_df.merge(
    meta_cleaned_df,
    left_on="speaker_strict_norm",
    right_on="label_strict_norm",
    how="left"
)

#pls_matching_result = pls_df.merge(
#    meta_cleaned_df,
#    left_on="speaker_strict_norm",
#    right_on="label_strict_norm",
#    how="left"
#)

And re-try the merge diagnostics (i.e. how many unmatched speeches)

In [33]:
# investigate the merge for migration dataset
# how many speeches did not get an identifier?
missing_ids_migration = migration_matching_result["identifier"].isna().sum()
print("Migration speeches without identifier:", missing_ids_migration)

# which speakers failed to match?
unmatched_migration_speeches = (
    migration_matching_result.loc[migration_matching_result["identifier"].isna(), ["speaker", "speaker_strict_norm",
                                                                                   "date", "period", "party", "multispeaker",
                                                                                   "written"
                                                                                   ]] #"speaker_loose_norm"]

)
#display(unmatched_migration_speeches.head())
unmatched_migration_speakers = (unmatched_migration_speeches[["speaker", "speaker_strict_norm"]]
                                .drop_duplicates("speaker_strict_norm"))
display(unmatched_migration_speakers)
print(unmatched_migration_speakers.shape[0])

# how many speeches per misspelled MEP are affected?
unmatched_migration_speaker_count = (
    migration_matching_result.loc[migration_matching_result["identifier"].isna(), "speaker_strict_norm"] #"speaker_loose_norm"]
    .value_counts()
)
display(unmatched_migration_speaker_count)

Migration speeches without identifier: 731


Unnamed: 0,speaker,speaker_strict_norm
35,Ελισσάβετ Βόζεμπεργκ-Βρυωνίδη,
298,Jarosław Duda,jarosawduda
3357,Agnieszka Kozłowska-Rajewicz,agnieszkakozowskarajewicz
5204,Krystyna Maria Łybacka,krystynamariaybacka
8326,Mαρία Σπυράκη,m
8555,Nότης Μαριάς,n


6


speaker_strict_norm
agnieszkakozowskarajewicz    8
krystynamariaybacka          5
jarosawduda                  1
m                            1
n                            1
Name: count, dtype: int64

Wonderful, now lets tidy up and store the enriched migration dataset for later usage

In [34]:
display(migration_matching_result.head(3))

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,speaker_loose_norm,speaker_strict_norm,identifier,label,label_strict_norm,name_set_norm,citizenship,placeOfBirth,bday,deathDate,gender,hasMembership,label_loose_norm
0,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282,karen melchior,karenmelchior,197567,Karen MELCHIOR,karenmelchior,"[melchior, karen, karen melchior, melchiork]",DNK,Gentofte,1980-10-15,,FEMALE,"[{'id': 'membership/197567-f-161757', 'type': ...",karen melchior
1,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501,tineke strik,tinekestrik,197772,Tineke STRIK,tinekestrik,"[strik, tineke strik, tineke]",NLD,Appeltern,1961-09-28,,FEMALE,"[{'id': 'membership/197772-f-161044', 'type': ...",tineke strik
2,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925,anabela rodrigues,anabelarodrigues,254718,Anabela RODRIGUES,anabelarodrigues,"[rodriguesa, rodrigues, anabela rodrigues, ana...",PRT,Lisboa,1976-10-18,,FEMALE,"[{'id': 'membership/254718-f-169354', 'type': ...",anabela rodrigues


In [35]:
# get rid of unneeded/temp columns again
migration_matching_result_final = migration_matching_result.drop(['speaker_loose_norm', 'speaker_strict_norm',
                                                                  'name_set_norm', 'label_strict_norm', 'label_loose_norm'], axis=1)

# rename the meta columns with prefix ep_
migration_matching_result_final = migration_matching_result_final.rename(columns={
    "identifier": "ep_identifier",
    "label": "ep_label",
    "citizenship" : "ep_citizenship",
    "placeOfBirth" : "ep_placeOfBirth",
    "bday" : "ep_bday",
    "deathDate" : "ep_deathDate",
    "gender" : "ep_gender",
    "hasMembership" : "ep_hasMembership",
})

display(migration_matching_result_final.head(5))

# save as parquet file
migration_matching_result_final.to_parquet(path='data/intermed/ches/fin_migration_enriched_epID.parquet')

Unnamed: 0.1,Unnamed: 0,speaker,text,date,agenda,speechnumber,procedure_ID,partyfacts_ID,period,chair,MEP,commission,written,multispeaker,link,translatedText,translationSource,year,block,party,migration_prob,ep_identifier,ep_label,ep_citizenship,ep_placeOfBirth,ep_bday,ep_deathDate,ep_gender,ep_hasMembership
0,492,Karen Melchior,"Mr President, thank you very much to the Commi...",2024-04-24,22. Advance passenger information: enhancing a...,8,bill_26075_ID bill_26076_ID bill_26075_ID bi...,6401.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, thank you very much to the Commi...",original_gm,2024,liberal,ELDR/ALDE/Renew,0.393282,197567,Karen MELCHIOR,DNK,Gentofte,1980-10-15,,FEMALE,"[{'id': 'membership/197567-f-161757', 'type': ..."
1,844,Tineke Strik,"Mr President, Commissioner. People who flee wa...",2024-04-23,17. EU-Egypt strategic and comprehensive partn...,9,,6403.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Mr President, Commissioner. People who flee wa...",original_gm,2024,green,Greens/EFA,0.423501,197772,Tineke STRIK,NLD,Appeltern,1961-09-28,,FEMALE,"[{'id': 'membership/197772-f-161044', 'type': ..."
2,1050,Anabela Rodrigues,"Senhora Presidente, em 2024, o tráfico de sere...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,10,bill_241_ID bill_241_ID,6402.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, in 2024, human trafficking co...",machine_gm,2024,left,NGL/The Left,0.375925,254718,Anabela RODRIGUES,PRT,Lisboa,1976-10-18,,FEMALE,"[{'id': 'membership/254718-f-169354', 'type': ..."
3,1054,Beata Kempa,Pani Przewodnicząca! Pani Komisarz! Potrzebuje...,2024-04-22,20. Amending Directive 2011/36/EU on preventin...,14,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam Commissioner! We need very specific, ver...",machine_gm,2024,(extreme)_right,ECR,0.256493,197519,Beata KEMPA,POL,Syców,1966-02-11,,FEMALE,"[{'id': 'membership/197519-f-164917', 'type': ..."
4,1056,Dorien Rookmaker,"Voorzitter, dank u wel, meneer Engerer, want u...",2024-04-22,20. Amending Directive 2011/36/EU on preventin...,16,bill_241_ID bill_241_ID,6400.0,9,False,True,False,False,False,https://www.europarl.europa.eu/doceo/document/...,"Madam President, thank you, Mr Engerer, becaus...",machine_gm,2024,(extreme)_right,ECR,0.475315,204733,Dorien ROOKMAKER,NLD,Zijpe,1964-07-30,,FEMALE,"[{'id': 'membership/204733-f-169018', 'type': ..."


In [36]:
# check for nans in column identifier
check_nan = migration_matching_result_final['ep_identifier'].isnull().values.any()

# printing the result
print(check_nan) # should be false

True


In [37]:
#
#print(migration_matching_result_final["ep_placeOfBirth"].apply(type).unique())
#migration_matching_result_final["ep_placeOfBirth"] = migration_matching_result_final["placeOfBirth"].apply(str)
#migration_matching_result_final["ep_placeOfBirth"].apply(lambda x: x.encode('utf-8'))