In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from IPython.display import clear_output
from lxml import html
from datetime import datetime
import time
from tqdm import tqdm
import numpy as np

In [None]:
# Retrieve XML driectory from EP website
id_url = "https://www.europarl.europa.eu/meps/en/directory/xml?letter=&leg="

response = requests.get(id_url)

with open ('mep_ids.xml', 'wb') as file:
    file.write(response.content)

In [None]:
tree = ET.parse('mep_ids.xml')
root = tree.getroot()

In [None]:
id_l = []

name_l = []

for name in root.iter('fullName'):
    name_l.append(name.text)

for id in root.iter('id'):
    id_l.append(id.text)

id_list = [list(l) for l in zip(name_l, id_l)]

In [None]:
# Crawl EP website based on XML directory
base_url = 'https://www.europarl.europa.eu/meps/en/'

df = pd.DataFrame()

for id in tqdm(id_list):
    url = base_url + id[1] + '/a/history/'
    r = requests.get(url)
    tree = html.fromstring(r.content)

    mep_id = id[1]
    full_name = id[0]
    political_group = tree.xpath('//*[@id="status"]/div/div[1]/ul/li[1]/text()')
    national_party = tree.xpath('//*[@id="status"]/div/div[2]/ul/li[1]/text()')
    nationality = tree.xpath("/html/body/main/div/div/div[2]/div/div/div[2]")[0].text.strip()
    date_of_birth = [x.strip() for x in tree.xpath('//*[@class="sln-birth-date"]/text()')]

    temp = pd.DataFrame()
    temp['mep_id'] = [mep_id]
    temp['full_name'] = [full_name]
    if (bool(political_group) == True): temp['political_group'] = political_group
    if (bool(national_party) == True): temp['national_party'] = national_party
    if (bool(nationality) == True): temp['nationality'] = nationality
    if (bool(date_of_birth) == True): temp['date_of_birth'] = date_of_birth

    df = pd.concat([df, temp], ignore_index=True)

    # if (count % 10 == 0): 
    #     print("Pausing for 10 seconds")
    #     time.sleep(10)
    
df

In [None]:
df["nationality"].value_counts(dropna=False)

In [None]:
df.to_csv("mep_raw.csv")

In [None]:
# df = pd.read_csv("mep_raw.csv", index_col=0)

In [None]:
# Data cleaning
def clean_political_group(text):
    sep_1 = ': '
    sep_2 = ' -'
    political_group = text.split(sep_1, 1)[1]
    political_group = political_group.split(sep_2, 1)[0]
    return political_group

def get_nationality(text):
    sep_1 = '('
    sep_2 = ')'
    nationality = text.split(sep_1, 1)[1]
    nationality = nationality.split(sep_2, 1)[0]
    return nationality

def clean_national_party(text):
    sep_1 = ': '
    sep_2 = ' ('
    national_party = text.split(sep_1, 1)[1]
    national_party = national_party.split(sep_2, 1)[0]
    return national_party

In [None]:
clean_data = df.copy()

clean_data = clean_data[clean_data["political_group"].notna()]

clean_data = clean_data[clean_data["national_party"].str.contains("\(")]

clean_data['political_group'] = clean_data['political_group'].apply(clean_political_group)

mask = (clean_data["nationality"].isnull())

clean_data.loc[mask, "nationality"] = clean_data.loc[mask, "national_party"].apply(get_nationality)

clean_data['national_party'] = clean_data['national_party'].apply(clean_national_party)

clean_data

In [None]:
clean_data["nationality"].value_counts(dropna=False)

In [None]:
clean_data.to_csv("../dataset/mep_clean.csv")