In [1]:
import requests

# xml libraries
import xml.etree.ElementTree as ET
import elementpath as ep

import pandas as pd

from tqdm.auto import tqdm

In [36]:
def get_mep_party():
    url = "https://www.europarl.europa.eu/meps/en/full-list/xml"

    response = requests.get(url)

    if response.status_code == 200:
        xml_doc = ET.fromstring(response.content)
        return xml_doc
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def get_mep(term):
    # gets information about all members of parlament
    base_url = "https://data.europarl.europa.eu/api/v2/"
    format = "application%2Fld%2Bjson"
    query = f"meps?parliamentary-term={term}&format={format}"

    url = base_url + query

    response = requests.get(url)

    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def get_documents(work_type, offset=0, limit=100, verbose=False):
    base_url = "https://data.europarl.europa.eu/api/v2/"

    format = "application%2Fld%2Bjson"  # response content is json format

    query = (
        f"documents?work-type={work_type}&offset={offset}&limit={limit}&format={format}"
    )

    url = base_url + query

    if verbose:
        print(url)

    response = requests.get(url)

    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def get_document_by_id(id, language="de", verbose=False):
    base_url = "https://data.europarl.europa.eu/api/v2/documents/"

    format = "application%2Fld%2Bjson"  # response content is json format

    query = f"{id}?format={format}&language={language}"

    url = base_url + query
    if verbose:
        print(url)
    response = requests.get(url)
    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None, url


def get_file_location(document):

    # If document is "ComplexWork", find the current version and load it
    if document["type"] == "ComplexWork":
        current_version_id = document["hasCurrentVersion"].split("/")[-1]
        document = get_document_by_id(current_version_id).json()["data"][0]

    # Get all availbale files, check format and save only xml file
    files = document["is_realized_by"][0]["is_embodied_by"]
    for file in files:
        format = file["format"].split("/")[-1]
        if format == "XML":
            file_location = file["is_exemplified_by"]
            break

    return file_location


def get_xml_doc(file_location, verbose=False):

    base_url = "https://data.europarl.europa.eu/"
    url = base_url + file_location

    if verbose:
        print(url)

    response = requests.get(url)
    if response.status_code == 200:
        xml_doc = ET.fromstring(response.content)
        return xml_doc
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def parse_speeches_from_xml(xml_doc):
    # Extract speaches from plenary session document

    text_list = []
    mep_ip_list = []
    date_list = []
    topic_list = []

    date = xml_doc.find("HEAD").find("META").text
    chapters = xml_doc.find("DEBATS").findall("CHAPTER")

    for chapter in chapters:
        # Find topic of chapter in German
        chapter_topic = chapter.find("TL-CHAP[@VL='DE']").text

        interventions = chapter.findall("INTERVENTION")
        for intervention in interventions:
            speaker = intervention.find("ORATEUR")
            # member of parlament id
            mep_ip = speaker.attrib["MEPID"]
            # if speach in German
            if speaker.attrib["LG"] == "DE":
                # find all paragraphs and concatenate to single string
                paragraphs = intervention.findall("PARA")
                text = ""
                for paragraph in paragraphs:
                    text += "\n\n".join([text for text in paragraph.itertext()])

                # Append to array
                text_list.append(text)
                mep_ip_list.append(mep_ip)
                date_list.append(date)
                topic_list.append(chapter_topic)

    df_speeches = pd.DataFrame(
        {"date": date, "topic": topic_list, "text": text_list, "mep_id": mep_ip_list}
    )
    return df_speeches

In [37]:
# Get a list of all documents of type
type = "PLENARY_CRE_EP"
type = "CRE_PLENARY"

document_list = []
i = 0
while True:
    try:
        doc_list = get_documents(type, offset=i * 100, limit=100).json()["data"]
        if len(doc_list) == 0:
            break
        else:
            document_list.extend(doc_list)
    except:
        break
    i += 1

df_documents = pd.DataFrame(
    {"identifier": [doc["identifier"] for doc in document_list]}
)

Failed to fetch data. Status code: 204


In [38]:
df = pd.DataFrame()

failed_documents = []
for i, id in tqdm(enumerate(df_documents.identifier), total=len(df_documents)):

    try:
        # Get document
        document = get_document_by_id(id).json()["data"][0]

        # Extract xml file location
        file_location = get_file_location(document)

        # Download xml file
        xml_doc = get_xml_doc(file_location)

        # Extract speeches from xml file
        df_ = parse_speeches_from_xml(xml_doc)

        # Append to pd.DataFrame
        df = pd.concat([df, df_])
    except Exception as error:
        print(f"Step {i}, id: {id}, Error:{error}")
        failed_documents.append(i)

  0%|          | 0/1561 [00:00<?, ?it/s]

Step 0, id: CRE-6-2008-09-22, Error:'is_realized_by'
Step 1, id: CRE-6-2008-09-22-FNL, Error:'is_realized_by'
Step 658, id: CRE-8-2018-04-19-PRV, Error:'NoneType' object has no attribute 'text'
Step 885, id: CRE-9-2019-10-23, Error:'is_realized_by'
Step 886, id: CRE-9-2019-10-24, Error:'is_realized_by'
Step 1004, id: CRE-9-2020-09-16, Error:'is_realized_by'
Step 1495, id: CRE-9-2023-07-10, Error:'is_realized_by'
Step 1514, id: CRE-9-2023-09-14, Error:'is_realized_by'
Step 1515, id: CRE-9-2023-10-02, Error:'is_realized_by'
Step 1516, id: CRE-9-2023-10-03, Error:'is_realized_by'
Step 1526, id: CRE-9-2023-10-17, Error:'is_realized_by'
Step 1529, id: CRE-9-2023-10-19, Error:'is_realized_by'
Step 1530, id: CRE-9-2023-11-08, Error:'is_realized_by'
Step 1531, id: CRE-9-2023-11-09, Error:'is_realized_by'
Step 1532, id: CRE-9-2023-11-20, Error:'is_realized_by'
Step 1533, id: CRE-9-2023-11-21, Error:'is_realized_by'
Step 1534, id: CRE-9-2023-11-22, Error:'is_realized_by'
Step 1535, id: CRE-9-202

In [39]:
# TODO!!!
print(
    f"{len(failed_documents)} failed documents out of a total of {len(document_list)} !!!"
)
print("Find out what is happening!!!")

print(failed_documents)

39 failed documents out of a total of 1561 !!!
Find out what is happening!!!
[0, 1, 658, 885, 886, 1004, 1495, 1514, 1515, 1516, 1526, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1558, 1559, 1560]


In [40]:
# Drop duplicates and reset index
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)

# Convert date to datetime format
df["date"] = pd.to_datetime(df.date, format="%d-%m-%Y")

# Change mep_ip type to int
df["mep_id"] = df["mep_id"].astype("Int32")

# Filter 9th election period
df = df[df["date"] >= "2019-07-02"]

In [41]:
xml_doc = get_mep_party()

keys = ["id", "fullName", "country", "politicalGroup", "nationalPoliticalGroup"]
# Create empty dictionary
data_dict = {key: [] for key in keys}

for i, mep in enumerate(xml_doc.findall("mep")):
    for j, key in enumerate(keys):
        element = mep.find(key)
        try:
            data_dict[key].append(element.text)
        except:
            data_dict[key].append(None)

df_mep_party = pd.DataFrame(data_dict)

# Filter for German meps
df_mep_party = df_mep_party[df_mep_party["country"] == "Germany"]

df_mep_party = df_mep_party.set_index("id")
df_mep_party.index = df_mep_party.index.astype("Int32")
df_mep_party = df_mep_party.drop_duplicates()
df_mep_party.head()

Unnamed: 0_level_0,fullName,country,politicalGroup,nationalPoliticalGroup
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
197475,Christine ANDERSON,Germany,Identity and Democracy Group,Alternative für Deutschland
197448,Rasmus ANDRESEN,Germany,Group of the Greens/European Free Alliance,Bündnis 90/Die Grünen
197433,Katarina BARLEY,Germany,Group of the Progressive Alliance of Socialist...,Sozialdemokratische Partei Deutschlands
132191,Gunnar BECK,Germany,Identity and Democracy Group,Alternative für Deutschland
197408,Hildegard BENTELE,Germany,Group of the European People's Party (Christia...,Christlich Demokratische Union Deutschlands


In [44]:
df_mep_party.nationalPoliticalGroup.unique()

array(['Alternative für Deutschland', 'Bündnis 90/Die Grünen',
       'Sozialdemokratische Partei Deutschlands',
       'Christlich Demokratische Union Deutschlands',
       'Bündnis Deutschland', 'Volt', 'Piratenpartei Deutschland',
       'Independent', 'DIE LINKE.',
       'Christlich-Soziale Union in Bayern e.V.', 'Freie Wähler',
       'Familien-Partei Deutschlands', 'Freie Demokratische Partei',
       'Ökologisch-Demokratische Partei', 'Die PARTEI'], dtype=object)

In [45]:
# Join dataframes
keys = df_mep_party.keys()

for key in keys:
    df[key] = df["mep_id"].apply(
        lambda x: df_mep_party.loc[x][key] if x in df_mep_party.index else None
    )

# Drop nan values (not current members of parlament)
df = df.dropna()

In [46]:
# Create mapping for main parties
mapping_national_parties = {
    "Christlich-Soziale Union in Bayern e.V.": "cdu",
    "Christlich Demokratische Union Deutschlands": "cdu",
    "DIE LINKE.": "linke",
    "Alternative für Deutschland": "afd",
    "Bündnis 90/Die Grünen": "gruene",
    "Sozialdemokratische Partei Deutschlands": "spd",
    "Freie Demokratische Partei": "fdp",
    "Volt": "volt",
    "Freie Wähler": "fw",
    "Familien-Partei Deutschlands": "familie",
    "Ökologisch-Demokratische Partei": "oedp",
    "Piratenpartei Deutschland": "piraten",
}

# Map other parties to "other"
for party in df["nationalPoliticalGroup"].unique():
    if not party in mapping_national_parties.keys():
        mapping_national_parties.update({party: "other"})

# Map names
df["party"] = df["nationalPoliticalGroup"].apply(lambda x: mapping_national_parties[x])
# Drop old column
df = df.drop(columns=["nationalPoliticalGroup"])

# Drop "other" parties
df = df[df["party"] != "other"]

In [48]:
# Save file
df.to_csv("../data/debates/europarl_speeches.csv")
df_mep_party.to_csv("../data/debates/europarl_members.csv")