In [1]:
import requests

# xml libraries
import xml.etree.ElementTree as ET
import elementpath as ep

import pandas as pd

from tqdm.auto import tqdm

In [2]:
def get_mep(term):
    # gets information about all members of parlament
    base_url = "https://data.europarl.europa.eu/api/v1/"
    format = "application%2Fld%2Bjson"
    query = f"meps?parliamentary-term={term}&format={format}"

    url = base_url + query

    response = requests.get(url)

    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def get_documents(work_type, offset=0, limit=100, verbose=False):
    base_url = "https://data.europarl.europa.eu/api/v1/"

    format = "application%2Fld%2Bjson"  # response content is json format

    query = f"documents?work-type={work_type}&offset={offset}&limit={limit}&format={format}"

    url = base_url+query

    if verbose:
        print(url)

    response = requests.get(url)

    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def get_document_by_id(id, language="de", verbose=False):
    base_url = "https://data.europarl.europa.eu/api/v1/documents/"

    format = "application%2Fld%2Bjson"  # response content is json format

    query = f"{id}?format={format}&language={language}"

    url = base_url+query
    if verbose:
        print(url)
    response = requests.get(url)
    if response.status_code == 200:
        return response
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None, url


def get_file_location(document):

    # If document is "ComplexWork", find the current version and load it
    if document["type"] == "ComplexWork":
        current_version_id = document["hasCurrentVersion"].split("/")[-1]
        document = get_document_by_id(current_version_id).json()["data"][0]

    # Get all availbale files, check format and save only xml file
    files = document["is_realized_by"][0]["is_embodied_by"]
    for file in files:
        format = file["format"].split("/")[-1]
        if format == "XML":
            file_location = file["is_exemplified_by"]
            break

    return file_location


def get_xml_doc(file_location, verbose=False):

    base_url = "https://data.europarl.europa.eu/"
    url = base_url+file_location

    if verbose:
        print(url)

    response = requests.get(url)
    if response.status_code == 200:
        xml_doc = ET.fromstring(response.content)
        return xml_doc
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        return None


def parse_speeches_from_xml(xml_doc):
    # Extract speaches from plenary session document

    text_list = []
    mep_ip_list = []
    date_list = []
    topic_list = []

    date = xml_doc.find("HEAD").find("META").text
    chapters = xml_doc.find("DEBATS").findall("CHAPTER")

    for chapter in chapters:
        # Find topic of chapter in German
        chapter_topic = chapter.find("TL-CHAP[@VL='DE']").text

        interventions = chapter.findall("INTERVENTION")
        for intervention in interventions:
            speaker = intervention.find("ORATEUR")
            # member of parlament id
            mep_ip = speaker.attrib["MEPID"]
            # if speach in German
            if speaker.attrib["LG"] == "DE":
                # find all paragraphs and concatenate to single string
                paragraphs = intervention.findall("PARA")
                text = ""
                for paragraph in paragraphs:
                    text += " ".join([text for text in paragraph.itertext()])

                # Append to array
                text_list.append(text)
                mep_ip_list.append(mep_ip)
                date_list.append(date)
                topic_list.append(chapter_topic)

    df_speeches = pd.DataFrame({"date": date, "topic": topic_list,
                                "text": text_list, "mep_id": mep_ip_list})
    return df_speeches

In [3]:
# Get a list of all documents of type
type = "PLENARY_CRE_EP"

document_list = []
i = 0
while True:
    try:
        doc_list = get_documents(
            type, offset=i*100, limit=100).json()["data"]
        if len(doc_list) == 0:
            break
        else:
            document_list.extend(doc_list)
    except:
        break
    i += 1

df_documents = pd.DataFrame(
    {"identifier": [doc["identifier"] for doc in document_list]})

In [4]:
df = pd.DataFrame()

failed_documents = []
for i, id in tqdm(enumerate(df_documents.identifier), total=len(df_documents)):

    try:
        # Get document
        document = get_document_by_id(id).json()["data"][0]

        # Extract xml file location
        file_location = get_file_location(document)

        # Download xml file
        xml_doc = get_xml_doc(file_location)

        # Extract speeches from xml file
        df_ = parse_speeches_from_xml(xml_doc)

        # Append to pd.DataFrame
        df = pd.concat([df, df_])
    except Exception as error:
        print(f"Step {i}, id: {id}, Error:{error}")
        failed_documents.append(i)

  0%|          | 0/1601 [00:00<?, ?it/s]

Step 0, id: CRE-6-2008-09-22, Error:'is_realized_by'
Step 1, id: CRE-6-2008-09-22-FNL, Error:'is_realized_by'
Step 658, id: CRE-8-2018-04-19-PRV, Error:'NoneType' object has no attribute 'text'


In [5]:
# TODO!!!
print(f"{len(failed_documents)} failed documents out of a total of {len(document_list)} !!!")
print("Find out what is happening!!!")

print(failed_documents)

3 failed documents out of a total of 1601 !!!
Find out what is happening!!!
[0, 1, 658]


In [6]:
# Drop duplicates and reset index
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)

# Convert date to datetime format
df["date"] = pd.to_datetime(df.date, format="%d-%m-%Y")

# Change mep_ip type to int
df["mep_id"] = df["mep_id"].astype("Int32")

In [7]:
df_mep = pd.DataFrame()
for term in range(10):
    mep = get_mep(term=term).json()["data"]
    df_mep = pd.concat([df_mep, pd.DataFrame(mep)])

df_mep = df_mep.set_index("identifier")
df_mep.index = df_mep.index.astype("Int32")
df_mep = df_mep.drop_duplicates()

In [8]:
df["givenName"] = df["mep_id"].apply(
    lambda x: df_mep.loc[x].givenName if x in df_mep.index else None)
df["familyName"] = df["mep_id"].apply(
    lambda x: df_mep.loc[x].familyName if x in df_mep.index else None)

In [9]:
# Save file
df.to_csv("../data/debates/europarl_speaches.csv")
df_mep.to_csv("../data/debates/europarl_members.csv")