# Parliament Hackathon 2024 - Data Wrangling Jupyter Notebook
## Setup

In [1]:
import pandas
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime
import re

# Baseline namespace information
namespace = {'tei': 'http://www.tei-c.org/ns/1.0', "xml": "http://www.w3.org/XML/1998/namespace", "XInclude": "http://www.w3.org/2001/XInclude"}

## Getting ParlaMint Speeches

In [21]:
def get_parlamint_xml_speech(input_path, country_id = ""):
    """"
    Given an input path to an xml ParlaMint file, export a pandas dataframe with its speeches, speech id, and a person id, potentially modified per country.
    :param input_path: Input file path.
    :param country_id: Country_id (i.e. "GB") to add at the end of the person_id in the dataframe.
    :return: A pandas dataframe with speeches, speech id, and person id.
    """
    # Setup - Getting root and dict info
    root = ET.parse(input_path).getroot()
    export_speech_dict = {"speech_id": [], "speech": [], "person_id": []}
    
    # Searching through all tei speeches
    for elem in root.findall("tei:text/tei:body/tei:div/tei:u", namespace):
        export_speech_dict["speech_id"].append(elem.attrib[f"{{{namespace['xml']}}}id"]) # Add speech_id
        try: # Add person_id (checking if country_id exists, and if an error (as a person_id may not exist) add a blank category.
            if country_id:
                export_speech_dict["person_id"].append(f"{country_id}-{elem.attrib['who'][1:]}")
            else:
                export_speech_dict["person_id"].append(elem.attrib['who'][1:])
        except KeyError:
            export_speech_dict["person_id"].append("")
            
        # Sorting through speech segments.
        u_speech = ""
        for seg in elem.findall("tei:seg", namespace):
            seg_text = seg.text
            # Not all segments end in a period. To ensure that splitting text by sentence goes well, and that sentences don't combine, a period is added to the end of each segment.
            if seg_text[-1] not in [".", "?", "!"]:
                seg_text += "."
           # If there is no segment originally added to the total speech, add it with no starting space. If there is, then there will be a starting space (so the sentence is split).
            if not u_speech:
                u_speech += seg_text
            else:
                u_speech += f" {seg_text}"
        export_speech_dict["speech"].append(u_speech) # Adding speech
    return pd.DataFrame(export_speech_dict)

def get_corpus_list_speeches(corpus_list, input_path, export_path):
    """
    Given ParlaMint corpuses, output a csv and parquet file containing all of their speeches..
    :param corpus_list: The corpus that are used to get ParlaMint data (i.e. "ParlaMint-GB", since all files begin with this).
    :param input_path: Where the files come from.
    :param export_path: Where the files should be outputted.
    """
    # Begin by looking through the corpus list.
    for corpus in corpus_list:
        # Base info, adjusted later
        xml_file_paths = []
        parlamint_speeches = pandas.DataFrame(columns = ["speech_id", "speech", "person_id"])
        
        # Get all of the xml file paths from the xml file.
        base_xml_path = f"{input_path}\\{corpus}\\{corpus}.TEI"
        xml_info_path = Path(f"{base_xml_path}\\{corpus}.xml")
        root = ET.parse(xml_info_path).getroot()
        for elem in root.findall("XInclude:include", namespace): # Loop through the root to find them
            xml_file_paths.append(Path(f"{base_xml_path}\\{elem.attrib["href"]}"))
        
        # With all of the xml file paths, begin to search through all links to find all speeches, before concating the together.
        number_xmls = len(xml_file_paths)
        print(f"Beginning {corpus} search with {number_xmls} files.")
        for i, file_path in enumerate(xml_file_paths):
            # print(f"Finished search {i+1} out of {number_xmls}.") # Lets you see how many instances are being fulfilled.
            new_speech = get_parlamint_xml_speech(file_path, corpus[-2:])
            parlamint_speeches = pd.concat([parlamint_speeches, new_speech])
        
        # Export to csv and parquet.
        parlamint_speeches.to_csv(f"{export_path}\\{corpus}-RawSpeechesCSV.csv", index=False)
        parlamint_speeches.to_parquet(f"{export_path}\\{corpus}-RawSpeechesParquet.gzip", index=False, compression='gzip')


        
all_corpus = ["ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA"]

# base_legislative_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-taxonomy-parla.legislature.xml"
# person_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listPerson.xml"
# org_path = "ParliamentHackathon2024Data/ParlaMint4.0-GB/ParlaMint-GB.TEI/ParlaMint-GB-listOrg.xml"

#corpus_list_test = ["ParlaMint-AT", "ParlaMint-GB"]

base_path = "D:\\ParlaMint Data\\Files"
output_path = "D:\\ParlaMint Data\\Raw Data"

# Gets all of the corpus lists (currently off due to finishing before)
# get_corpus_list_speeches(all_corpus, base_path, output_path)


## ParlaMint Data Cleaning

In [31]:
#gb_data = new_dataframe.copy().sample(frac=0.05, replace=True, random_state=1)


def clean_data(corpus_list, input_path, export_path):
    total_sentences = 0
    for corpus in corpus_list:
        print(f"Part 1: Importing Data at {datetime.datetime.now()} for {corpus}.")
        speech_data = pd.read_parquet(f"{input_path}\\{corpus}-RawSpeechesParquet.gzip")
        print(f"Starting with {len(speech_data)} columns.")
        
        def clean_text(text):
            # text = re.sub(r'\(.*?\)', '', text)  # Remove interpellations (text within parentheses)
            # text = re.sub(r'-', '', text)  # Remove hyphens
            # text = re.sub(r'_', '', text)  # Remove underscores
            return text.strip()
        
        def split_text(text):
            return re.split(r'(?<!\d)[.!?]\s+', text) # Splitting by ., !, or ?
        
        print(f"Part 2: Cleaning Speech at {datetime.datetime.now()}")
        speech_data['speech'] = speech_data['speech'].apply(clean_text)
        
        print(f"Part 3: Exploding Speech at {datetime.datetime.now()}")
        speech_data = speech_data.assign(speech=speech_data['speech'].apply(split_text)).explode('speech')
        
        # print(f"Part 4: Removing Small Text at {datetime.datetime.now()}")
        # speech_data = speech_data[speech_data['speech'].str.len() > 30]
        
        print(f"Part 5: Resetting Index at {datetime.datetime.now()}")
        speech_data = speech_data.reset_index(drop=True).reset_index()
        
        print(f"Part 6: Making New Speech ID at {datetime.datetime.now()}")
        speech_data["speech_id"] = speech_data["speech_id"] + "-" + speech_data["index"].astype(str)
        
        print(f"Part 7: Dropping Index Column at {datetime.datetime.now()}")
        speech_data.drop(columns = ["index"])
        
        print(f"Final: Showing Info at {datetime.datetime.now()}")
        print(f"Total of {len(speech_data)} columns.")
        total_sentences += len(speech_data)
        
        speech_data.to_parquet(f"{export_path}\\{corpus}-CleanedSpeechesParquet.gzip", index=False, compression='gzip')
    
    print(f"Total number of sentences: {total_sentences}")
 
all_corpus = ["ParlaMint-AT", "ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA", "ParlaMint-GB"]
input_path = "D:\\ParlaMint Data\\Raw Data"
output_path = "D:\\ParlaMint Data\\Cleaner Data"

clean_data(all_corpus, input_path, output_path)

Part 1: Importing Data at 2024-05-21 12:36:22.207527 for ParlaMint-AT.
Starting with 231759 columns.
Part 2: Cleaning Speech at 2024-05-21 12:36:27.220669
Part 3: Exploding Speech at 2024-05-21 12:36:27.314419
Part 5: Resetting Index at 2024-05-21 12:36:38.053831
Part 6: Making New Speech ID at 2024-05-21 12:36:38.438483
Part 7: Dropping Index Column at 2024-05-21 12:36:39.255077
Final: Showing Info at 2024-05-21 12:36:39.427317
Total of 1985049 columns.
Part 1: Importing Data at 2024-05-21 12:36:55.321170 for ParlaMint-BA.
Starting with 126252 columns.
Part 2: Cleaning Speech at 2024-05-21 12:36:56.517286
Part 3: Exploding Speech at 2024-05-21 12:36:56.548541
Part 5: Resetting Index at 2024-05-21 12:37:00.020636
Part 6: Making New Speech ID at 2024-05-21 12:37:00.208161
Part 7: Dropping Index Column at 2024-05-21 12:37:00.500279
Final: Showing Info at 2024-05-21 12:37:00.579939
Total of 962670 columns.
Part 1: Importing Data at 2024-05-21 12:37:09.768342 for ParlaMint-BE.
Starting wit

In [2]:
def combine_parlamint_parquets(corpus_list, input_path, export_path):
    export_speech_dict = {"speech_id": [], "speech": [], "person_id": []}
    for corpus in corpus_list:
        print(f"Loading corpus {corpus} at {datetime.datetime.now()}")
        speech_data = pd.read_parquet(f"{input_path}\\{corpus}-CleanedSpeechesParquet.gzip")
        for item in export_speech_dict:
            export_speech_dict[item].append(speech_data[item].tolist())
    print(f"Setting dataframe at {datetime.datetime.now()}")
    parlamint_speeches = pd.DataFrame(export_speech_dict)
    print(f"Exporting at {datetime.datetime.now()}")
    parlamint_speeches.to_parquet(f"{export_path}\\{corpus}-AllCleaned.gzip", index=False, compression='gzip')
    print(f"Finished exporting at {datetime.datetime.now()}")

all_corpus = ["ParlaMint-GB", "ParlaMint-AT", "ParlaMint-BA", "ParlaMint-BE", "ParlaMint-BG", "ParlaMint-CZ", "ParlaMint-DK", "ParlaMint-EE", "ParlaMint-ES", "ParlaMint-ES-CT", "ParlaMint-ES-GA", "ParlaMint-ES-PV", "ParlaMint-FI", "ParlaMint-FR", "ParlaMint-GR", "ParlaMint-HR", "ParlaMint-HU", "ParlaMint-IS", "ParlaMint-IT", "ParlaMint-LV", "ParlaMint-NL", "ParlaMint-NO", "ParlaMint-PL", "ParlaMint-PT", "ParlaMint-RS", "ParlaMint-SE", "ParlaMint-SI", "ParlaMint-TR", "ParlaMint-UA"]
input_path = "D:\\ParlaMint Data\\Cleaned Data"
output_path = "D:\\ParlaMint Data"
combine_parlamint_parquets(all_corpus, input_path, output_path)

Loading corpus ParlaMint-GB at 2024-05-21 14:25:03.327544
Loading corpus ParlaMint-AT at 2024-05-21 14:25:23.319955
Loading corpus ParlaMint-BA at 2024-05-21 14:25:26.900552
Loading corpus ParlaMint-BE at 2024-05-21 14:25:29.959595
Loading corpus ParlaMint-BG at 2024-05-21 14:25:33.392673
Loading corpus ParlaMint-CZ at 2024-05-21 14:25:37.214672
Loading corpus ParlaMint-DK at 2024-05-21 14:25:43.082630
Loading corpus ParlaMint-EE at 2024-05-21 14:25:50.352345
Loading corpus ParlaMint-ES at 2024-05-21 14:25:53.037479
Loading corpus ParlaMint-ES-CT at 2024-05-21 14:25:54.364065
Loading corpus ParlaMint-ES-GA at 2024-05-21 14:25:55.610067
Loading corpus ParlaMint-ES-PV at 2024-05-21 14:25:57.354567
Loading corpus ParlaMint-FI at 2024-05-21 14:25:59.059191
Loading corpus ParlaMint-FR at 2024-05-21 14:26:02.323144
Loading corpus ParlaMint-GR at 2024-05-21 14:26:16.454161
Loading corpus ParlaMint-HR at 2024-05-21 14:30:01.559920
Loading corpus ParlaMint-HU at 2024-05-21 14:30:17.915744
Loadi

ArrowMemoryError: realloc of size 2147483648 failed

In [2]:
input_path = "D:\\ParlaMint Data\\Cleaned Data"
speech_data = pd.read_parquet(f"{input_path}\\ParlaMint-GB-CleanedSpeechesParquet.gzip")
speech_data


Unnamed: 0,index,speech_id,speech,person_id
0,0,ParlaMint-GB_2015-01-05-commons.u1-0,1. What progress her Department has made on im...,GB-JenniferWillott
1,1,ParlaMint-GB_2015-01-05-commons.u2-1,The Government are on track to deliver their c...,GB-TheresaMay
2,2,ParlaMint-GB_2015-01-05-commons.u3-2,"It is clear that exit checks, which were scrap...",GB-JenniferWillott
3,3,ParlaMint-GB_2015-01-05-commons.u3-3,"I know that progress has been made, but how su...",GB-JenniferWillott
4,4,ParlaMint-GB_2015-01-05-commons.u4-4,"As I indicated in my original answer, we are o...",GB-TheresaMay
...,...,...,...,...
5552346,5552346,ParlaMint-GB_2022-07-21-lords.u212-5552346,"Therefore, it becomes very difficult to see an...",GB-PremSikka
5552347,5552347,ParlaMint-GB_2022-07-21-lords.u212-5552347,The Minister also referred to the audit Bill,GB-PremSikka
5552348,5552348,ParlaMint-GB_2022-07-21-lords.u212-5552348,"From what I have seen, I do not have any faith...",GB-PremSikka
5552349,5552349,ParlaMint-GB_2022-07-21-lords.u212-5552349,I thank all noble Lords for staying behind and...,GB-PremSikka


MargaretProsser
JonathanMendelsohn
AlanMak
RuthHunt
JoannaCherry
WilliamJordan
MayBlood
TheresaVilliers
GeorgeGalloway
PaulWhite
JohnSpellar
BeebanKidron
AnnaTurley
GeraintDavies
TimothyYeo
PeterHeatonJones
GrevilleHoward
NicholasHoltam
DavidLaws
JohnGardiner
EleanorReeves
RobertMay
EmmaLewellBuck
GillianKeegan
MatthewWestern
MargotAline
SebastianCoe
BrendaDean
SueEllenBraverman
KatyClark
RobertFlello
GavinShuker
StewartJackson
MarkEastwood
RogerGale
MelanieOnn
AngelaHarris
DonaldMackay
CharlesDugdale
JulianSmith
JamesGray
ThomasBrake
ReginaldEmpey
GarryHart
TaniaMathias
LisaForbes
HilaryBenn
LaurenceRobertson
JeanBarker
ThomasCoke
DianaEccles
JaneHunt
MarkHunter
RobertSyms
TobiasEllwood
LeoDocherty
SallyAnnHart
TomGreatrex
AndrewGriffiths
ErnestOxburgh
BrooksNewmark
AlanBeith
DavidRichards
VictoriaAtkins
AnnMcKechin
RobertMaclennan
CharlesHendry
BrettElphicke
MiriamCates
TracyBrabin
MichaelMorris
PaulCondon
SallyOppenheimBarnes
PaulMurphy
ThangamDebbonaire
GavinBarwell
EwenCameron
Pat

In [7]:
find_party()

{http://www.tei-c.org/ns/1.0}desc {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.geo-political'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.organization'}
{http://www.tei-c.org/ns/1.0}category {'{http://www.w3.org/XML/1998/namespace}id': 'parla.term'}
