In [1]:
# This script will format the dataframe into a row per speech, so we can apply the language processing to it

In [2]:
import glob
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm
import warnings

In [3]:
# Supress future warning
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
def parse():
    
    # Get the files generated at the previous step
    parsed_files = glob.glob("../output/csvs/parsed-*.csv")
    
    # Makes a dataframe out of it
    dfs = pd.DataFrame()
    
    # Using a loop instead of a quicker concat so I can see where eventual erros happen
    for file in tqdm(parsed_files):
        
        # This is a badly formatted date in 2011
        if file == '../output/csvs/parsed-7-2011-09-15.csv':
            continue
            
            
        # Gets date and term with regex
        date = re.search(r'\d{4}-\d{2}-\d{2}', file).group()
        term = re.search(r'(\d{1})-', file).group(1)
        

        # Reads the file that was downloaded
        df = pd.read_csv(file, encoding='UTF-32')
        # Renames columns
        df.columns = ['speech', 'speaker_id', 'subject', 'speaker_span']
        
        # Adds extra info
        df['fname'] = file
        df['date'] = date
        df['term'] = term
        
        # Adds to the existing df
        dfs = dfs.append(df, ignore_index=True)

    
    return dfs

In [4]:
def main():
    
    df = parse()
    
    # Get only the actual speeches, removing procedural notes that are in the document.
    df = df[~df.speaker_span.isna()].reset_index(drop=True)
    
    # Extract an year from the date
    df['year'] = pd.to_datetime(df.date).dt.year
    
    # # How many were given in the last legislature?
    print(f"Speeches on the last legislature: {df[df.date >= '2019-06-02'].shape[0] / df.shape[0]}")
    
    # Keep only the speeches by the most recent legislature
    # df = df[pd.to_datetime(df.date) >= pd.to_datetime('2019-06-02')]
    
    # As for what a speech IS, let's say that it's anything longer than a tweet.
    # This will aid on classification later (too short of a speech will lead to lack of context)
    # And it has the advantage of getting rid of procedural things such as a Preisdent saying "the session is open"
    df['speech_length_in_characters'] = df.speech.str.len()
    
    # This is how the distribution looks like – the tail is even longer, but I clipped the x axis
    ax = df['speech_length_in_characters'].hist(bins=1000)
    ax.set_xlim(-10, 5000)
    
    # There's a weird error with a single empty speech. Let's get rid of it.
    df = df[~df.speech_length_in_characters.isna()].reset_index()
    
    # Filters to keep only what's long enough. Manual inspection showed me that 
    # most speeches with less characters than this were actually references to internal
    # rules, regulations and procedures.
    df = df[df['speech_length_in_characters'] >= 280]
    
    # Saves as CSV and feather
    df.to_csv("../../output/processed/all-speeches-df.csv", index=False)
    df.to_feather("../../output/processed/all-speeches-df.feather", index=False)

    

In [5]:
if __name__ == '__main__':
    df = main()