In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os


### Setting paths

In [None]:
os.chdir("..")
data_path = os.path.join(os.path.abspath(os.curdir), 'corpus','raw','speeches.feather')
result_path = os.path.join(os.path.abspath(os.curdir),'corpus','prepared')

### Loading raw data

In [None]:
raw_data = pd.read_feather(data_path)


### Data understanding

In [None]:
def get_info_on_dataframe(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: summary statistics on the dataset
    '''
    print("shape of the dataframe: ", df.shape)
    print("datatypes and nan values per column: ", df.info())
    print("descriptive summary of object columns: ", df.describe(include=object))

get_info_on_dataframe(raw_data)

In [None]:
def get_info_on_columns(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: Provide basic information on the length (number of characters) of the raw held in parliament and the
    distribution of raw over factions and positions
    '''
    df['len'] = df['speechContent'].apply(len)
    print('Statistics on the length of raw: ',df['len'].describe())
    print("rows per faction: ", df['factionId'].value_counts())
    print("rows per position: ", df['positionShort'].value_counts())
get_info_on_columns(raw_data)

In [None]:
def create_year_column(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: add a 'year' column to dataframe containing the year a speech was held in (instead of the full date)
    '''
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    return df

create_year_column(raw_data)

In [None]:
def plot_speeches_over_years(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: plot showing the number of speeches held every year
    '''
    df['year'].hist(bins=50, figsize=(20, 6))
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.show()

plot_speeches_over_years(raw_data)

In [None]:
def plot_length_of_speeches(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: plot showing the length of the speeches (characters) per position
    '''
    df['len'].hist(bins=50, figsize=(20,6))
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.show()

plot_length_of_speeches(raw_data)

### Reducing the size of the dataset

In [None]:
def drop_irrelevant_speeches(df):
    '''

    :param df: dataframe containing the raw plenary speeches
    :return: dataframe with plenary speeches from known Members of the Bundestag with more than 30 contributions
    '''
    start_len = len(df.index)
    print('rows before dropping values: ', start_len)
    start_char = df['len'].sum()
    print('total characters in speechContent before dropping values: ', start_char)
    df.drop(df.loc[(df['positionShort'] != 'Member of Parliament')].index, inplace=True)
    print('rows after dropping all but MEP: ', len(df.index))
    print('percentage of rows kept after dropping all but MEP: ',  (100/start_len)*len(df.index))
    print('percentage of characters in speechContent kept after dropping all but MEP: ', (100/start_char)*df['len'].sum())
    df.drop(df.loc[(df['politicianId'] == -1)].index, inplace=True)
    print('rows after dropping speeches of unknown speakers: ', len(df.index))
    print('percentage of raw kept after dropping unknown speakers: ',  (100/start_len)*len(df.index))
    print('percentage of characters in speechContent kept after dropping unknown speaker: ', (100/start_char)*df['len'].sum())
    consolidated_speeches = df[df.groupby('politicianId')['politicianId'].transform('size') > 30]
    print('rows after dropping speeches of rare speakers: ', len(consolidated_speeches.index))
    print('percentage of raw kept after dropping rare speakers: ',  (100/start_len)*len(consolidated_speeches.index))
    print('percentage of characters in speechContent kept after dropping rare speaker: ', (100/start_char)*consolidated_speeches['len'].sum())
    return consolidated_speeches


consolidated_data=drop_irrelevant_speeches(raw_data)

In [None]:
def merge_fragmented_speeches(df):
    '''

    :param df: dataframe with plenary speeches from known Members of the Bundestag with more than 30 contributions
    :return: dataframe with plenary speeches, where speeches that are wrongly saved as speech fragments in consecutive rows are joined
    '''
    df.sort_values(by='id')
    return df.groupby(['session','electoralTerm','politicianId','factionId','documentUrl','date' , 'year']).agg({'id':'first', 'firstName':'first','lastName':'first','positionShort':'first','positionLong':'first','speechContent':''.join,'len':'sum'}).reset_index()

corpus = merge_fragmented_speeches(consolidated_data)

In [None]:
def drop_after_merge(df):
    '''

    :param df: dataframe with plenary speeches from known Members of the Bundestag with more than 30 contributions and joined speeches
    :return: dataframe that only contains plenary speeches from Known Members of the Bundestag iwth more than 100 characters
    '''
    df.drop(df.loc[df['len'] < 100].index, inplace=True)
    print('rows after dropping speeches without or with little content: ', len(df.index))
    df.drop(['session', 'firstName', 'lastName', 'documentUrl', 'positionLong', 'date'], axis='columns', inplace=True)
    return df

corpus = drop_after_merge(corpus)

### Saving the corpus to disk

In [None]:
def save_csv_to_disk(df):
    '''

    :param df:
    :return: saves the corpus created above and three smaller corpi for testing purposes to disk,
    '''
    corpus.to_csv(os.path.join(result_path, "corpus.csv"))
    test_corpus_3_terms = (corpus.loc[corpus['electoralTerm'].isin([19,18,17])])
    test_corpus_3_terms.to_csv(os.path.join(result_path, "test_corpus_3_terms.csv"))
    test_corpus_1000 = corpus.sample(n=1000, random_state=1)
    test_corpus_1000.to_csv(os.path.join(result_path, "test_corpus_1000.csv"))
    test_corpus_10000 = corpus.sample(n=10000, random_state=1)
    test_corpus_10000.to_csv(os.path.join(result_path,"test_corpus_10000.csv"))


save_csv_to_disk(corpus)