In [1]:
'''
This module takes the dataset and tokenizes the speech to prepare it for analysis.
'''

'\nThis module takes the dataset and tokenizes the speech to prepare it for analysis.\n'

In [2]:
from collections import defaultdict
import os
import pandas as pd

import json

# Regular Expression library
import re

from wordcloud import WordCloud

import matplotlib.pyplot as mplt

In [3]:
# Importing important packages

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords


import gensim.corpora as corpora

from pprint import pprint


import pickle 
import pyLDAvis
import pyLDAvis.gensim_models
from datetime import datetime

In [4]:
directory_save = os.path.join('D:', 'tese_data', 'save')
print(directory_save)

D:tese_data\save


In [5]:
# Open the file

file_path = os.path.join(directory_save, 'panda_dict.json')

speeches = pd.read_json(file_path)

In [6]:
# Some information on the dimension of the panda dataframe.

speeches.shape
print('There are {} columns and {} rows.'.format(speeches.shape[1], speeches.shape[0]))

There are 5 columns and 382899 rows.


In [7]:
def get_clean_parties(dtf, parameter_to_clean = 'Party', cutoff = 150):
    ''' Takes dataframe and defines a list with actual parties.
    '''
    
    clean_field = list()
    
    for field, data in dtf.groupby(parameter_to_clean):
        if dtf[dtf[parameter_to_clean] == field].agg(count = ('Intervention', 'count')).iat[0,0] > cutoff:
            clean_field.append(field)
    
        else:
            pass

    return clean_field
    
        
def get_clean_dataframe(dtf, field_to_cut = 'Intervention', length_cut_off = 40):
    ''' Takes raw dataframe and cleans it by dropping
    small interventions, keeping only true political parties,
    and removing punctuation.
    '''
    
    # Drop small interventions
    dataframe = dtf.loc[dtf[field_to_cut].str.len() > length_cut_off]
    
    # Keep only actual true parties
    dataframe = dataframe[dataframe['Party'].isin(clean_parties)]
    
    # Remove punctuation
    dataframe['Intervention_processed'] = dataframe['Intervention'].map(lambda x: re.sub('[,\\.!?:]', '', x))

    # Convert the titles to lowercase
    dataframe['Intervention_processed'] = dataframe['Intervention_processed'].map(lambda x: x.lower())
        
    return dataframe

In [8]:
# Implementing the functions specified above.

clean_parties = get_clean_parties(speeches)

clean_dataframe = get_clean_dataframe(speeches)

In [65]:
clean_dataframe.shape
print('There are {} columns and {} rows.'.format(clean_dataframe.shape[1], clean_dataframe.shape[0]))

There are 6 columns and 271049 rows.


In [41]:
# Here, I change the dataset to only include data until the end of our period of interest.

last_session = "2019-10-25"

clean_dataframe["Date"] = pd.to_datetime(clean_dataframe["Date"])
clean2 = clean_dataframe[clean_dataframe["Date"] <= pd.to_datetime(last_session)]

print("Last Session is: ", last_session)

print(clean2.head)

Last Session is:  2019-10-25
<bound method NDFrame.head of              Date            File  \
0      2005-03-11  darl10sl01n001   
1      2005-03-11  darl10sl01n001   
2      2005-03-11  darl10sl01n001   
3      2005-03-11  darl10sl01n001   
4      2005-03-11  darl10sl01n001   
...           ...             ...   
312665 2019-10-10  darl13sl04n110   
312667 2019-10-10  darl13sl04n110   
312669 2019-10-10  darl13sl04n110   
312670 2019-10-10  darl13sl04n110   
312671 2019-10-10  darl13sl04n110   

                                             Intervention   Party  Session  \
0       Sr. Presidente, o Bloco de Esquerda far-se-á r...      BE       10   
1       Sr. Presidente, Srs. Deputados: Conheço José S...      BE       10   
2       Sr. Presidente, Sr.as Deputadas e Srs. Deputad...      BE       10   
3       Sr. Presidente, é para indicar que integrará e...  CDS-PP       10   
4       Sr. Presidente, Sr.as e Srs. Deputados: No pas...  CDS-PP       10   
...                         

In [66]:
clean2.shape
print('There are {} columns and {} rows.'.format(clean2.shape[1], clean2.shape[0]))

There are 6 columns and 221784 rows.


In [43]:
clean_data_frame = clean2

In [103]:
# Here, I change the parties of Government to PS or PSD/CDS-PP respectively, and join PSD/CDS-PP.

clean3 = clean2


clean3.loc[
    (clean3["Date"] >= pd.to_datetime("2011-06-20")) &
    (clean3["Date"] <= pd.to_datetime("2015-11-26")) &
    (clean3["Party"] == "Government"),
    "Party"
] = "PSD/CDS-PP"


clean3.loc[
    (clean3["Date"] < pd.to_datetime("2011-06-20")) &
    (clean3["Party"] == "Government"),
    "Party"
] = "PS"

clean3.loc[
    (clean3["Date"] > pd.to_datetime("2015-11-26")) &
    (clean3["Party"] == "Government"),
    "Party"
] = "PS"


clean3.loc[
    (clean3["Date"] > pd.to_datetime("2015-11-26")) &
    (clean3["Party"] == "Government"),
    "Party"
] = "PS"

clean3.loc[
    (clean3["Party"] == "PSD") |
    (clean3["Party"] == "PSD/CDS-PP") |
    (clean3["Party"] == "CDS-PP"),
    "Party"
] = "PSD+CDS-PP"



In [105]:
clean_data_frame = clean3

clean_data_frame.shape
print('There are {} columns and {} rows.'.format(clean_data_frame.shape[1], clean_data_frame.shape[0]))

There are 6 columns and 221784 rows.


In [106]:

# Save the DataFrame as a JSON file
directory_save = os.path.join('D:', 'tese_data', 'save')
json_filename = "clean_dataframe.json"
file_path = os.path.join(directory_save, json_filename)


clean_data_frame.to_json(file_path, orient="records", lines=True)

print(f"DataFrame saved as {json_filename}")

DataFrame saved as clean_dataframe.json


In [7]:
# Finally, I take a provisional clean dataframe and remove interventions from PAN and "nao inscritos"


file_path = os.path.join(directory_save, 'clean_dataframe.json')

speeches = pd.read_json(file_path, orient="records", lines=True)

speeches.shape
print('There are {} columns and {} rows.'.format(speeches.shape[1], speeches.shape[0]))

speeches = speeches[~speeches['Party'].isin(['PAN', 'N insc.'])]

speeches.shape
print('There are {} columns and {} rows.'.format(speeches.shape[1], speeches.shape[0]))


There are 6 columns and 221784 rows.
There are 6 columns and 221036 rows.


In [8]:
clean_data_frame = speeches


# Save the DataFrame as a JSON file
directory_save = os.path.join('D:', 'tese_data', 'save')
json_filename = "clean_dataframe.json"
file_path = os.path.join(directory_save, json_filename)


clean_data_frame.to_json(file_path, orient="records", lines=True)

print(f"DataFrame saved as {json_filename}")

DataFrame saved as clean_dataframe.json


In [101]:
compiled = []

for lst in data_words:
    for i in lst:
        compiled.append(i)

print(len(compiled))



12410951




In [109]:
for party in clean_parties:
    
    data_words = tokenize(clean_data_frame, party)
    save_list(data_words, party)