In [2]:
'''
This module tokenizes text and divides the dataset into categories (political parties, legislative sessions) for further analysis.
'''

'\nThis module tokenizes text and divides the dataset into categories for further analysis.\n'

In [2]:
from collections import defaultdict
import os
import pandas as pd

import json

# Regular Expression library
import re

from wordcloud import WordCloud

import matplotlib.pyplot as mplt

In [3]:
# Importing important packages

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords


import gensim.corpora as corpora

from pprint import pprint


import pickle 
import pyLDAvis
import pyLDAvis.gensim_models

In [4]:
directory_save = os.path.join('D:', 'tese_data', 'save')
print(directory_save)

D:tese_data\save


In [5]:
# Open the file

file_path = os.path.join(directory_save, 'clean_dataframe.json')

speeches = pd.read_json(file_path, orient = "records", lines = True)

In [6]:
# Some information on the dimension of the panda dataframe.

speeches.shape
print('There are {} columns and {} rows.'.format(speeches.shape[1], speeches.shape[0]))

There are 6 columns and 221036 rows.


In [7]:
clean_dataframe = speeches
clean_parties = ['BE', 'PSD+CDS-PP', 'Os Verdes', 'PCP', 'PS', 'PSD']

In [8]:
# Here, I create the word lists per party. That is how the functions are structured, but easily adjustable. 

In [8]:
# We start by defining the relevant stopwords.

stop_words = stopwords.words('portuguese')

common_parl_words = ['sr', 'governo', 'lei', 'presidente', 'srs',
                     'regime', 'aplausos', 'deputados', 'deputado',
                     'ministro', 'estado', 'pais', 'srª', 'partido',
                     'portugal', 'sras', 'portugueses', 'proposta',
                     'propostas', 'medida', 'medidas', 'deputada',
                     'ministra', 'assembleia', 'republica', 'socialista',
                     'país', 'voto', 'número', 'parlamentar',
                     'républica', 'senhores', 'senhor',
                     'senhora', 'série', 'falar', 'chega',
                     'grupo', 'intervenção', 'votação', 'joão',
                     'pedro', 'declaração', 'questão', 'parlamento',
                     'antónio', 'projeto', 'josé', 'iniciativa',
                     'madeira', 'açores', 'resolução', 'partidos',
                     'secretário', 'palavra', 'mesa', 'caros',
                     'discussão', 'nº', 'bancada', 'tempo',
                     'dia', 'ponto', 'resposta', 'liberal', 'pergunta',
                     'colegas', 'caras', 'informar', 'questões', 'responder',
                     'politica', 'protestos', 'bloco', 'esquerda']

common_pt_words = ['nao', 'tambem', 'porque', 'sao', 'ha', 'ja',
                   'sobre', 'aqui', 'todos', 'hoje', 'fazer', 'anos',
                   'primeiro', 'ter', 'dizer', 'so', 'ainda', 'ate',
                   'serie', 'pode', 'questao', 'ano', 'vai', 'facto',
                   'materia', 'debate', 'quer', 'portanto', 'vez', 'assim',
                   'deste', 'desta', 'parte', 'sempre', 'nesta', 'qualquer',
                   'tudo', 'apenas', 'situacao', 'sentido', 'deve',
                   'todas', 'importante', 'momento', 'onde',
                   'dar', 'contra', 'quanto', 'têm',
                   'nada', 'alias', 'outros', 'ver', 'agora',
                   'bem', 'coisa', 'disse', 'realidade', 'vezes',
                   'dia', 'nunca', 'fez', 'meses', 'sabe', 'feito',
                   'preciso', 'tão', 'aliás', 'queria', 'caso',
                   'relativamente', 'vou', 'quero', 'cada', 'exemplo',
                   'sabemos', 'há', 'já', 'até', 'faz', 'novo',
                   'vão', 'nomeadamente', 'toda', 'verdade',
                   'tipo', 'podemos', 'mesmo', 'mesma', 'relação',
                   'acerca', 'saber', 'devemos', 'outro', 'gostaria',
                   'então', 'podem', 'nome', 'matéria', 'sim', 'não',
                   'estao', 'neste', 'forma', 'diz', 'outras']

parties = ['be', 'cds', 'pp', 'ch', 'il', 'l',
            'livre', 'verdes', 'pan', 'pcp', 'ps', 'psd']

stop_words.extend(common_parl_words)
stop_words.extend(common_pt_words)
stop_words.extend(parties)

In [9]:
# We define how to tokenize the text and remove stopwords.
    
def sent_to_words(sentences):
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence, language='portuguese')
        tokens = [token.lower() for token in tokens if token.isalpha()] # remove punctuation and numbers

        yield(tokens)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [10]:
# This function simply consolidates the two above, and introduces criteria for party.

def tokenize(dataframe, party=None, year=None, session=None):
    '''Prepares and returns the data_words list based on party, year, and optional session.'''
    
    if party is None:
        party = "all"
    
    if party == "all":
        if year is not None:
            filtered_data = dataframe[dataframe['Date'].dt.year == year]
        else:
            filtered_data = dataframe
        
        if session is not None:
            filtered_data = filtered_data[filtered_data['Session'] == session]
        
        data = filtered_data.Intervention_processed.values.tolist()
    elif party in clean_parties:
        if year is not None:
            filtered_data = dataframe[(dataframe['Party'] == party) & (dataframe['Date'].dt.year == year)]
        else:
            filtered_data = dataframe[dataframe['Party'] == party]
        
        if session is not None:
            filtered_data = filtered_data[filtered_data['Session'] == session]
        
        data = filtered_data.Intervention_processed.values.tolist()
    else:
        raise ValueError("Invalid party specified")
    
    data_words = list(sent_to_words(data))
    
    # Remove stop words
    data_words = remove_stopwords(data_words)
    
    if year is not None and session is not None:
        save_list(data_words, party=party, year=year, session=session)
    elif year is not None:
        save_list(data_words, party=party, year=year)
    elif session is not None:
        save_list(data_words, party=party, session=session)
    
    return data_words

In [11]:
def save_list(lst, party=None, year=None, session=None):
    '''Saves list of words for future use.'''
    
    directory_save_panda = os.path.join('D:', 'tese_data', 'save', 'panda')
    
    if party is None:
        party = "all"
    
    if party == "all":
        if year is not None:
            if session is not None:
                st = f'data_words_{year}_{session}.json'
            else:
                st = f'data_words_{year}.json'
        elif session is not None:
            st = f'data_words_{session}.json'
        else:
            st = 'data_words.json'
    elif party in clean_parties:
        if year is not None:
            if session is not None:
                st = f'data_words_{party}_{year}_{session}.json'
            else:
                st = f'data_words_{party}_{year}.json'
        elif session is not None:
            st = f'data_words_{party}_{session}.json'
        else:
            st = f'data_words_{party}.json'
    else:
        raise ValueError("Invalid party specified")
    
    file_name = os.path.join(directory_save_panda, st)

    with open(file_name, 'w') as outfile:
        json.dump(lst, outfile)

In [None]:
data_words = tokenize(clean_dataframe)
save_list(data_words)

# Total count of words:
compiled = []

for lst in data_words:
    for i in lst:
        compiled.append(i)

print(len(compiled))

print("Total count of words is: ", compiled)

In [14]:
for party in clean_parties:
    
    data_words = tokenize(clean_dataframe, party=party)
    save_list(data_words, party=party)

In [21]:
for year in range(2005, 2020):
    
    data_words = tokenize(clean_dataframe, year=year)
    save_list(data_words, year=year)

In [22]:
for session in range(10, 14):
    
    data_words = tokenize(clean_dataframe, session=session)
    save_list(data_words, session=session)

In [12]:
for party in clean_parties:
    for session in range(10,14):
        
        data_words = tokenize(clean_dataframe, party = party, session = session)
        save_list(data_words, party=party, session = session)