# Script para limpar os discursos da base:
- remover stopwords
- remover pontuação
- remover caracteres especiais
- remover palavras/termos muito comuns, por exemplo, "Sr. Presidente"

In [None]:
import os
import pandas as pd
import re

import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from unicodedata import normalize


In [51]:

def remove_stop_words(data, list_stop_words = None):
    """
    Função para remover stopwords em português
    """
    print('  Remove Stop Words')

    #list_stop_words = set(stopwords.words('portuguese'))
    if list_stop_words != None:
        output = []
        for sentence in data:
            temp_list = []
            for word in sentence.split():
                if word.lower() not in list_stop_words:
                    temp_list.append(word)
            output.append(' '.join(temp_list))

        data = pd.Series(output)    
    
    return data

def remove_punct(data):
    """
    Função para remover pontuações
    """
    print('  Remove Punction')

    puncts = ["\\" +s for s in string.punctuation]
    puncts = '|'.join(puncts)
    data = data.replace(puncts,' ',regex=True)    
    return data

def remove_special_caract(array_data):
    """
    Função para substituir caracteres especiais
    """
    print('  Remove Special Caract')
    data = [normalize('NFKD', data).encode('ASCII', 'ignore').decode('ASCII') for data in array_data]
    return data

def remove_patter_terms(data):
    """
    Função para remover orações que há em todos os discursos, como "senhor presidente", "senhora(s) deputadas"
    """
    #print(data)
    pattern = "^(.+?)-\sSr|(((O?\s?SR\.?\s+?)|(A?\s?SRA\.?\s+?))(\s+DEPUTADO|\s+DEPUTADA|\s+PRESIDENTE|\s+PRESIDENTA)?)|\
              (SR\.?\s+?)|(SRA\.?\s+?)|srs|sras|(presidente|deputado(s)|deputada(s))"
    
    data = re.sub(pattern, '',data, flags = re.IGNORECASE)
    
    return data

def clean_text(data):
    """
    Função para fazer pre-processamento dos textos
    """
    print('Cleaning text...')
    data.isnull().sum()
    data =  data.apply(remove_patter_terms)
    
    data = remove_punct(data)
    data = data.str.lower()
    
    list_stop_words = remove_special_caract(stopwords.words('portuguese'))
    list_stop_words.extend(['ser','ja','ha','exa','ainda','ate','todo','todos','toda','todas','devem'])
    list_stop_words = set(list_stop_words)
    data = remove_stop_words(data, list_stop_words)
    
    #data = data.replace('\s+',' ',regex=True)
    
    return data

In [60]:
directory = "../data/csvs/"
discursos = pd.read_csv(directory+'discursos_plen.csv')

In [61]:
discursos.isnull().sum()

FILE                4
ORIGINAL_CONTENT    4
CLEAN_CONTENT       4
SESSION_TYPE        0
SESSION_DATE        0
MONTH               0
YEAR                0
SPEAKER_COUNT       0
dtype: int64

In [62]:
discursos = discursos.dropna()
discursos.index = [i for i in range(0,len(discursos))] #loc[discursos.CLEAN_CONTENT.isna(),:]

In [63]:
discursos.isnull().sum()

FILE                0
ORIGINAL_CONTENT    0
CLEAN_CONTENT       0
SESSION_TYPE        0
SESSION_DATE        0
MONTH               0
YEAR                0
SPEAKER_COUNT       0
dtype: int64

In [64]:
print(len(discursos))
discursos.info()

113154
<class 'pandas.core.frame.DataFrame'>
Int64Index: 113154 entries, 0 to 113153
Data columns (total 8 columns):
FILE                113154 non-null object
ORIGINAL_CONTENT    113154 non-null object
CLEAN_CONTENT       113154 non-null object
SESSION_TYPE        113154 non-null object
SESSION_DATE        113154 non-null object
MONTH               113154 non-null int64
YEAR                113154 non-null int64
SPEAKER_COUNT       113154 non-null int64
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [65]:
#discursos = discursos.reindex([i for i in range(0,len(discursos))])#.isnull().sum()

In [66]:
#discursos.isnull().sum()

In [67]:
#discursos.head()


In [68]:
# retirando caracteres especiais das stopwords, porque o texto está sem!!
list_stop_words = remove_special_caract(stopwords.words('portuguese'))

# inserindo mais palavras pra ser removidas
list_stop_words.extend(['ser','ja','ha','exa','ainda','ate','todo','todos','toda','todas','devem'])

  Remove Special Caract


In [69]:
discursos.loc[:,'CLEAN_CONTENT'] = clean_text(discursos.loc[:,'CLEAN_CONTENT'])
#discursos.iloc[57646:57647,:]

Cleaning text...
  Remove Punction
  Remove Special Caract
  Remove Stop Words


In [10]:
print(discursos.isna().sum())

FILE                0
ORIGINAL_CONTENT    0
CLEAN_CONTENT       0
SESSION_TYPE        0
SESSION_DATE        0
MONTH               0
YEAR                0
SPEAKER_COUNT       0
dtype: int64


In [70]:
print('Qtd de Registros por ANO:\n{}\n\nQtd Total de registros:{} '.format(discursos.YEAR.value_counts(),len(discursos)))

Qtd de Registros por ANO:
2015    7971
2017    7923
2013    7923
2007    7264
2009    7110
2003    6726
2016    6624
2011    6464
2008    5945
2019    5777
2012    5666
2005    5620
2001    5445
2004    5442
2014    4909
2018    4701
2006    4554
2010    4247
2002    2843
Name: YEAR, dtype: int64

Qtd Total de registros:113154 


In [71]:
discursos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113154 entries, 0 to 113153
Data columns (total 8 columns):
FILE                113154 non-null object
ORIGINAL_CONTENT    113154 non-null object
CLEAN_CONTENT       113154 non-null object
SESSION_TYPE        113154 non-null object
SESSION_DATE        113154 non-null object
MONTH               113154 non-null int64
YEAR                113154 non-null int64
SPEAKER_COUNT       113154 non-null int64
dtypes: int64(3), object(5)
memory usage: 7.8+ MB


In [72]:
## salvando dados limpos em uma nova base
discursos.to_csv(directory+'discursos_plen_limpo.csv', index = False)