# Imports

In [4]:
import pandas as pd
import re
import unidecode
import nltk
from nltk.corpus import stopwords  
import time
import numpy as np
import pathlib
import datetime

tweets_df = pd.read_csv(str(pathlib.Path().absolute()) + '\\datasets\\allTweets.csv')
tweets_df

Unnamed: 0,Username,Created_at,Text
0,DelPatricia19,2020-11-16 01:59:55,"Fizemos história em uma campanha llisa, limpa,..."
1,DelPatricia19,2020-11-15 20:16:33,"@mprandrade Agradeço por seu apoio, vamos juntos!"
2,DelPatricia19,2020-11-15 14:53:51,@Dinilton_a @jandira_feghali Agradecemos o seu...
3,DelPatricia19,2020-11-15 14:53:35,@Dinilton_a @jandira_feghali Agradecemos o seu...
4,DelPatricia19,2020-11-15 11:37:11,@BolsoMito380 Obrigada pelo apoio 🖐️💚
...,...,...,...
2288,mendoncafilho,2020-09-28 15:33:48,"A verdade, para mim, significa confiança. É al..."
2289,mendoncafilho,2020-09-27 19:01:08,Eu queria saber de você: quais problemas na ci...
2290,mendoncafilho,2020-09-27 15:20:17,Mendonça consolida-se como ÚNICO da oposição q...
2291,mendoncafilho,2020-09-27 11:31:16,É vergonhosa a situação em que os Mercados Púb...


# Pre-processing data

In [5]:
nltk.download('stopwords')

stop_words = set(stopwords.words('portuguese')) 

def removeStopwords(text):
    split_text = text.split()
    noStopwordsList = [item for item in split_text if item not in stop_words]
    return " ".join(noStopwordsList)
    
    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joaog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# To lower
tweets_df['Text'] = tweets_df.Text.apply(lambda t: t.lower())

# Unicode tweet (remove accents etc)
tweets_df['Text'] = tweets_df.Text.apply(lambda t: unidecode.unidecode(t))  

#Remove usernames 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: re.sub('(@)[^\s]+','',t))

#Remove urls 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: re.sub('(http)\S+', '', t, flags=re.MULTILINE))

#Remove new line 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: t.replace('\n',''))

#Remove hashtags 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: re.sub('(#)\w+','',t))

#Remove Numbers 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: re.sub('(\d)+','',t))

#Remove special characteres 
tweets_df['Text'] = tweets_df.Text.apply(lambda t: re.sub(r"[^a-zA-Z0-9]+", ' ',t))

#Remove stopwords
tweets_df['Text'] = tweets_df['Text'].apply(lambda t: removeStopwords(t))

#Remove rows without text
tweets_df = tweets_df[tweets_df['Text'].map(lambda l: l != '')]
tweets_df = tweets_df[tweets_df['Text'].map(lambda l: l != ' ')]
  
tweets_df.head()


Unnamed: 0,Username,Created_at,Text
0,DelPatricia19,2020-11-16 01:59:55,fizemos historia campanha llisa limpa so possi...
1,DelPatricia19,2020-11-15 20:16:33,agradeco apoio vamos juntos
2,DelPatricia19,2020-11-15 14:53:51,agradecemos apoio contamos voce caminharmos ju...
3,DelPatricia19,2020-11-15 14:53:35,agradecemos apoio contamos voce caminharmos ju...
4,DelPatricia19,2020-11-15 11:37:11,obrigada apoio


# Generate CSV - First Round

In [8]:
tweets_df['Created_at'] = tweets_df['Created_at'].apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S'))
until_date = datetime.datetime(2020, 11, 13 ,23, 59, 59)

#Filter by date
firstRound_df = tweets_df[tweets_df.Created_at <= until_date]

firstRound_df.to_csv(str(pathlib.Path().absolute()) + '\\datasets\\firstRound.csv', index=False)

# Generate CSV - Second Round

In [10]:
since_date = datetime.datetime(2020, 11, 16, 0, 0, 0)
until_date = datetime.datetime(2020,11,27 ,23, 59, 59)

# Filter by Date
secondRound_df = tweets_df[tweets_df.Created_at >= since_date]
secondRound_df = secondRound_df[secondRound_df.Created_at <= until_date]

# Filter by Candidates
secondRound_df = secondRound_df[(secondRound_df.Username == 'JoaoCampos') | (secondRound_df.Username == 'MariliaArraes')]

secondRound_df.to_csv(str(pathlib.Path().absolute()) + '\\datasets\\secondRound.csv', index=False)