<a href="https://colab.research.google.com/github/flaviocrispin/twitter_analisys/blob/main/2_tratamento_e_limpeza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tratamento e limpeza dos dados
- Carregamento dos dados
- Exclusão de caracteres especiais (@:// e etc)
- Stopwords
- Conversão de emojis e emoticons em sentimentos
- Stemmed
- Tokenização
- Salvar o novo DATAFRAME no google drive


## 1. Exploração dos dados

In [1]:
#@title Connect google colab with google drive

from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
#@title Libraries imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

### 1.1 - Transform and load data

In [7]:
#@title Data transform

PATH_FOLDER = '/content/drive/MyDrive/Colab Notebooks/project_politics' #@param {type: 'string'}
PATH_IMAGES = '/content/drive/MyDrive/Colab Notebooks/project_politics/img/' #@param {type: 'string'}
PATH_MODELS = '/content/drive/MyDrive/Colab Notebooks/project_politics/models/' #@param {type: 'string'}
PATH_DATA = '/content/drive/MyDrive/Colab Notebooks/project_politics/data/' #@param {type: 'string'}

colunas = ['date','username', 'place','tweet','language','replies_count',
           'retweets_count','likes_count','hashtags']


def open_data (candidato):
  dataframe = pd.read_csv(PATH_DATA + 
                          'raw/{}_tweets_extract.csv'.format(candidato),
                          usecols = colunas)
  df = dataframe.loc[dataframe['language'] == 'pt']
  df['qtd_tweets'] = 1
  df['date'] = pd.to_datetime(df['date'])
  #df_data = df.loc[df['date'].between('2022-03-23', '2022-03-30')]
  df_data = df.rename(columns={'replies_count': 'respostas', 
                                    'retweets_count': 'compartilhamento',
                                    'likes_count': 'likes'})
  return df_data


def group_data_by_time (candidato):
  dataframe = open_data (candidato).groupby(by='date')['qtd_tweets'].sum().reset_index()
  return dataframe



def data_joint (data1, data2, data3, data4):
  df1_g = df1.groupby(by='date')['qtd_tweets'].sum().reset_index()
  df2_g = df2.groupby(by='date')['qtd_tweets'].sum().reset_index()
  df3_g = df3.groupby(by='date')['qtd_tweets'].sum().reset_index()
  df4_g = df4.groupby(by='date')['qtd_tweets'].sum().reset_index()

  df1_g['candidato'] = 'Bolsonaro'
  df2_g['candidato'] = 'Lula'
  df3_g['candidato'] = 'Ciro'
  df4_g['candidato'] = 'Moro'

  data = pd.concat([df1_g,df2_g, df3_g, df4_g],keys=['Bolsonaro','Lula', 'Ciro', 'Moro'])
  
  return data

def data_joint_tweet_types (data1, data2, data3, data4):

  data = pd.concat([data1,data2, data3, data4],keys=['Bolsonaro','Lula', 'Ciro', 'Moro'])

  
  return plot


In [10]:
#@title Load CSV files

df1 = open_data('bolsonaro')
df2 = open_data('lula')
df3 = open_data('ciro')

### 1.2 Data Cleaning

In [13]:
#@title NLTK and EMOT install 
#import NLTK and download
import nltk
nltk.download('stopwords')
nltk.download('rslp')


#install EMOT library
!pip install emot  &> /dev/null

!pip install unidecode &> /dev/null
!pip install autocorrect &> /dev/null

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [12]:
#@title Libraries import

import datetime as dt
import numpy as np
import string
import unidecode

stemmer = nltk.stem.RSLPStemmer()

#cleaning
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords


# library for transform the emojis and emoticons
import emot
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS


dictionary for this language not found, downloading...
__________________________________________________
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

In [14]:
#@title Transform EMOJIS in sentiment

# criar função para armazenar lista de emojis e emoticons

# emojis = []
# emotis = []
# emot_obj = emot.core.emot()
# lista_comentarios = df3['tweet'].tolist()

# def localiza_emoji_emoti(text):  
#     emoji = emot_obj.emoji(text)
#     emoti = emot_obj.emoticons(text)
    
#     if emoji['flag'] == True:
#         emojis.append(emoji['value'])
        
#     try:
#         if emoti['flag'] == True:
#             emotis.append(emoti['value'])
#     except:
#         emotis.append('nada')

# # aplicando a função na lista de comentários
# for txt in lista_comentarios:
#     localiza_emoji_emoti(txt)

# # removendo duplicidade dos emojis e emoticos
# lista_emojis = []
# lista_emotis = []

# for linha in emojis:
#     for emoji in linha:
#         if emoji not in lista_emojis:
#             lista_emojis.append(emoji)
            
# for linha in emotis:
#     for emoti in linha:
#         if emoti != 'nada' and emoti not in lista_emotis:
#             lista_emotis.append(emoti)
            
# len(lista_emojis), len(lista_emotis)

# criar dois dicionários com as interpretações, um para emojis outro para emoticons
dict_emojis = {
    'exclamation_question_mark': 'ruim',
    'person_pouting': 'ruim',
    'kiss_mark': 'ótimo',
    'upside-down_face': 'ótimo',
    'smiling_face_with_open_mouth_&_smiling_eyes': 'ótimo',
    'love_letter': 'ótimo',
    'rose': 'ótimo',
    'angry_face_with_horns': 'ruim',
    'yellow_heart': 'ótimo',
    'blue_heart': 'ótimo',
    'green_heart': 'ótimo',
    'relieved_face': 'ótimo',
    'trophy': 'ótimo',
    'expressionless_face': 'ruim',
    'slightly_smiling_face': 'ótimo',
    'nauseated_face': 'ruim',
    'face_with_stuck-out_tongue_&_winking_eye': 'ótimo',
    'OK_hand': 'ótimo',
    'neutral_face': 'ruim',
    'person_shrugging': 'ruim',
    'weary_face': 'ruim',
    'heart_with_arrow': 'ótimo',
    'grimacing_face': 'ruim',
    'sleepy_face': 'ruim',
    'pig_face': 'ruim',
    'thinking_face': 'ruim',
    'loudly_crying_face': 'ruim',
    'blossom': 'ótimo',
    'face_with_cold_sweat': 'ruim',
    'crying_cat_face': 'ruim',
    'unamused_face': 'ruim',
    'disappointed_but_relieved_face': 'ruim',
    'smiling_face': 'ótimo',
    'face_screaming_in_fear': 'ruim',
    'face_with_steam_from_nose': 'ruim',
    'broken_heart': 'ruim',
    'see-no-evil_monkey': 'ruim',
    'two_hearts': 'ótimo',
    'growing_heart': 'ótimo',
    'slightly_frowning_face': 'ruim',
    'crying_face': 'ruim',
    'dizzy': 'ruim',
    'smiling_face_with_open_mouth_&_closed_eyes': 'ótimo',
    'victory_hand': 'ótimo',
    'face_with_rolling_eyes': 'ruim',
    'revolving_hearts': 'ótimo',
    'smiling_face_with_open_mouth': 'ótimo',
    'rolling_on_the_floor_laughing': 'ótimo',
    'pensive_face': 'ruim',
    'dizzy_face': 'ruim',
    'angry_face': 'ruim',
    'confused_face': 'ruim',
    'smiling_face_with_open_mouth_&_cold_sweat': 'ótimo',
    'smirking_face': 'ótimo',
    'smiling_face_with_sunglasses': 'ótimo',
    'face_with_tears_of_joy': 'ótimo',
    'white_medium_star': 'ótimo',
    'thumbs_down': 'ruim',
    'red_heart': 'ótimo',
    'clapping_hands': 'ótimo',
    'smiling_face_with_halo': 'ótimo',
    'purple_heart': 'ótimo',
    'smiling_face_with_heart-eyes': 'ótimo',
    'heart_suit': 'ótimo',
    'hugging_face': 'ótimo',
    'glowing_star': 'ótimo',
    'smiling_face_with_smiling_eyes': 'ótimo',
    'grinning_face_with_smiling_eyes': 'ótimo',
    'thumbs_up': 'ótimo',
    'face_blowing_a_kiss': 'ótimo',
    'winking_face': 'ótimo',
    'smiling_face_with_hearts': 'ótimo',
    'clown_face': 'ruim'
}

dict_emotis = {
    'Wink or smirk': 'ótimo',
    'Happy face or smiley': 'ótimo',
    'Tongue sticking out, cheeky, playful or blowing a raspberry': 'ótimo',
    'Frown, sad, andry or pouting': 'ruim',
    'Skeptical, annoyed, undecided, uneasy or hesitant': 'ruim'
}

len(dict_emojis), len(dict_emotis)
# dos 105 emojis, 71 foram traduzidos. Os emoticons foram todos traduzidos

(73, 5)

In [16]:
#@title Pre processing

#clean the tweets
def pre_processing(text):

  #remove special caractheres 
  pat1 = r'@[^ ]+'                   
  pat2 = r'https?://[A-Za-z0-9./]+'  
  pat3 = r'\'s'                      
  pat4 = r'\#\w+'                     
  pat5 = r'&amp '
  pat6 = r'(.)\1{3,}'                              
  combined_pat = r'|'.join((pat1, pat2, pat3, pat4, pat5, pat6))
  text = re.sub(combined_pat,"",text).lower()
  text = unidecode.unidecode(text)
  text = "".join([i for i in text if i not in string.punctuation])

  #translate emojis
  for emot_obj in UNICODE_EMOJI:
    text = text.replace(emot_obj, UNICODE_EMOJI[emot_obj])
    text = text.replace(':', ' ')

  lista = text.split(' ')
  for x in range(len(lista)):
      chave = lista[x]
      if chave in dict_emojis:
          lista[x] = dict_emojis[chave]
      if chave in dict_emotis:
          lista[x] = dict_emotis[chave]
          
  text = ' '
  text = (text.join(lista))

  #remove stopwords
  stopwords = nltk.corpus.stopwords.words('portuguese')
  newStopWords = ['q','vc', 'pq', 'rs', 's', 'fud', 'pra', 'ei', 
                  'tá', 'vai', 'pa', 've', 'ta', 'é', 'to', 'tb', 
                  'ir', 'n', 'p', 'ai', 'vei']
  stopwords.extend(newStopWords)
  palavras = [i for i in text.split() if not i in stopwords]

  text = (" ".join(palavras))

  return text


# Aplicando o stemming em nossa base:
def Stemming(text):
    stemmer = nltk.stem.RSLPStemmer()
    palavras = []
    for w in text.split():
        palavras.append(stemmer.stem(w))
    return (" ".join(palavras))

In [17]:
#@title pre processing at bolsonaro DATASET
'''
pre processing at Bolsonaro dataset

'''
df1["cleaned_tweet"] = df1["tweet"].apply(lambda x:pre_processing(x))
df1['steemer_twitter'] = df1['cleaned_tweet'].apply (lambda x: Stemming(x))

In [18]:
#@title pre processing at Lula DATASET
'''
pre processing in Lula dataset
time estimate ~20 minutes
'''
df2["cleaned_tweet"] = df2["tweet"].apply(lambda x:pre_processing(x))
df2['steemer_twitter'] = df2['cleaned_tweet'].apply (lambda x: Stemming(x))

In [19]:
#@title pre processing at Ciro DATASET
'''
pre processing in Ciro Gomes dataset
time estimate 1 minute and 24 seconds
'''
df3["cleaned_tweet"] = df3["tweet"].apply(lambda x:pre_processing(x))
df3['steemer_twitter'] = df3['cleaned_tweet'].apply (lambda x: Stemming(x))

In [20]:
#@title Save the datasets cleaned in google drive
df1.to_csv(PATH_DATA + 'interim' + '/bolsonaro_tratado.csv', sep=';', index=False)
df2.to_csv(PATH_DATA + 'interim' + '/lula_tratado.csv', sep=';', index=False)
df3.to_csv(PATH_DATA + 'interim' + '/ciro_tratado.csv', sep=';', index=False)