# Desafio de análise de sentimento do twitter - kaggle:
link: https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview

In [12]:
import pandas as pd
import numpy as np
import unidecode
import re

In [13]:
df_treino = pd.read_csv('dataset/train.csv')
df_treino.head(3)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative


In [14]:
# Tamanho do dataset de treino:
df_treino.shape

(27481, 4)

In [15]:
df_treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [16]:
df_treino = df_treino.set_index('textID')

In [17]:
# Transformar todas as strings em minúsculas e tirar espaços vazios da extremidade que podem existir:
for i in list(df_treino.columns):
    df_treino[f'{i}'] = df_treino[f'{i}'].str.strip()
    df_treino[f'{i}'] = df_treino[f'{i}'].str.lower()
df_treino.head(3)

Unnamed: 0_level_0,text,selected_text,sentiment
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral
549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
088c60f138,my boss is bullying me...,bullying me,negative


In [18]:
# Contar a quantidade de text's por sentimento:
df_treino_qtd_sentment = df_treino.groupby('sentiment').agg({'text':'nunique'})
df_treino_qtd_sentment.columns = ['Qtd de mensagens']
df_treino_qtd_sentment['% do total'] = (100*(df_treino_qtd_sentment['Qtd de mensagens']/df_treino_qtd_sentment['Qtd de mensagens'].sum())).round(2)
df_treino_qtd_sentment

Unnamed: 0_level_0,Qtd de mensagens,% do total
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,7774,28.36
neutral,11101,40.49
positive,8539,31.15


In [19]:
# Existe linha com NA's ?
df_treino.isna().sum()

text             1
selected_text    1
sentiment        0
dtype: int64

In [20]:
# Como existe apenas uma linha com NA's, ela será retirada:
df_treino = df_treino.dropna()
df_treino.head(3)

Unnamed: 0_level_0,text,selected_text,sentiment
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral
549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
088c60f138,my boss is bullying me...,bullying me,negative


In [21]:
# Remover url's e código html:
def remover_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

df_treino['text'] = df_treino['text'].apply(
    lambda x: unidecode.unidecode(x)
    ).apply(
        lambda x: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x) # Retirar links
        ).apply(
        lambda x: remover_html(x) # Retirar html código
        ).str.replace('-','').str.strip()

df_treino['selected_text'] = df_treino['selected_text'].apply(lambda x: unidecode.unidecode(x)).apply(
        lambda x: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x) # Retirar links
        ).apply(
        lambda x: remover_html(x) # Retirar html código
        ).str.replace('-','').str.strip()
df_treino.head()

Unnamed: 0_level_0,text,selected_text,sentiment
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral
549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
088c60f138,my boss is bullying me...,bullying me,negative
9642c003ef,what interview! leave me alone,leave me alone,negative
358bd9e861,"sons of ****, why couldn`t they put them on th...","sons of ****,",negative


## Fazendo a vetorização das palavras:

In [22]:
words = list(df_treino['text'].values)
words[0:10]

['i`d have responded, if i were going',
 'sooo sad i will miss you here in san diego!!!',
 'my boss is bullying me...',
 'what interview! leave me alone',
 'sons of ****, why couldn`t they put them on the releases we already bought',
 'some shameless plugging for the best rangers forum on earth',
 '2am feedings for the baby are fun when he is all smiles and coos',
 'soooo high',
 'both of you',
 'journey!? wow... u just became cooler.  hehe... (is that possible!?)']

### Próximos passos:
vetorização;

calcular o cosseno das palavras;

fazer a comparação das distâncias;

analisar mais sobre antes de executar o machine learning usando h2o.