# Análise de Sentimentos com LSTM

## Importações

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [27]:
Tweets = pd.read_csv('Tweets.csv')
Tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [28]:
Tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [29]:
Tweets = Tweets[Tweets['airline_sentiment_confidence'] > 0.8]

In [30]:
Tweets.shape

(10459, 15)

## Pré Processamento dos Dados

In [31]:
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values) # Criando modelo

In [32]:
X = token.texts_to_sequences(Tweets['text'].values) # tokens gerados
X = pad_sequences(X, padding="post", maxlen=100) # completar os tokens com 0 a direita

In [33]:
print(X)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [34]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [35]:
y_categorical = to_categorical(y, num_classes=3)
print(y_categorical)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


- Separação das variáveis

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [37]:
X_test

array([[13, 95, 35, ...,  0,  0,  0],
       [13,  1,  0, ...,  0,  0,  0],
       [ 8, 69,  9, ...,  0,  0,  0],
       ...,
       [73, 10, 54, ...,  0,  0,  0],
       [16, 32,  7, ...,  0,  0,  0],
       [12, 51, 87, ...,  0,  0,  0]], dtype=int32)

In [38]:
y_train = to_categorical(y_train, num_classes=3)  
y_test = to_categorical(y_test, num_classes=3)

## Criação do Modelo

Treinamento será com Embbeding. 

PS:
- Embedding possui três parâmetros principais, sendo:
  - tamanho do vocabulário
  - comprimento do vetor das palavras
  - tamanho máximo da sequência

Dropout: elimina conexões aleatoriamente para reduzir overfitting (segunda camada)

In [39]:
modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128, input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh',
                recurrent_activation='sigmoid', unroll=False, use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))



In [44]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(modelo.summary())

None


In [41]:
modelo.fit(X_train, y_train, epochs=10, batch_size=30,verbose=True,validation_data=(X_test, y_test))

Epoch 1/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 71ms/step - accuracy: 0.7107 - loss: 0.8164 - val_accuracy: 0.6934 - val_loss: 0.8313
Epoch 2/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 86ms/step - accuracy: 0.7158 - loss: 0.7995 - val_accuracy: 0.6934 - val_loss: 0.8324
Epoch 3/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 87ms/step - accuracy: 0.7101 - loss: 0.8072 - val_accuracy: 0.6934 - val_loss: 0.8414
Epoch 4/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 89ms/step - accuracy: 0.7037 - loss: 0.8174 - val_accuracy: 0.6934 - val_loss: 0.8320
Epoch 5/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 91ms/step - accuracy: 0.7091 - loss: 0.8063 - val_accuracy: 0.6934 - val_loss: 0.8333
Epoch 6/10
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 87ms/step - accuracy: 0.7157 - loss: 0.7963 - val_accuracy: 0.6934 - val_loss: 0.8313
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x1c9d4e0e780>

In [42]:
loss, accuracy = modelo.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.6950 - loss: 0.8314
Loss:  0.834134042263031
Accuracy:  0.6934353113174438


In [43]:
prev = modelo.predict(X_test)
print(prev)

[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
[[0.7384677  0.14057027 0.12096205]
 [0.7384677  0.14057027 0.12096205]
 [0.7384677  0.14057027 0.12096208]
 ...
 [0.7384677  0.14057027 0.12096208]
 [0.7384677  0.1405703  0.12096209]
 [0.7384677  0.1405703  0.12096209]]
