# Regras VS Supervisionado

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Emanuelle\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
Tweets = pd.read_csv("Tweets2.csv")
Tweets.shape

(74682, 4)

In [3]:
Tweets.head()

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
Tweets.groupby(['sentiment']).size()

sentiment
Negative    22542
Neutral     31308
Positive    20832
dtype: int64

## Pré Processamento

In [5]:
Tweets.loc[Tweets['sentiment']=='Irrelevant','sentiment'] = 'Neutral'

In [7]:
Tweets = Tweets.dropna(subset=['text'])
Tweets.reset_index(drop=True, inplace=True)

# Supervisionado

In [8]:
token = Tokenizer(num_words=100)
token.fit_on_texts(Tweets['text'].values)

In [9]:
X = token.texts_to_sequences(Tweets['text'].values)
X = pad_sequences(X, padding="post", maxlen=100)

- LabelEncoder e OneHotEncoder

In [10]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['sentiment'])
print(y)

[2 2 2 ... 2 2 2]


In [12]:
y_categorical = to_categorical(y, num_classes=3)
print(y_categorical)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


- Divisão em treino e teste

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4)
X_test

array([[12, 49, 21, ...,  0,  0,  0],
       [ 3, 79,  0, ...,  0,  0,  0],
       [65,  9, 13, ...,  0,  0,  0],
       ...,
       [57,  2, 76, ...,  0,  0,  0],
       [47, 19, 41, ...,  0,  0,  0],
       [44,  6,  1, ...,  0,  0,  0]], dtype=int32)

In [14]:
y_train = to_categorical(y_train, num_classes=3)  
y_test = to_categorical(y_test, num_classes=3)

- Criação e treinamento do modelo

In [15]:
modelo = Sequential()
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128, input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh',
                recurrent_activation='sigmoid', unroll=False, use_bias=True))
modelo.add(Dense(units=3,activation="softmax"))



In [19]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(modelo.summary())

None


In [18]:
modelo.fit(X_train, y_train, epochs=5, batch_size=500,verbose=True)

Epoch 1/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 369ms/step - accuracy: 0.4092 - loss: 1.0856
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 347ms/step - accuracy: 0.4167 - loss: 1.0837
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 350ms/step - accuracy: 0.4189 - loss: 1.0827
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 354ms/step - accuracy: 0.4183 - loss: 1.0830
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 354ms/step - accuracy: 0.4188 - loss: 1.0829


<keras.src.callbacks.history.History at 0x1caf227ac30>

In [20]:
_, accuracy = modelo.evaluate(X_test,y_test)
print("Accuracy: ", accuracy)

[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - accuracy: 0.4168 - loss: 1.0832
Accuracy:  0.4204871654510498


# Vader

In [21]:
mas = SentimentIntensityAnalyzer()
Tweets['vander_sentiment'] = ''

for y in range(len(Tweets.index)):
  x = mas.polarity_scores(Tweets['text'].iloc[y]) # armazena o maior score na coluna criada
  del x['compound']
  maior = max(x,key=x.get) #neg pos neu
  Tweets.loc[y,'vander_sentiment'] = maior

In [22]:
Tweets.groupby(['vander_sentiment']).size()

vander_sentiment
neg     3660
neu    65581
pos     4755
dtype: int64

In [23]:
# Alterando os nomes
Tweets.loc[Tweets['vander_sentiment']== 'neu' , 'vander_sentiment'] = 'Neutral'
Tweets.loc[Tweets['vander_sentiment']== 'neg' , 'vander_sentiment'] = 'Negative'
Tweets.loc[Tweets['vander_sentiment']== 'pos' , 'vander_sentiment'] = 'Positive'

In [24]:
Tweets.groupby(['vander_sentiment']).size()

vander_sentiment
Negative     3660
Neutral     65581
Positive     4755
dtype: int64

In [25]:
y_pred = Tweets['vander_sentiment']
y_test = Tweets['sentiment']
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 2004 19902   452]
 [ 1122 28384  1477]
 [  534 17295  2826]]


In [26]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.44886210065408944
