In [291]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, CuDNNLSTM, Dense
import pandas as pd
import re
import numpy as np
import unicodedata
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import SGD


nlp = spacy.load("pt_core_news_sm")

In [292]:
df = pd.read_csv('Tweets_Mg.csv', header=None)
df.columns = ['texto', 'sentimento']
df.head()

Unnamed: 0,texto,sentimento
0,���⛪ @ Catedral de Santo Antônio - Governador ...,Neutro
1,"� @ Governador Valadares, Minas Gerais https:/...",Neutro
2,"�� @ Governador Valadares, Minas Gerais https:...",Neutro
3,��� https://t.co/BnDsO34qK0,Neutro
4,��� PSOL vai questionar aumento de vereadores ...,Negativo


In [342]:
def preprocess(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in STOP_WORDS])
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
    text = re.sub(r'[?!.,@#:]', '', text)
    special_char_pattern = re.compile(r'[\}\}\\\(\)\./!-]')
    text = special_char_pattern.sub(" ", text)  
    
    return text.strip()

In [343]:
df['texto'] = df['texto'].apply(preprocess)
df['sentimento'] = pd.factorize(df.sentimento)[0]
df.head()

Unnamed: 0,texto,sentimento
0,catedral santo antonio governador valadares mg...,0
1,governador valadares minas gerais https tco b3...,0
2,governador valadares minas gerais https tco dp...,0
3,https tco bndso34qk0,0
4,psol questionar aumento vereadores e prefeito ...,1


In [344]:
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(df['texto'])
X = tokenizer.texts_to_sequences(df['texto'])
X = pad_sequences(X)

In [407]:
embed_size = 150
lstm_out = 400
batch_size = 64

model = Sequential()
model .add(Embedding(4000, embed_size, input_length = X.shape[1]))
model .add(LSTM(lstm_out))
model .add(Dense(3, activation='softmax'))

modelCUDA = Sequential()
modelCUDA .add(Embedding(4000, embed_size, input_length = X.shape[1]))
modelCUDA .add(CuDNNLSTM(lstm_out))
modelCUDA .add(Dense(3, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-4, momentum=0.9, nesterov=True)
model.compile(loss = 'categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_60 (Embedding)     (None, 25, 150)           600000    
_________________________________________________________________
cu_dnnlstm_56 (CuDNNLSTM)    (None, 400)               883200    
_________________________________________________________________
dense_106 (Dense)            (None, 3)                 1203      
Total params: 1,484,403
Trainable params: 1,484,403
Non-trainable params: 0
_________________________________________________________________


In [408]:
X_train, X_test, y_train, y_test = train_test_split(X, pd.get_dummies(df['sentimento']), test_size=0.3,random_state=42)

In [409]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=50)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7ff81e7d1f28>

In [410]:
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print('score: {0:.2}'.format(score))
print('acc: {0:.2}'.format(acc))

score: 0.18
acc: 0.96


In [411]:
def deprocess(text):
    text_test = preprocess(text)
    text_test = tokenizer.texts_to_sequences([text])
    text_test = pad_sequences(text_test)
    text_to_array = np.zeros(X.shape[1])
    text_to_array[(text_to_array.shape[0] - text_test.shape[1]):] = text_test
    result = np.argmax(model.predict(text_to_array.reshape(1,X.shape[1])))

    if result == 0:
        print('Neutro')
    elif result == 1:
        print('Negativo')
    else:
        print('Positvo')

In [412]:
deprocess('PM apreende maconha e celular no presidio')

Positvo
