In [0]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences # para deixar todos os tokens com o mesmo tamanho
from keras.utils import np_utils

In [0]:
# Leitura do arquivo
Tweets = pd.read_csv("/dbfs/FileStore/shared_uploads/cristiane.gea@qcx.com.br/Tweets.csv")
Tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [0]:
# Identificação da quantidade de elementos associados a cada sentimento
Tweets.groupby(['airline_sentiment']).size()

Out[7]: airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [0]:
# Filtragem do dataframe (somente os registros com airline_sentiment_confidence acima de 0.8)
Tweets = Tweets[Tweets['airline_sentiment_confidence'] > 0.8]
Tweets

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)
9,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...",,2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14631,569588464896876545,negative,1.0,Bad Flight,1.0000,American,,MDDavis7,,0,@AmericanAir thx for nothing on getting us out...,,2015-02-22 12:04:07 -0800,US,Eastern Time (US & Canada)
14633,569587705937600512,negative,1.0,Cancelled Flight,1.0000,American,,RussellsWriting,,0,@AmericanAir my flight was Cancelled Flightled...,,2015-02-22 12:01:06 -0800,Los Angeles,Arizona
14636,569587371693355008,negative,1.0,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",


In [0]:
# Tratamento do texto (padding do texto)
token = Tokenizer(num_words = 100)                     # Definição do nº de tokens que será criado
token.fit_on_texts(Tweets['text'].values)              # Criação do modelo
x = token.texts_to_sequences(Tweets['text'].values)    # Geração dos tokens
x = pad_sequences(x, padding='post', maxlen=100)       # Sequenciamento dos paddings
print(x)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [0]:
# Tratamento da classe
labelencoder = LabelEncoder()        # instanciamento do objeto
y = labelencoder.fit_transform(Tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [0]:
# Aplicação do One-Hot-Encoder à variável y
y = np_utils.to_categorical(y)
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [0]:
# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [0]:
# Montagem das camadas da rede neural
modelo = Sequential()                                                                                  # Instaciamento de um modelo sequencial
modelo.add(Embedding(input_dim= len(token.word_index), output_dim=128, input_length=x.shape[1]))       # Criação da camada de embedding
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units = 196, dropout = 0.2, recurrent_dropout=0, activation='tanh',                    # Criação da camada LSTM
           recurrent_activation='sigmoid', unroll=False, use_bias=True))
                                                                                                       # units = 196 :: nº de neurônios da camada
modelo.add(Dense(units = 3, activation = 'softmax'))                                                   # Criação da camada de saída
                                                                                                       # units = 3 :: nº de neurônios da camada de saída (nº de classes que eu quero prever)

In [0]:
# Compilação do modelo
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(modelo.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 1,894,047
Trainable params: 1,894,047
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
# Treinamento da rede neural
modelo.fit(X_train, y_train, epochs=10, batch_size=30,verbose=True,validation_data=(X_test, y_test))

Epoch 1/10


INFO:tensorflow:Assets written to: /tmp/tmpsetuvfks/model/data/model/assets
INFO:tensorflow:Assets written to: /tmp/tmpsetuvfks/model/data/model/assets
Out[15]: <keras.callbacks.History at 0x7f3457068b50>

In [0]:
# Avaliação do modelo
loss, accuracy = modelo.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.8021494150161743
Accuracy:  0.7125557661056519


In [0]:
# Previsão de novos dados
prev = modelo.predict(X_test)
print(prev)

[[0.67939717 0.16802321 0.1525796 ]
 [0.67939717 0.16802321 0.15257959]
 [0.67939717 0.16802321 0.1525796 ]
 ...
 [0.67939717 0.16802321 0.15257959]
 [0.67939717 0.16802321 0.1525796 ]
 [0.67939717 0.16802321 0.15257959]]
