In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import np_utils
from google.colab import files 
import numpy as np


In [None]:
files.upload()

In [None]:
tweets = pd.read_csv("Tweets.csv")
tweets.head()

In [None]:
tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [None]:
tweets = tweets[tweets['airline_sentiment_confidence'] > 0.8]


In [None]:
token = Tokenizer(num_words=100)
token.fit_on_texts(tweets['text'].values)


In [None]:
from numpy.lib.arraypad import pad
X = token.texts_to_sequences(tweets['text'].values)
X = pad_sequences(X, padding='post', maxlen=100)

In [None]:
print(X)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [None]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [None]:
y = np_utils.to_categorical(y)
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test

array([[13,  6, 20, ...,  0,  0,  0],
       [16, 11,  0, ...,  0,  0,  0],
       [13, 47, 47, ...,  0,  0,  0],
       ...,
       [ 8, 11,  8, ...,  0,  0,  0],
       [18, 78, 47, ...,  0,  0,  0],
       [16, 81,  4, ...,  0,  0,  0]], dtype=int32)

In [None]:
from keras.backend import dropout
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index), output_dim=128, input_length=X.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, activation='tanh', recurrent_activation='sigmoid', unroll=False, use_bias=True))
modelo.add(Dense(units=3, activation="softmax"))

In [None]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam', metrics= ['accuracy'])
print(modelo.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 1,894,047
Trainable params: 1,894,047
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
modelo.fit(X_train, y_train, epochs=10, batch_size=30, verbose=True, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f290a29b8d0>

In [None]:
loss, accuracy = modelo.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.7915359735488892
Accuracy:  0.7192479372024536


In [None]:
prev = modelo.predict(X_test)
print(prev)

[[0.6832385  0.16197313 0.1547884 ]
 [0.6832385  0.1619731  0.1547884 ]
 [0.6832385  0.1619731  0.15478839]
 ...
 [0.6832385  0.16197313 0.1547884 ]
 [0.6832385  0.16197313 0.1547884 ]
 [0.6832385  0.16197313 0.1547884 ]]
