### Sentiment with LSTM

In [1]:
# Dependancies
import pandas as pd
import os

In [2]:
# Read preprocessed data into dataframe
file = os.path.join('..', 'Output', 'tweets1.csv' )
tweet_df = pd.read_csv(file)
print(tweet_df.count())

# Drop Null values from data frame
tweet_df = tweet_df.dropna(subset=['Tokenized'])
print(tweet_df.count())

tweet_df.head()

ItemID       99989
Sentiment    99989
Tokenized    97028
dtype: int64
ItemID       97028
Sentiment    97028
Tokenized    97028
dtype: int64


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [3]:
# Add sequence vectors 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_fatures = 2000

print(f'Sentiment 1 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 1].size))
print(f'Sentiment 0 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 0].size))

docs = tweet_df.Tokenized
tokenizer = Tokenizer(num_words=max_fatures, split=" ")
tokenizer.fit_on_texts(docs)

#making sequences:
X = tokenizer.texts_to_sequences(docs.values)
X = pad_sequences(X)
tweet_df.head()

Using TensorFlow backend.


Sentiment 1 count: 163857
Sentiment 0 count: 127227


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [4]:
# Build model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 35, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
# Split data into test, train sets
from sklearn.model_selection import train_test_split

y = tweet_df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
(len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
(len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
(len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
(len(X_test[y_test == 1]) / (len(X_test)*1.))*100))

#X_train, X_test, y_train, y_test = train_test_split( tweet_df['SequenceVector'], tweet_df['Sentiment'], test_size = 0.33, random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Train set has total 65008 entries with 43.78% negative, 56.22% positive
Test set has total 32020 entries with 43.56% negative, 56.44% positive
(65008, 35) (65008,)
(32020, 35) (32020,)


In [13]:
# Train model
batch_size = 64
model.fit(X_train, y_train, epochs = 6, batch_size=batch_size, verbose = 2)

# Save model
model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
model.save_weights(model_file)

# Load trained model from file
#model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
#model.load_weights(model_file)

Epoch 1/8
 - 94s - loss: 0.4166 - accuracy: 0.8006
Epoch 2/8
 - 97s - loss: 0.4054 - accuracy: 0.8096
Epoch 3/8
 - 98s - loss: 0.3961 - accuracy: 0.8130
Epoch 4/8
 - 96s - loss: 0.3865 - accuracy: 0.8189
Epoch 5/8
 - 97s - loss: 0.3779 - accuracy: 0.8230
Epoch 6/8
 - 97s - loss: 0.3701 - accuracy: 0.8277
Epoch 7/8
 - 96s - loss: 0.3630 - accuracy: 0.8316
Epoch 8/8
 - 97s - loss: 0.3587 - accuracy: 0.8348


In [17]:
# Validation
validation_size = 5000

aX_validate = X_test[-validation_size:]
aY_validate = y_test[-validation_size:]
aX_test = X_test[:-validation_size]
aY_test = y_test[:-validation_size]
score,acc = model.evaluate(aX_test, aY_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.67
acc: 0.71


In [18]:
# Results summary
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
import numpy as np

y_prediction = []

fck = list(y_test)

pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for i in range(len(X_test)):
        
    result = model.predict(X_test[i].reshape(1,X_test[i].shape[0]),batch_size=1,verbose = 2)[0]

    y_prediction.append(np.argmax(result))
    
    if np.argmax(result) == fck[i]:
        if fck[i] == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if fck[i] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1


print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 76.77198030210812 %
neg_acc 63.84885638488564 %


In [20]:
# Result
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_prediction))
print(classification_report(y_test, y_prediction))
print(accuracy_score(y_test, y_prediction))

[[ 8905  5042]
 [ 4198 13875]]
              precision    recall  f1-score   support

           0       0.68      0.64      0.66     13947
           1       0.73      0.77      0.75     18073

   micro avg       0.71      0.71      0.71     32020
   macro avg       0.71      0.70      0.70     32020
weighted avg       0.71      0.71      0.71     32020

0.7114303560274828
