### Sentiment with LSTM

In [144]:
# Dependancies
import pandas as pd
import os

In [145]:
# Read preprocessed data into dataframe
file = os.path.join('..', 'Output', 'tweets1.csv' )
tweet_df = pd.read_csv(file)
print(tweet_df.count())

# Drop Null values from data frame
tweet_df = tweet_df.dropna(subset=['Tokenized'])
print(tweet_df.count())

tweet_df.head()

ItemID       99989
Sentiment    99989
Tokenized    97042
dtype: int64
ItemID       97042
Sentiment    97042
Tokenized    97042
dtype: int64


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put min
4,5,0,think mi cheating


In [146]:
# Add sequence vectors 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_fatures = 2000

print(f'Sentiment 1 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 1].size))
print(f'Sentiment 0 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 0].size))

docs = tweet_df.Tokenized
tokenizer = Tokenizer(num_words=max_fatures, split=" ")
tokenizer.fit_on_texts(docs)

#making sequences:
X = tokenizer.texts_to_sequences(docs.values)
X = pad_sequences(X)
tweet_df.head()

Sentiment 1 count: 163878
Sentiment 0 count: 127248


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put min
4,5,0,think mi cheating


In [147]:
# Build model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 35, 128)           256000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 35, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [148]:
# Split data into test, train sets
from sklearn.model_selection import train_test_split

y = tweet_df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
(len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
(len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
(len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
(len(X_test[y_test == 1]) / (len(X_test)*1.))*100))

#X_train, X_test, y_train, y_test = train_test_split( tweet_df['SequenceVector'], tweet_df['Sentiment'], test_size = 0.33, random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Train set has total 65018 entries with 43.73% negative, 56.27% positive
Test set has total 32024 entries with 43.66% negative, 56.34% positive
(65018, 35) (65018,)
(32024, 35) (32024,)


In [149]:
# Train model
#batch_size = 32
#model.fit(X_train, y_train, epochs = 7, batch_size=batch_size, verbose = 2)

# Load trained model from file
model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
model.load_weights(model_file)

In [150]:
# Save model
#model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
#model.save_weights(model_file)

In [151]:
# Validation
validation_size = 5000

X_validate = X_test[-validation_size:]
Y_validate = y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [189]:
# Results summary
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
import numpy as np

y_prediction = []

pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for i in range(len(X_test)):
        
    result = model.predict(X_test[i].reshape(1,X_test[i].shape[0]),batch_size=1,verbose = 2)[0]

    y_prediction.append(np.argmax(result))
    
    if np.argmax(result) == fck[i]:
        if fck[i] == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if fck[i] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1


print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 82.47325536278477 %
neg_acc 60.981191446756775 %


In [190]:
# Result
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_prediction))
print(classification_report(y_test, y_prediction))
print(accuracy_score(y_test, y_prediction))

[[ 8527  5456]
 [ 3162 14879]]
              precision    recall  f1-score   support

           0       0.73      0.61      0.66     13983
           1       0.73      0.82      0.78     18041

   micro avg       0.73      0.73      0.73     32024
   macro avg       0.73      0.72      0.72     32024
weighted avg       0.73      0.73      0.73     32024

0.7308893330002498
