### Sentiment with LSTM

In [28]:
# Dependancies
import pandas as pd
import os

In [29]:
# Read preprocessed data into dataframe
file = os.path.join('..', 'Output', 'tweets1.csv' )
tweet_df = pd.read_csv(file)
print(tweet_df.count())

# Drop Null values from data frame
tweet_df = tweet_df.dropna(subset=['Tokenized'])
print(tweet_df.count())

tweet_df.head()

ItemID       99989
Sentiment    99989
Tokenized    97028
dtype: int64
ItemID       97028
Sentiment    97028
Tokenized    97028
dtype: int64


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [30]:
# Add sequence vectors 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_fatures = 2000

print(f'Sentiment 1 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 1].size))
print(f'Sentiment 0 count: ' + str(tweet_df[ tweet_df['Sentiment'] == 0].size))

docs = tweet_df.Tokenized
tokenizer = Tokenizer(num_words=max_fatures, split=" ")
tokenizer.fit_on_texts(docs)

#making sequences:
X = tokenizer.texts_to_sequences(docs.values)
X = pad_sequences(X)
tweet_df.head()

Sentiment 1 count: 163857
Sentiment 0 count: 127227


Unnamed: 0,ItemID,Sentiment,Tokenized
0,1,0,sad friend
1,2,0,miss new moon trailer
2,3,1,already
3,4,0,cry dentist since get crown put
4,5,0,think mi cheating


In [36]:
# Build model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
#model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
#model.add(Dense(1,activation='softmax'))
#model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 35, 128)           256000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 35, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
# Split data into test, train sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

y = tweet_df.Sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

# Scaling
#X_minmax = MinMaxScaler().fit(X_train)
#X_train_minmax = X_minmax.transform(X_train)
#X_test_minmax = X_minmax.transform(X_test)

#print(X_train_minmax)

print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
(len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
(len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
(len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
(len(X_test[y_test == 1]) / (len(X_test)*1.))*100))

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

Train set has total 72771 entries with 43.80% negative, 56.20% positive
Test set has total 24257 entries with 43.42% negative, 56.58% positive
(72771, 35) (72771,)
(24257, 35) (24257,)


In [38]:
# Train model
batch_size = 32
#model.fit(X_train, y_train, epochs = 5, batch_size=batch_size, verbose = 2)

# Save model
#model_file = os.path.join('..', 'Output', 'tweets1_model.h5')
#model.save_weights(model_file)

# Load trained model from file
model_file = os.path.join('..', 'Output', 'tweets1_model_p1.h5')
model.load_weights(model_file)

In [39]:
# split into train and test
#n_train = 500
#trainX, testX = X[:n_train, :], X[n_train:, :]
#trainy, testy = y[:n_train], y[n_train:]

# Validation
#validation_size = 5000

#aX_validate = X_test[-validation_size:]
#aY_validate = y_test[-validation_size:]
#aX_test = X_test[:-validation_size]
#aY_test = y_test[:-validation_size]
#score,acc = model.evaluate(aX_test, aY_test, verbose = 2, batch_size = batch_size)
score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print(f"Test Samples Count : {len(X_test)}")
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

Test Samples Count : 24257
score: 0.54
acc: 0.73


In [49]:
print(X_test[0].reshape(1,X_test[i].shape[0]).shape)
print(X_test[0].shape)

(1, 35)
(35,)


In [52]:
# Results summary
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
import numpy as np

y_prediction = []

fck = list(y_test)

pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for i in range(len(X_test)):
        
    result = model.predict(X_test[i].reshape(1,X_test[i].shape[0]),batch_size=1,verbose = 2)[0]
    result_value = np.argmax(result)
    y_prediction.append(result_value)
    
    if result_value == fck[i]:
        if fck[i] == 0:
            neg_correct += 1
        else:
            pos_correct += 1
#    else:
#        print(f"{i}:{X_test[i]}")
        
    if fck[i] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1


print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 84.96903460837886 %
neg_acc 57.44398025066464 %


In [53]:
# Result
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_prediction))
print(classification_report(y_test, y_prediction))
print(accuracy_score(y_test, y_prediction))

[[ 6050  4482]
 [ 2063 11662]]
              precision    recall  f1-score   support

           0       0.75      0.57      0.65     10532
           1       0.72      0.85      0.78     13725

   micro avg       0.73      0.73      0.73     24257
   macro avg       0.73      0.71      0.71     24257
weighted avg       0.73      0.73      0.72     24257

0.7301809786865647
