In [3]:
pip install Keras-Preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Charles Wallis
# Sentiment Analysis using RNN and Naive Bayes
# https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

import os
import numpy as np
import pandas as pd
from google.colab import drive
import zipfile
import warnings
warnings.filterwarnings('ignore')

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import re

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from nltk.tokenize import RegexpTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn import metrics

## Data Exploration

In [None]:
drive.mount('/content/drive')

path_to_zip_file = r'/content/drive/MyDrive/UTD/.UTD 2023 Spring/CS 4395/sentiment labelled sentences.zip'
directory_to_extract_to = "sentiment labelled sentences"
!mkdir "sentiment labelled sentences"
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

fileDir = r"sentiment labelled sentences/sentiment labelled sentences"
fileList = ['imdb_labelled.txt','amazon_cells_labelled.txt', 'yelp_labelled.txt']

Mounted at /content/drive


In [None]:
dataset = pd.DataFrame()
for file in fileList:
    dataset = dataset.append(pd.read_csv(os.path.join(fileDir, file), sep='\t', header=None), ignore_index=True)
dataset.columns = ['Phrase','Sentiment']

In [None]:
dataset.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [None]:
dataset

Unnamed: 0,Phrase,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
2743,I think food should have flavor and texture an...,0
2744,Appetite instantly gone.,0
2745,Overall I was not impressed and would not go b...,0
2746,"The whole experience was underwhelming, and I ...",0


In [None]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])

In [None]:
#Split to test and train
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'], test_size=0.2, random_state=1234)

## Sentiment prediction using LSTM 

In [None]:
#set LSTM hyperparams
embed_dim = 128 
lstm_out = 196
max_features = 2000
tokenizerLSTM = Tokenizer(num_words=max_features, split=' ')
tokenizerLSTM.fit_on_texts(dataset['Phrase'].values)
X = tokenizerLSTM.texts_to_sequences(dataset['Phrase'].values)
X = pad_sequences(X)

LSTM_model = Sequential()
LSTM_model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
LSTM_model.add(SpatialDropout1D(0.4))
LSTM_model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
LSTM_model.add(Dense(2,activation='softmax'))
LSTM_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(LSTM_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1267, 128)         256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 1267, 128)        0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
Y = pd.get_dummies(dataset['Sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 1234)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
batch_size = 32
LSTM_model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

(1841, 1267) (1841, 2)
(907, 1267) (907, 2)
Epoch 1/7
58/58 - 471s - loss: 0.6757 - accuracy: 0.5731 - 471s/epoch - 8s/step
Epoch 2/7
58/58 - 419s - loss: 0.5850 - accuracy: 0.7192 - 419s/epoch - 7s/step
Epoch 3/7
58/58 - 414s - loss: 0.4562 - accuracy: 0.8213 - 414s/epoch - 7s/step
Epoch 4/7
58/58 - 410s - loss: 0.3129 - accuracy: 0.8821 - 410s/epoch - 7s/step
Epoch 5/7
58/58 - 433s - loss: 0.2072 - accuracy: 0.9240 - 433s/epoch - 7s/step
Epoch 6/7
58/58 - 521s - loss: 0.1438 - accuracy: 0.9538 - 521s/epoch - 9s/step
Epoch 7/7
58/58 - 443s - loss: 0.1012 - accuracy: 0.9685 - 443s/epoch - 8s/step


<keras.callbacks.History at 0x7f989e10e100>

In [None]:
test_size = 500

X_validate = X_test[-test_size:]
Y_validate = Y_test[-test_size:]
X_test = X_test[:-test_size]
Y_test = Y_test[:-test_size]
score,acc = LSTM_model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

13/13 - 12s - loss: 0.5279 - accuracy: 0.8059 - 12s/epoch - 958ms/step
score: 0.53
acc: 0.81


In [None]:
total_positives, total_negatives, correct_positives, correct_negatives = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = LSTM_model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 0)[0]
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            correct_negatives += 1
        else:
            correct_positives += 1
    if np.argmax(Y_validate[x]) == 0:
        total_negatives += 1
    else:
        total_positives += 1

In [None]:
print("positive sentiment accuracy", correct_positives/total_positives*100, "%")
print("negative sentiment accuracy", correct_negatives/total_negatives*100, "%")

positive sentiment accuracy 81.27490039840637 %
negative sentiment accuracy 76.70682730923694 %


## Sentiment prediction using Naive Bayes

In [None]:
#Multinomial Naive
cv = CountVectorizer(stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'], test_size=0.2, random_state=1234)

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print("MNB Accuracy: ", accuracy_score)

MNB Accuracy:  0.8


In [None]:
#MNB But with 2,2 ngram
cv = CountVectorizer(stop_words='english', ngram_range = (2,2), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'],test_size=0.2, random_state=1234)

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print("MNB Accuracy: ", accuracy_score)

MNB Accuracy:  0.62


In [None]:
#MNB But with 3,3 ngram
cv = CountVectorizer(stop_words='english', ngram_range = (3,3), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['Phrase'])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['Sentiment'],test_size=0.2, random_state=1234)

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

accuracy_score = metrics.accuracy_score(MNB.predict(X_test), Y_test)
print("MNB Accuracy: ", accuracy_score)

MNB Accuracy:  0.5309090909090909


In [None]:
#Gaussian Naive Bayes
GNB = GaussianNB()
GNB.fit(X_train.todense(), Y_train)

GaussianNB()

In [None]:
#Bernoulli Naive Bayes
BNB = BernoulliNB()
BNB.fit(X_train, Y_train)

BernoulliNB()

In [None]:
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(dataset['Phrase'])
x_train, x_test, y_train, y_test = train_test_split(text_count_2, dataset['Sentiment'],test_size=0.2,random_state=1234)

MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print("MNB Accuracy: ", accuracy_score_mnb)

BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print("BNB Accuracy: ", accuracy_score_bnb)

GNB.fit(x_train.todense(), y_train)
accuracy_score_gnb = metrics.accuracy_score(GNB.predict(x_test.todense()), y_test)
print("MNB Accuracy: ", accuracy_score_gnb)

MNB Accuracy:  0.8418181818181818
BNB Accuracy:  0.8036363636363636
MNB Accuracy:  0.7127272727272728


In [None]:
#saving the models to my drive as an h5 file (so I don't have to re-train it when I open colab again)
import pickle
# r"/content/drive/MyDrive/UTD/.UTD 2022 Fall/CS 4372/'
with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/gnb_model.h5", 'wb') as f:
  pickle.dump(GNB, f)
with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/bnb_model.h5", 'wb') as f:
  pickle.dump(BNB, f)
with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/cnb_model.h5", 'wb') as f:
  pickle.dump(MNB, f)
with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/lstm_model.h5", 'wb') as f:
  pickle.dump(LSTM_model, f)
if(False): #set to True to load the models
  with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/gnb_model.h5", 'rb') as f:
      GNB = pickle.load(f)
  with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/bnb_model.h5", 'rb') as f:
      BNB = pickle.load(f)
  with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/cnb_model.h5", 'rb') as f:
      MNB = pickle.load(f)
  with open(r"/content/drive/MyDrive/UTD/.UTD 2023 Spring/4395/lstm_model.h5", 'rb') as f:
      LSTM_model = pickle.load(f)

## Results

In [None]:
def miscalulationsNB(model, transformer, dataset, NMiss = 10):
  acc = 0
  FalsePositive = []
  PositiveFalse = []
  cnt = len(dataset['Phrase'].values)
  for i in range(len(dataset['Phrase'].values)):
    phrase = dataset['Phrase'].values[i]  
    ans = dataset['Sentiment'].values[i]
    try:
      pred = model.predict(transformer.transform([phrase]))[0]
    except:
      pred = model.predict(transformer.transform([phrase]).toarray())[0]
    acc += 1 if ans == pred else 0
    #print(phrase, ans, pred)
    if ans != pred:
      #print(phrase, ans, pred)
      if( ans == 0):
        PositiveFalse.append([phrase, ans, pred])
      else:
        FalsePositive.append([phrase, ans, pred])
    #if(len(PositiveFalse)>=NMiss and len(FalsePositive)>=NMiss):
      #break

  print( f"Accuracy: {acc/cnt*100}%")  
  print(f"{NMiss} Missclassifications of FalsePositives: ")
  print(FalsePositive[0:NMiss])
  print(f"{NMiss} Missclassifications of PositiveFalses: ")
  print(PositiveFalse[0:NMiss])

print("=========== MNB Missclassification Examples ========") 
miscalulationsNB(MNB, tfidf, dataset)
print("=========== BNB Missclassification Examples ========") 
miscalulationsNB(BNB, tfidf, dataset)  
print("=========== GNB Missclassification Examples ========") 
miscalulationsNB(GNB, tfidf, dataset)  

Accuracy: 92.97671033478893%
10 Missclassifications of FalsePositives: 
[["This if the first movie I've given a 10 to in years.  ", 1, 0], ['If there was ever a movie that needed word-of-mouth to promote, this is it.  ', 1, 0], ['Give this one a look.  ', 1, 0], ['It actually turned out to be pretty decent as far as B-list horror/suspense films go.  ', 1, 0], ["I don't think you will be disappointed.  ", 1, 0], ['Some applause should be given to the "prelude" however.  ', 1, 0], ['The movie had you on the edge of your seat and made you somewhat afraid to go to your car at the end of the night.  ', 1, 0], ['I liked this movie way too much.  ', 1, 0], ["Still, I do like this movie for it's empowerment of women; there's not enough movies out there like this one.  ", 1, 0], ['You\'ll love it!  \t1\nThis movie is BAD.  \t0\nSo bad.  \t0\nThe film is way too long.  \t0\nThis is definitely one of the bad ones.  \t0\nThe movie I received was a great quality film for it\'s age.  \t1\nJohn Wayne

# Summary

Model |Parameters| Accuracy
---------|----------| --------
1| LSTM | 80.59%, Positive Sentiment 81.27%, Negative Sentiment 76.71%
2| Multinomial NB, 1-gram            |80.00%
3| Multinomial NB, 2-gram            |62.00%
4| Multinomial NB, 3-gram            |53.09%
5| Multinomial NB                    |92.98%
6| Bernoulli NB                      |90.28%
7| Gaussian NB                       |89.08%


##Understanding RNN, LSTM

RNN is a type of architecture that deals with sequences of data. CNN signals can be one-dimensional, two-dimensional, and three-dimensional depending on the domain. A domain is defined by "where we mapped it from" and "where we mapped it to" and since a domain is just a temporary input to X, dealing with sequence data is basically the same as dealing with one-dimensional data. Nevertheless, it is possible to handle two-dimensional data with two directions using RNN.

The LSTM is explicitly designed to avoid problems with long dependency periods. Remembering long periods of information should be the basic behavior of a model, so that the model doesn't have to spend time to re-learn it every time.

The core aspect of LSTM is the cell state, and LSTM has the ability to add or subtract something onto the cell state, which is carefully controlled by the gate composed of sigmoid layers and pointwise multiplication.

LSTM First decides what information needs to be thrown out, which is decided by the sigmoid layer. Then, it decides what information needs to be saved. the input gate layer (a sigmoid layer) decides that as well. Then the tanh layer creates a vector to add onto the cell state. Like this, LSTM created a vector to update the state. 

## Naive Bayes

Multinomial Naive Bayes classification is used when the characteristics of the data are expressed by the number of appearances. For example, is a dice rolls 1 once, 2 twice, 3 thrice, the data can be represented as (1,2,3,0,0,0) with each index being the face of the dice, and the value being the number of times they appeared. 

Bernoulli Naive Bayes is used when the data is expressed as 0s and 1s. Which is what we are using for positive and negative sentiment data. Surprisingly, our results show that Bernouli NB algorithm was in comparison not the best NB model that fits our case. 

Unlike the two above, Gaussian NB is used when the data is not discrete. is the data is continuous, using GNB can give a better accuracy. This classification is used under the assumption that the values of the features are normally distributed

## Comparison and which one I prefer

In sentiment analysis, Naive Bayes models were much faster in training and fitting, compared to LSTM. NB models also showed a higher accuracy, and with the understanding of what each classifiers do, we can apply these types of NB models in a lot of scenarios. LSTM is certainly a great way to utilize RNN and use the model to give itself feedback (hence being recurrent). LSTM is a great solution to the vanishing gradient problem, which was originally a problem that LSTM had which older information would have less impact on new weight updates. 