# Problem 1 - Sentiment Analysis using recurrent models

## 1.1

In [1]:
import numpy as np
import pandas as pd

# Importing the training data
df = pd.read_csv('/content/IMDB Dataset.csv')
# Summary of the dataset
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [2]:
# Sentiment count - dataset is balanced
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [3]:
df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
# 1 - positive, 0 - negative
df.sentiment = (df.sentiment == "positive").astype("int")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)


def train_val_test_split(df=None, train_percent=0.7, test_percent=0.15, val_percent=0.15):
  df = df.sample(frac=1)
  train_df = df[: int(len(df)*train_percent)]
  test_df = df[int(len(df)*train_percent)+1 : int(len(df)*(train_percent+test_percent))]
  val_df = df[int(len(df)*(train_percent + test_percent))+1 : ]
  return train_df, test_df, val_df

train_df, test_df, val_df = train_val_test_split(df, 0.7, 0.15, 0.15)
train_labels, train_texts = train_df.values[:,1], train_df.values[:,0]
val_labels, val_texts = val_df.values[:,1], val_df.values[:,0]
test_labels, test_texts = test_df.values[:,1], test_df.values[:,0]
print(len(train_df), len(test_df), len(val_df))
print(len(train_texts), len(train_labels), len(val_df))

35000 7499 7499
35000 35000 7499


In [5]:
import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

def process_tokens(text):

    # Function to process tokens, replace any unwanted chars and patterns

    # Lowercase and remove specific punctuation
    preprocessed_text = text.lower()
    preprocessed_text = re.sub(r"[,.:\-\(\)]", "", preprocessed_text)

    # Remove HTML-like tags (e.g., <br/>)
    preprocessed_text = re.sub(r"<[^>]+>", " ", preprocessed_text)

    # Remove digits
    preprocessed_text = ''.join([i for i in preprocessed_text if not i.isdigit()])

    return preprocessed_text

def preprocessing(data):

    # Preprocessing data to list of tokens

    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

# Use the following functions to preprocess your train, validation, and test data
train_data = preprocessing(train_texts)
val_data = preprocessing(val_texts)
test_data = preprocessing(test_texts)

In [6]:
print(train_data[0])

['i', 'loved', 'this', 'movie', 'my', 'daughter', 'is', ' ', '/', 'and', 'a', 'country', 'girl', 'at', 'heart', 'there', 'are', 'not', 'any', 'movies', 'for', 'young', 'children', 'i', 'loved', 'this', 'one', 'because', 'the', 'worst', 'thing', 'in', 'it', 'was', 'when', 'one', 'of', 'the', 'boys', 'said', '"stupid"', 'i', 'applaud', 'them', 'for', 'stepping', 'out', 'and', 'making', 'a', 'true', 'family', 'movie', 'i', 'rented', 'it', 'the', 'first', 'time', 'we', 'saw', 'it', 'and', 'know', 'looking', 'to', 'buy', 'to', 'add', 'to', 'our', 'collection', 'my', 'daughter', 'can', 'not', 'stop', 'talking', 'about', 'it', 'it', 'goes', 'along', 'with', 'our', 'lifestyle', 'we', 'live', 'in', 'east', 'texas', 'i', 'hope', 'to', 'see', 'more', 'family', 'films', 'like', 'this', 'one', 'she', 'even', 'named', 'one', 'of', 'our', 'calves', '"hokey', 'pokey', 'keen"!!!', 'i', 'can', 'not', 'say', 'enough', 'about', 'this', 'movie', 'i', 'look', 'forward', 'to', 'many', 'more', 'films', 'like'

In [7]:
import numpy as np
import itertools

# Creating a vectorizer to vectorize text and create matrix of features
# Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

# Max features - top k words to consider only
max_features = 2000

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

# Checking if the len of vocab = k
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [8]:
# Each sequence of token is a vector of
# Token indices (with the count of those words)
X_train[:5]

array([[ 3.,  3.,  2., ...,  0.,  0.,  0.],
       [10.,  3., 10., ...,  0.,  0.,  0.],
       [14.,  2.,  5., ...,  0.,  0.,  0.],
       [ 0.,  1.,  4., ...,  0.,  0.,  0.],
       [21., 19., 16., ...,  0.,  0.,  0.]])

In [9]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


## 1.2

In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam

rnn_model = None
rnn_model = Sequential()
rnn_model.add(SimpleRNN(256, input_shape=(1, max_features)))
rnn_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
rnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(rnn_model.summary())
rnn_model_history = rnn_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(rnn_model_history.history.keys())

rnn_score, rnn_accuracy = rnn_model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', rnn_score)
print('Test accuracy:', rnn_accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 256)               577792    
                                                                 
 dense (Dense)               (None, 2)                 514       
                                                                 
Total params: 578306 (2.21 MB)
Trainable params: 578306 (2.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.537799060344696
Test accuracy: 0.8741165399551392


## 1.3

In [11]:
from tensorflow.keras.layers import LSTM

lstm_model = None
lstm_model = Sequential()
lstm_model.add(LSTM(256, input_shape=(1, max_features)))
lstm_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
lstm_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(lstm_model.summary())
lstm_model_history = lstm_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(lstm_model_history.history.keys())

lstm_score, lstm_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', lstm_score)
print('Test accuracy:', lstm_accuracy)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 256)               2311168   
                                                                 
 dense_1 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2311682 (8.82 MB)
Trainable params: 2311682 (8.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.517749547958374
Test accuracy: 0.8719829320907593


## 1.4

In [12]:
from tensorflow.keras.layers import GRU

gru_model = None
gru_model = Sequential()
gru_model.add(GRU(256, input_shape=(1, max_features)))
gru_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
gru_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(gru_model.summary())
history_gru_model = gru_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history_gru_model.history.keys())

gru_score, gru_accuracy= gru_model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', gru_score)
print('Test accuracy:', gru_accuracy)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 256)               1734144   
                                                                 
 dense_2 (Dense)             (None, 2)                 514       
                                                                 
Total params: 1734658 (6.62 MB)
Trainable params: 1734658 (6.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.5579535365104675
Test accuracy: 0.8714495301246643


## 1.5

In [13]:
from tensorflow.keras.layers import Bidirectional

bilstm_model = None
bilstm_model = Sequential()
bilstm_model.add(Bidirectional(LSTM(256), input_shape=(1, max_features)))
bilstm_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
bilstm_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(bilstm_model.summary())
history_bilstm_model = bilstm_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history_bilstm_model.history.keys())

bilstm_score, bilstm_accuracy = bilstm_model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', bilstm_score)
print('Test accuracy:', bilstm_accuracy)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 512)               4622336   
 al)                                                             
                                                                 
 dense_3 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 4623362 (17.64 MB)
Trainable params: 4623362 (17.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.5763213038444519
Test accuracy: 0.8727830648422241


## 1.6

In [14]:
# Print accuracies
print(f'RNN Accuracy: {rnn_accuracy}')
print(f'RNN Loss: {rnn_score}')
print(f'\nLSTM Accuracy: {lstm_accuracy}')
print(f'LSTM Loss: {lstm_score}')
print(f'\nGRU Accuracy: {gru_accuracy}')
print(f'GRU Score: {gru_score}')
print(f'\nBiLSTM Accuracy: {bilstm_accuracy}')
print(f'BiLSTM Score: {bilstm_score}')


RNN Accuracy: 0.8741165399551392
RNN Loss: 0.537799060344696

LSTM Accuracy: 0.8719829320907593
LSTM Loss: 0.517749547958374

GRU Accuracy: 0.8714495301246643
GRU Score: 0.5579535365104675

BiLSTM Accuracy: 0.8727830648422241
BiLSTM Score: 0.5763213038444519


The RNN model, despite its simplicity and known limitations with long-term dependencies, achieved the highest accuracy. This suggests that for the given dataset and task, the temporal dynamics and sequence processing capability of RNNs were sufficient.

On the other hand, the LSTM, GRU, and BiLSTM models, with more advanced mechanisms to handle long-term dependencies, did not significantly outperform the basic RNN in terms of accuracy. Therefore, this could indicate that the sentiment analysis task for this particular dataset might not require complex handling of long-term dependencies.

In regard to loss scores, the LSTM model has the lowest loss, which can be attributed to its efficient handling of long-term dependencies and its ability to forget irrelevant information through its gating mechanism. However, the slightly higher loss/score in GRU and BiLSTM might indicate a slightly less efficient learning process in this specific context, possibly due to overfitting or the complexity of the models in relation to the dataset.

Overall, the RNN model performed the best in terms of accuracy, which is the primary metric for performance in classification tasks like sentiment analysis. However, the marginal differences in accuracy across all models suggest that all of them were effective to a similar extent for this specific task. The choice of model might therefore depend on factors like training time, complexity, and the specific characteristics of the dataset, such as the length of the input sequences and the importance of capturing long-term dependencies.