### Question 1

In [32]:
import zipfile
import pandas as pd
import numpy as np

# unzipping the file
with zipfile.ZipFile('/content/IMDB Dataset.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

# now loading the csv file after extraction
df = pd.read_csv('/content/IMDB Dataset.csv')

# checking if the data was loaded correctly
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [33]:
# checking to see if sentiment count is balanced in the dataset
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [34]:
# re-loading dataset while only selecting only the 'review' and 'sentiment' columns, with proper encoding
df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')

# converting the sentiment labels to binary values - 1 for positive and 0 for negative
df['sentiment'] = (df['sentiment'] == "positive").astype(int)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [35]:
val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)


def train_val_test_split(df=None, train_percent=0.7, test_percent=0.15, val_percent=0.15):
  df = df.sample(frac=1)
  train_df = df[: int(len(df)*train_percent)]
  test_df = df[int(len(df)*train_percent)+1 : int(len(df)*(train_percent+test_percent))]
  val_df = df[int(len(df)*(train_percent + test_percent))+1 : ]
  return train_df, test_df, val_df

train_df, test_df, val_df = train_val_test_split(df, 0.7, 0.15, 0.15)
train_labels, train_texts = train_df.values[:,1], train_df.values[:,0]
val_labels, val_texts = val_df.values[:,1], val_df.values[:,0]
test_labels, test_texts = test_df.values[:,1], test_df.values[:,0]
print(len(train_df), len(test_df), len(val_df))
print(len(train_texts), len(train_labels), len(val_df))

35000 7499 7499
35000 35000 7499


In [38]:
import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

def clean_and_process_text(text):

    # convert text to lowercase and clean unwanted punctuation
    cleaned_text = text.lower()
    cleaned_text = re.sub(r"[,.:\-]", "", cleaned_text)

    # remove html tags (if any)
    cleaned_text = re.sub(r"<[^>]+>", " ", cleaned_text)

    # remove all numeric digits from the text
    cleaned_text = ''.join([char for char in cleaned_text if not char.isdigit()])

    return cleaned_text

def tokenize_data(data):

    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    tokenized_data = []

    # process each sentence in the data
    for sentence in data:
        # clean and preprocess the sentence
        cleaned_sentence = clean_and_process_text(sentence)

        # tokenize the cleaned sentence
        tokens = tokenizer(cleaned_sentence)
        token_list = [str(token) for token in tokens]

        tokenized_data.append(token_list)

    return tokenized_data

# apply the tokenization to training, validation, and test data
train_tokens = tokenize_data(train_texts)
val_tokens = tokenize_data(val_texts)
test_tokens = tokenize_data(test_texts)


In [39]:
print(train_tokens[0])

['although', "i'm", 'not', 'a', 'golf', 'fan', 'i', 'attended', 'a', 'sneak', 'preview', 'of', 'this', 'movie', 'and', 'absolutely', 'loved', 'it', 'the', 'historical', 'settings', 'the', 'blatant', 'class', 'distinctions', 'and', 'seeing', 'the', 'good', 'and', 'the', 'bad', 'on', 'both', 'sides', 'of', 'the', 'dividing', 'line', 'held', 'my', 'attention', 'throughout', 'the', 'actors', 'and', 'their', 'characterizations', 'were', 'all', 'mesmerizing', 'and', 'i', 'was', 'on', 'the', 'edge', 'of', 'my', 'seat', 'during', 'the', 'golf', 'segments', 'which', 'were', 'not', 'only', 'dramatic', 'and', 'exciting', 'but', 'easy', 'to', 'follow', 'toward', 'the', 'end', 'of', 'this', 'movie', '"seabiscuit"', 'came', 'strongly', 'to', 'mind', 'although', '"the', 'greatest', 'game', 'ever', 'played"', 'is', 'far', 'less', 'complex', 'a', 'story', 'than', 'that', 'film', 'in', 'both', 'cases', 'the', 'fact', 'that', 'the', 'events', 'really', 'happened', 'deepened', 'my', 'interest']


In [51]:
import itertools

# class to implement the Bag of Words vectorization
class TextVectorizer:
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocabulary = None
        self.token_to_index_map = None

    def fit(self, data):
        word_count = {}
        for sentence in data:
            for word in sentence:
                if word not in word_count:
                    word_count[word] = 1
                else:
                    word_count[word] += 1

        # sort words by frequency in descending order
        word_count = dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))

        # select top N words based on the max_features parameter
        top_words = min(len(word_count), self.max_features)
        word_count = dict(itertools.islice(word_count.items(), top_words))

        self.vocabulary = list(word_count.keys())

        # map each word to a unique index
        self.token_to_index_map = {word: index for index, word in enumerate(self.vocabulary)}

    def transform(self, data):
        """
        Transform the input data into a matrix based on the vocabulary built during fitting.
        """
        matrix = np.zeros((len(data), len(self.vocabulary)))

        for i, sentence in enumerate(data):
            for word in sentence:
                if word in self.token_to_index_map:
                    matrix[i, self.token_to_index_map[word]] += 1
        return matrix

# define maximum number of features to include in the vocabulary (top k words)
max_features = 2000

vectorizer = TextVectorizer(max_features=max_features)

# fit the vectorizer to the training data
vectorizer.fit(train_tokens)

X_train = vectorizer.transform(train_tokens)
X_val = vectorizer.transform(val_tokens)
X_test = vectorizer.transform(test_tokens)

# convert labels into numpy arrays
y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

# accessing the vocabulary list
vocabulary = vectorizer.vocabulary

In [52]:
# display the first 5 rows of the training data matrix
# each row represents a sentence, where the columns correspond to word frequencies from the vocabulary
X_train[:5]

array([[11.,  3.,  6., ...,  0.,  0.,  0.],
       [ 9.,  2.,  4., ...,  0.,  0.,  0.],
       [16.,  8.,  9., ...,  0.,  0.,  0.],
       [10.,  5.,  6., ...,  1.,  0.,  0.],
       [ 3.,  2.,  1., ...,  0.,  0.,  0.]])

In [53]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


### Question 2

In [54]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam

In [55]:
rnn_model = None
rnn_model = Sequential()
rnn_model.add(SimpleRNN(256, input_shape=(1, max_features)))
rnn_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
rnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(rnn_model.summary())
rnn_model_history = rnn_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(rnn_model_history.history.keys())

rnn_score, rnn_accuracy = rnn_model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', rnn_score)
print('Test Accuracy:', rnn_accuracy)

  super().__init__(**kwargs)


None
Epoch 1/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.7551 - loss: 0.8071 - val_accuracy: 0.8759 - val_loss: 0.2913
Epoch 2/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8936 - loss: 0.2602 - val_accuracy: 0.8763 - val_loss: 0.2964
Epoch 3/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9122 - loss: 0.2125 - val_accuracy: 0.8713 - val_loss: 0.3132
Epoch 4/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9383 - loss: 0.1617 - val_accuracy: 0.8712 - val_loss: 0.3249
Epoch 5/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9643 - loss: 0.1032 - val_accuracy: 0.8738 - val_loss: 0.3764
Epoch 6/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9836 - loss: 0.0599 - val_accuracy: 0.8716 - val_loss: 0.4200
Epoch 7/10
[1m137/137

### Question 3

In [56]:
from tensorflow.keras.layers import LSTM

lstm_model = Sequential()
lstm_model.add(LSTM(256, input_shape=(1, max_features)))
lstm_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)

lstm_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(lstm_model.summary())

lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=256, validation_data=(X_val, y_val), epochs=10)
print(lstm_model_history.history.keys())

lstm_score, lstm_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', lstm_score)
print('Test Accuracy:', lstm_accuracy)

None
Epoch 1/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.7411 - loss: 0.5307 - val_accuracy: 0.8764 - val_loss: 0.2936
Epoch 2/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8955 - loss: 0.2526 - val_accuracy: 0.8760 - val_loss: 0.2937
Epoch 3/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9190 - loss: 0.2093 - val_accuracy: 0.8737 - val_loss: 0.2937
Epoch 4/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9418 - loss: 0.1608 - val_accuracy: 0.8709 - val_loss: 0.3222
Epoch 5/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9658 - loss: 0.1047 - val_accuracy: 0.8668 - val_loss: 0.3789
Epoch 6/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9787 - loss: 0.0711 - val_accuracy: 0.8698 - val_loss: 0.4167
Epoch 7/10
[1m137/13

### Question 4

In [57]:
from tensorflow.keras.layers import GRU

gru_model = None
gru_model = Sequential()
gru_model.add(GRU(256, input_shape=(1, max_features)))
gru_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
gru_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(gru_model.summary())

history_gru_model = gru_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history_gru_model.history.keys())

gru_score, gru_accuracy= gru_model.evaluate(X_test, y_test, verbose=0)

print('Test Loss:', gru_score)
print('Test Accuracy:', gru_accuracy)

None
Epoch 1/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.7699 - loss: 0.5203 - val_accuracy: 0.8775 - val_loss: 0.2868
Epoch 2/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8984 - loss: 0.2486 - val_accuracy: 0.8736 - val_loss: 0.2920
Epoch 3/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9215 - loss: 0.1963 - val_accuracy: 0.8738 - val_loss: 0.3119
Epoch 4/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9495 - loss: 0.1352 - val_accuracy: 0.8726 - val_loss: 0.3434
Epoch 5/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9692 - loss: 0.0891 - val_accuracy: 0.8676 - val_loss: 0.4022
Epoch 6/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9836 - loss: 0.0518 - val_accuracy: 0.8673 - val_loss: 0.4658
Epoch 7/10
[1m137/13

### Question 5

In [58]:
from tensorflow.keras.layers import Bidirectional

bilstm_model = None
bilstm_model = Sequential()
bilstm_model.add(Bidirectional(LSTM(256), input_shape=(1, max_features)))
bilstm_model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate = 0.01)
bilstm_model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(bilstm_model.summary())

history_bilstm_model = bilstm_model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history_bilstm_model.history.keys())

bilstm_score, bilstm_accuracy = bilstm_model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', bilstm_score)
print('Test Accuracy:', bilstm_accuracy)

  super().__init__(**kwargs)


None
Epoch 1/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.7638 - loss: 0.5671 - val_accuracy: 0.8604 - val_loss: 0.3169
Epoch 2/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8976 - loss: 0.2492 - val_accuracy: 0.8763 - val_loss: 0.2935
Epoch 3/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9158 - loss: 0.2103 - val_accuracy: 0.8726 - val_loss: 0.3046
Epoch 4/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9361 - loss: 0.1679 - val_accuracy: 0.8705 - val_loss: 0.3190
Epoch 5/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9621 - loss: 0.1129 - val_accuracy: 0.8646 - val_loss: 0.3948
Epoch 6/10
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9805 - loss: 0.0668 - val_accuracy: 0.8624 - val_loss: 0.4108
Epoch 7/10
[1m13

### Question 6

In [59]:
# comparing models to see which one has the best accuracy
print(f'RNN Accuracy: {rnn_accuracy}')
print(f'RNN Loss: {rnn_score}')
print(f'\nLSTM Accuracy: {lstm_accuracy}')
print(f'LSTM Loss: {lstm_score}')
print(f'\nGRU Accuracy: {gru_accuracy}')
print(f'GRU Score: {gru_score}')
print(f'\nBiLSTM Accuracy: {bilstm_accuracy}')
print(f'BiLSTM Score: {bilstm_score}')

RNN Accuracy: 0.869449257850647
RNN Loss: 0.5746417045593262

LSTM Accuracy: 0.8738498687744141
LSTM Loss: 0.5549727082252502

GRU Accuracy: 0.8742498755455017
GRU Score: 0.6527361273765564

BiLSTM Accuracy: 0.874916672706604
BiLSTM Score: 0.5540788769721985


Based on my results above, the BiLSTM model achieved the highest accuracy of approximately 87.49%, followed closely by the GRU model with an accuracy of approximately 87.42%. The LSTM model performed slightly worse with an accuracy of 87.38%, and the RNN model had the lowest accuracy at 86.94%.

Therefore the BiLSTM model is the best-performing model in terms of accuracy, which makes sense since its bidirectional nature, allows it to capture context from both the past and future within the sequences, making it better suited for tasks like sentiment analysis.