# **Sentiment Analysis on Movie Reviews**

*By David Fit*

---

In [None]:
# Environment Setup
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# TensorFlow and Keras Libraries
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
import numpy as np

# Hugging Face Datasets
from datasets import load_dataset

# Statistical and Evaluation Metrics
from scipy.stats import mode
from sklearn.metrics import accuracy_score

In [None]:
# Load IMDB dataset using the Hugging Face datasets library
imdb_dataset = load_dataset("imdb")['train']

# Extract text data
train_data = [item['text'] for item in imdb_dataset]
train_labels = [item['label'] for item in imdb_dataset]

# Tokenize text data (limited to top 10,000 most frequent words)
tokenizer = Tokenizer(num_words=10000, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Convert the text data into sequences of integers, convert the sequences into a binary matrix
sequences = tokenizer.texts_to_sequences(train_data)
features = tokenizer.sequences_to_matrix(sequences, mode='binary')

X = pad_sequences(sequences, maxlen=500)

In [None]:
# Train-test split below
X_train, X_val, y_train, y_val = train_test_split(X, train_labels, train_size=0.9, random_state=123)

y_train = np.array(y_train)
y_val = np.array(y_val)

# Model 1: MLP with Pre-trained GloVe Embeddings

In [None]:
# Set up word embeddings using pre-trained GloVe embeddings (ensure 'glove.6B.300d.txt' is downloaded)
embedding_dim = 300
embedding_file = 'glove.6B.300d.txt'
embedding_index = {}

In [None]:
# Load GloVe embeddings into dictionary 'embedding_index'
with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Create an embedding matrix for the tokenizer's vocabulary
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
# MLP with pre-trained GloVe embeddings
model_1 = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model, using 5 epochs
model_1.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.5911 - loss: 0.6618 - val_accuracy: 0.7796 - val_loss: 0.4995
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7715 - loss: 0.4949 - val_accuracy: 0.8008 - val_loss: 0.4432
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7973 - loss: 0.4558 - val_accuracy: 0.7692 - val_loss: 0.4776
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8122 - loss: 0.4333 - val_accuracy: 0.8056 - val_loss: 0.4173
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8104 - loss: 0.4289 - val_accuracy: 0.8104 - val_loss: 0.4143


<keras.src.callbacks.history.History at 0x7bd1c04c5c50>

# Model 2: LSTM

In [None]:
# LSTM model with randomly initialized embeddings
model_2 = Sequential([
    layers.Embedding(input_dim=10000, output_dim=100),
    layers.LSTM(128, activation='tanh', return_sequences=False),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using 5 epochs
model_2.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 30ms/step - accuracy: 0.6444 - loss: 0.6041 - val_accuracy: 0.8548 - val_loss: 0.3634
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 25ms/step - accuracy: 0.8634 - loss: 0.3541 - val_accuracy: 0.8480 - val_loss: 0.3730
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.9159 - loss: 0.2289 - val_accuracy: 0.8704 - val_loss: 0.3286
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9433 - loss: 0.1669 - val_accuracy: 0.8664 - val_loss: 0.4133
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 29ms/step - accuracy: 0.9634 - loss: 0.1175 - val_accuracy: 0.8704 - val_loss: 0.4261


<keras.src.callbacks.history.History at 0x7bd1c04d1d90>

# Model 3: GRU

In [None]:
# Include an additional GRU layer for sequence processing
model_3 = Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    layers.GRU(128, activation='tanh', return_sequences=False),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using 5 epochs
model_3.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 40ms/step - accuracy: 0.6986 - loss: 0.5594 - val_accuracy: 0.8520 - val_loss: 0.3436
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - accuracy: 0.9127 - loss: 0.2251 - val_accuracy: 0.8888 - val_loss: 0.2892
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 38ms/step - accuracy: 0.9594 - loss: 0.1152 - val_accuracy: 0.8652 - val_loss: 0.3420
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 37ms/step - accuracy: 0.9831 - loss: 0.0521 - val_accuracy: 0.8692 - val_loss: 0.4183
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 38ms/step - accuracy: 0.9918 - loss: 0.0275 - val_accuracy: 0.8688 - val_loss: 0.6003


<keras.src.callbacks.history.History at 0x7bd25413c050>

In [None]:
# We already have X_train, X_val, and y_train, y_val
# Now split the remaining data into test set (X_test, y_test)

X_train, X_temp, y_train, y_temp = train_test_split(X, train_labels, train_size=0.8, random_state=123)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size=0.5, random_state=123)

# Now we have:
# X_train, y_train: Training data and labels
# X_val, y_val: Validation data and labels
# X_test, y_test: Test data and labels

In [None]:
accuracy_1 = 0.81
accuracy_2 = 0.96
accuracy_3 = 0.99

# Normalize the accuracies to get the weights of each model
total_accuracy = accuracy_1 + accuracy_2 + accuracy_3

weight1 = accuracy_1 / total_accuracy
weight2 = accuracy_2 / total_accuracy
weight3 = accuracy_3 / total_accuracy

print(f"Weight for Model 1: {weight1:.2f}")
print(f"Weight for Model 2: {weight2:.2f}")
print(f"Weight for Model 3: {weight3:.2f}")

Weight for Model 1: 0.29
Weight for Model 2: 0.35
Weight for Model 3: 0.36


In [None]:
# Get predictions (probabilities) for the test set
pred1 = model_1.predict(X_test)
pred2 = model_2.predict(X_test)
pred3 = model_3.predict(X_test)

# Assign the specified weights from above
weight1 = 0.29
weight2 = 0.35
weight3 = 0.36

# Weighted average of the predictions
weighted_pred = (weight1 * pred1 + weight2 * pred2 + weight3 * pred3) / (weight1 + weight2 + weight3)

# Convert weighted prediction to binary (0 or 1)
ensemble_pred = (weighted_pred > 0.5).astype(int)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print(f"Ensemble Model Accuracy: {ensemble_accuracy}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Ensemble Model Accuracy: 0.938
