In [1]:
!pip install datasets keras-tuner contractions pyspellchecker langdetect deep-translator

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl

In [2]:
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Embedding, Bidirectional, GRU, Layer, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import re
import matplotlib.pyplot as plt
import tensorflow as tf 
from tensorflow import keras
import keras_tuner as kt
from deep_translator import MyMemoryTranslator
from langdetect import detect
import spacy
import contractions
import nltk
from spellchecker import SpellChecker
import random

# Set a seed value
seed_value = 0

random.seed(seed_value)
# For NumPy
np.random.seed(seed_value)
# For TensorFlow
tf.random.set_seed(seed_value)
# For PyTorch
torch.manual_seed(seed_value)

if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Dataset Preparation

In [3]:
# Load the dataset
dataset = load_dataset("rotten_tomatoes")

# Split the dataset into training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

# Generate Word Embeddings

In [4]:
# Download glove and unzip it in Notebook.
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

  pid, fd = os.forkpty()


--2024-11-09 01:35:47--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-09 01:35:47--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-09 01:35:47--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: 'glove

# Data Exploration & Preprocessing

In [5]:
train_text = train_dataset['text']

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Number of unique words in the train dataset
vocab_size = len(tokenizer.word_index)
print("Vocabulary size of train data:", vocab_size)

Vocabulary size of train data: 17451


In [6]:
def load_glove_vocab(filepath):
    glove_vocab = set()
    
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            glove_vocab.add(word)

    return glove_vocab

# Load Glove words into a set
glove_vocab = load_glove_vocab('glove.6B.300d.txt')

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []

for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Size of Glove Vocabulary:", len(glove_vocab))
print("Out-Of-Vocabulary Words:", oov_words)
print("Number of OOV words:", len(oov_words))

Size of Glove Vocabulary: 400000
Out-Of-Vocabulary Words: ["it's", "doesn't", "there's", "that's", "isn't", "don't", "can't", "film's", "you're", "you'll", "he's", "movie's", "won't", "what's", "you've", "i'm", "didn't", "they're", "year's", '\x96', "you'd", "aren't", "i've", "we've", "couldn't", "she's", "man's", "we're", "wasn't", "i'd", '\x97', "who's", "director's", "haven't", "here's", "story's", "characters'", "wouldn't", "i'll", "'the", "woman's", "hasn't", "world's", "filmmaker's", "children's", "moore's", "one's", "soderbergh's", "america's", "disney's", "shouldn't", "ain't", "character's", "cinema's", "women's", "cho's", "hoffman's", "kids'", "today's", "amy's", "wilde's", "life's", "they'll", "emperor's", "sandler's", "scorsese's", "allen's", "it'll", "filmmakers'", 'cletis', "carvey's", "let's", "stevenson's", 'waydowntown', "polanski's", "he'd", "weren't", "lee's", "everyone's", "parker's", "woo's", "'i", "greene's", "2002's", "actor's", "bullock's", "writer's", "shakespea

# RNN with raw unprocessed data and frozen embedding layer

In [7]:
test_text = test_dataset['text']
val_text = validation_dataset['text']

raw_train_text = train_text
raw_val_text = val_text
raw_test_text = test_text

raw_train_sequences = tokenizer.texts_to_sequences(raw_train_text)
raw_val_sequences = tokenizer.texts_to_sequences(raw_val_text)
raw_test_sequences = tokenizer.texts_to_sequences(raw_test_text)

tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(raw_train_text)

max_seq_len = 100
tokenizer.word_index['<PAD>'] = 0

# Pad sequences
raw_train_padded = pad_sequences(raw_train_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])
raw_val_padded = pad_sequences(raw_val_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])
raw_test_padded = pad_sequences(raw_test_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])

# Labels
train_labels = np.array(train_dataset['label'])
val_labels = np.array(validation_dataset['label'])
test_labels = np.array(test_dataset['label'])

In [8]:
def create_vector_matrix(filepath, word_index, embedding_dim):
    # Calculate the size of the vocabulary (including UNK token)
    vocab_size = len(word_index)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    # Load GloVe vectors from the file
    glove_embeddings = {}

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            glove_embeddings[word] = np.array(vector, dtype=np.float32)

    # Create the embedding matrix
    for word, idx in word_index.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            # Set the embedding to zero if word is OOV (including <UNK>)
            embedding_matrix[idx] = np.zeros(embedding_dim)

    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Example usage with GloVe 300-dimensional embeddings
embedding_dim = 300
embedding_matrix = create_vector_matrix('glove.6B.300d.txt', tokenizer.word_index, embedding_dim)

# No. of rows should be equal to vocab size, no. of columns should be equal to vector dimension
print("Embedding Matrix Shape=> ", embedding_matrix.shape)

# Print the dense vector for the UNK token
unk_idx = tokenizer.word_index["<UNK>"]
print("Dense vector for UNK token is => ", embedding_matrix[unk_idx])

Embedding Matrix Shape=>  torch.Size([17453, 300])
Dense vector for UNK token is =>  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [9]:
hidden_dim = 64
embedding_dim = 300
vocab_size = len(tokenizer.word_index)
output_dim = 1
batch_size = 64

no_epochs = 50
lr = 0.001
patience = 3

# Add early stopping
early_stopper = EarlyStopping(monitor='val_accuracy', mode='max', patience=patience, restore_best_weights=True)

In [10]:
# Simple RNN Model
simple_rnn = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              input_shape=(max_seq_len,),
              weights=[embedding_matrix],
              mask_zero=True,
              trainable=False),
    SimpleRNN(units=hidden_dim), # RNN Layer
    Dense(output_dim, activation='sigmoid') # 1 dense (FC) layer for binary classification
])

simple_rnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

simple_rnn.summary()

  super().__init__(**kwargs)


In [11]:
history = simple_rnn.fit(raw_train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=batch_size,
                         validation_data=(raw_val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50


I0000 00:00:1731116376.359184      95 service.cc:145] XLA service 0x7ef5e4015a10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731116376.359233      95 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 18/134[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 10ms/step - accuracy: 0.5014 - loss: 0.7497

I0000 00:00:1731116377.374956      95 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.5255 - loss: 0.7161 - val_accuracy: 0.5469 - val_loss: 0.6885
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6134 - loss: 0.6535 - val_accuracy: 0.5844 - val_loss: 0.6750
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6538 - loss: 0.6198 - val_accuracy: 0.5535 - val_loss: 0.7051
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6702 - loss: 0.6040 - val_accuracy: 0.5779 - val_loss: 0.7045
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6915 - loss: 0.5771 - val_accuracy: 0.5872 - val_loss: 0.7034
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7143 - loss: 0.5490 - val_accuracy: 0.5741 - val_loss: 0.7391
Epoch 7/50
[1m134/134[0m [32m━

In [12]:
test_loss, test_accuracy = simple_rnn.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7401 - loss: 0.5741
Test Loss: 0.7293897867202759
Test Accuracy: 58.07%


In [13]:
# https://www.tensorflow.org/tutorials/keras/keras_tuner
# Hyperparameter tuning
def build_model(hp):
    model = Sequential()

    # Embedding layer (fixed)
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        input_length=max_seq_len,
                        weights=[embedding_matrix],
                        mask_zero=True,
                        trainable=False))

    # RNN layer (Tune between 64 and 256 neurons)
    hidden_dim = hp.Choice('hidden_dim', [64, 128, 256]) # Tune between 64 and 256 neurons
    model.add(SimpleRNN(units=hidden_dim, return_sequences=False))

    # Tuning the number of FC (dense) layers
    for i in range(hp.Int('num_layers', 0, 3)):  # Tune between 0 and 3 FC layers
        units = hp.Choice(f'units_{i}', [32, 64, 128, 256]) # Tune between 32 and 256 neurons
        model.add(Dense(units, activation='relu'))
        
    # Output layer
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    # Tune the learning rate for the optimizer
    learning_rate = hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])  # Tune learning rate
    optimizer = hp.Choice('optimizer', ['adam', 'SGD'])
    
    if optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer == 'SGD':
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
        
    # Compile the model, we decided to use Adam optimizer
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
# Define the BayesianOptimization tuner
tuner = kt.BayesianOptimization(
    build_model,  # The model-building function
    objective='val_accuracy',  # Optimize for validation accuracy
    max_trials=20,
    num_initial_points=5,
    directory='/content/',
    project_name='300d_test')

early_stopper = EarlyStopping(monitor='val_accuracy', mode='max', patience=patience, restore_best_weights=True)

# Perform the search for best hyperparameters
tuner.search(raw_train_padded, train_labels,
             epochs=20,
             batch_size=tuner.oracle.hyperparameters.Choice('batch_size', [32, 64, 128]),  # Tune batch size
             validation_data=(raw_val_padded, val_labels),
             callbacks=[early_stopper])

Trial 20 Complete [00h 00m 18s]
val_accuracy: 0.5112570524215698

Best val_accuracy So Far: 0.6153846383094788
Total elapsed time: 00h 12m 06s


In [15]:
# Get best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

for key in best_hps.values:
  print(key, ":", best_hps.get(key))

hidden_dim : 256
num_layers : 2
learning_rate : 0.0001
optimizer : adam
batch_size : 64
units_0 : 32
units_1 : 32


In [16]:
# Build model with best hyperparameters found and train it on data for 50 epochs
model = tuner.hypermodel.build(best_hps)

early_stopper = EarlyStopping(monitor='val_accuracy', mode='max', patience=patience, restore_best_weights=True)

history = model.fit(raw_train_padded, train_labels,
               epochs=no_epochs,
               batch_size=best_hps.get('batch_size'),
               validation_data=(raw_val_padded, val_labels),
               callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.4987 - loss: 0.7031 - val_accuracy: 0.5291 - val_loss: 0.6904
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5850 - loss: 0.6739 - val_accuracy: 0.5441 - val_loss: 0.6838
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6344 - loss: 0.6467 - val_accuracy: 0.5610 - val_loss: 0.6844
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6707 - loss: 0.6171 - val_accuracy: 0.5675 - val_loss: 0.6903
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6981 - loss: 0.5897 - val_accuracy: 0.5694 - val_loss: 0.6964
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7210 - loss: 0.5623 - val_accuracy: 0.5685 - val_loss: 0.7124
Epoch 7/50
[1m134/134

In [17]:
# Evaluate best model
test_loss, test_accuracy = model.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6275 - loss: 0.7654
Test Loss: 0.9134982824325562
Test Accuracy: 58.35%


## With Average Pooling

In [18]:
mean_pool_optimizer = None
if best_hps.get('optimizer') == 'adam':
    mean_pool_optimizer = tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate'))
elif best_hps.get('optimizer') == 'SGD':
    mean_pool_optimizer = tf.keras.optimizers.SGD(learning_rate=best_hps.get('learning_rate'))

In [19]:
# Average pooling
# Average hidden states of all words in the sentence to represent sentence

# Create model with optimal hyperparameters
rnn_mean_pooling = Sequential()
rnn_mean_pooling.add(Embedding(input_dim=vocab_size,
          output_dim=embedding_dim,
          input_shape=(max_seq_len,),
          weights=[embedding_matrix],
          mask_zero=True,
          trainable=False))

rnn_mean_pooling.add(SimpleRNN(units=best_hps.get('hidden_dim'), return_sequences=True))

rnn_mean_pooling.add(GlobalAveragePooling1D())  # Average pooling over the time dimension

for i in range(best_hps.get('num_layers')):
    rnn_mean_pooling.add(Dense(best_hps.get(f'units_{i}'), activation='relu'))

rnn_mean_pooling.add(Dense(output_dim, activation='sigmoid')) # 1 dense (FC) layer for binary classification
    
rnn_mean_pooling.compile(loss="binary_crossentropy", optimizer=mean_pool_optimizer, metrics=["accuracy"])
rnn_mean_pooling.summary()

  super().__init__(**kwargs)


In [20]:
early_stopper = EarlyStopping(monitor='val_accuracy', patience=patience, restore_best_weights=True)

mean_pooling_history = rnn_mean_pooling.fit(
                         raw_train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=best_hps.get('batch_size'),
                         validation_data=(raw_val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.5333 - loss: 0.6888 - val_accuracy: 0.5544 - val_loss: nan
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5929 - loss: 0.6669 - val_accuracy: 0.5797 - val_loss: nan
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6212 - loss: 0.6477 - val_accuracy: 0.5807 - val_loss: nan
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6409 - loss: 0.6335 - val_accuracy: 0.5788 - val_loss: nan
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6582 - loss: 0.6211 - val_accuracy: 0.5769 - val_loss: nan
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6704 - loss: 0.6095 - val_accuracy: 0.5957 - val_loss: nan
Epoch 7/50
[1m134/134[0m [32m━━━━━━━━

In [21]:
test_loss, test_accuracy = rnn_mean_pooling.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6000 - loss: 0.6709
Test Loss: 0.6774390339851379
Test Accuracy: 58.82%


## With Max Pooling

In [22]:
max_pool_optimizer = None
if best_hps.get('optimizer') == 'adam':
    max_pool_optimizer = tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate'))
elif best_hps.get('optimizer') == 'SGD':
    max_pool_optimizer = tf.keras.optimizers.SGD(learning_rate=best_hps.get('learning_rate'))

In [23]:
# Max pooling
# Take Max of hidden states of all words in the sentence to represent sentence

# Create model with optimal hyperparameters
rnn_max_pooling = Sequential()
rnn_max_pooling.add(Embedding(input_dim=vocab_size,
          output_dim=embedding_dim,
          input_shape=(max_seq_len,),
          weights=[embedding_matrix],
          mask_zero=True,
          trainable=False))

rnn_max_pooling.add(SimpleRNN(units=best_hps.get('hidden_dim'), return_sequences=True))

rnn_max_pooling.add(GlobalMaxPooling1D())  # Max pooling over the time dimension

for i in range(best_hps.get('num_layers')):
    rnn_max_pooling.add(Dense(best_hps.get(f'units_{i}'), activation='relu'))

rnn_max_pooling.add(Dense(output_dim, activation='sigmoid')) # 1 dense (FC) layer for binary classification

rnn_max_pooling.compile(loss="binary_crossentropy", optimizer=max_pool_optimizer, metrics=["accuracy"])
rnn_max_pooling.summary()



In [24]:
early_stopper = EarlyStopping(monitor='val_accuracy', patience=patience, restore_best_weights=True)

max_pooling_history = rnn_max_pooling.fit(
                         raw_train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=best_hps.get('batch_size'),
                         validation_data=(raw_val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.5211 - loss: 0.6930 - val_accuracy: 0.5338 - val_loss: 0.6885
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6321 - loss: 0.6645 - val_accuracy: 0.5544 - val_loss: 0.6832
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6700 - loss: 0.6381 - val_accuracy: 0.5750 - val_loss: 0.6770
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6966 - loss: 0.6038 - val_accuracy: 0.5910 - val_loss: 0.6751
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7343 - loss: 0.5627 - val_accuracy: 0.5976 - val_loss: 0.6774
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7776 - loss: 0.5171 - val_accuracy: 0.6023 - val_loss: 0.6809
Epoch 7/50
[1m134/134

In [25]:
test_loss, test_accuracy = rnn_max_pooling.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.6587 - loss: 0.6229
Test Loss: 0.6627724766731262
Test Accuracy: 61.07%


## With Attention

In [26]:
# Attention

class Attention_2(Layer):
    def init(self):
        super(Attention_2, self).init()

    def build(self, input_shape):
        # Define trainable weight parameters for attention
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1],), initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1],), initializer="zeros")
        super(Attention_2, self).build(input_shape)

    def call(self, x):
        # Compute attention scores
        e = tf.keras.activations.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)

        # Multiply each time step by its attention score
        output = x * tf.expand_dims(a, -1)
        return output

In [27]:
attention_optimizer = None
if best_hps.get('optimizer') == 'adam':
    attention_optimizer = tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate'))
elif best_hps.get('optimizer') == 'SGD':
    attention_optimizer = tf.keras.optimizers.SGD(learning_rate=best_hps.get('learning_rate'))

In [28]:
rnn_attention_pooling = Sequential()
rnn_attention_pooling.add(Embedding(input_dim=vocab_size,
          output_dim=embedding_dim,
          input_shape=(max_seq_len,),
          weights=[embedding_matrix],
          mask_zero=True,
          trainable=False))

rnn_attention_pooling.add(SimpleRNN(units=best_hps.get('hidden_dim'), return_sequences=True))

rnn_attention_pooling.add(Attention_2())

rnn_attention_pooling.add(GlobalMaxPooling1D())

for i in range(best_hps.get('num_layers')):
    rnn_attention_pooling.add(Dense(best_hps.get(f'units_{i}'), activation='relu'))
    
rnn_attention_pooling.add(Dense(output_dim, activation='sigmoid')) # 1 dense (FC) layer for binary classification

# Compile the model
rnn_attention_pooling.compile(loss="binary_crossentropy",
                      optimizer=attention_optimizer,
                      metrics=["accuracy"])

rnn_attention_pooling.summary()



In [29]:
early_stopper = EarlyStopping(monitor='val_accuracy', patience=patience, restore_best_weights=True)

attention_history = rnn_attention_pooling.fit(
                         raw_train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=best_hps.get('batch_size'),
                         validation_data=(raw_val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.5122 - loss: 0.6931 - val_accuracy: 0.5394 - val_loss: 0.6930
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5323 - loss: 0.6927 - val_accuracy: 0.5741 - val_loss: 0.6918
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5705 - loss: 0.6897 - val_accuracy: 0.5769 - val_loss: 0.6856
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5946 - loss: 0.6800 - val_accuracy: 0.5657 - val_loss: 0.6832
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6116 - loss: 0.6676 - val_accuracy: 0.5713 - val_loss: 0.6827
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6254 - loss: 0.6519 - val_accuracy: 0.5769 - val_loss: 0.6814


In [30]:
test_loss, test_accuracy = rnn_attention_pooling.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5832 - loss: 0.6960
Test Loss: 0.6869640350341797
Test Accuracy: 57.13%


# RNN with raw unprocessed data and unfrozen embedding layer

In [31]:
optimizer = None
if best_hps.get('optimizer') == 'adam':
    optimizer = tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate'))
elif best_hps.get('optimizer') == 'SGD':
    optimizer = tf.keras.optimizers.SGD(learning_rate=best_hps.get('learning_rate'))

In [32]:
best_model_bef = Sequential()
best_model_bef.add(Embedding(input_dim=vocab_size,
          output_dim=embedding_dim,
          input_shape=(max_seq_len,),
          weights=[embedding_matrix],
          mask_zero=True,
          trainable=True))

best_model_bef.add(SimpleRNN(units=best_hps.get('hidden_dim'), return_sequences=True))

best_model_bef.add(GlobalMaxPooling1D())  # Max pooling over the time dimension

for i in range(best_hps.get('num_layers')):
    best_model_bef.add(Dense(best_hps.get(f'units_{i}'), activation='relu'))

best_model_bef.add(Dense(output_dim, activation='sigmoid')) # 1 dense (FC) layer for binary classification

best_model_bef.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
best_model_bef.summary()



In [33]:
early_stopper = EarlyStopping(monitor='val_accuracy', patience=patience, restore_best_weights=True)

best_model_bef_history = best_model_bef.fit(raw_train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=best_hps.get('batch_size'),
                         validation_data=(raw_val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.5044 - loss: 0.6938 - val_accuracy: 0.5600 - val_loss: 0.6885
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.6208 - loss: 0.6727 - val_accuracy: 0.5835 - val_loss: 0.6781
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6589 - loss: 0.6389 - val_accuracy: 0.6051 - val_loss: 0.6669
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7089 - loss: 0.5866 - val_accuracy: 0.6313 - val_loss: 0.6585
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7649 - loss: 0.5156 - val_accuracy: 0.6266 - val_loss: 0.6768
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8155 - loss: 0.4389 - val_accuracy: 0.6473 - val_loss: 0.6936
Epoch 7/50
[1m134/134

In [34]:
test_loss, test_accuracy = best_model_bef.evaluate(raw_test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7298 - loss: 0.8833
Test Loss: 0.9292657971382141
Test Accuracy: 71.11%


# Cleaning Training Text

In [35]:
def expand_contractions(text):
    return contractions.fix(text)

In [36]:
def re_manip(text):
    #remove apostrophe and punctuations
    text = re.sub(r"'\S*", "", text)
    text = re.sub(r'[^\w\s]', ' ', text)

    #remove numbers
    text = re.sub(r'\b\d+(st|nd|rd|th)?\b', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [37]:
### INTERMEDIARY STEP TO NOT OVERUSE TRANSLATION API
cleaned_train_text = [re_manip(expand_contractions(text)) for text in train_text]

tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

In [38]:
language_locale_map = {
    "es": "es-ES",
    "fr": "fr-FR",
    "de": "de-DE",
    "it": "it-IT",
    "pt": "pt-PT",
    "ru": "ru-RU",
    "zh": "zh-CN",
    "ja": "ja-JP",
    "ko": "ko-KR",
    "ar": "ar-SA"
}

In [39]:
spell = SpellChecker(distance=3)

def correct_and_translate(word):
    spell.distance = 3
    corrected_word = spell.correction(word) or word
    try:
        lang = detect(corrected_word)
        if lang in language_locale_map and lang != 'en':
            source_locale = language_locale_map[lang]
            translator = MyMemoryTranslator(source=source_locale, target='en-US')
            corrected_word = translator.translate(corrected_word)

    except Exception as e:
        print(f"Error translating word '{corrected_word}': {e}")

    return corrected_word

In [40]:
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker(distance=3)

def lemmatize_with_handling(text, oov_words):
    doc = nlp(text)
    lemmatized_words = []

    for token in doc:
        if token.text in oov_words:
            lemmatized_words.append(correct_and_translate(token.text))
        else:
            spell.distance = 2
            lemma = token.lemma_
            if token.lemma_ not in glove_vocab:
                lemma = spell.correction(lemma) or lemma

            lemmatized_words.append(lemma)

    return " ".join(lemmatized_words)

cleaned_train_text = [lemmatize_with_handling(text, oov_words) for text in cleaned_train_text]

print("Before:", train_text[0])
print("After:", cleaned_train_text[0])

Before: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
After: the rock be destine to be the century new conan and that he be go to make a splash even great than arnold schwarzenegger jean claud van damme or steven segal


In [41]:
### FUNCTION PURELY FOR TEST/VAL DATA
def clean_words(texts):
    cleaned_text = [re_manip(expand_contractions(text)) for text in texts]
    tokenizer.fit_on_texts(cleaned_text)  

    # Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
    oov_word = []

    for word in tokenizer.word_index:
      if word not in glove_vocab:
        oov_word.append(word)

    finalised_text = [lemmatize_with_handling(text, oov_word) for text in cleaned_text]

    return finalised_text

In [42]:
# Clean test/validation sets
test_text = test_dataset['text']
val_text = validation_dataset['text']
cleaned_test_text = clean_words(test_text)
cleaned_val_text = clean_words(val_text)

Assigning 'UNK' Label on Cleaned Training Text

In [43]:
# Initialise the tokeniser
tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []

for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Out-of-vocabulary words:", oov_words)
print("Number of OOV words:", len(oov_words))

Out-of-vocabulary words: ['<UNK>', 'exhilarate', 'engross', 'moviemake', 'stylize', 'unclinch', 'windtalker', 'clockstopper', 'enthral', 'waydowntown', 'ozpetek', 'everlaste', 'spellbind', 'kosashvili', 'tambac', 'swinge', 'feardotcom', 'overstuff', 'deprave', 'watstein', 'appal', 'breathtake', 'throe', 'rollick', 'tatter', 'runteldat', 'misconceive', 'cheerly', 'janklowicz', 'frissons', 'infatuate', 'roteirista', 'stupefy', 'groundbreake', 'cliffsnote', 'wertmull', 'wisegirls', 'enrapturing', 'originalidad', 'overpraise', 'syncopate', 'obviation', 'gorefests', 'makmalbaf', 'shapelessly', 'mullinski', 'narcotizing', 'sparklingly', 'dreyfu', 'nonthreatene', 'dominatrixes', 'denlopp', 'sappier', 'sorprenderá', 'sarcastic\xa0', 'powaqqatsi', 'kaputschnik', 'monkeyfun', 'bierbichler', 'unindicte', 'datedness', 'inhospitality', 'hastier', 'existência', 'inquestionável', 'hotdogging', 'sogginess', 'stuffiest', 'limewater', 'muckrake', 'premiss', 'mergulha', 'culminant', 'desfecho', 'lentamen

# RNN with processed data and unfrozen embedding layer

In [44]:
# Convert texts to sequences (words to the word index in tokenizer)
train_sequences = tokenizer.texts_to_sequences(cleaned_train_text)

# Pad the sequences (ensure they are all the same length)
max_seq_len = 100
tokenizer.word_index['<PAD>'] = 0
train_padded = pad_sequences(train_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])

print("Before:", train_sequences[0])
print("After:", list(train_padded[0]))

Before: [2, 579, 3, 2602, 7, 3, 2, 748, 98, 4251, 5, 10, 61, 3, 71, 7, 25, 4, 2291, 60, 113, 34, 1349, 1854, 1855, 7481, 1439, 5371, 41, 834, 7482]
After: [2, 579, 3, 2602, 7, 3, 2, 748, 98, 4251, 5, 10, 61, 3, 71, 7, 25, 4, 2291, 60, 113, 34, 1349, 1854, 1855, 7481, 1439, 5371, 41, 834, 7482, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [45]:
# Prepare inputs for model
# Convert validation and test texts to sequences
val_sequences = tokenizer.texts_to_sequences(cleaned_val_text)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_text)

# Pad sequences
val_padded = pad_sequences(val_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])
test_padded = pad_sequences(test_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])

# Labels
train_labels = np.array(train_dataset['label'])
val_labels = np.array(validation_dataset['label'])
test_labels = np.array(test_dataset['label'])

In [46]:
optimizer = None
if best_hps.get('optimizer') == 'adam':
    optimizer = tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate'))
elif best_hps.get('optimizer') == 'SGD':
    optimizer = tf.keras.optimizers.SGD(learning_rate=best_hps.get('learning_rate'))

In [47]:
# Build model with best hyperparameters found and train it on data for 50 epochs
# Create model with optimal hyperparameters
best_model = Sequential()
best_model.add(Embedding(input_dim=vocab_size,
          output_dim=embedding_dim,
          input_shape=(max_seq_len,),
          weights=[embedding_matrix],
          mask_zero=True,
          trainable=True))

best_model.add(SimpleRNN(units=best_hps.get('hidden_dim'), return_sequences=True))

best_model.add(GlobalMaxPooling1D())  # Max pooling over the time dimension

for i in range(best_hps.get('num_layers')):
    best_model.add(Dense(best_hps.get(f'units_{i}'), activation='relu'))

best_model.add(Dense(output_dim, activation='sigmoid')) # 1 dense (FC) layer for binary classification

best_model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
best_model.summary()



In [48]:
early_stopper = EarlyStopping(monitor='val_accuracy', patience=patience, restore_best_weights=True)

best_model_history = best_model.fit(train_padded, train_labels,
                         epochs=no_epochs,
                         batch_size=best_hps.get('batch_size'),
                         validation_data=(val_padded, val_labels),
                         callbacks=[early_stopper])

Epoch 1/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.5104 - loss: 0.7177 - val_accuracy: 0.5441 - val_loss: 0.6869
Epoch 2/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6168 - loss: 0.6728 - val_accuracy: 0.5844 - val_loss: 0.6762
Epoch 3/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6679 - loss: 0.6448 - val_accuracy: 0.6116 - val_loss: 0.6599
Epoch 4/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7123 - loss: 0.6030 - val_accuracy: 0.6276 - val_loss: 0.6441
Epoch 5/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7547 - loss: 0.5492 - val_accuracy: 0.6238 - val_loss: 0.6463
Epoch 6/50
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7997 - loss: 0.4857 - val_accuracy: 0.6379 - val_loss: 0.6328
Epoch 7/50
[1m134/134

In [49]:
test_loss, test_accuracy = best_model.evaluate(test_padded, test_labels)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7632 - loss: 0.5368
Test Loss: 0.6985761523246765
Test Accuracy: 71.20%
