In [2]:
!pip install datasets keras-tuner contractions pyspellchecker langdetect deep-translator gensim
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, GRU, Layer, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import re
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

from deep_translator import MyMemoryTranslator
from langdetect import detect
import spacy
import contractions
import nltk
from spellchecker import SpellChecker
import random

# Set a seed value
seed_value = 42

random.seed(seed_value)
# For NumPy
np.random.seed(seed_value)

# For TensorFlow
tf.random.set_seed(seed_value)

# For PyTorch
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datas

# Dataset Preparation


In [3]:
# Load the dataset
dataset = load_dataset("rotten_tomatoes")

# Split the dataset into training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

#Retreiving Word Embeddings

In [4]:
# Download glove and unzip it in Notebook.
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2024-11-07 09:19:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-07 09:19:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-07 09:19:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

#Data Exploration & Preprocessing

In [5]:
train_text = train_dataset['text']

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Number of unique words in the train dataset
vocab_size = len(tokenizer.word_index)
print("Vocabulary size of train data:", vocab_size)

Vocabulary size of train data: 17451


In [6]:
def load_glove_vocab(filepath):
    glove_vocab = set()
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            glove_vocab.add(word)
    return glove_vocab

# Load Glove words into a set
glove_vocab = load_glove_vocab('glove.6B.300d.txt')

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Size of Glove Vocabulary:", len(glove_vocab))
print("Out-Of-Vocabulary Words:", oov_words)
print("Number of OOV words:", len(oov_words))

Size of Glove Vocabulary: 400000
Out-Of-Vocabulary Words: ["it's", "doesn't", "there's", "that's", "isn't", "don't", "can't", "film's", "you're", "you'll", "he's", "movie's", "won't", "what's", "you've", "i'm", "didn't", "they're", "year's", '\x96', "you'd", "aren't", "i've", "we've", "couldn't", "she's", "man's", "we're", "wasn't", "i'd", '\x97', "who's", "director's", "haven't", "here's", "story's", "characters'", "wouldn't", "i'll", "'the", "woman's", "hasn't", "world's", "filmmaker's", "children's", "moore's", "one's", "soderbergh's", "america's", "disney's", "shouldn't", "ain't", "character's", "cinema's", "women's", "cho's", "hoffman's", "kids'", "today's", "amy's", "wilde's", "life's", "they'll", "emperor's", "sandler's", "scorsese's", "allen's", "it'll", "filmmakers'", 'cletis', "carvey's", "let's", "stevenson's", 'waydowntown', "polanski's", "he'd", "weren't", "lee's", "everyone's", "parker's", "woo's", "'i", "greene's", "2002's", "actor's", "bullock's", "writer's", "shakespea

#Cleaning Training Text

In [7]:
def expand_contractions(text):
    return contractions.fix(text)

In [8]:
def re_manip(text):
    #remove apostrophe and punctuations
    text = re.sub(r"'\S*", "", text)
    text = re.sub(r'[^\w\s]', ' ', text)

    #remove numbers
    text = re.sub(r'\b\d+(st|nd|rd|th)?\b', '', text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
### INTERMEDIARY STEP TO NOT OVERUSE TRANSLATION API
cleaned_train_text = [re_manip(expand_contractions(text)) for text in train_text]

tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

In [10]:
language_locale_map = {
    "es": "es-ES",
    "fr": "fr-FR",
    "de": "de-DE",
    "it": "it-IT",
    "pt": "pt-PT",
    "ru": "ru-RU",
    "zh": "zh-CN",
    "ja": "ja-JP",
    "ko": "ko-KR",
    "ar": "ar-SA"
}

In [11]:
spell = SpellChecker(distance=3)
def correct_and_translate(word):
    spell.distance = 3
    corrected_word = spell.correction(word) or word
    try:
        lang = detect(corrected_word)
        if lang in language_locale_map and lang != 'en':
            source_locale = language_locale_map[lang]
            translator = MyMemoryTranslator(source=source_locale, target='en-US')
            corrected_word = translator.translate(corrected_word)
    except Exception as e:
        print(f"Error translating word '{corrected_word}': {e}")

    return corrected_word

In [12]:
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker(distance=3)
def lemmatize_with_handling(text, oov_words):
    doc = nlp(text)
    lemmatized_words = []
    for token in doc:
        if token.text in oov_words:
            lemmatized_words.append(correct_and_translate(token.text))
        else:
            spell.distance = 2
            lemma = token.lemma_
            if token.lemma_ not in glove_vocab:
                lemma = spell.correction(lemma) or lemma
            lemmatized_words.append(lemma)
    return " ".join(lemmatized_words)

cleaned_train_text = [lemmatize_with_handling(text, oov_words) for text in cleaned_train_text]

print("Before:", train_text[0])
print("After:", cleaned_train_text[0])

Before: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
After: the rock be destine to be the century new conan and that he be go to make a splash even great than arnold schwarzenegger jean claud van damme or steven segal


In [13]:
### FUNCTION PURELY FOR TEST/VAL DATA
def clean_words(texts):
    cleaned_text = [re_manip(expand_contractions(text)) for text in texts]
    tokenizer.fit_on_texts(cleaned_text)
    # Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
    oov_word = []
    for word in tokenizer.word_index:
      if word not in glove_vocab:
        oov_word.append(word)
    finalised_text = [lemmatize_with_handling(text, oov_word) for text in cleaned_text]
    return finalised_text

In [14]:
#Clean test/validation sets
test_text = test_dataset['text']
val_text = validation_dataset['text']
cleaned_test_text = clean_words(test_text)
cleaned_val_text = clean_words(val_text)

#Assigning "UNK" Label

In [15]:
# Initialise the tokeniser
tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Out-of-vocabulary words:", oov_words)
print("Number of OOV words:", len(oov_words))

Out-of-vocabulary words: ['<UNK>', 'exhilarate', 'engross', 'moviemake', 'stylize', 'unclinch', 'windtalker', 'clockstopper', 'enthral', 'waydowntown', 'ozpetek', 'everlaste', 'spellbind', 'kosashvili', 'tambac', 'swinge', 'feardotcom', 'overstuff', 'deprave', 'watstein', 'appal', 'breathtake', 'throe', 'rollick', 'tatter', 'runteldat', 'misconceive', 'cheerly', 'janklowicz', 'frissons', 'infatuate', 'roteirista', 'stupefy', 'groundbreake', 'cliffsnote', 'wertmull', 'wisegirls', 'enrapturing', 'originalidad', 'overpraise', 'syncopate', 'obviation', 'gorefests', 'makmalbaf', 'shapelessly', 'mullinski', 'narcotizing', 'sparklingly', 'dreyfu', 'nonthreatene', 'dominatrixes', 'denlopp', 'sappier', 'visualmente', 'sorprenderá', 'sarcastic\xa0', 'powaqqatsi', 'kaputschnik', 'monkeyfun', 'bierbichler', 'unindicte', 'datedness', 'inhospitality', 'hastier', 'existência', 'inquestionável', 'hotdogging', 'sogginess', 'stuffiest', 'limewater', 'muckrake', 'premiss', 'mergulha', 'culminant', 'desfe

In [16]:
# Convert texts to sequences (words to the word index in tokenizer)
train_sequences = tokenizer.texts_to_sequences(cleaned_train_text)

# Pad the sequences (ensure they are all the same length)
max_seq_len = 100
tokenizer.word_index['<PAD>'] = 0
train_padded = pad_sequences(train_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])

print("Before:", train_sequences[0])
print("After:", list(train_padded[0]))

Before: [2, 579, 3, 2602, 7, 3, 2, 748, 98, 4252, 5, 10, 61, 3, 71, 7, 25, 4, 2291, 60, 113, 34, 1349, 1854, 1855, 7481, 1439, 5371, 41, 834, 7482]
After: [2, 579, 3, 2602, 7, 3, 2, 748, 98, 4252, 5, 10, 61, 3, 71, 7, 25, 4, 2291, 60, 113, 34, 1349, 1854, 1855, 7481, 1439, 5371, 41, 834, 7482, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
# Prepare inputs for model
# Convert validation and test texts to sequences
val_sequences = tokenizer.texts_to_sequences(cleaned_val_text)
test_sequences = tokenizer.texts_to_sequences(cleaned_test_text)

# Pad sequences
val_padded = pad_sequences(val_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])
test_padded = pad_sequences(test_sequences, maxlen=max_seq_len, padding='post', value=tokenizer.word_index['<PAD>'])

# Labels
train_labels = np.array(train_dataset['label'])
val_labels = np.array(validation_dataset['label'])
test_labels = np.array(test_dataset['label'])

In [18]:
# word_index refers to the vocab in training data?
print(tokenizer.word_index)
print(len(tokenizer.word_index))

13258


#Creating input embeddings

In [19]:
def create_vector_matrix(filepath, word_index, embedding_dim):
    # Calculate the size of the vocabulary (including UNK token)
    vocab_size = len(word_index)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    # Load GloVe vectors from the file
    glove_embeddings = {}
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            glove_embeddings[word] = np.array(vector, dtype=np.float32)

    # Create the embedding matrix
    for word, idx in word_index.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            # Set the embedding to zero if word is OOV (including <UNK>)
            embedding_matrix[idx] = np.zeros(embedding_dim)


    return torch.tensor(embedding_matrix, dtype=torch.float32)


# Example usage with GloVe 100-dimensional embeddings
embedding_dim = 300
embedding_matrix = create_vector_matrix('glove.6B.300d.txt', tokenizer.word_index, embedding_dim)


# No. of rows should be equal to vocab size, no. of columns should be equal to vector dimension
print("Embedding Matrix Shape=> ", embedding_matrix.shape)

# Print the dense vector for the UNK token
unk_idx = tokenizer.word_index["<UNK>"]
print("Dense vector for UNK token is => ", embedding_matrix[unk_idx])

Embedding Matrix Shape=>  torch.Size([13258, 300])
Dense vector for UNK token is =>  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

#Setting up CNN model

In [23]:
from tensorflow.keras import regularizers
#Setting up CNN model

vocab_size = len(tokenizer.word_index)

modelCNN = Sequential()
modelCNN.add(tf.keras.layers.Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    input_shape=(max_seq_len,),
                    weights=[embedding_matrix],
                    trainable=True,
                    embeddings_regularizer = regularizers.l2(0.0005)))

modelCNN.add(tf.keras.layers.Conv1D(128,3, activation='relu',\
                                 kernel_regularizer = regularizers.l2(0.0005),\
                                 bias_regularizer = regularizers.l2(0.0005))) #Using 128 filters and kernel size of 3 for a trigram model

modelCNN.add(tf.keras.layers.GlobalMaxPooling1D())

modelCNN.add(tf.keras.layers.Dropout(0.5))

modelCNN.add(tf.keras.layers.Dense(1, activation='sigmoid'))

modelCNN.summary()

In [24]:
#early_stopper = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
modelCNN.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
)
history=modelCNN.fit(train_padded, train_labels, validation_data=(val_padded, val_labels),
          epochs=30, batch_size=32)

lossCNN, accuracyCNN = modelCNN.evaluate(test_padded, test_labels)
print(f"Test Loss: {lossCNN}, Test Accuracy: {accuracyCNN}")

Epoch 1/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.6176 - loss: 202.8838 - val_accuracy: 0.7533 - val_loss: 88.8823
Epoch 2/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7563 - loss: 70.6059 - val_accuracy: 0.7636 - val_loss: 32.9813
Epoch 3/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7907 - loss: 26.5953 - val_accuracy: 0.7495 - val_loss: 13.3085
Epoch 4/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8066 - loss: 10.9098 - val_accuracy: 0.7514 - val_loss: 6.0110
Epoch 5/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8339 - loss: 4.9961 - val_accuracy: 0.7458 - val_loss: 3.1498
Epoch 6/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8532 - loss: 2.6087 - val_accuracy: 0.7411 - val_loss: 1.9588
Epoch 7/30
[1m267/