In [2]:
import numpy as np
import torch
import torch.nn as nn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, GRU, Layer, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import re
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

from deep_translator import MyMemoryTranslator
from langdetect import detect
import spacy
import contractions
import nltk
from spellchecker import SpellChecker
import random

# Set a seed value
seed_value = 42

random.seed(seed_value)
# For NumPy
np.random.seed(seed_value)

# For TensorFlow
tf.random.set_seed(seed_value)

# For PyTorch
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
nltk.download('words')
from nltk.corpus import words

[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Dataset Preparation

In [4]:
# Load the dataset
dataset = load_dataset("rotten_tomatoes")

# Split the dataset into training, validation, and test sets
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

# Generate Word Embeddings

In [5]:
# Download glove and unzip it in Notebook.
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2024-11-05 02:23:15--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-05 02:23:15--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-05 02:23:15--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: 'glove

# Data Exploration & Preprocessing

In [6]:
train_text = train_dataset['text']

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Number of unique words in the train dataset
vocab_size = len(tokenizer.word_index)
print("Vocabulary size of train data:", vocab_size)

Vocabulary size of train data: 17451


In [7]:
def load_glove_vocab(filepath):
    glove_vocab = set()
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            glove_vocab.add(word)
    return glove_vocab

# Load Glove words into a set
glove_vocab = load_glove_vocab('glove.6B.300d.txt')

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Size of Glove Vocabulary:", len(glove_vocab))
print("Out-Of-Vocabulary Words:", oov_words)
print("Number of OOV words:", len(oov_words))

Size of Glove Vocabulary: 400000
Out-Of-Vocabulary Words: ["it's", "doesn't", "there's", "that's", "isn't", "don't", "can't", "film's", "you're", "you'll", "he's", "movie's", "won't", "what's", "you've", "i'm", "didn't", "they're", "year's", '\x96', "you'd", "aren't", "i've", "we've", "couldn't", "she's", "man's", "we're", "wasn't", "i'd", '\x97', "who's", "director's", "haven't", "here's", "story's", "characters'", "wouldn't", "i'll", "'the", "woman's", "hasn't", "world's", "filmmaker's", "children's", "moore's", "one's", "soderbergh's", "america's", "disney's", "shouldn't", "ain't", "character's", "cinema's", "women's", "cho's", "hoffman's", "kids'", "today's", "amy's", "wilde's", "life's", "they'll", "emperor's", "sandler's", "scorsese's", "allen's", "it'll", "filmmakers'", 'cletis', "carvey's", "let's", "stevenson's", 'waydowntown', "polanski's", "he'd", "weren't", "lee's", "everyone's", "parker's", "woo's", "'i", "greene's", "2002's", "actor's", "bullock's", "writer's", "shakespea

Cleaning Training Text

In [8]:
valid_words = set(words.words())

In [9]:
def expand_contractions(text):
    return contractions.fix(text)

cleaned_train_text = [expand_contractions(text) for text in train_text]

print("Before:", train_text[0])
print("After:", cleaned_train_text[0])

Before: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
After: the rock is destined to be the 21st century's new " conan " and that he is going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .


In [10]:
# Initialise the tokeniser
tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Out-of-vocabulary words:", oov_words)
print("Number of OOV words:", len(oov_words))

Out-of-vocabulary words: ['<UNK>', "film's", "movie's", "year's", '\x96', "man's", '\x97', "director's", "story's", "characters'", "'the", "woman's", "world's", "filmmaker's", "children's", "moore's", "one's", "soderbergh's", "america's", "disney's", "character's", "cinema's", "women's", "cho's", "hoffman's", "kids'", "today's", "amy's", "wilde's", "life's", "emperor's", "sandler's", "scorsese's", "allen's", "filmmakers'", 'cletis', "'what", "carvey's", "stevenson's", 'waydowntown', "polanski's", "lee's", "parker's", "woo's", "'i", "greene's", "2002's", "actor's", "bullock's", "'it", "writer's", "shakespeare's", "script's", "audience's", 'seldahl', "kaufman's", "dickens'", "mcgrath's", "zhang's", "plot's", "joan's", "chan's", "son's", "grant's", "carpenter's", "girls'", "solondz's", "artist's", 'ozpetek', "'a", "howard's", "rohmer's", "barry's", "child's", "summer's", "nelson's", "spears'", "seagal's", "benigni's", "week's", "jackson's", "'who", "vincent's", "boys'", "earnhart's", "fes

In [11]:
def re_manip(text):
    #remove apostrophe and punctuations
    text = re.sub(r"'\S*", "", text)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    #remove numbers
    text = re.sub(r'\b\d+(st|nd|rd|th)?\b', '', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
cleaned_train_text = [re_manip(expand_contractions(text)) for text in train_text]

print("Before:", train_text[0])
print("After:", cleaned_train_text[0])

Before: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
After: the rock is destined to be the century new conan and that he is going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal


In [12]:
# Initialise the tokeniser
tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(cleaned_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Out-of-vocabulary words:", oov_words)
print("Number of OOV words:", len(oov_words))

Out-of-vocabulary words: ['<UNK>', 'birot', 'cletis', 'waydowntown', 'seldahl', 'ozpetek', 'wollter', 'divertida', 'stultifyingly', 'kosashvili', 'também', 'feardotcom', 'nohe', 'watstein', 'fato', 'montias', 'consegue', 'auteil', 'animé', 'idemoto', 'personagens', 'profundamente', 'runteldat', 'líquido', 'elemento', 'achronological', 'sheerly', 'direção', 'actuada', 'janklowicz', 'frissons', 'roteirista', 'kidlets', 'russos', 'wisegirls', 'enrapturing', 'intelectualmente', 'retadora', 'orquídeas', 'originalidad', 'suspenser', 'obviation', 'gorefests', 'makmalbaf', 'exhilarate', 'nuttgens', 'petin', 'provocatuers', 'jirí', 'hubac', 'shapelessly', 'addessi', 'mullinski', 'narcotizing', 'precollegiate', 'sparklingly', 'superlarge', 'destinees', 'margolo', 'dominatrixes', 'scuzbag', 'idoosyncratic', 'flatula', 'denlopp', 'updatings', 'sappier', 'condensada', 'visualmente', 'entretenida', 'sorprenderá', 'exporing', 'capturou', 'sarcástica', 'demencial', 'predecesora', 'complejos', 'cadness

In [13]:
language_locale_map = {
    "es": "es-ES",
    "fr": "fr-FR",
    "de": "de-DE",
    "it": "it-IT",
    "pt": "pt-PT",
    "ru": "ru-RU",
    "zh": "zh-CN",
    "ja": "ja-JP",
    "ko": "ko-KR",
    "ar": "ar-SA"
}

In [14]:
spell = SpellChecker(distance=3)
def correct_and_translate(word):
    spell.distance = 3
    corrected_word = spell.correction(word) or word
    try:
        lang = detect(corrected_word)
        if lang in language_locale_map and lang != 'en':
            source_locale = language_locale_map[lang]
            translator = MyMemoryTranslator(source=source_locale, target='en-US')
            corrected_word = translator.translate(corrected_word)
    except Exception as e:
        print(f"Error translating word '{corrected_word}': {e}")
    
    return corrected_word

In [15]:
nlp = spacy.load("en_core_web_sm")
def lemmatize_with_handling(text):
    doc = nlp(text)
    lemmatized_words = []
    for token in doc:
        if token.text in oov_words:
            lemmatized_words.append(correct_and_translate(token.text))
        else:
            spell.distance = 2
            lemma = token.lemma_
            if token.lemma_ not in glove_vocab:
                lemma = spell.correction(lemma) or lemma
            lemmatized_words.append(lemma)
    return " ".join(lemmatized_words)

new_train_text = [lemmatize_with_handling(text) for text in cleaned_train_text]


In [16]:
# Initialise the tokeniser
tokenizer = Tokenizer(oov_token='<UNK>')  # <UNK> for unseen words
tokenizer.fit_on_texts(new_train_text)

# Identify OOV words (Words appeared in training data but not in the GloVe dictionary)
oov_words = []
for word in tokenizer.word_index:
  if word not in glove_vocab:
    oov_words.append(word)

# Print OOV words
print("Out-of-vocabulary words:", oov_words)
print("Number of OOV words:", len(oov_words))

Out-of-vocabulary words: ['<UNK>', 'engross', 'exhilarate', 'stylize', 'unclinch', 'windtalker', 'schwarzenegg', 'clockstopper', 'enthral', 'goodfella', 'waydowntown', 'ozpetek', 'everlaste', 'breathtake', 'spellbind', 'moviemake', 'swinge', 'kosashvili', 'tambac', 'feardotcom', 'watstein', 'appal', 'throe', 'rollick', 'tatter', 'runteldat', 'misconceive', 'cheerly', 'janklowicz', 'frissons', 'infatuate', 'roteirista', 'stupefy', 'cliffsnote', 'deprave', 'wisegirls', 'enrapturing', 'originalidad', 'overpraise', 'syncopate', 'obviation', 'gorefests', 'makmalbaf', 'shapelessly', 'mullinski', 'narcotizing', 'sparklingly', 'nonthreatene', 'dominatrixes', 'denlopp', 'sappier', 'sorprenderá', 'sarcastic\xa0', 'powaqqatsi', 'kaputschnik', 'monkeyfun', 'bierbichler', 'datedness', 'inhospitality', 'hastier', 'existência', 'inquestionável', 'hotdogging', 'sogginess', 'stuffiest', 'limewater', 'premiss', 'mergulha', 'culminant', 'desfecho', 'lentamente', 'soaringly', 'pulpiness', 'haphazardness',