### SUBMITTED BY:  PRANJUL MISHRA,  SAURABH SINGH ,   MRITUNJAY SINGH

### Importing Datasets

Importing Libraries 

In [9]:
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

In [10]:
# Load the IMDB dataset 
ds = load_dataset("stanfordnlp/imdb")

In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [12]:
# Split the dataset into train and test
train_data = ds['train']
test_data = ds['test']

train_texts, train_labels = train_data['text'], train_data['label']
test_texts, test_labels = test_data['text'], test_data['label']

In [13]:
# Helper function to evaluate models
def evaluate_model(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    accuracy = accuracy_score(y_true, y_pred)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

results = []

In [14]:
# 1. Word2Vec Embeddings
print("Training with Word2Vec Embeddings...")
word2vec = api.load("word2vec-google-news-300")

def get_average_word2vec(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_train_w2v = np.array([get_average_word2vec(text, word2vec) for text in train_texts])
X_test_w2v = np.array([get_average_word2vec(text, word2vec) for text in test_texts])

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w2v, train_labels)
y_pred_w2v = clf.predict(X_test_w2v)
results.append({'model': 'Word2Vec + Logistic Regression', **evaluate_model(test_labels, y_pred_w2v)})

Training with Word2Vec Embeddings...


In [15]:
# 2. FastText Embeddings
print("Training with FastText Embeddings...")
fasttext = api.load("fasttext-wiki-news-subwords-300")

X_train_ft = np.array([get_average_word2vec(text, fasttext) for text in train_texts])
X_test_ft = np.array([get_average_word2vec(text, fasttext) for text in test_texts])

clf.fit(X_train_ft, train_labels)
y_pred_ft = clf.predict(X_test_ft)
results.append({'model': 'FastText + Logistic Regression', **evaluate_model(test_labels, y_pred_ft)})

Training with FastText Embeddings...


In [16]:
# 3. GloVe Embeddings
print("Training with GloVe Embeddings...")
glove = api.load("glove-wiki-gigaword-300")

X_train_glove = np.array([get_average_word2vec(text, glove) for text in train_texts])
X_test_glove = np.array([get_average_word2vec(text, glove) for text in test_texts])

clf.fit(X_train_glove, train_labels)
y_pred_glove = clf.predict(X_test_glove)
results.append({'model': 'GloVe + Logistic Regression', **evaluate_model(test_labels, y_pred_glove)})

Training with GloVe Embeddings...


In [17]:
# 4. Transformer-based Model (e.g., BERT)
print("Training with BERT...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Training with BERT...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_encodings = tokenize_function(train_data)
test_encodings = tokenize_function(test_data)

In [19]:
# Convert to torch Dataset
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
preds_output = trainer.predict(test_dataset)
y_pred_bert = np.argmax(preds_output.predictions, axis=1)
results.append({'model': 'BERT (DistilBERT) + Fine-Tuning', **evaluate_model(test_labels, y_pred_bert)})

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 16%|█▌        | 500/3125 [1:37:57<7:55:53, 10.88s/it]

{'loss': 0.3783, 'grad_norm': 3.9874985218048096, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.16}


 32%|███▏      | 1000/3125 [3:08:30<6:10:41, 10.47s/it]

{'loss': 0.3024, 'grad_norm': 17.18511199951172, 'learning_rate': 1.3600000000000002e-05, 'epoch': 0.32}


 48%|████▊     | 1500/3125 [4:42:09<4:53:41, 10.84s/it]

{'loss': 0.2866, 'grad_norm': 20.45327377319336, 'learning_rate': 1.04e-05, 'epoch': 0.48}


 64%|██████▍   | 2000/3125 [6:13:20<3:20:28, 10.69s/it]

{'loss': 0.2749, 'grad_norm': 3.9374825954437256, 'learning_rate': 7.2000000000000005e-06, 'epoch': 0.64}


 80%|████████  | 2500/3125 [7:42:19<1:51:28, 10.70s/it]

{'loss': 0.2581, 'grad_norm': 34.679935455322266, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


 96%|█████████▌| 3000/3125 [9:13:11<22:38, 10.87s/it]  

{'loss': 0.2405, 'grad_norm': 16.450645446777344, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.96}


                                                     
100%|██████████| 3125/3125 [13:06:59<00:00, 15.11s/it]


{'eval_loss': 0.2334533929824829, 'eval_runtime': 12699.0245, 'eval_samples_per_second': 1.969, 'eval_steps_per_second': 0.246, 'epoch': 1.0}
{'train_runtime': 47219.8973, 'train_samples_per_second': 0.529, 'train_steps_per_second': 0.066, 'train_loss': 0.2888260266113281, 'epoch': 1.0}


100%|██████████| 3125/3125 [3:17:58<00:00,  3.80s/it]  


In [20]:
# Display Results
import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

                             model  accuracy  precision   recall        f1
0   Word2Vec + Logistic Regression   0.82768   0.833008  0.81968  0.826290
1   FastText + Logistic Regression   0.77892   0.786413  0.76584  0.775990
2      GloVe + Logistic Regression   0.82340   0.827885  0.81656  0.822184
3  BERT (DistilBERT) + Fine-Tuning   0.92664   0.927941  0.92512  0.926528
