In [1]:
!pip install datasets spacy wordninja contractions
!pip install torch



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import ast
from tqdm import tqdm
from google.colab import drive
from datasets import load_dataset
import os
import re
import requests
import tarfile
import random
import spacy
import wordninja
import contractions
from collections import defaultdict, Counter
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score, v_measure_score, adjusted_rand_score, classification_report
from sklearn.datasets import load_files
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression

In [3]:
def build_vocab(df, text_column):
  """
  This function takes in a dataframe and the column where the text is stored.
  It returns a unique set of key-value pairs, where the key is the word, 
  and the value is its unique corresponding numerical value.
  """
  unique_words = {word for text in df[text_column] for word in text.split()}
  return {word: idx for idx, word in enumerate(sorted(unique_words))}

def create_context_target_pairs(df, text_column, window_size = 5):
  """
  This function intends to formulate the context-target pairs across the 
  entire text column of a dataframe, with the window size having a default
  value of 5. It returns a list of tuples, where each tuple is represented 
  by a list of context words, followed by the corresponding target word.
  """
  pairs = []
  for text in df[text_column]:
    tokens = text.split()
    for i, target in enumerate(tokens):
      start = max(0, i - window_size)
      end = min(len(tokens), i + window_size + 1)
      context_words = [tokens[j] for j in range(start, end) if j != i]
      pairs.append((context_words, target))
  return pairs

def encode_pairs(pairs, vocab):
  """
  This function takes in context-target pairs that is produced using 
  create_context_target_pairs, and encodes all the words into its unique
  integer representation, which will be used as input for our model later on.
  """
  encoded_pairs = []
  for context, target in pairs:
    encoded_context = [vocab[word] for word in context]
    encoded_target = vocab[target]
    encoded_pairs.append((encoded_context, encoded_target))
  return encoded_pairs

class CBOWDataset(Dataset):
  def __init__(self, pairs, vocab_size, window_size):
    self.pairs = pairs
    self.vocab_size = vocab_size
    self.window_size = window_size

  def __len__(self):
    return len(self.pairs)

  def __getitem__(self, idx):
    context, target = self.pairs[idx]
    padded = context + [0] * (2 * self.window_size - len(context)) #padding is necessary to ensure that all input tensors have the same dimensions
    return torch.tensor(padded, dtype = torch.long), torch.tensor(target, dtype = torch.long)

class CBOWModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(CBOWModel, self).__init__()
    self.embedding_dim = embedding_dim
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.softmax = nn.Softmax(dim = 1) #use softmax to convert logits into probabilities
    self.fc = nn.Linear(embedding_dim, vocab_size)

  def forward(self, context):
    embedded = self.embeddings(context)  # (batch_size, context_size, embedding_dim)
    avg_embedding = embedded.mean(dim=1)  # (batch_size, embedding_dim)
    out = self.fc(avg_embedding)  # (batch_size, vocab_size)
    return self.softmax(out)

In [4]:
df_quarter = pd.read_csv('/kaggle/input/we-final-quarter/WE_book_corpus_final_dataset_processed_quarter.csv')
df_quarter.head()

Unnamed: 0,text,processed_text
0,"we were all young once , he said .",we be all young once he say
1,it was as she washing her hands in the sink af...,it be as she wash her hand in the sink after u...
2,"for the illiterate soldiers , it is a trip bac...",for the illiterate soldier it be a trip back t...
3,"names , characters , places , brands , media ,...",name character place brand medium and incident...
4,it was all a masquerade .,it be all a masquerade


In [5]:
vocab_quarter = build_vocab(df_quarter, 'processed_text')
pairs_quarter = create_context_target_pairs(df_quarter, 'processed_text', 5)
encoded_pairs_quarter = encode_pairs(pairs_quarter, vocab_quarter)

In [7]:
#use gpu if possible to accelerate the speed of model training and evaluation

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
def calculate_metrics(output, target, top_ks = [3, 5, 10]):
  """
  This function takes in the entire output vector and the true target word,
  and returns accuracies for top-k accuracies where k = 3, 5 and 10 
  respectively. Eg. if the true target word is in the top 10 highest
  probabilities for k = 10, this will count as a true positive towards 
  the computation of accuracy.
  """
  top_k_accuracies = {}
  for k in top_ks:
    _, top_k_preds = output.topk(k, dim = 1)
    top_k_acc = (top_k_preds == target.view(-1, 1)).any(dim = 1).float().mean().item()
    top_k_accuracies[f"top_{k}_accuracy"] = top_k_acc
  return top_k_accuracies

def tune_hyperparams_quarter(param_grid, train_dataset, val_dataset):
  """
  This function takes in the parameter grid that we want to train our model 
  on, along with the respective train and validation datasets. The train and 
  validation loop is nested in this function.
  """     
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)

    model = CBOWModel(len(vocab_quarter), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy: {avg_top_10_accuracy:.4f}")

      if abs(best_val_loss - avg_val_loss) < 1e-3:
        print("Early stopping triggered") #early stopping is triggered to prevent misinformation and to improve computational time
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_model_quarter.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

In [14]:
train_pairs_quarter, val_test_pairs_quarter = train_test_split(encoded_pairs_quarter, test_size = 0.70, random_state = 42)
val_pairs_quarter, test_pairs_quarter = train_test_split(val_test_pairs_quarter, test_size = 0.5, random_state = 42)

train_dataset_quarter = CBOWDataset(train_pairs_quarter, len(vocab_quarter), 5)
val_dataset_quarter = CBOWDataset(val_pairs_quarter, len(vocab_quarter), 5)
test_dataset_quarter = CBOWDataset(test_pairs_quarter, len(vocab_quarter), 5)

param_grid = {
    'batch_size': [64, 128],
    'embedding_dim': [50, 100],
    'learning_rate': [0.001, 0.01],
    'num_epochs': [10, 20]
}

param_combs = list(itertools.product(
    param_grid['batch_size'],
    param_grid['embedding_dim'],
    param_grid['learning_rate'],
    param_grid['num_epochs']
))

param_grid_dicts = [
    {'batch_size': bs, 'embedding_dim': ed, 'learning_rate': lr, 'num_epochs': ne}
    for bs, ed, lr, ne in param_combs
]

results_quarter = tune_hyperparams_quarter(param_grid_dicts, train_dataset_quarter, val_dataset_quarter)

Epoch 1:
Training Loss: 10.2977
Epoch 1:
Validation Loss: 10.2839
Validation Accuracy: 0.0801
Top-3 Accuracy: 0.1309
Top-5 Accuracy: 0.1706
Top-10 Accuracy: 0.2570
Epoch 2:
Training Loss: 10.2790
Epoch 2:
Validation Loss: 10.2781
Validation Accuracy: 0.0850
Top-3 Accuracy: 0.1322
Top-5 Accuracy: 0.1699
Top-10 Accuracy: 0.2614
Epoch 3:
Training Loss: 10.2735
Epoch 3:
Validation Loss: 10.2758
Validation Accuracy: 0.0867
Top-3 Accuracy: 0.1325
Top-5 Accuracy: 0.1701
Top-10 Accuracy: 0.2587
Epoch 4:
Training Loss: 10.2700
Epoch 4:
Validation Loss: 10.2744
Validation Accuracy: 0.0879
Top-3 Accuracy: 0.1339
Top-5 Accuracy: 0.1704
Top-10 Accuracy: 0.2585
Epoch 5:
Training Loss: 10.2673
Epoch 5:
Validation Loss: 10.2729
Validation Accuracy: 0.0894
Top-3 Accuracy: 0.1451
Top-5 Accuracy: 0.1709
Top-10 Accuracy: 0.2582
Epoch 6:
Training Loss: 10.2638
Epoch 6:
Validation Loss: 10.2702
Validation Accuracy: 0.0924
Top-3 Accuracy: 0.1465
Top-5 Accuracy: 0.1716
Top-10 Accuracy: 0.2572
Epoch 7:
Trainin

In [15]:
"""
The exported dataframe will have the following metrics: accuracy, 
top-3 accuracy, top-5 accuracy, top-10 accuracy, as well as the total time 
needed to finish the training loop. This is necessary as a tiebreaker, as
computational time is a finite resource that we need to consider.
"""

results_quarter_df = pd.DataFrame(results_quarter)
results_quarter_df.to_csv('model_quarter_hyperparam_results.csv', index = False)
print(results_quarter_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.095047   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.096010   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.099198   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.088328   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.088123   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy  total time  
0            0.147940            0.173606             0.254494  462.096224  
1            0.148532            0.173694             0.246759  402.416546  
2            0.166956            0.197802             0.222861  346.485157  
3            0.140491            0.166743             0.231708  232.955465  
4            0.134215            0.156366             0.257379  384.411699  


In [8]:
"repeat what we did with df_quarter, with the df_full and df_half"

df_full = pd.read_csv('/kaggle/input/we-final/WE_book_corpus_final_dataset_processed.csv')
df_half = pd.read_csv('/kaggle/input/we-final-half/WE_book_corpus_final_dataset_processed_half.csv')

In [10]:
vocab_full = build_vocab(df_full, 'processed_text')
pairs_full = create_context_target_pairs(df_full, 'processed_text', 5)
encoded_pairs_full = encode_pairs(pairs_full, vocab_full)

vocab_half = build_vocab(df_half, 'processed_text')
pairs_half = create_context_target_pairs(df_half, 'processed_text', 5)
encoded_pairs_half = encode_pairs(pairs_half, vocab_half)

In [11]:
def tune_hyperparams_full(param_grid, train_dataset, val_dataset):
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)

    model = CBOWModel(len(vocab_full), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy; {avg_top_10_accuracy:.4f}")

      if abs(avg_val_loss - best_val_loss) < 1e-3:
        print("Early Stopping Triggered")
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_model_full.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

In [21]:
train_pairs_full, val_test_pairs_full = train_test_split(encoded_pairs_full, test_size = 0.70, random_state = 42)
val_pairs_full, test_pairs_full = train_test_split(val_test_pairs_full, test_size = 0.5, random_state = 42)

train_dataset_full = CBOWDataset(train_pairs_full, len(vocab_full), window_size = 5)
val_dataset_full = CBOWDataset(val_pairs_full, len(vocab_full), window_size = 5)
test_dataset_full = CBOWDataset(test_pairs_full, len(vocab_full), window_size = 5)

param_grid = {
    'batch_size': [64, 128],
    'embedding_dim': [50, 100],
    'learning_rate': [0.001, 0.01],
    'num_epochs': [10, 20]
}

param_combs = list(itertools.product(
    param_grid['batch_size'],
    param_grid['embedding_dim'],
    param_grid['learning_rate'],
    param_grid['num_epochs']
))

param_grid_dicts = [
    {'batch_size': bs, 'embedding_dim': ed, 'learning_rate': lr, 'num_epochs': ne}
    for bs, ed, lr, ne in param_combs
]
results_full = tune_hyperparams_full(param_grid_dicts, train_dataset_full, val_dataset_full)

KeyboardInterrupt: 

In [25]:
results_full_df = pd.DataFrame(results_full)
results_full_df.to_csv('model_full_hyperparam_results.csv', index = False)
print(results_full_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.091945   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.099652   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.095635   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.104104   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.091414   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy   total time  
0            0.134056            0.163004             0.261592  1131.510653  
1            0.149471            0.172890             0.244863  1125.440634  
2            0.153999            0.188280             0.214185  1713.419736  
3            0.169909            0.193780             0.228524  1416.539697  
4            0.136206            0.158654             0.259320  1256.018537  


In [12]:
def tune_hyperparams_half(param_grid, train_dataset, val_dataset):
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)

    model = CBOWModel(len(vocab_half), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy; {avg_top_10_accuracy:.4f}")

      if abs(avg_val_loss - best_val_loss) < 1e-3:
        print("Early Stopping Triggered")
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_model_half.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

In [27]:
train_pairs_half, val_test_pairs_half = train_test_split(encoded_pairs_half, test_size = 0.70, random_state = 42)
val_pairs_half, test_pairs_half = train_test_split(val_test_pairs_half, test_size = 0.5, random_state = 42)

train_dataset_half = CBOWDataset(train_pairs_half, len(vocab_half), window_size = 5)
val_dataset_half = CBOWDataset(val_pairs_half, len(vocab_half), window_size = 5)
test_dataset_half = CBOWDataset(test_pairs_half, len(vocab_half), window_size = 5)

param_grid = {
    'batch_size': [64, 128],
    'embedding_dim': [50, 100],
    'learning_rate': [0.001, 0.01],
    'num_epochs': [10, 20]
}

param_combs = list(itertools.product(
    param_grid['batch_size'],
    param_grid['embedding_dim'],
    param_grid['learning_rate'],
    param_grid['num_epochs']
))

param_grid_dicts = [
    {'batch_size': bs, 'embedding_dim': ed, 'learning_rate': lr, 'num_epochs': ne}
    for bs, ed, lr, ne in param_combs
]
results_half = tune_hyperparams_half(param_grid_dicts, train_dataset_half, val_dataset_half)

Epoch 1:
Training Loss: 10.4920
Epoch 1:
Validation Loss: 10.4824
Validation Accuracy: 0.0753
Top-3 Accuracy: 0.1162
Top-5 Accuracy: 0.1533
Top-10 Accuracy; 0.2594
Epoch 2:
Training Loss: 10.4787
Epoch 2:
Validation Loss: 10.4782
Validation Accuracy: 0.0788
Top-3 Accuracy: 0.1174
Top-5 Accuracy: 0.1535
Top-10 Accuracy; 0.2611
Epoch 3:
Training Loss: 10.4742
Epoch 3:
Validation Loss: 10.4764
Validation Accuracy: 0.0803
Top-3 Accuracy: 0.1180
Top-5 Accuracy: 0.1540
Top-10 Accuracy; 0.2519
Epoch 4:
Training Loss: 10.4714
Epoch 4:
Validation Loss: 10.4755
Validation Accuracy: 0.0810
Top-3 Accuracy: 0.1185
Top-5 Accuracy: 0.1547
Top-10 Accuracy; 0.2476
Early Stopping Triggered
Epoch 1:
Training Loss: 10.4843
Epoch 1:
Validation Loss: 10.4719
Validation Accuracy: 0.0867
Top-3 Accuracy: 0.1448
Top-5 Accuracy: 0.1763
Top-10 Accuracy; 0.2570
Epoch 2:
Training Loss: 10.4648
Epoch 2:
Validation Loss: 10.4625
Validation Accuracy: 0.0959
Top-3 Accuracy: 0.1533
Top-5 Accuracy: 0.1781
Top-10 Accuracy

In [28]:
results_half_df = pd.DataFrame(results_half)
results_half_df.to_csv('model_half_hyperparam_results.csv', index = False)
print(results_half_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.081014   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.101805   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.098515   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.099565   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.102039   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy  total time  
0            0.118510            0.154749             0.247598  526.791273  
1            0.157203            0.178764             0.251544  655.878617  
2            0.156505            0.190732             0.253524  393.286634  
3            0.163915            0.193381             0.235921  653.833550  
4            0.157298            0.179036             0.251280  587.242343  


After completing the training and evaluation loops with the standard model, 
we repeat the process with the weighted model, where the lambda layer takes
a weighted average instead of the standard average that was employed in the
previous models.

In [11]:
class CBOWModelWeighted(nn.Module):
  def __init__(self, vocab_size, embedding_dim, window_size):
        super(CBOWModelWeighted, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.window_size = window_size
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.weights = self.compute_weights(window_size)

  def compute_weights(self, window_size):
        """
        Assign higher weights to context words closer to the target word.
        Weights decrease linearly with distance from the target word.
        """
        positions = torch.arange(-window_size, window_size + 1, dtype=torch.float32)
        weights = 1.0 / (1.0 + torch.abs(positions))  # Larger weights for closer words
        weights[window_size] = 0  # Exclude target position itself (distance=0)
        return weights[:-1].softmax(0)  # Normalize weights to sum to 1

  def forward(self, context):
    embedded = self.embeddings(context)  # (batch_size, context_size, embedding_dim)

    # Weighted averaging of embeddings
    weights = self.weights.to(context.device)  # Ensure weights are on the same device
    weighted_embeds = embedded * weights.view(1, -1, 1) # (batch_size, context_size, embedding_dim)
    weighted_avg_embedding = weighted_embeds.sum(dim = 1) # (batch_size, embedding_dim)

    # Fully connected layer
    output = self.fc(weighted_avg_embedding) # (batch_size, vocab_size)
    return output

In [14]:
def tune_hyperparams_half_weighted(param_grid, train_dataset, val_dataset):
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
    model = CBOWModelWeighted(len(vocab_half), embedding_dim, 5).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy: {avg_top_10_accuracy:.4f}")

      if abs(avg_val_loss - best_val_loss) < 1e-3:
        print("Early Stopping Triggered")
        break
      elif avg_val_loss > best_val_loss:
        print("Early Stopping Triggered")
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_weighted_model_half.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

train_pairs_half, val_test_pairs_half = train_test_split(encoded_pairs_half, test_size = 0.70, random_state = 42)
val_pairs_half, test_pairs_half = train_test_split(val_test_pairs_half, test_size = 0.5, random_state = 42)

train_dataset_half = CBOWDataset(train_pairs_half, len(vocab_half), window_size = 5)
val_dataset_half = CBOWDataset(val_pairs_half, len(vocab_half), window_size = 5)
test_dataset_half = CBOWDataset(test_pairs_half, len(vocab_half), window_size = 5)

param_grid = {
    'batch_size': [64, 128],
    'embedding_dim': [50, 100],
    'learning_rate': [0.001, 0.01],
    'num_epochs': [10, 20]
}

param_combs = list(itertools.product(
    param_grid['batch_size'],
    param_grid['embedding_dim'],
    param_grid['learning_rate'],
    param_grid['num_epochs']
))

param_grid_dicts = [
    {'batch_size': bs, 'embedding_dim': ed, 'learning_rate': lr, 'num_epochs': ne}
    for bs, ed, lr, ne in param_combs
]

results_half_weighted = tune_hyperparams_half_weighted(param_grid_dicts, train_dataset_half, val_dataset_half)

KeyboardInterrupt: 

In [16]:
results_half_weighted_df = pd.DataFrame(results_half_weighted)
results_half_weighted_df.to_csv('weighted_model_half_hyperparam_results.csv', index = False)
print(results_half_weighted_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.122419   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.122408   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.115742   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.114636   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.125488   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy   total time  
0            0.227599            0.288879             0.380460  1132.265152  
1            0.227408            0.288740             0.380246  1163.120735  
2            0.217964            0.277909             0.368647   258.736813  
3            0.216128            0.276872             0.368416   259.857931  
4            0.232994            0.295295             0.388821  1146.043770  


In [18]:
def tune_hyperparams_full_weighted(param_grid, train_dataset, val_dataset):
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)

    model = CBOWModelWeighted(len(vocab_full), embedding_dim, 5).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy; {avg_top_10_accuracy:.4f}")

      if abs(avg_val_loss - best_val_loss) < 1e-3:
        print("Early Stopping Triggered")
        break
      elif avg_val_loss > best_val_loss:
        print("Early Stopping Triggered")
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_weighted_model_full.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

train_pairs_full, val_test_pairs_full = train_test_split(encoded_pairs_full, test_size = 0.70, random_state = 42)
val_pairs_full, test_pairs_full = train_test_split(val_test_pairs_full, test_size = 0.5, random_state = 42)

train_dataset_full= CBOWDataset(train_pairs_full, len(vocab_full), window_size = 5)
val_dataset_full = CBOWDataset(val_pairs_full, len(vocab_full), window_size = 5)
test_dataset_full = CBOWDataset(test_pairs_full, len(vocab_full), window_size = 5)

results_full_weighted = tune_hyperparams_full_weighted(param_grid_dicts, train_dataset_full, val_dataset_full)

Epoch 1:
Training Loss: 6.1885
Epoch 1:
Validation Loss: 6.0146
Validation Accuracy: 0.1079
Top-3 Accuracy: 0.2056
Top-5 Accuracy: 0.2640
Top-10 Accuracy; 0.3533
Epoch 2:
Training Loss: 5.9073
Epoch 2:
Validation Loss: 5.9238
Validation Accuracy: 0.1163
Top-3 Accuracy: 0.2184
Top-5 Accuracy: 0.2785
Top-10 Accuracy; 0.3699
Epoch 3:
Training Loss: 5.8111
Epoch 3:
Validation Loss: 5.8759
Validation Accuracy: 0.1211
Top-3 Accuracy: 0.2259
Top-5 Accuracy: 0.2868
Top-10 Accuracy; 0.3785
Epoch 4:
Training Loss: 5.7480
Epoch 4:
Validation Loss: 5.8541
Validation Accuracy: 0.1232
Top-3 Accuracy: 0.2295
Top-5 Accuracy: 0.2910
Top-10 Accuracy; 0.3835
Epoch 5:
Training Loss: 5.6989
Epoch 5:
Validation Loss: 5.8549
Validation Accuracy: 0.1252
Top-3 Accuracy: 0.2321
Top-5 Accuracy: 0.2937
Top-10 Accuracy; 0.3861
Early Stopping Triggered
Epoch 1:
Training Loss: 6.1907
Epoch 1:
Validation Loss: 6.0188
Validation Accuracy: 0.1082
Top-3 Accuracy: 0.2058
Top-5 Accuracy: 0.2639
Top-10 Accuracy; 0.3528
Epo

In [19]:
results_full_weighted_df = pd.DataFrame(results_full_weighted)
results_full_weighted_df.to_csv('weighted_model_full_hyperparam_results.csv', index = False)
print(results_full_weighted_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.125187   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.124740   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.117284   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.115852   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.128640   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy   total time  
0            0.232135            0.293705             0.386132  1363.852431  
1            0.231576            0.293288             0.385661  1382.852483  
2            0.220186            0.280531             0.371106   559.355721  
3            0.219265            0.280504             0.371204   552.773860  
4            0.238421            0.302015             0.396428  1526.194981  


In [20]:
def tune_hyperparams_quarter_weighted(param_grid, train_dataset, val_dataset):
  results = []
  for params in param_grid:
    start_time = time.time()
    batch_size = params['batch_size']
    embedding_dim = params['embedding_dim']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)

    model = CBOWModelWeighted(len(vocab_quarter), embedding_dim, 5).to(device)
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      for context, target in train_loader:
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

      avg_train_loss = total_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Training Loss: {avg_train_loss:.4f}")

      model.eval()
      val_loss = 0.0
      val_accuracy = 0.0
      val_top_3_accuracy = 0.0
      val_top_5_accuracy = 0.0
      val_top_10_accuracy = 0.0
      with torch.no_grad():
        for context, target in val_loader:
          context, target = context.to(device), target.to(device)
          output = model(context)
          loss = criterion(output, target)
          val_loss += loss.item()

          predictions = torch.argmax(output, dim = 1)
          val_accuracy += accuracy_score(target.cpu(), predictions.cpu())

          top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
          val_top_3_accuracy += top_k_accuracies['top_3_accuracy']
          val_top_5_accuracy += top_k_accuracies['top_5_accuracy']
          val_top_10_accuracy += top_k_accuracies['top_10_accuracy']

      avg_val_loss = val_loss / len(val_loader)
      avg_val_accuracy = val_accuracy / len(val_loader)
      avg_top_3_accuracy = val_top_3_accuracy / len(val_loader)
      avg_top_5_accuracy = val_top_5_accuracy / len(val_loader)
      avg_top_10_accuracy = val_top_10_accuracy / len(val_loader)
      val_losses.append(avg_val_loss)

      print(f"Epoch {epoch+1}:")
      print(f"Validation Loss: {avg_val_loss:.4f}")
      print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
      print(f"Top-3 Accuracy: {avg_top_3_accuracy:.4f}")
      print(f"Top-5 Accuracy: {avg_top_5_accuracy:.4f}")
      print(f"Top-10 Accuracy: {avg_top_10_accuracy:.4f}")

      if abs(avg_val_loss - best_val_loss) < 1e-3:
        print("Early Stopping Triggered")
        break
      elif avg_val_loss > best_val_loss:
        print("Early Stopping Triggered")
        break
      else:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_cbow_weighted_model_quarter.pth")

    end_time = time.time()
    total_time = end_time - start_time
    results.append({
        'params': params,
        'val_accuracy': avg_val_accuracy,
        'val_top_3_accuracy': avg_top_3_accuracy,
        'val_top_5_accuracy': avg_top_5_accuracy,
        'val_top_10_accuracy': avg_top_10_accuracy,
        'total time': total_time
    })
  return results

train_pairs_quarter, val_test_pairs_quarter = train_test_split(encoded_pairs_quarter, test_size = 0.70, random_state = 42)
val_pairs_quarter, test_pairs_quarter = train_test_split(val_test_pairs_quarter, test_size = 0.5, random_state = 42)

train_dataset_quarter = CBOWDataset(train_pairs_quarter, len(vocab_quarter), window_size = 5)
val_dataset_quarter = CBOWDataset(val_pairs_quarter, len(vocab_quarter), window_size = 5)
test_dataset_quarter = CBOWDataset(test_pairs_quarter, len(vocab_quarter), window_size = 5)

results_quarter_weighted = tune_hyperparams_quarter_weighted(param_grid_dicts, train_dataset_quarter, val_dataset_quarter)

Epoch 1:
Training Loss: 6.4614
Epoch 1:
Validation Loss: 6.2059
Validation Accuracy: 0.0883
Top-3 Accuracy: 0.1756
Top-5 Accuracy: 0.2301
Top-10 Accuracy: 0.3187
Epoch 2:
Training Loss: 5.9935
Epoch 2:
Validation Loss: 6.1234
Validation Accuracy: 0.0989
Top-3 Accuracy: 0.1913
Top-5 Accuracy: 0.2479
Top-10 Accuracy: 0.3370
Epoch 3:
Training Loss: 5.8432
Epoch 3:
Validation Loss: 6.0787
Validation Accuracy: 0.1044
Top-3 Accuracy: 0.1997
Top-5 Accuracy: 0.2576
Top-10 Accuracy: 0.3467
Epoch 4:
Training Loss: 5.7439
Epoch 4:
Validation Loss: 6.0486
Validation Accuracy: 0.1083
Top-3 Accuracy: 0.2059
Top-5 Accuracy: 0.2640
Top-10 Accuracy: 0.3532
Epoch 5:
Training Loss: 5.6696
Epoch 5:
Validation Loss: 6.0306
Validation Accuracy: 0.1110
Top-3 Accuracy: 0.2097
Top-5 Accuracy: 0.2684
Top-10 Accuracy: 0.3581
Epoch 6:
Training Loss: 5.6100
Epoch 6:
Validation Loss: 6.0188
Validation Accuracy: 0.1126
Top-3 Accuracy: 0.2123
Top-5 Accuracy: 0.2716
Top-10 Accuracy: 0.3613
Epoch 7:
Training Loss: 5.56

In [21]:
results_quarter_weighted_df = pd.DataFrame(results_quarter_weighted)
results_quarter_weighted_df.to_csv('weighted_model_quarter_hyperparam_results.csv', index = False)
print(results_quarter_weighted_df.head())

                                              params  val_accuracy  \
0  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.116672   
1  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.117533   
2  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.112865   
3  {'batch_size': 64, 'embedding_dim': 50, 'learn...      0.112762   
4  {'batch_size': 64, 'embedding_dim': 100, 'lear...      0.118743   

   val_top_3_accuracy  val_top_5_accuracy  val_top_10_accuracy  total time  
0            0.217880            0.277880             0.368394  638.042042  
1            0.219373            0.279266             0.369360  764.510987  
2            0.213017            0.272950             0.363215  125.983661  
3            0.212697            0.272405             0.362845  125.847549  
4            0.220188            0.280671             0.372426  401.873167  


Now that we have completed the hyperparameter tuning across all 6 variations
of the model, we can compare the results of each parameter combination for each model, and pick the best combination for each model, taking into account all accuracy metrics as well as the total time taken.

In [13]:
batch_size_full = 128
embedding_dim_full = 100
learning_rate_full = 0.01
num_epochs_full = 20

batch_size_half = 128
embedding_dim_half = 100
learning_rate_half = 0.01
num_epochs_half = 20

batch_size_quarter = 128
embedding_dim_quarter = 100
learning_rate_quarter = 0.01
num_epochs_quarter = 10

batch_size_weighted_full = 128
embedding_dim_weighted_full = 100
learning_rate_weighted_full = 0.001
num_epochs_weighted_full = 10

batch_size_weighted_half = 64
embedding_dim_weighted_half = 100
learning_rate_weighted_half = 0.001
num_epochs_weighted_half = 10

batch_size_weighted_quarter = 64
embedding_dim_weighted_quarter = 100
learning_rate_weighted_quarter = 0.001
num_epochs_weighted_quarter = 20

vocab_size_full = len(vocab_full)
vocab_size_half = len(vocab_half)
vocab_size_quarter = len(vocab_quarter)

train_pairs_full, val_test_pairs_full = train_test_split(encoded_pairs_full, test_size = 0.7, random_state = 42)
val_pairs_full, test_pairs_full = train_test_split(val_test_pairs_full, test_size = 0.5, random_state = 42)

train_pairs_half, val_test_pairs_half = train_test_split(encoded_pairs_half, test_size = 0.7, random_state = 42)
val_pairs_half, test_pairs_half = train_test_split(val_test_pairs_half, test_size = 0.5, random_state = 42)

train_pairs_quarter, val_test_pairs_quarter = train_test_split(encoded_pairs_quarter, test_size = 0.7, random_state = 42)
val_pairs_quarter, test_pairs_quarter = train_test_split(val_test_pairs_quarter, test_size = 0.5, random_state = 42)

train_dataset_full = CBOWDataset(train_pairs_full, vocab_size_full, 5)
val_dataset_full = CBOWDataset(val_pairs_full, vocab_size_full, 5)
test_dataset_full = CBOWDataset(test_pairs_full, vocab_size_full, 5)

train_dataset_half = CBOWDataset(train_pairs_half, vocab_size_half, 5)
val_dataset_half = CBOWDataset(val_pairs_half, vocab_size_half, 5)
test_dataset_half = CBOWDataset(test_pairs_half, vocab_size_half, 5)

train_dataset_quarter = CBOWDataset(train_pairs_quarter, vocab_size_quarter, 5)
val_dataset_quarter = CBOWDataset(val_pairs_quarter, vocab_size_quarter, 5)
test_dataset_quarter = CBOWDataset(test_pairs_quarter, vocab_size_quarter, 5)

train_loader_full = DataLoader(train_dataset_full, batch_size = batch_size_full, shuffle = True)
val_loader_full = DataLoader(val_dataset_full, batch_size = batch_size_full, shuffle = False)
test_loader_full = DataLoader(test_dataset_full, batch_size = batch_size_full, shuffle = False)

train_loader_half = DataLoader(train_dataset_half, batch_size = batch_size_half, shuffle = True)
val_loader_half = DataLoader(val_dataset_half, batch_size = batch_size_half, shuffle = False)
test_loader_half = DataLoader(test_dataset_half, batch_size = batch_size_half, shuffle = False)

train_loader_quarter = DataLoader(train_dataset_quarter, batch_size = batch_size_quarter, shuffle = True)
val_loader_quarter = DataLoader(val_dataset_quarter, batch_size = batch_size_quarter, shuffle = False)
test_loader_quarter = DataLoader(test_dataset_quarter, batch_size = batch_size_quarter, shuffle = False)

train_loader_weighted_full = DataLoader(train_dataset_full, batch_size = batch_size_weighted_full, shuffle = True)
val_loader_weighted_full = DataLoader(val_dataset_full, batch_size = batch_size_weighted_full, shuffle = False)
test_loader_weighted_full = DataLoader(test_dataset_full, batch_size = batch_size_weighted_full, shuffle = False)

train_loader_weighted_half = DataLoader(train_dataset_half, batch_size = batch_size_weighted_half, shuffle = True)
val_loader_weighted_half = DataLoader(val_dataset_half, batch_size = batch_size_weighted_half, shuffle = False)
test_loader_weighted_half = DataLoader(test_dataset_half, batch_size = batch_size_weighted_half, shuffle = False)

train_loader_weighted_quarter = DataLoader(train_dataset_quarter, batch_size = batch_size_weighted_quarter, shuffle = True)
val_loader_weighted_quarter = DataLoader(val_dataset_quarter, batch_size = batch_size_weighted_quarter, shuffle = False)
test_loader_weighted_quarter = DataLoader(test_dataset_quarter, batch_size = batch_size_weighted_quarter, shuffle = False)

Next, we evaluate all 6 of our models on the test data that was not involved in the training and validation loops. This can be achieved by using the same random_state in our definition of train_test_split.

In [23]:
model_full = CBOWModel(vocab_size_full, embedding_dim_full).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_full.parameters(), lr = learning_rate_full)

best_val_loss = float('inf')
train_losses_full, val_losses_full = [], []

start_time = time.time()
for epoch in range(num_epochs_full):
  model_full.train()
  total_loss = 0.0
  for context, target in train_loader_full:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = model_full(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_full)
  train_losses_full.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_full}: Training Loss =  {avg_train_loss:.4f}")

  model_full.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_full:
      context, target = context.to(device), target.to(device)
      output = model_full(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_full)
  val_losses_full.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_full}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(model_full.state_dict(), "best_cbow_model_full.pth") #important to save this model path so we can reuse it for future evaluations

model_full.load_state_dict(torch.load('best_cbow_model_full.pth'))
model_full.eval()

test_loss_full = 0.0
test_accuracy_full = 0.0
top_3_accuracy_full, top_5_accuracy_full, top_10_accuracy_full = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_full:
    context, target = context.to(device), target.to(device)
    output = model_full(context)
    loss = criterion(output, target)
    test_loss_full += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_full += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_full += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_full += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_full += top_k_accuracies['top_10_accuracy']

avg_test_loss_full = test_loss_full / len(test_loader_full)
avg_test_accuracy_full = test_accuracy_full / len(test_loader_full)
avg_top_3_accuracy_full = top_3_accuracy_full / len(test_loader_full)
avg_top_5_accuracy_full = top_5_accuracy_full / len(test_loader_full)
avg_top_10_accuracy_full = top_10_accuracy_full / len(test_loader_full)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_full:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_full:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_full:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_full:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_full:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 20: Training Loss =  10.6476
Epoch 1 / 20: Validation Loss = 10.6352
Epoch 2 / 20: Training Loss =  10.6305
Epoch 2 / 20: Validation Loss = 10.6312
Epoch 3 / 20: Training Loss =  10.6259
Epoch 3 / 20: Validation Loss = 10.6277
Epoch 4 / 20: Training Loss =  10.6230
Epoch 4 / 20: Validation Loss = 10.6272
Early stopping triggered


  model_full.load_state_dict(torch.load('best_cbow_model_full.pth'))


Test Loss: 10.6274
Test Accuracy: 0.0990
Top 3 Test Accuracy: 0.1574
Top 5 Test Accuracy: 0.1834
Top 10 Test Accuracy: 0.2398
Total time taken to train and test: 776.29


In [25]:
model_half = CBOWModel(vocab_size_half, embedding_dim_half).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_half.parameters(), lr = learning_rate_half)

best_val_loss = float('inf')
train_losses_half, val_losses_half = [], []

start_time = time.time()
for epoch in range(num_epochs_half):
  model_half.train()
  total_loss = 0.0
  for context, target in train_loader_half:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = model_half(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_half)
  train_losses_half.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_half}: Training Loss =  {avg_train_loss:.4f}")

  model_half.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_half:
      context, target = context.to(device), target.to(device)
      output = model_half(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_half)
  val_losses_half.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_half}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(model_half.state_dict(), "best_cbow_model_half.pth")

model_half.load_state_dict(torch.load('best_cbow_model_half.pth'))
model_half.eval()

test_loss_half = 0.0
test_accuracy_half = 0.0
top_3_accuracy_half, top_5_accuracy_half, top_10_accuracy_half = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_half:
    context, target = context.to(device), target.to(device)
    output = model_half(context)
    loss = criterion(output, target)
    test_loss_half += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_half += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_half += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_half += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_half += top_k_accuracies['top_10_accuracy']

avg_test_loss_half = test_loss_half / len(test_loader_half)
avg_test_accuracy_half = test_accuracy_half / len(test_loader_half)
avg_top_3_accuracy_half = top_3_accuracy_half / len(test_loader_half)
avg_top_5_accuracy_half = top_5_accuracy_half / len(test_loader_half)
avg_top_10_accuracy_half = top_10_accuracy_half / len(test_loader_half)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_half:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_half:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_half:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_half:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_half:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 20: Training Loss =  10.4651
Epoch 1 / 20: Validation Loss = 10.4600
Epoch 2 / 20: Training Loss =  10.4526
Epoch 2 / 20: Validation Loss = 10.4579
Epoch 3 / 20: Training Loss =  10.4472
Epoch 3 / 20: Validation Loss = 10.4545
Epoch 4 / 20: Training Loss =  10.4437
Epoch 4 / 20: Validation Loss = 10.4533
Epoch 5 / 20: Training Loss =  10.4411
Epoch 5 / 20: Validation Loss = 10.4532
Early stopping triggered


  model_half.load_state_dict(torch.load('best_cbow_model_half.pth'))


Test Loss: 10.4528
Test Accuracy: 0.1030
Top 3 Test Accuracy: 0.1720
Top 5 Test Accuracy: 0.2034
Top 10 Test Accuracy: 0.2502
Total time taken to train and test: 436.35


In [26]:
model_quarter = CBOWModel(vocab_size_quarter, embedding_dim_quarter).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_quarter.parameters(), lr = learning_rate_quarter)

best_val_loss = float('inf')
train_losses_quarter, val_losses_quarter = [], []

start_time = time.time()
for epoch in range(num_epochs_quarter):
  model_quarter.train()
  total_loss = 0.0
  for context, target in train_loader_quarter:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = model_quarter(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_quarter)
  train_losses_quarter.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_quarter}: Training Loss =  {avg_train_loss:.4f}")

  model_quarter.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_quarter:
      context, target = context.to(device), target.to(device)
      output = model_quarter(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_quarter)
  val_losses_quarter.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_quarter}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(model_quarter.state_dict(), "best_cbow_model_quarter.pth")

model_quarter.load_state_dict(torch.load('best_cbow_model_quarter.pth'))
model_quarter.eval()

test_loss_quarter = 0.0
test_accuracy_quarter = 0.0
top_3_accuracy_quarter, top_5_accuracy_quarter, top_10_accuracy_quarter = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_quarter:
    context, target = context.to(device), target.to(device)
    output = model_quarter(context)
    loss = criterion(output, target)
    test_loss_quarter += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_quarter += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_quarter += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_quarter += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_quarter += top_k_accuracies['top_10_accuracy']

avg_test_loss_quarter = test_loss_quarter / len(test_loader_quarter)
avg_test_accuracy_quarter = test_accuracy_quarter / len(test_loader_quarter)
avg_top_3_accuracy_quarter = top_3_accuracy_quarter / len(test_loader_quarter)
avg_top_5_accuracy_quarter = top_5_accuracy_quarter / len(test_loader_quarter)
avg_top_10_accuracy_quarter = top_10_accuracy_quarter / len(test_loader_quarter)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_quarter:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_quarter:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_quarter:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_quarter:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_quarter:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 10: Training Loss =  10.2820
Epoch 1 / 10: Validation Loss = 10.2710
Epoch 2 / 10: Training Loss =  10.2622
Epoch 2 / 10: Validation Loss = 10.2663
Epoch 3 / 10: Training Loss =  10.2528
Epoch 3 / 10: Validation Loss = 10.2620
Epoch 4 / 10: Training Loss =  10.2468
Epoch 4 / 10: Validation Loss = 10.2606
Epoch 5 / 10: Training Loss =  10.2420
Epoch 5 / 10: Validation Loss = 10.2577
Epoch 6 / 10: Training Loss =  10.2362
Epoch 6 / 10: Validation Loss = 10.2557
Epoch 7 / 10: Training Loss =  10.2323
Epoch 7 / 10: Validation Loss = 10.2551
Early stopping triggered


  model_quarter.load_state_dict(torch.load('best_cbow_model_quarter.pth'))


Test Loss: 10.2560
Test Accuracy: 0.1050
Top 3 Test Accuracy: 0.1780
Top 5 Test Accuracy: 0.2123
Top 10 Test Accuracy: 0.2514
Total time taken to train and test: 271.45


In [28]:
weighted_model_full = CBOWModelWeighted(vocab_size_full, embedding_dim_weighted_full, 5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(weighted_model_full.parameters(), lr = learning_rate_weighted_full)

best_val_loss = float('inf')
train_losses_full, val_losses_full = [], []

start_time = time.time()
for epoch in range(num_epochs_weighted_full):
  weighted_model_full.train()
  total_loss = 0.0
  for context, target in train_loader_weighted_full:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = weighted_model_full(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_weighted_full)
  train_losses_full.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_full}: Training Loss =  {avg_train_loss:.4f}")

  weighted_model_full.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_weighted_full:
      context, target = context.to(device), target.to(device)
      output = weighted_model_full(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_weighted_full)
  val_losses_full.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_full}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(weighted_model_full.state_dict(), "best_cbow_model_weighted_full.pth")

weighted_model_full.load_state_dict(torch.load('best_cbow_model_weighted_full.pth'))
weighted_model_full.eval()

test_loss_full = 0.0
test_accuracy_full = 0.0
top_3_accuracy_full, top_5_accuracy_full, top_10_accuracy_full = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_weighted_full:
    context, target = context.to(device), target.to(device)
    output = weighted_model_full(context)
    loss = criterion(output, target)
    test_loss_full += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_full += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_full += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_full += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_full += top_k_accuracies['top_10_accuracy']

avg_test_loss_full = test_loss_full / len(test_loader_weighted_full)
avg_test_accuracy_full = test_accuracy_full / len(test_loader_weighted_full)
avg_top_3_accuracy_full = top_3_accuracy_full / len(test_loader_weighted_full)
avg_top_5_accuracy_full = top_5_accuracy_full / len(test_loader_weighted_full)
avg_top_10_accuracy_full = top_10_accuracy_full / len(test_loader_weighted_full)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_full:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_full:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_full:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_full:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_full:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 10: Training Loss =  6.1107
Epoch 1 / 10: Validation Loss = 5.8991
Epoch 2 / 10: Training Loss =  5.7425
Epoch 2 / 10: Validation Loss = 5.8051
Epoch 3 / 10: Training Loss =  5.6193
Epoch 3 / 10: Validation Loss = 5.7630
Epoch 4 / 10: Training Loss =  5.5403
Epoch 4 / 10: Validation Loss = 5.7396
Epoch 5 / 10: Training Loss =  5.4800
Epoch 5 / 10: Validation Loss = 5.7236
Epoch 6 / 10: Training Loss =  5.4302
Epoch 6 / 10: Validation Loss = 5.7140
Epoch 7 / 10: Training Loss =  5.3862
Epoch 7 / 10: Validation Loss = 5.7079
Epoch 8 / 10: Training Loss =  5.3461
Epoch 8 / 10: Validation Loss = 5.7040
Epoch 9 / 10: Training Loss =  5.3095
Epoch 9 / 10: Validation Loss = 5.7036
Early stopping triggered


  weighted_model_full.load_state_dict(torch.load('best_cbow_model_weighted_full.pth'))


Test Loss: 5.6974
Test Accuracy: 0.1297
Top 3 Test Accuracy: 0.2403
Top 5 Test Accuracy: 0.3041
Top 10 Test Accuracy: 0.3989
Total time taken to train and test: 1458.17


In [31]:
weighted_model_half = CBOWModelWeighted(vocab_size_half, embedding_dim_weighted_half, 5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(weighted_model_half.parameters(), lr = learning_rate_weighted_half)

best_val_loss = float('inf')
train_losses_half, val_losses_half = [], []

start_time = time.time()
for epoch in range(num_epochs_weighted_half):
  weighted_model_half.train()
  total_loss = 0.0
  for context, target in train_loader_weighted_half:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = weighted_model_half(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_weighted_half)
  train_losses_half.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_half}: Training Loss =  {avg_train_loss:.4f}")

  weighted_model_half.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_weighted_half:
      context, target = context.to(device), target.to(device)
      output = weighted_model_half(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_weighted_half)
  val_losses_half.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_half}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(weighted_model_half.state_dict(), "best_cbow_model_weighted_half.pth")

weighted_model_half.load_state_dict(torch.load('best_cbow_model_weighted_half.pth'))
weighted_model_half.eval()

test_loss_half = 0.0
test_accuracy_half = 0.0
top_3_accuracy_half, top_5_accuracy_half, top_10_accuracy_half = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_weighted_half:
    context, target = context.to(device), target.to(device)
    output = weighted_model_half(context)
    loss = criterion(output, target)
    test_loss_half += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_half += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_half += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_half += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_half += top_k_accuracies['top_10_accuracy']

avg_test_loss_half = test_loss_half / len(test_loader_weighted_half)
avg_test_accuracy_half = test_accuracy_half / len(test_loader_weighted_half)
avg_top_3_accuracy_half = top_3_accuracy_half / len(test_loader_weighted_half)
avg_top_5_accuracy_half = top_5_accuracy_half / len(test_loader_weighted_half)
avg_top_10_accuracy_half = top_10_accuracy_half / len(test_loader_weighted_half)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_half:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_half:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_half:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_half:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_half:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 10: Training Loss =  6.2224
Epoch 1 / 10: Validation Loss = 6.0271
Epoch 2 / 10: Training Loss =  5.8303
Epoch 2 / 10: Validation Loss = 5.9431
Epoch 3 / 10: Training Loss =  5.6908
Epoch 3 / 10: Validation Loss = 5.9052
Epoch 4 / 10: Training Loss =  5.5937
Epoch 4 / 10: Validation Loss = 5.8815
Epoch 5 / 10: Training Loss =  5.5166
Epoch 5 / 10: Validation Loss = 5.8685
Epoch 6 / 10: Training Loss =  5.4488
Epoch 6 / 10: Validation Loss = 5.8639
Epoch 7 / 10: Training Loss =  5.3877
Epoch 7 / 10: Validation Loss = 5.8586
Epoch 8 / 10: Training Loss =  5.3299
Epoch 8 / 10: Validation Loss = 5.8633
Early stopping triggered


  weighted_model_half.load_state_dict(torch.load('best_cbow_model_weighted_half.pth'))


Test Loss: 5.8531
Test Accuracy: 0.1253
Top 3 Test Accuracy: 0.2326
Top 5 Test Accuracy: 0.2953
Top 10 Test Accuracy: 0.3882
Total time taken to train and test: 833.07


In [33]:
weighted_model_quarter = CBOWModelWeighted(vocab_size_quarter, embedding_dim_weighted_quarter, 5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(weighted_model_quarter.parameters(), lr = learning_rate_weighted_quarter)

best_val_loss = float('inf')
train_losses_quarter, val_losses_quarter = [], []

start_time = time.time()
for epoch in range(num_epochs_weighted_quarter):
  weighted_model_quarter.train()
  total_loss = 0.0
  for context, target in train_loader_weighted_quarter:
    context, target = context.to(device), target.to(device)
    optimizer.zero_grad()
    output = weighted_model_quarter(context)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_loader_weighted_quarter)
  train_losses_quarter.append(avg_train_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_quarter}: Training Loss =  {avg_train_loss:.4f}")

  weighted_model_quarter.eval()
  val_loss = 0.0
  with torch.no_grad():
    for context, target in val_loader_weighted_quarter:
      context, target = context.to(device), target.to(device)
      output = weighted_model_quarter(context)
      loss = criterion(output, target)
      val_loss += loss.item()

  avg_val_loss = val_loss / len(val_loader_weighted_quarter)
  val_losses_quarter.append(avg_val_loss)
  print(f"Epoch {epoch + 1} / {num_epochs_weighted_quarter}: Validation Loss = {avg_val_loss:.4f}")

  if abs(best_val_loss - avg_val_loss) < 1e-3:
    print("Early stopping triggered")
    break
  elif avg_val_loss > best_val_loss:
    print("Early stopping triggered")
    break
  else:
    best_val_loss = avg_val_loss
    torch.save(weighted_model_quarter.state_dict(), "best_cbow_model_weighted_quarter.pth")

weighted_model_quarter.load_state_dict(torch.load('best_cbow_model_weighted_quarter.pth'))
weighted_model_quarter.eval()

test_loss_quarter = 0.0
test_accuracy_quarter = 0.0
top_3_accuracy_quarter, top_5_accuracy_quarter, top_10_accuracy_quarter = 0.0, 0.0, 0.0

with torch.no_grad():
  for context, target in test_loader_weighted_quarter:
    context, target = context.to(device), target.to(device)
    output = weighted_model_quarter(context)
    loss = criterion(output, target)
    test_loss_quarter += loss.item()

    predictions = torch.argmax(output, dim = 1)
    test_accuracy_quarter += accuracy_score(target.cpu(), predictions.cpu())

    top_k_accuracies = calculate_metrics(output, target, top_ks = [3, 5, 10])
    top_3_accuracy_quarter += top_k_accuracies['top_3_accuracy']
    top_5_accuracy_quarter += top_k_accuracies['top_5_accuracy']
    top_10_accuracy_quarter += top_k_accuracies['top_10_accuracy']

avg_test_loss_quarter = test_loss_quarter / len(test_loader_weighted_quarter)
avg_test_accuracy_quarter = test_accuracy_quarter / len(test_loader_weighted_quarter)
avg_top_3_accuracy_quarter = top_3_accuracy_quarter / len(test_loader_weighted_quarter)
avg_top_5_accuracy_quarter = top_5_accuracy_quarter / len(test_loader_weighted_quarter)
avg_top_10_accuracy_quarter = top_10_accuracy_quarter / len(test_loader_weighted_quarter)

end_time = time.time()
total_time = end_time - start_time

print(f"Test Loss: {avg_test_loss_quarter:.4f}")
print(f"Test Accuracy: {avg_test_accuracy_quarter:.4f}")
print(f"Top 3 Test Accuracy: {avg_top_3_accuracy_quarter:.4f}")
print(f"Top 5 Test Accuracy: {avg_top_5_accuracy_quarter:.4f}")
print(f"Top 10 Test Accuracy: {avg_top_10_accuracy_quarter:.4f}")
print(f"Total time taken to train and test: {total_time:.2f}")

Epoch 1 / 20: Training Loss =  6.3681
Epoch 1 / 20: Validation Loss = 6.1191
Epoch 2 / 20: Training Loss =  5.8572
Epoch 2 / 20: Validation Loss = 6.0426
Epoch 3 / 20: Training Loss =  5.6668
Epoch 3 / 20: Validation Loss = 6.0033
Epoch 4 / 20: Training Loss =  5.5332
Epoch 4 / 20: Validation Loss = 5.9856
Epoch 5 / 20: Training Loss =  5.4279
Epoch 5 / 20: Validation Loss = 5.9758
Epoch 6 / 20: Training Loss =  5.3400
Epoch 6 / 20: Validation Loss = 5.9757
Early stopping triggered


  weighted_model_quarter.load_state_dict(torch.load('best_cbow_model_weighted_quarter.pth'))


Test Loss: 5.9860
Test Accuracy: 0.1166
Top 3 Test Accuracy: 0.2191
Top 5 Test Accuracy: 0.2788
Top 10 Test Accuracy: 0.3691
Total time taken to train and test: 298.31


Now we can move on to testing some intrinsic and extrinsic evaluators on our various models, and compare their respective performances against each of the evaluators.

# Intrinsic Evaluation #

In [14]:
model_full = CBOWModel(vocab_size_full, embedding_dim_full).to(device)
model_half = CBOWModel(vocab_size_half, embedding_dim_half).to(device)
model_quarter = CBOWModel(vocab_size_quarter, embedding_dim_quarter).to(device)
weighted_model_full = CBOWModelWeighted(vocab_size_full, embedding_dim_weighted_full, 5).to(device)
weighted_model_half = CBOWModelWeighted(vocab_size_half, embedding_dim_weighted_half, 5).to(device)
weighted_model_quarter = CBOWModelWeighted(vocab_size_quarter, embedding_dim_weighted_quarter, 5).to(device)

model_full.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_full.pth'))
model_half.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_half.pth'))
model_quarter.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_quarter.pth'))
weighted_model_full.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_full.pth'))
weighted_model_half.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_half.pth'))
weighted_model_quarter.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_quarter.pth'))

  model_full.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_full.pth'))
  model_half.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_half.pth'))
  model_quarter.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_quarter.pth'))
  weighted_model_full.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_full.pth'))
  weighted_model_half.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_half.pth'))
  weighted_model_quarter.load_state_dict(torch.load('/kaggle/input/word-embedding-models/other/default/1/best_cbow_model_weighted_quarter.pth'))


<All keys matched successfully>

# Word Similarity #

First, we compare how well can our models recognise the similarity between words. Specific words are chosen based on how similar or dissimilar they are, and the similarities as predicted by the model will be compared with that from a human, to evaluate the performance of a model.

In [16]:
"""
common_vocab is defined so that words can be chosen for our first intrinsic 
evaluator. We do not want to end up choosing a word that does not exist in 
a model's vocabulary, as it will not be able to produce a suitable embedding.
"""

common_vocab = set(vocab_full.keys()) & set(vocab_half.keys()) & set(vocab_quarter.keys())
print(list(common_vocab)[1000:1500])

['velociraptor', 'compressor', 'kneeling', 'rab', 'unspecified', 'dublin', 'prank', 'botched', 'perceptiveness', 'shoe', 'adamantly', 'wayland', 'despairing', 'pelvic', 'schuster', 'warmer', 'fourteenth', 'boise', 'schmidt', 'customize', 'faithful', 'furnish', 'greta', 'rolfe', 'colossus', 'slope', 'ap', 'webster', 'homeopathic', 'masse', 'pte', 'frightening', 'corel', 'fink', 'reconnected', 'stave', 'earthen', 'renege', 'breezy', 'aeroplane', 'gstad', 'masseuse', 'demand', 'helplessly', 'kappa', 'submission', 'frightens', 'tem', 'turns', 'violet', 'whoever', 'network', 'macey', 'habitation', 'chihuahua', 'battleship', 'torturing', 'greenhorn', 'argon', 'mongoose', 'mauro', 'heritage', 'pit', 'ood', 'ranulf', 'disconcert', 'staircase', 'presentable', 'pleasantly', 'portico', 'slocum', 'geopolitical', 'darkest', 'holl', 'riel', 'clich', 'cuesta', 'contrast', 'exterior', 'lsu', 'unsettling', 'vigilant', 'rommel', 'external', 'ceni', 'homey', 'fanfare', 'oppress', 'furze', 'unannounced', 

In [25]:
custom_dataset = {
    'word1': ['hot', 'fast', 'european', 'fedora', 'warrior', 'legitimate',
             'alchemy', 'attendant', 'alligator', 'blogger'],
    'word2': ['cold', 'swift', 'romanian', 'indefinitely', 'heroic', 
              'stench', 'treachery', 'foreman', 'viper', 'recurring'],
    'similarity': [3.0, 9.0, 7.5, 0.5, 8.5, 1.0, 1.5, 5.0, 6.0, 1.0]
}
custom_df = pd.DataFrame(custom_dataset)

def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

model_full_sims, model_half_sims, model_quarter_sims = [], [], []
weighted_model_full_sims, weighted_model_half_sims, weighted_model_quarter_sims = [], [], []
human_similarities = custom_df['similarity'].tolist()

for _, row in custom_df.iterrows():
    word1, word2 = row['word1'], row['word2']

    vec1_full = model_full.embeddings.weight.data[vocab_full[word1]].detach().cpu().numpy()
    vec2_full = model_full.embeddings.weight.data[vocab_full[word2]].detach().cpu().numpy()

    vec1_half = model_half.embeddings.weight.data[vocab_half[word1]].detach().cpu().numpy()
    vec2_half = model_half.embeddings.weight.data[vocab_half[word2]].detach().cpu().numpy()

    vec1_quarter = model_quarter.embeddings.weight.data[vocab_quarter[word1]].detach().cpu().numpy()
    vec2_quarter = model_quarter.embeddings.weight.data[vocab_quarter[word2]].detach().cpu().numpy()

    vec1_weighted_full = weighted_model_full.embeddings.weight.data[vocab_full[word1]].detach().cpu().numpy()
    vec2_weighted_full = weighted_model_full.embeddings.weight.data[vocab_full[word2]].detach().cpu().numpy()

    vec1_weighted_half = weighted_model_half.embeddings.weight.data[vocab_half[word1]].detach().cpu().numpy()
    vec2_weighted_half = weighted_model_half.embeddings.weight.data[vocab_half[word2]].detach().cpu().numpy()

    vec1_weighted_quarter = weighted_model_quarter.embeddings.weight.data[vocab_quarter[word1]].detach().cpu().numpy()
    vec2_weighted_quarter = weighted_model_quarter.embeddings.weight.data[vocab_quarter[word2]].detach().cpu().numpy()

    model_full_sims.append(cosine_similarity(vec1_full, vec2_full))
    model_half_sims.append(cosine_similarity(vec1_half, vec2_half))
    model_quarter_sims.append(cosine_similarity(vec1_quarter, vec2_quarter))
    weighted_model_full_sims.append(cosine_similarity(vec1_weighted_full, vec2_weighted_full))
    weighted_model_half_sims.append(cosine_similarity(vec1_weighted_half, vec2_weighted_half))
    weighted_model_quarter_sims.append(cosine_similarity(vec1_weighted_quarter, vec2_weighted_quarter))

cor_full, _ = spearmanr(human_similarities, model_full_sims)
cor_half, _ = spearmanr(human_similarities, model_half_sims)
cor_quarter, _ = spearmanr(human_similarities, model_quarter_sims)
cor_weighted_full, _ = spearmanr(human_similarities, weighted_model_full_sims)
cor_weighted_half, _ = spearmanr(human_similarities, weighted_model_half_sims)
cor_weighted_quarter, _ = spearmanr(human_similarities, weighted_model_quarter_sims)

print(f"Full Model Similarities: {model_full_sims}")
print(f"Full Model Correlation: {cor_full}")
print(f"Half Model Similarities: {model_half_sims}")
print(f"Half Model Correlation: {cor_half}")
print(f"Quarter Model Similarities: {model_quarter_sims}")
print(f"Quarter Model Correlation: {cor_quarter}")
print(f"Weighted Full Model Similarities: {weighted_model_full_sims}")
print(f"Weighted Full Model Correlation: {cor_weighted_full}")
print(f"Weighted Half Model Similarities: {weighted_model_half_sims}")
print(f"Weighted Half Model Correlation: {cor_weighted_half}")
print(f"Weighted Quarter Model Similarities: {weighted_model_quarter_sims}")
print(f"Weighted Quarter Model Correlation: {cor_weighted_quarter}")

Full Model Similarities: [0.3399430909931195, 0.28245043586529617, 0.023994129743122206, 0.03516878845311133, 0.4297824273014339, -0.0038491602192494145, 0.11120815355511826, -0.24181547755315913, -0.006461446180797914, -0.06458320150433217]
Full Model Correlation: 0.34650615989763683
Half Model Similarities: [0.34770953474812694, 0.28559212393206357, 0.04068693640485588, 0.07360373043302926, -0.07592316930435494, -0.11798983177013533, -0.10614448265349918, -0.005776699458077905, 0.15430099283416976, -0.08609664769202974]
Half Model Correlation: 0.395138603392042
Quarter Model Similarities: [0.2846130109633893, 0.08480810775543524, -0.009338049132698645, -0.08692384261580832, 0.07326200578619846, -0.2855609230546936, -0.19492230520074538, 0.22572628631766056, 0.11301451113667849, -0.003532530839936321]
Quarter Model Correlation: 0.480245379507251
Weighted Full Model Similarities: [0.32550454944511964, 0.03939329796905766, 0.15609808905173783, -0.07950767559493999, -0.06753427974964987,

# Concept Categorisation #

We compare how well our models can categorise similar words into clusters, evaluating based on the homogeneity and completeness of clusters formed, using two metrics, V Measure score and Adjusted Rand Index (ARI).

In [29]:
words = ['alligator', 'viper', 'foreman', 'attendant', 'hallway', 'defibrillator',
        'earthworm', 'lavatory', 'appraiser', 'nightstand']
categories = ['animal', 'animal', 'occupation', 'occupation', 'location',
             'item', 'animal', 'location', 'occupation', 'item']

embeddings_full = [model_full.embeddings.weight.data[vocab_full[word]].detach().cpu().numpy() for word in words]
embeddings_half = [model_half.embeddings.weight.data[vocab_half[word]].detach().cpu().numpy() for word in words]
embeddings_quarter = [model_quarter.embeddings.weight.data[vocab_quarter[word]].detach().cpu().numpy() for word in words]
weighted_embeddings_full = [weighted_model_full.embeddings.weight.data[vocab_full[word]].detach().cpu().numpy() for word in words]
weighted_embeddings_half = [weighted_model_half.embeddings.weight.data[vocab_half[word]].detach().cpu().numpy() for word in words]
weighted_embeddings_quarter = [weighted_model_quarter.embeddings.weight.data[vocab_quarter[word]].detach().cpu().numpy() for word in words]

n_clusters = len(set(categories))
kmeans = KMeans(n_clusters = n_clusters, random_state = 42)

predicted_labels_full = kmeans.fit_predict(embeddings_full)
predicted_labels_half = kmeans.fit_predict(embeddings_half)
predicted_labels_quarter = kmeans.fit_predict(embeddings_quarter)
predicted_labels_weighted_full = kmeans.fit_predict(weighted_embeddings_full)
predicted_labels_weighted_half = kmeans.fit_predict(weighted_embeddings_half)
predicted_labels_weighted_quarter = kmeans.fit_predict(weighted_embeddings_quarter)

vmeasure_full = v_measure_score(categories, predicted_labels_full)
ari_full = adjusted_rand_score(categories, predicted_labels_full)
print(f"V Measure Full: {vmeasure_full}")
print(f"ARI Full: {ari_full}")

vmeasure_half = v_measure_score(categories, predicted_labels_half)
ari_half = adjusted_rand_score(categories, predicted_labels_half)
print(f"V Measure Half: {vmeasure_half}")
print(f"ARI Half: {ari_half}")

vmeasure_quarter = v_measure_score(categories, predicted_labels_quarter)
ari_quarter = adjusted_rand_score(categories, predicted_labels_quarter)
print(f"V Measure Quarter: {vmeasure_quarter}")
print(f"ARI Quarter: {ari_quarter}")

vmeasure_weighted_full = v_measure_score(categories, predicted_labels_weighted_full)
ari_weighted_full = adjusted_rand_score(categories, predicted_labels_weighted_full)
print(f"V Measure Weighted Full: {vmeasure_weighted_full}")
print(f"ARI Weighted Full: {ari_weighted_full}")

vmeasure_weighted_half = v_measure_score(categories, predicted_labels_weighted_half)
ari_weighted_half = adjusted_rand_score(categories, predicted_labels_weighted_half)
print(f"V Measure Weighted Half: {vmeasure_weighted_half}")
print(f"ARI Weighted Half: {ari_weighted_half}")

vmeasure_weighted_quarter = v_measure_score(categories, predicted_labels_weighted_quarter)
ari_weighted_quarter = adjusted_rand_score(categories, predicted_labels_weighted_quarter)
print(f"V Measure Weighted Quarter: {vmeasure_weighted_quarter}")
print(f"ARI Weighted Quarter: {ari_weighted_quarter}")

V Measure Full: 0.35007927152981044
ARI Full: -0.09223300970873786
V Measure Half: 0.40946281657246997
ARI Half: 0.02476780185758514
V Measure Quarter: 0.40946281657246997
ARI Quarter: 0.02476780185758514
V Measure Weighted Full: 0.7031321488843297
ARI Weighted Full: 0.45047489823609227
V Measure Weighted Half: 0.4160292481856192
ARI Weighted Half: -0.06418918918918919
V Measure Weighted Quarter: 0.5739323787756515
ARI Weighted Quarter: 0.16923076923076924




# Extrinsic Evaluation # 

# Sentiment Analysis #

First, we conduct some simple sentiment analysis, with a pre-trained dataset, the popular IMDB reviews dataset. This dataset come with pre-defined classes of positive and negative sentiment, and our models are tested on the accuracy at which they can predict the sentiment of the text.

In [17]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
response = requests.get(url, stream=True)

output_file = "/kaggle/working/aclImdb_v1.tar.gz"
with open(output_file, "wb") as f:
    f.write(response.content)

extract_to = "/kaggle/working/aclImdb"
with tarfile.open(output_file, "r:gz") as tar:
    tar.extractall(path=extract_to)

print("Dataset downloaded and extracted!")

Dataset downloaded and extracted!


In [18]:
data = load_files('/kaggle/working/aclImdb/aclImdb/train', categories = ['pos', 'neg'],
                 encoding = 'utf-8', decode_error = 'ignore')

pos_revs = [data.data[i] for i in range(len(data.data)) if data.target[i] == 1]
neg_revs = [data.data[i] for i in range(len(data.data)) if data.target[i] == 0]

samp_pos_revs = random.sample(pos_revs, 10000)
samp_neg_revs = random.sample(neg_revs, 10000)

df_sentiment = pd.DataFrame({
    'review': samp_pos_revs + samp_neg_revs,
    'label': [1] * 10000 + [0] * 10000
})
df_sentiment = df_sentiment.sample(frac = 1).reset_index(drop = True)

print(df_sentiment.shape)
print(df_sentiment.head())

(20000, 2)
                                              review  label
0  Four teenage girlfriends drive to Fort Laurdal...      1
1  I found 'Time At The Top' an entertaining and ...      1
2  Got to be one of the best political satires I ...      1
3  Another too bad the lowest they can go here is...      0
4  American Pie has gone a long distance from the...      0


In [19]:
X = df_sentiment['review']
y = df_sentiment['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

def encode_text(text, model, vocab):
  tokens = text.split()
  vectors = [
      model.embeddings.weight.data[vocab[word]].detach().cpu().numpy() for word in tokens if word in vocab
  ]
  if len(vectors) == 0:
    return np.zeros(100)
  return np.mean(vectors, axis = 0)

X_train_encoded_full = np.array([encode_text(review, model_full, vocab_full) for review in X_train])
X_test_encoded_full = np.array([encode_text(review, model_full, vocab_full) for review in X_test])
X_train_encoded_half = np.array([encode_text(review, model_half, vocab_half) for review in X_train])
X_test_encoded_half = np.array([encode_text(review, model_half, vocab_half) for review in X_test])
X_train_encoded_quarter = np.array([encode_text(review, model_quarter, vocab_quarter) for review in X_train])
X_test_encoded_quarter = np.array([encode_text(review, model_quarter, vocab_quarter) for review in X_test])
X_train_encoded_weighted_full = np.array([encode_text(review, weighted_model_full, vocab_full) for review in X_train])
X_test_encoded_weighted_full = np.array([encode_text(review, weighted_model_full, vocab_full) for review in X_test])
X_train_encoded_weighted_half = np.array([encode_text(review, weighted_model_half, vocab_half) for review in X_train])
X_test_encoded_weighted_half = np.array([encode_text(review, weighted_model_half, vocab_half) for review in X_test])
X_train_encoded_weighted_quarter = np.array([encode_text(review, weighted_model_quarter, vocab_quarter) for review in X_train])
X_test_encoded_weighted_quarter = np.array([encode_text(review, weighted_model_quarter, vocab_quarter) for review in X_test])

classifier_full = LogisticRegression(random_state = 42)
classifier_full.fit(X_train_encoded_full, y_train)
y_pred_full = classifier_full.predict(X_test_encoded_full)
print(f"Full Model Accuracy: {accuracy_score(y_test, y_pred_full):.4f}")
print(classification_report(y_test, y_pred_full))

classifier_half = LogisticRegression(random_state = 42)
classifier_half.fit(X_train_encoded_half, y_train)
y_pred_half = classifier_full.predict(X_test_encoded_half)
print(f"Half Model Accuracy: {accuracy_score(y_test, y_pred_half):.4f}")
print(classification_report(y_test, y_pred_half))

classifier_quarter = LogisticRegression(random_state = 42)
classifier_quarter.fit(X_train_encoded_quarter, y_train)
y_pred_quarter = classifier_quarter.predict(X_test_encoded_quarter)
print(f"Quarter Model Accuracy: {accuracy_score(y_test, y_pred_quarter):.4f}")
print(classification_report(y_test, y_pred_quarter))

classifier_weighted_full = LogisticRegression(random_state = 42)
classifier_weighted_full.fit(X_train_encoded_weighted_full, y_train)
y_pred_weighted_full = classifier_weighted_full.predict(X_test_encoded_weighted_full)
print(f"Weighted Full Model Accuracy: {accuracy_score(y_test, y_pred_weighted_full):.4f}")
print(classification_report(y_test, y_pred_weighted_full))

classifier_weighted_half = LogisticRegression(random_state = 42)
classifier_weighted_half.fit(X_train_encoded_weighted_half, y_train)
y_pred_weighted_half = classifier_weighted_half.predict(X_test_encoded_weighted_half)
print(f"Weighted Half Model Accuracy: {accuracy_score(y_test, y_pred_weighted_half):.4f}")
print(classification_report(y_test, y_pred_weighted_half))

classifier_weighted_quarter = LogisticRegression(random_state = 42)
classifier_weighted_quarter.fit(X_train_encoded_weighted_quarter, y_train)
y_pred_weighted_quarter = classifier_weighted_quarter.predict(X_test_encoded_weighted_quarter)
print(f"Weighted Quarter Model Accuracy: {accuracy_score(y_test, y_pred_weighted_quarter):.4f}")
print(classification_report(y_test, y_pred_weighted_quarter))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Full Model Accuracy: 0.7045
              precision    recall  f1-score   support

           0       0.70      0.70      0.70      2000
           1       0.70      0.70      0.70      2000

    accuracy                           0.70      4000
   macro avg       0.70      0.70      0.70      4000
weighted avg       0.70      0.70      0.70      4000

Half Model Accuracy: 0.5065
              precision    recall  f1-score   support

           0       0.50      0.95      0.66      2000
           1       0.56      0.06      0.11      2000

    accuracy                           0.51      4000
   macro avg       0.53      0.51      0.38      4000
weighted avg       0.53      0.51      0.38      4000

Quarter Model Accuracy: 0.6893
              precision    recall  f1-score   support

           0       0.69      0.69      0.69      2000
           1       0.69      0.69      0.69      2000

    accuracy                           0.69      4000
   macro avg       0.69      0.69      0.

# Named Entity Recognition #

We do some simple checks on the performance of our models on NER tasks, using a pre-trained dataset, CoNLL-2003, another very popular dataset. Our models are then tested on how well they can classify information units.

In [15]:
ner_dataset = load_dataset('conll2003')

train_data = ner_dataset['train']
test_data = ner_dataset['test']

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [24]:
def encode_sentence(tokens, model, vocab):
    """
    Encodes each token in a sentence using a pre-trained model.
    Returns individual token embeddings.
    """
    embeddings = []
    for token in tokens:
        if token in vocab:
            embedding = model.embeddings.weight.data[vocab[token]].detach().cpu().numpy()
            embeddings.append(embedding)
        else:
            embeddings.append(np.zeros(model.embeddings.embedding_dim))  # Unknown tokens

    return embeddings

X_train_full, y_train_full = [], []
X_train_half, y_train_half = [], []
X_train_quarter, y_train_quarter = [], []
X_train_weighted_full, y_train_weighted_full = [], []
X_train_weighted_half, y_train_weighted_half = [], []
X_train_weighted_quarter, y_train_weighted_quarter = [], []

for example in train_data:
    tokens = example['tokens']
    tags = example['ner_tags']
    embeddings_full = encode_sentence(tokens, model_full, vocab_full)
    embeddings_half = encode_sentence(tokens, model_half, vocab_half)
    embeddings_quarter = encode_sentence(tokens, model_quarter, vocab_quarter)
    embeddings_weighted_full = encode_sentence(tokens, weighted_model_full, vocab_full)
    embeddings_weighted_half = encode_sentence(tokens, weighted_model_half, vocab_half)
    embeddings_weighted_quarter = encode_sentence(tokens, weighted_model_quarter, vocab_quarter)
    
    X_train_full.extend(embeddings_full)
    X_train_half.extend(embeddings_half)
    X_train_quarter.extend(embeddings_quarter)
    X_train_weighted_full.extend(embeddings_weighted_full)
    X_train_weighted_half.extend(embeddings_weighted_half)
    X_train_weighted_quarter.extend(embeddings_weighted_quarter)

    y_train_full.extend(tags)
    y_train_half.extend(tags)
    y_train_quarter.extend(tags)
    y_train_weighted_full.extend(tags)
    y_train_weighted_half.extend(tags)
    y_train_weighted_quarter.extend(tags)

X_test_full, y_test_full = [], []
X_test_half, y_test_half = [], []
X_test_quarter, y_test_quarter = [], []
X_test_weighted_full, y_test_weighted_full = [], []
X_test_weighted_half, y_test_weighted_half = [], []
X_test_weighted_quarter, y_test_weighted_quarter = [], []

for example in test_data:
    tokens = example['tokens']
    tags = example['ner_tags']
    embeddings_full = encode_sentence(tokens, model_full, vocab_full)
    embeddings_half = encode_sentence(tokens, model_half, vocab_half)
    embeddings_quarter = encode_sentence(tokens, model_quarter, vocab_quarter)
    embeddings_weighted_full = encode_sentence(tokens, weighted_model_full, vocab_full)
    embeddings_weighted_half = encode_sentence(tokens, weighted_model_half, vocab_half)
    embeddings_weighted_quarter = encode_sentence(tokens, weighted_model_quarter, vocab_quarter)
    
    X_test_full.extend(embeddings_full)
    X_test_half.extend(embeddings_half)
    X_test_quarter.extend(embeddings_quarter)
    X_test_weighted_full.extend(embeddings_weighted_full)
    X_test_weighted_half.extend(embeddings_weighted_half)
    X_test_weighted_quarter.extend(embeddings_weighted_quarter)

    y_test_full.extend(tags)
    y_test_half.extend(tags)
    y_test_quarter.extend(tags)
    y_test_weighted_full.extend(tags)
    y_test_weighted_half.extend(tags)
    y_test_weighted_quarter.extend(tags)

In [25]:
classifier_full = LogisticRegression(random_state = 42)
classifier_full.fit(X_train_full, y_train_full)
y_pred_full = classifier_full.predict(X_test_full)
print(f"Full Model Accuracy: {accuracy_score(y_test_full, y_pred_full):.4f}")
print(classification_report(y_test_full, y_pred_full, zero_division = 0))

classifier_half = LogisticRegression(random_state = 42)
classifier_half.fit(X_train_half, y_train_half)
y_pred_half = classifier_half.predict(X_test_half)
print(f"Half Model Accuracy: {accuracy_score(y_test_half, y_pred_half):.4f}")
print(classification_report(y_test_half, y_pred_half, zero_division = 0))

classifier_quarter = LogisticRegression(random_state = 42)
classifier_quarter.fit(X_train_quarter, y_train_quarter)
y_pred_quarter = classifier_quarter.predict(X_test_quarter)
print(f"Quarter Model Accuracy: {accuracy_score(y_test_quarter, y_pred_quarter):.4f}")
print(classification_report(y_test_quarter, y_pred_quarter, zero_division = 0))

classifier_weighted_full = LogisticRegression(random_state = 42)
classifier_weighted_full.fit(X_train_weighted_full, y_train_weighted_full)
y_pred_weighted_full = classifier_weighted_full.predict(X_test_weighted_full)
print(f"Weighted Full Model Accuracy: {accuracy_score(y_test_weighted_full, y_pred_weighted_full):.4f}")
print(classification_report(y_test_weighted_full, y_pred_weighted_full, zero_division = 0))

classifier_weighted_half = LogisticRegression(random_state = 42)
classifier_weighted_half.fit(X_train_weighted_half, y_train_weighted_half)
y_pred_weighted_half = classifier_weighted_half.predict(X_test_weighted_half)
print(f"Weighted Half Model Accuracy: {accuracy_score(y_test_weighted_half, y_pred_weighted_half):.4f}")
print(classification_report(y_test_weighted_half, y_pred_weighted_half, zero_division = 0))

classifier_weighted_quarter = LogisticRegression(random_state = 42)
classifier_weighted_quarter.fit(X_train_weighted_quarter, y_train_weighted_quarter)
y_pred_weighted_quarter = classifier_weighted_quarter.predict(X_test_weighted_quarter)
print(f"Weighted Quarter Model Accuracy: {accuracy_score(y_test_weighted_quarter, y_pred_weighted_quarter):.4f}")
print(classification_report(y_test_weighted_quarter, y_pred_weighted_quarter, zero_division = 0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Full Model Accuracy: 0.8247
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     38323
           1       0.00      0.00      0.00      1617
           2       0.00      0.00      0.00      1156
           3       0.00      0.00      0.00      1661
           4       0.00      0.00      0.00       835
           5       0.00      0.00      0.00      1668
           6       0.00      0.00      0.00       257
           7       0.00      0.00      0.00       702
           8       0.00      0.00      0.00       216

    accuracy                           0.82     46435
   macro avg       0.09      0.11      0.10     46435
weighted avg       0.68      0.82      0.75     46435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Half Model Accuracy: 0.8247
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     38323
           1       0.00      0.00      0.00      1617
           2       0.00      0.00      0.00      1156
           3       0.00      0.00      0.00      1661
           4       0.00      0.00      0.00       835
           5       0.00      0.00      0.00      1668
           6       0.00      0.00      0.00       257
           7       0.00      0.00      0.00       702
           8       0.00      0.00      0.00       216

    accuracy                           0.82     46435
   macro avg       0.09      0.11      0.10     46435
weighted avg       0.68      0.82      0.75     46435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Quarter Model Accuracy: 0.8246
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     38323
           1       0.00      0.00      0.00      1617
           2       0.67      0.00      0.00      1156
           3       0.00      0.00      0.00      1661
           4       0.00      0.00      0.00       835
           5       0.00      0.00      0.00      1668
           6       0.00      0.00      0.00       257
           7       0.00      0.00      0.00       702
           8       0.00      0.00      0.00       216

    accuracy                           0.82     46435
   macro avg       0.17      0.11      0.10     46435
weighted avg       0.70      0.82      0.75     46435

Weighted Full Model Accuracy: 0.8213
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     38323
           1       0.00      0.00      0.00      1617
           2       0.00      0.00      0.00      1156
          