In [3]:
from collections import Counter
import re


wiki_text_file = "../data/wiki_text_data.txt"


with open(wiki_text_file, "r") as file:
   text = file.read()


def tokenizer(text):
  
   # remove punctuation and non alphabetic characters
   remove_punctuation = re.sub(r'[^\w\s]', '', text)
   lower_case_words = remove_punctuation.lower()
   words = lower_case_words.split(' ')


   # print count of words in split_words_by_whitespace
   print(f"Number of words before filtering: {len(words)}")


   # get word counts


   top_k = 30000
   word_counts = Counter(words)
   top_words = dict(word_counts.most_common(top_k))
   word_to_id = {word: i for i, word in enumerate(top_words.keys())}
   id_to_word = {i: word for i, word in enumerate(top_words.keys())}


   # Sum their counts
   total_count = sum(count for word, count in top_words.items())


   print(f"Total count of top {top_k} words: {total_count}")
   # Optional: Show what percentage of all words this represents
   total_words = sum(word_counts.values())
   percentage = (total_count / total_words) * 100
   print(f"This represents {percentage:.2f}% of all words in the corpus")


   # filter corpus to only include words in the tok k words
   corpus = [word for word in words if word in top_words]
   print("corpus length:", len(corpus))


   return word_to_id, id_to_word, corpus

In [4]:
# usage
word_to_id, id_to_word, corpus = tokenizer(text)

Number of words before filtering: 17005208
Total count of top 30000 words: 16315126
This represents 95.94% of all words in the corpus
corpus length: 16315126


In [5]:



#print(corpus[:100])

# Generate the training data from the corpus
# The training data looks like a list of tuples,
# where each tuple contains a list of context words and the target word (not the IDs)


def generate_training_data(corpus):
   data = []


   # start from index 2 and end 2 positions before the last word
   # this ensures we always have 2 words before and after the target word
   # for a 5-len sliding window


   for i in range(2, len(corpus) - 2):
       # Get the context words
       # 'i' is the index of the target word
       # [i-2:i] gets the two words before the target word
       # [i+1:i+3] gets the two words after the target word
       context_words = corpus[i-2:i] + corpus[i+1:i+3]
      
       # Get the target word
       target_word = corpus[i]


       # Append the tuple to the data list
       data.append((context_words, target_word))


   return data


In [6]:
# usage
training_data = generate_training_data(corpus)
print("CBOW training data generated")

CBOW training data generated


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import time
import os
import numpy as np
import wandb




In [2]:
# Initialize wandb with your configuration
wandb.init(
   project="cbow-wiki",
   config={
       # Model parameters
       "embedding_dim": 200,
       "vocab_size": 30000,
      
       # Training parameters
       "batch_size": 128,
       "learning_rate": 0.001,
       "num_epochs": 5,
       "train_split": 0.7,
      
       # Optimizer parameters
       "weight_decay": 1e-5,
      
       # DataLoader parameters
       "num_workers": 4,
      
       # Architecture details
       "model_type": "CBOW",
       "context_size": 4  # 2 words before + 2 words after
   }
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mevelyntants[0m ([33mbryars-bryars[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# Then use the config values throughout your code
EMBEDDING_DIM = wandb.config.embedding_dim
BATCH_SIZE = wandb.config.batch_size
LEARNING_RATE = wandb.config.learning_rate
NUM_EPOCHS = wandb.config.num_epochs
TRAIN_SPLIT = wandb.config.train_split


In [8]:
class CBOWDataset(Dataset):
   def __init__(self, data, word_to_id):
       self.data = data
       self.word_to_id = word_to_id


   # overriding the __len__ method to tell PyTorch how many samples you have
   def __len__(self):
       return len(self.data)


   # overriding the __getitem__ method
   # to tell PyTorch how to retrieve a specific sample and convert it to the format your model expects
   def __getitem__(self, idx):
       context, target = self.data[idx]
       context_ids = torch.tensor([self.word_to_id[word] for word in context], dtype=torch.long)
       target_id = torch.tensor(self.word_to_id[target], dtype=torch.long)
       return context_ids, target_id

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")


if torch.cuda.is_available():
   print(f"GPU: {torch.cuda.get_device_name(0)}")
   print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
   # Enable cuDNN auto-tuner
   torch.backends.cudnn.benchmark = True

dataset = CBOWDataset(training_data, word_to_id)
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


# Create data loaders with GPU pinning
train_loader = DataLoader(
   train_dataset,
   batch_size=wandb.config.batch_size,
   shuffle=True,
   pin_memory=True,  # Enable pinning for faster GPU transfer
   num_workers=wandb.config.num_workers     # Use multiple workers for data loading
)


test_loader = DataLoader(
   test_dataset,
   batch_size=wandb.config.batch_size,
   shuffle=False,
   pin_memory=True,
   num_workers=wandb.config.num_workers
)



Using device: cuda
GPU: NVIDIA GeForce RTX 3080
GPU Memory: 10.39 GB


In [9]:
class CBOW(torch.nn.Module):
   def __init__(self, vocab_size, embedding_dim):
       super(CBOW, self).__init__()
       self.embedding = nn.Embedding(vocab_size, embedding_dim)
       self.linear = nn.Linear(embedding_dim, vocab_size)


   def forward(self, inputs):
       embed = self.embedding(inputs)
       embed = embed.mean(dim=1)
       out = self.linear(embed)
       probs = F.log_softmax(out, dim=1)
       return probs
      


In [10]:
model = CBOW(
   vocab_size=wandb.config.vocab_size,
   embedding_dim=wandb.config.embedding_dim)


model = model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                            lr=wandb.config.learning_rate,
                            weight_decay=wandb.config.weight_decay)




#Add evaluation function
def evaluate(model, test_loader, criterion, device):
   model.eval()
   total_loss = 0
   with torch.no_grad():
      for context, target in test_loader:
           context, target = context.to(device), target.to(device)
           output = model(context)
           loss = criterion(output, target)
           total_loss += loss.item()
   return total_loss / len(test_loader)





In [11]:
# Initialize best_test_loss with infinity before the training loop
best_test_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement
epochs_no_improve = 0  # Counter for epochs with no improvement

# Modified training loop with progress bar and test loss
for epoch in range(wandb.config.num_epochs):
   model.train()
   total_loss = 0
  
   # Create progress bar for each epoch
   progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{wandb.config.num_epochs}')
  
   for batch_idx, (context, target) in enumerate(progress_bar):
       context, target = context.to(device), target.to(device)
      
       optimizer.zero_grad()
       output = model(context)
       loss = criterion(output, target)
       loss.backward()
       optimizer.step()
      
       total_loss += loss.item()
       current_loss = total_loss / (batch_idx + 1)
      
       # Progress bar for both losses
       progress_bar.set_postfix({
           'train_loss': f'{current_loss:.4f}'
       })
      
       # Log batch metrics
       wandb.log({
           "batch_loss": loss.item(),
           "batch": batch_idx + epoch * len(train_loader)
       })
  
   # Calculate average training loss
   train_loss = total_loss / len(train_loader)
  
   # Calculate test loss
   test_loss = evaluate(model, test_loader, criterion, device)
  
   # Print epoch summary
   print(f'\nEpoch {epoch+1}/{wandb.config.num_epochs}:')
   print(f'Train Loss: {train_loss:.4f}')
   print(f'Test Loss: {test_loss:.4f}')
  
   # Log epoch metrics
   wandb.log({
       "epoch": epoch + 1,
       "train_loss": train_loss,
       "test_loss": test_loss,
       "learning_rate": optimizer.param_groups[0]['lr'],
       "embedding_dim": wandb.config.embedding_dim,
       "vocab_size": wandb.config.vocab_size,
       "batch_size": wandb.config.batch_size,
       "weight_decay": wandb.config.weight_decay,
       "num_workers": wandb.config.num_workers
   })


   # Save checkpoint if test loss improved
   if test_loss < best_test_loss:
       best_test_loss = test_loss
       print(f"New best test loss: {test_loss:.4f}. Saving checkpoint...")
      
       # Save model checkpoint
       checkpoint_path = f"cbow_best_model_epoch{epoch+1}.pt"
       torch.save({
           'epoch': epoch,
           'model_state_dict': model.state_dict(),
           'optimizer_state_dict': optimizer.state_dict(),
           'train_loss': train_loss,
           'test_loss': test_loss,
           'config': wandb.config,
       }, checkpoint_path)
      
       # Extract the embedding matrix from the model
       embedding_weights = model.embedding.weight.data.cpu().numpy()


       # Log embedding weights histogram to wandb
       wandb.log({
           "embedding_weights": wandb.Histogram(embedding_weights)
       })


       # save embeddings as .pt version
       torch.save(model.embedding.weight.data.cpu(), f"./model/embeddings_epoch{epoch+1}.pt")


       # Log to wandb
       embedding_path = f"./model/embeddings_epoch{epoch+1}.pt"
       wandb.save(embedding_path)
       print(f"Embeddings saved to {embedding_path}")   


       # Save to wandb
       wandb.save(checkpoint_path)
       print(f"Checkpoint saved to {checkpoint_path}")
       
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered. No improvement in test loss for {patience} epochs.")
            break




Epoch 1/5: 100%|██████████| 89224/89224 [04:14<00:00, 349.95it/s, train_loss=6.2445]



Epoch 1/5:
Train Loss: 6.2445
Test Loss: 6.1203


NameError: name 'best_test_loss' is not defined

In [None]:
# Save final model
final_checkpoint_path = "./model/cbow_final_model.pt"
torch.save({
   'epoch': wandb.config.num_epochs,
   'model_state_dict': model.state_dict(),
   'optimizer_state_dict': optimizer.state_dict(),
   'train_loss': train_loss,
   'test_loss': test_loss,
   'config': wandb.config,
}, final_checkpoint_path)
wandb.save(final_checkpoint_path)
print(f"\nTraining completed. Final model saved to {final_checkpoint_path}")

