### This notebook uses the .pt files (tensors) of the corpus, training data, word-to-id, and id-to-word mappings to train a cbow model

We will follow the steps below: 
- create a CBOWModel class that contains the embedding layer that creates and learns embeddings
- create a 'train cbow model' function that learns the embeddings during training and returns a loss over time
- create a 'get word embeddings' function that stores the learned embeddings from the cbow model as self.embeddings.weight

**To optimise data handling, we will use PyTorch's Dataset and DataLoader methods.*

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import time
import os
import numpy as np

In [2]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable cuDNN auto-tuner
    torch.backends.cudnn.benchmark = True


Using device: cuda
GPU: NVIDIA RTX A4000
GPU Memory: 16.78 GB


In [6]:
# Function to load data with memory mapping
def load_data_with_mmap(file_path, map_location=None):
    if map_location is None:
        map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    print(f"Loading {file_path} with memory mapping...")
    return torch.load(file_path, map_location=map_location)

In [7]:
training_data = torch.load("../data/eve_training_data.pt")
word_to_id = torch.load("../data/eve_word_to_id.pt")
id_to_word = torch.load("../data/eve_id_to_word.pt")
corpus = torch.load("../data/eve_corpus.pt")


KeyboardInterrupt: 

In [7]:


class CBOWDataset(Dataset):
    def __init__(self, data, word_to_id):
        self.data = data
        self.word_to_id = word_to_id

    # overriding the __len__ method to tell PyTorch how many samples you have
    def __len__(self):
        return len(self.data)

    # overriding the __getitem__ method 
    # to tell PyTorch how to retrieve a specific sample and convert it to the format your model expects
    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_ids = torch.tensor([self.word_to_id[word] for word in context], dtype=torch.long)
        target_id = torch.tensor(self.word_to_id[target], dtype=torch.long)
        return context_ids, target_id

In [8]:
dataset = CBOWDataset(training_data, word_to_id)
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders with GPU pinning
train_loader = DataLoader(
    train_dataset, 
    batch_size=128,
    shuffle=True,
    pin_memory=True,  # Enable pinning for faster GPU transfer
    num_workers=4     # Use multiple workers for data loading
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=128,
    shuffle=False,
    pin_memory=True,
    num_workers=4
)


NameError: name 'word_to_id' is not defined

In [None]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):

In [5]:
# create the cbow model
class CBOWModel(nn.Module):
    # define the architecture of the model
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__() # call super to inherit from nn.Module
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) # create an embedding layer
        self.linear = nn.Linear(embedding_dim, vocab_size) # create a linear layer to project embeddings back to vocab size
        
    # define how data flows through the model
    def forward(self, inputs):
        embeds = self.embeddings(inputs) # convert input words to embeddings
        out = torch.mean(embeds, dim=1) # average the embeddings
        out = self.linear(out) # project embeddings back to vocab size as vector of logits
        log_probs = F.log_softmax(out, dim=1) # apply softmax to get log probabilities
        return log_probs



In [3]:
# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

CUDA available: False
