# Word2Vec

To simplify, we will train the following small corpus from NLTK.

In [1]:
from nltk.corpus import inaugural

raw_corpus = inaugural.raw("1789-Washington.txt")
print(f"words (including punctuation): {len(inaugural.words('1793-Washington.txt'))}")
raw_corpus

words (including punctuation): 147


'Fellow-Citizens of the Senate and of the House of Representatives:\n\nAmong the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could not

In [2]:
import torch
import torch.nn as nn
import lightning as L

from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch import optim

## Aside: understanding `nn.Embedding` in PyTorch

`nn.Embedding` generates a $n\times m$ matrix where n = number of words (ie. vocabulary) and m = dimension of each word embedding. The creation of a `nn.Embedding` object will randomly initialize its entries (user can specify distribution to draw from) in model class relation.

A forward pass to a `nn.Embedding` object takes a torch tensor of `torch.int64` data type (ie. `torch.LongTensor` type) and returns the corresponding indicies row of embeddings to the integer entries of the input. (ex. [0] will return the 1st embedding vector, see examples in cells below). 


In [3]:
embedding = nn.Embedding(3, 3) # create a 3x3 matrix of embeddings
print(embedding.weight)
input = torch.tensor([0, 1], dtype=torch.int64) # returns the 1st and 2nd ROW of the matrix
print(input.dtype)
embedding(input)

Parameter containing:
tensor([[-1.5434, -2.1433, -0.5919],
        [ 0.3199,  0.3632,  1.5093],
        [-1.1995, -0.9635, -0.2396]], requires_grad=True)
torch.int64


tensor([[-1.5434, -2.1433, -0.5919],
        [ 0.3199,  0.3632,  1.5093]], grad_fn=<EmbeddingBackward0>)

In [4]:
embedding = nn.Embedding(5, 3) # create a 5x3 matrix of embeddings
print(embedding.weight)
input = torch.tensor([[0, 1], [3,4]], dtype=torch.long) 
# returns 3rd order tensor with first entry in 1st dimension = 1st and 2nd row embedding vectors and second entry = 4th and 5th row embedding vectors
embedding(input)

Parameter containing:
tensor([[-0.2629,  1.2739,  1.8622],
        [ 1.0560,  0.5023,  0.9591],
        [-1.5599, -0.6480,  0.1509],
        [-1.6602, -0.4377,  0.8469],
        [-0.3645,  1.2964,  0.8005]], requires_grad=True)


tensor([[[-0.2629,  1.2739,  1.8622],
         [ 1.0560,  0.5023,  0.9591]],

        [[-1.6602, -0.4377,  0.8469],
         [-0.3645,  1.2964,  0.8005]]], grad_fn=<EmbeddingBackward0>)

## Skip-gram architecture

### Model inputs

Skip-gram word2vec takes in a pair of integers that represent a word pairing between a "center word" and an "outside word" the integers are usually the index of the word (or token) in the vocabulary (list object of words/token the model is to be trained on). The center word here is self-explanatory. What determines if the word should be outside is based on the `window` parameter of the model where it determines how many slots are to be considered (usually 2-4 slots). 

In [5]:
# create dataset class to handle corpus and feeding dataset for training + inference (ie. generate integer pairs from a corpus)

class SkipGramDataset(Dataset):
    def __init__(
        self, 
        corpus: str, 
        window_size: int = 2,
        min_count: int = 5 # required minimum times appeared by the word in corpus to be incorporated into vocabulary
    ):
        self.window_size = window_size

        self.tokens = self._tokenize(corpus)
        self.vocab, self.word2idx, self.idx2word = self._build_vocab(min_count)
        self.pairs = self._generate_pairs()
        
    def __len__(self) -> int:
        return len(self.pairs)
    
    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        target = self.pairs[index]["target"]
        context = self.pairs[index]["context"]
        return torch.tensor([target], dtype=torch.long), torch.tensor([context], dtype=torch.long)
    
    def _tokenize(self, corpus: str) -> list[str]:
        tokens = corpus.lower().split()
        return tokens
    
    def _build_vocab(self, min_count: int) -> tuple[list[str], dict[str, int], dict[int, str]]:
        """Build vocabulary from tokens where it takes only words that occur more than minimal count and indices them in order of appearance"""
        word_counts = Counter(self.tokens)
        vocab = [word for word, count in word_counts.items() if count >= min_count]
        word_2_index_dict = {word: idx for idx, word in enumerate(vocab)}
        index_2_word_dict = {idx: word for word, idx in word_2_index_dict.items()}
        return vocab, word_2_index_dict, index_2_word_dict
    
    def _generate_pairs(self) -> list[dict[str, int]]:
        """
            For each word i in the corpus which are in the vocabulary:
                create a pair where:
                    1. target: index of the word i
                    2. context: index of nth closest word (that is in the vocabulary) to i (where n is dictated by window_size)
                append pair to pair list
                repeat for same target with (n - 1)th closest applicable word 
                    and onwards until window size exhausted
        """
        pairs = []
        indexed_tokens = [self.word2idx[token] for token in self.tokens if token in self.vocab]

        for i, target_index in enumerate(indexed_tokens):
            #Get context window
            start = max(0, i - self.window_size)
            end = min(len(indexed_tokens), i + self.window_size + 1)

            context_indices = [indexed_tokens[j] for j in range(start, end) if j != i]

            # Skip-gram: predict context from target
            for context_index in context_indices:
                pairs.append({
                    "target": target_index, 
                    "context": context_index
                })

        return pairs



In [6]:
corpus_dataset = SkipGramDataset(raw_corpus)
corpus_dataset[0]

(tensor([0]), tensor([1]))

### Model

In [7]:
class SkipGram(L.LightningModule):
    def __init__(self, vocab_size: int, embedding_dim: int, learning_rate: float):
        super().__init__()
        self.v_embeddings = nn.Embedding(vocab_size, embedding_dim) 
        self.u_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize with small random values
        self.v_embeddings.weight.data.uniform_(-0.5 / embedding_dim, 0.5 / embedding_dim)
        self.u_embeddings.weight.data.uniform_(-0.5 / embedding_dim, 0.5 / embedding_dim)

        # Hyperparamters
        self.learning_rate = learning_rate

    def forward(self, center_word: torch.Tensor, outside_word: torch.Tensor) -> torch.Tensor:
        v_c = self.v_embeddings(center_word) #the embeddings are essentially row vectors
        u_o = self.u_embeddings(outside_word) #the embeddings are essentially row vectors

        numerator = torch.exp((u_o @ v_c.transpose(1, 2)).squeeze(2)) 
        lower_product = torch.exp((self.u_embeddings.weight.data @ v_c.transpose(1, 2)).squeeze(2)) 
        denominator = torch.sum(lower_product)

        probability = numerator / denominator

        return probability
    
    def loss(self, center_word: torch.Tensor, outside_word: torch.Tensor) -> torch.Tensor:
        _probability = self.forward(center_word, outside_word)
        loss = -torch.mean(torch.log(_probability).unsqueeze(1))

        return loss
    
    def training_step(self, batch: tuple[torch.Tensor], batch_idx: int):
        _input_target, _input_context = batch
        training_loss = self.loss(_input_target, _input_context)
        return training_loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer



In [8]:
# test some outputs
model = SkipGram(len(corpus_dataset.vocab), 10, learning_rate=0.001)
output = model(torch.LongTensor([[0]]), torch.LongTensor([[2]]))
output

tensor([[0.0251]], grad_fn=<DivBackward0>)

In [9]:
loss = model.loss(torch.LongTensor([[0]]), torch.LongTensor([[2]]))
loss

tensor(3.6865, grad_fn=<NegBackward0>)

### Training

In [10]:
max_epochs = 200
batch_size = 10

model = SkipGram(len(corpus_dataset.vocab), 10, learning_rate=0.001)
trainer = L.Trainer(max_epochs=200)

dataloader = DataLoader(corpus_dataset, batch_size=batch_size)

trainer.fit(model, train_dataloaders=dataloader)


ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
/home/tony/anaconda3/envs/rec-sys/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name         | Type      | Params | Mode 
---------------------------------------------------
0 | v_embeddings | Embedding | 400    | train
1 | u_embeddings | Embedd

Epoch 199: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 279/279 [00:00<00:00, 629.27it/s, v_num=32]

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 279/279 [00:00<00:00, 625.73it/s, v_num=32]


### Get embeddings for given word

In [11]:
def get_skipgram_embedding(word: str, corpus: Dataset = corpus_dataset) -> torch.Tensor:
    try:
        idx = corpus.word2idx[word]
    except KeyError:
        raise Exception(f"{word} is not part of the vocabulary")
    else:
        _u = model.u_embeddings.weight.data[idx]
        _v = model.v_embeddings.weight.data[idx]

        embedding = (_u + _v) / 2 # the entries of the embedding vector for a given word is the mean of the u and v components

    return embedding

In [12]:
# get embedding for the word this:
this = get_skipgram_embedding("this")
this

tensor([ 6.0934,  0.6981,  1.0869,  5.4527,  4.2140, -3.8069,  4.3484,  3.7076,
         1.7668, -2.1651])

## Continous Bag of Words (CBoW) architecture

### Model inputs

In [13]:
from torch.utils.data import TensorDataset

class CBoWDataset:
    def __init__(self, corpus: str, window_size: int = 2, min_word_count: int = 2):
        self.corpus = corpus
        self.window_size = window_size
        self.min_freq = min_word_count
        self.data = self._create_training_data()

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        target = self.data[index]["target"]
        context = self.data[index]["context"]
        return torch.tensor([context], dtype=torch.long), torch.tensor([target], dtype=torch.long)
    
    def _tokenize(self, corpus: str) -> list[str]:
        tokens = corpus.lower().split()
        return tokens
    
    def __len__(self) -> int:
        return len(self.data)

    def _create_training_data(self):
        _tokenized_text = self._tokenize(self.corpus)
        word_counts = Counter(_tokenized_text)
        self.vocab = [word for word, count in word_counts.items() if count >= self.min_freq]
        self.word_2_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.index_2_word  = {idx: word for word, idx in self.word_2_index.items()}

        _indexed_tokens = [self.word_2_index[word] for word in _tokenized_text if word in self.vocab]

        data = []
        for i, target_index in enumerate(_indexed_tokens):
            #skip or end iterations where there are not enough context words (ie. out of corpus)
            if i < self.window_size:
                continue
            if i + self.window_size >= len(_indexed_tokens):
                break

            start = max(0, i - self.window_size)
            end = i + self.window_size + 1

            context_indices = [_indexed_tokens[j] for j in range(start, end) if j != i]
            data.append({
                "context": context_indices,
                "target": target_index
            })

        return data
    
    def get_pytorch_dataset(self) -> TensorDataset: #TODO: better refine this class into a subclass of Pytorch Dataset class.
        if self.data is None:
            raise AttributeError("Dataset is empty in object")
        
        context_tensor = torch.tensor([d["context"] for d in self.data])
        target_tensor = torch.tensor([d["target"] for d in self.data])

        return TensorDataset(context_tensor, target_tensor)

            


A record of training dataset for CBoW would be indices of the neighboring words (context) and the index of the target word (target). Below is an example of what a record in training data would look like:

In [14]:
cbow_dataset = CBoWDataset(raw_corpus)
cbow_dataset[0]

(tensor([[0, 1, 0, 1]]), tensor([2]))

### Model

In [15]:
class CBoW(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 100, learning_rate: int = 0.001):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.learning_rate = learning_rate
        
    def forward(self, context: torch.Tensor) -> torch.Tensor:
        embeddings = self.embeddings(context).mean(1).squeeze(1)
        scores = self.linear(embeddings)
        return scores
    

Below is an example of what an output of the model would look like.

In [16]:
m = CBoW(len(cbow_dataset.vocab), 10)
print(m(cbow_dataset[40][0]).shape)
m(cbow_dataset[40][0])

torch.Size([1, 134])


tensor([[-0.3612, -0.0851,  0.1617, -0.4738, -0.7177,  0.4703, -0.4646,  0.0479,
          0.2783,  0.5770, -0.0645,  0.3715,  0.3509, -0.1515,  0.0447,  0.1342,
          0.3677,  0.5179, -0.5440,  0.2223,  0.1782,  0.6553, -0.1591,  0.0101,
         -0.0176,  0.1313, -0.4974,  0.5783, -0.2171, -0.1124, -0.5484, -0.6421,
          0.2231, -0.2881, -0.4699, -0.0294, -0.2904,  0.0722, -0.2899,  0.4874,
          0.2892,  0.4108, -0.2286,  0.1201,  0.1087, -0.0586, -0.2182,  0.0404,
         -0.4564, -0.1814,  0.1302, -0.4615,  0.4367,  0.0845, -0.5777,  0.1769,
         -0.0140,  0.0427,  0.0379,  0.0253,  0.7221,  0.4631, -0.1402, -0.8026,
          0.4313,  0.2072, -0.6968,  0.1046,  0.0069,  0.3584, -0.1209,  0.0765,
         -0.4060,  0.2188, -0.4560, -0.1932, -0.3899,  0.0207,  0.1750, -0.6605,
          0.4231,  0.1385,  0.0905, -0.1333,  0.0224, -0.0477, -0.1873,  0.0180,
         -0.2859,  0.4312, -0.1609, -0.5253, -0.1304, -0.3109, -0.4464,  0.2957,
          0.4460, -0.3395, -

### Training

In [17]:
max_epochs = 500
batch_size = 30
learning_rate = 0.0001

cbow_dataset = CBoWDataset(raw_corpus)
model = CBoW(len(cbow_dataset.vocab), 100)
cbow_torch_dataset = cbow_dataset.get_pytorch_dataset()
dataloader = DataLoader(cbow_torch_dataset, batch_size=batch_size, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

for i in range(0, max_epochs+1):
    for batch_idx, data in enumerate(dataloader):
        _context, _target = data
        optimizer.zero_grad()
        pred = model(_context.squeeze(0))
        loss = loss_fn(pred, _target.squeeze(0))
        loss.backward()

        optimizer.step()

    if i % 50 == 0:
        print(f"epoch: {i}, loss: {loss.item()}")

epoch: 0, loss: 4.860497951507568
epoch: 50, loss: 3.501563549041748
epoch: 100, loss: 3.432368040084839
epoch: 150, loss: 3.2841267585754395
epoch: 200, loss: 2.3874926567077637
epoch: 250, loss: 2.4139373302459717
epoch: 300, loss: 2.013295888900757
epoch: 350, loss: 1.8555536270141602
epoch: 400, loss: 1.5092122554779053
epoch: 450, loss: 1.6561615467071533
epoch: 500, loss: 1.4356046915054321


### Retrieving embeddings

In [18]:
def get_cbow_embedding(word: str, corpus: CBoWDataset = cbow_dataset) -> torch.Tensor:
    try:
        idx = corpus.word_2_index[word]
    except KeyError:
        raise Exception(f"{word} is not part of the vocabulary")
    else:
        embedding = model.embeddings.weight.data[idx]

    return embedding

get_cbow_embedding("this")

tensor([ 8.7227e-01, -1.4642e+00,  3.3227e-01, -1.1976e+00,  9.9508e-01,
        -1.9185e+00,  1.6873e+00, -2.1981e-01, -1.3914e+00, -4.5306e-01,
        -8.8307e-01,  3.6806e-01, -1.0049e-01, -5.1561e-01,  5.3933e-01,
         1.7580e+00, -1.6260e+00,  9.0106e-02, -1.4081e+00,  5.6344e-01,
        -4.0096e-02, -1.0726e-01,  1.6529e+00,  1.9768e+00, -2.3835e+00,
        -1.7071e+00, -1.3035e-01,  6.0507e-01,  8.0239e-02,  7.6443e-01,
        -7.3667e-01,  1.1619e+00, -5.1986e-01,  8.4223e-01, -1.2132e+00,
         1.3615e+00, -5.3674e-01,  2.5407e-01, -2.1256e+00, -2.0824e+00,
        -1.2459e+00,  1.2917e-01,  7.6094e-01,  6.7789e-01, -5.5524e-01,
        -9.8033e-02, -2.5199e-01,  8.1439e-01, -1.3252e-04,  4.1599e-01,
        -8.0540e-01, -7.4667e-01,  1.0687e+00,  2.8368e-01,  7.0659e-01,
        -3.4302e-01,  2.9099e-01,  9.6474e-01,  1.6937e+00,  8.2804e-01,
         4.1342e-01, -6.6189e-01, -8.8142e-01, -1.4374e+00, -9.3793e-01,
         1.5646e+00, -1.1017e+00,  2.0329e+00,  1.1