# Exercise: computing word embeddings - Continuous Bag-of-Words

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#sphx-glr-beginner-nlp-word-embeddings-tutorial-py

In [1]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

* nn.Embedding add a trailing dim: (.., 2) -> (.., 2, 5)

In [3]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        x = torch.flatten(self.embeddings(inputs))
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x.view((1, -1))

In [4]:
from torch import optim

losses = []
loss_function = nn.CrossEntropyLoss()
model = CBOW(vocab_size, embedding_dim=3, context_size=CONTEXT_SIZE * 2)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [5]:
for epoch in range(10):
    total_loss = 0
    for context, target in data:
        context_indices = torch.tensor(
            [word_to_ix[w] for w in context],
            dtype=torch.long
        )
        model.zero_grad()
        logits = model(context_indices)
        loss = loss_function(
            logits,
            torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    losses.append(total_loss)
    
losses

[226.6508104801178,
 225.39487028121948,
 224.1544623374939,
 222.92901062965393,
 221.71839547157288,
 220.52208828926086,
 219.3393349647522,
 218.16984701156616,
 217.0133113861084,
 215.86875104904175]

### nn.EmbeddingBag

In [6]:
embedding = nn.Embedding(10, 3)
inp = torch.LongTensor([[1, 2], [2, 1]])
torch.sum(embedding(inp), axis=-2)

tensor([[-0.3886,  1.5494, -1.0720],
        [-0.3886,  1.5494, -1.0720]], grad_fn=<SumBackward1>)

In [7]:
embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
inp = torch.LongTensor([[1, 2], [2, 1]])
embedding_sum(inp)

tensor([[ 1.4653,  0.7969, -1.0544],
        [ 1.4653,  0.7969, -1.0544]], grad_fn=<EmbeddingBagBackward>)