In [1]:
# Imports

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

torch.manual_seed(1)

<torch._C.Generator at 0x10ceac470>

### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py

In [9]:
import nltk
class Vocabulary():
    def __init__(self, filepath):
        super(Vocabulary, self).__init__()
        self.filepath = filepath
        self.tokens = self.nltk_tokenize()
        self.tok_to_ids, self.ids_to_tok = self.make_dicts()
        self.nr_unique_tokens = len(self.vocabulary_set())
    
    def readfile(self):
        """this function opens the file and returns the text in a string"""
        file = open(self.filepath)
        text = file.read()
        file.close()
        return text
    
    def nltk_tokenize(self):
        """this function tokenizes the text and returns a list of tokens as strings"""
        text = self.readfile()
        tokens = nltk.tokenize.word_tokenize(text)
        return tokens
    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = self.vocabulary_set()
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok

    def __len__(self):
        return len(self.tok_to_ids)
        

In [3]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
    
    def vectorize(self, context_words):
        context_ids = [self.vocab.tok_to_ids[w] for w in context_words]
        return torch.tensor(context_ids, dtype=torch.long)


In [10]:
filepath = 'test_corpus.txt'
test_vocab = Vocabulary(filepath)
vectorizer = Vectorizer(test_vocab)

# Size of the context windows, 2 and 5 are supposed to be used in ex02...
# range \in [2, 1/2 * document_length - 1]
CONTEXT_SIZE = 2

# let's stick with this notation for now ;)
CONTEXT_WINDOW_SIZE = CONTEXT_SIZE * 2


# Data creation - get context around the target word
data = []
tokens = test_vocab.tokens
for i in range(CONTEXT_SIZE, len(tokens) - CONTEXT_SIZE):
    # Context before w_i
    context_before_w = tokens[i - CONTEXT_SIZE: i]
    
    # Context after w_i
    context_after_w = tokens[i + 1: i + CONTEXT_SIZE + 1]
    
    # Put them together
    context_window = context_before_w + context_after_w
    
    # Target = w_i
    target = tokens[i]
    
    # Append in the correct format
    data.append((context_window, target))



In [5]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_window_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self.context_window_size = context_window_size
        
        # Embedding layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # note: this probably doesn't deal with 'UNK' words
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons)  
        
        # output layer
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM) -> (EMBEDDING_DIM)
        embeds = sum(self.embeddings(inputs))

        # shape = (1, EMBEDDING_DIM)
        # -1 param in view() ... "the actual value for this dimension will be inferred so that the number of elements in the view matches the original number of elements."
        embeds_2D = embeds.view(1, -1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds_2D) / self.context_window_size
        
        # output of the hidden layer
        out =  F.relu(h) 
         
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [6]:
NUM_ITERATIONS = 100
EMBEDDING_DIM = 50

losses = []
loss_function = nn.NLLLoss() # negative log likelihood loss
model = CBOW(test_vocab.nr_unique_tokens, EMBEDDING_DIM, CONTEXT_WINDOW_SIZE)

optimizer = optim.SGD(model.parameters(), lr=0.1)

print(model.embeddings.weight)

for epoch in tqdm(range(NUM_ITERATIONS)):
    total_loss = 0
    for context, target in data:
        # Step1. Create input vector 
        context_vectors = vectorizer.vectorize(context)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_vectors)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        target = torch.tensor(vectorizer.vocab.tok_to_ids[target], dtype=torch.long).view(1)
        loss = loss_function(log_probs, target)
        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print(losses)
print(model.embeddings.weight)

  0%|          | 0/100 [00:00<?, ?it/s]

Parameter containing:
tensor([[-1.5256, -0.7502, -0.6540,  ..., -0.6298, -0.9274,  0.5451],
        [ 0.0663, -0.4370,  0.7626,  ...,  1.1899,  0.8165, -0.9135],
        [ 1.3851, -0.8138, -0.9276,  ...,  0.6419,  0.4730, -0.4286],
        ...,
        [-2.6061,  1.6771,  0.6073,  ...,  0.2068,  1.5356,  2.0230],
        [-0.0422,  0.5282,  1.4127,  ...,  1.0123,  1.0741, -0.8674],
        [-1.0755, -1.6713,  1.0103,  ..., -0.1815, -0.9565,  0.5544]],
       requires_grad=True)


100%|██████████| 100/100 [00:27<00:00,  3.67it/s]

[2140.3685624599457, 1856.9114190340042, 1549.4629547819495, 1119.8741447571665, 600.4194824802689, 229.9869882969033, 100.09525637446313, 55.6612077549907, 26.30374109443801, 19.925122829203247, 11.694381759784847, 10.055085795338528, 9.004289690579753, 8.201048040014939, 7.58014596027715, 7.076458833071229, 6.65117995158289, 6.300688938697931, 5.998086726755446, 5.744755854947016, 5.513304552205227, 5.307536289304608, 5.128261977941293, 4.965124934028154, 4.819614729301975, 4.685921886021788, 4.565508244000284, 4.4598745606554075, 4.352509277080117, 4.267838650705471, 4.1762281610572245, 4.089543600316119, 4.025071966489577, 3.9442779442451865, 3.876551443583139, 3.8198289013398607, 3.755192917189106, 3.69926191669083, 3.6515934343624394, 3.6071754065135337, 3.5512068579139395, 3.504560904696291, 3.4671490395849105, 3.4223116729690446, 3.3828150984516014, 3.347106567221772, 3.3198804429687243, 3.280975121398342, 3.244982886036496, 3.212801146619313, 3.187191225942115, 3.1559934333295




# Part 2 - Test your embeddings

In [16]:
def get_closest_word(word, topn=5):
    word_distance = []
    emb = model.embeddings
    pdist = nn.PairwiseDistance()
    i = test_vocab.tok_to_ids[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long) 
    v_i = emb(lookup_tensor_i)
    for j in range(len(test_vocab)): 
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j) 
            word_distance.append((test_vocab.ids_to_tok[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1]) 
    return word_distance[:topn]

get_closest_word('that')

[('new', 7.565771102905273),
 ('were', 7.587931156158447),
 ('1', 7.670988082885742),
 ('Or', 7.82880163192749),
 ('uneared', 8.229613304138184)]