In [1]:
# Imports

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x11333b470>

### References

* https://iksinc.online/tag/continuous-bag-of-words-cbow/
* http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_II_The_Continuous_Bag-of-Words_Model.pdf
* https://stackoverflow.com/questions/48479915/what-is-the-preferred-ratio-between-the-vocabulary-size-and-embedding-dimension
* https://github.com/FraLotito/pytorch-continuous-bag-of-words/blob/master/cbow.py
* https://stackoverflow.com/questions/50792316/what-does-1-mean-in-pytorch-view
* https://www.tensorflow.org/tutorials/text/word_embeddings
* https://pytorch.org/docs/stable/nn.html
* https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
* https://github.com/ChristophAlt/embedding_vectorizer/blob/master/embedding_vectorizer.py

In [2]:
import nltk
class Vocabulary():
    def __init__(self, filepath):
        super(Vocabulary, self).__init__()
        self.filepath = filepath
        self.tokens = self.nltk_tokenize()
        self.tok_to_ids, self.ids_to_tok = self.make_dicts()
        self.nr_unique_tokens = len(self.vocabulary_set())
    
    def readfile(self):
        """this function opens the file and returns the text in a string"""
        file = open(self.filepath)
        text = file.read()
        file.close()
        return text
    
    def nltk_tokenize(self):
        """this function tokenizes the text and returns a list of tokens as strings"""
        text = self.readfile()
        tokens = nltk.tokenize.word_tokenize(text)
        return tokens
    
    def vocabulary_set(self):
        """this function returns a list of unique tokens"""
        return(list(set(self.tokens)))
    
    def make_dicts(self):
        unique_tokens = self.vocabulary_set()
        tok_to_ix = {}
        ix_to_tok = {}
        for i in range(len(unique_tokens)):
            tok_to_ix.update({unique_tokens[i]: i})
            ix_to_tok.update({i: unique_tokens[i]})
        return tok_to_ix, ix_to_tok
        

In [3]:
shakespeare = Vocabulary('shakespeare-corpus.txt')

In [4]:
class One_Hot_Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
        self.tok_to_ids = self.vocab.tok_to_ids
        self.size = self.vocab.size
        self.vectors = self.vectorize()
        
    def vectorize(self):
        dict_vect = {}
        for token in self.tok_to_ids:
            vector = np.zeros(self.size)
            tok_id = self.tok_to_ids[token]
            vector[tok_id] = 1
            dict_vect.update({token: vector})
        return dict_vect
    

In [23]:
class Vectorizer(object):
    def __init__(self, vocabulary):
        self.vocab = vocabulary
        self.vectors = self.vectorize()
        
    def vectorize(self):
        vectors = {}
        for word in self.vocab.tok_to_ids:
            vec = torch.rand(50, requires_grad=True) 
            vectors.update({word: vec}) #each word points to its vector
        return vectors
    
    def make_context_vector(self, context):
        ids = [self.vectors[word] for word in context]
        return torch.stack(ids)


In [24]:
mytext = 'test_corpus.txt'
test_vocab = Vocabulary(mytext)
vectorizer = Vectorizer(test_vocab)

# Size of the context windows, 2 and 5 are supposed to be used in ex02...
# range \in [2, 1/2 * document_length - 1]
CONTEXT_SIZE = 2

# let's stick with this notation for now ;)
CONTEXT_WINDOW_SIZE = CONTEXT_SIZE * 2

NUM_ITERATIONS = 100

# Data creation - get context around the target word
data = []
tokens = test_vocab.tokens
for i in range(CONTEXT_SIZE, len(tokens) - CONTEXT_SIZE):
    # Context before w_i
    context_before_w = tokens[i - CONTEXT_SIZE: i]
    
    # Context after w_i
    context_after_w = tokens[i + 1: i + CONTEXT_SIZE + 1]
    
    # Put them together
    context_window = context_before_w + context_after_w
    
    # Target = w_i
    target = tokens[i]
    
    # Append in the correct format
    data.append((context_window, target))



In [27]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_window_size, nr_hidden_neurons=128):
        super(CBOW, self).__init__()
        self.context_window_size = context_window_size
        
        # Embedding layer
        #self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.vectorizer = 
        
        # note: this probably doesn't deal with 'UNK' words
        self.linear1 = nn.Linear(embedding_dim, nr_hidden_neurons)  
        
        # output layer
        self.linear2 = nn.Linear(nr_hidden_neurons, vocab_size)

        
    def forward(self, inputs):
        # shape = (WINDOW_SIZE, EMBEDDING_DIM)
        
        # NOTE: Leave these two commented out to ignore the embedding layer.
        #embeds = self.embeddings(inputs)
        #print(embeds.shape)
    
        # sum over all of the context vectors
        # shape = (EMBEDDING_DIM)
        summed_embeds = sum(inputs)

        # shape = (1, EMBEDDING_DIM)
        # -1 param in view() ... "the actual value for this dimension will be inferred so that the number of elements in the view matches the original number of elements."
        embeds_2D = summed_embeds.view(1, -1)
        
        # finally compute the hidden layer weighted sum (a.k.a. output before using the activation function)
        # ... and don't forget to divide by the number of input vectors
        h =  self.linear1(embeds_2D) / self.context_window_size
        
        # output of the hidden layer
        out =  F.relu(h) 
         
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [28]:
losses = []
loss_function = nn.NLLLoss() # negative log likelihood loss
model = CBOW(test_vocab.nr_unique_tokens, 50, CONTEXT_WINDOW_SIZE)

optimizer = optim.SGD(model.parameters(), lr=0.01)

print("embedding of 'that' before training:  ", vectorizer.vectors['that'])

for epoch in range(NUM_ITERATIONS):
    total_loss = 0
    for context, target in data:
        # Step1. Create input vector 
        context_vectors = vectorizer.make_context_vector(context)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_vectors)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        target = torch.tensor(vectorizer.vocab.tok_to_ids[target], dtype=torch.long).view(1)
        loss = loss_function(log_probs, target)
        #loss = loss_function(log_probs, torch.tensor(word_to_vec.vectors[target], dtype=torch.long))

        
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    
print('-' * 100)
print("embedding of 'that' after training:  ", vectorizer.vectors['that'])
print(losses)

embedding of 'that' before training:   tensor([0.9650, 0.7400, 0.3872, 0.1199, 0.6086, 0.6207, 0.2412, 0.3062, 0.1795,
        0.4194, 0.2258, 0.2382, 0.6560, 0.5999, 0.4702, 0.0732, 0.3296, 0.0682,
        0.8022, 0.7873, 0.1327, 0.0778, 0.9116, 0.1083, 0.5873, 0.8716, 0.2712,
        0.5834, 0.3283, 0.6758, 0.2405, 0.5274, 0.8777, 0.0872, 0.6084, 0.5113,
        0.1510, 0.8490, 0.3829, 0.7425, 0.2765, 0.9401, 0.5344, 0.7088, 0.1095,
        0.8702, 0.1212, 0.3343, 0.7172, 0.2539], requires_grad=True)
tensor([[0.4926, 0.2976, 0.3939, 0.2974, 0.5297, 0.8822, 0.8808, 0.6060, 0.5823,
         0.4073, 0.2642, 0.5397, 0.8877, 0.6874, 0.9971, 0.8707, 0.4805, 0.6828,
         0.9935, 0.7222, 0.7816, 0.5579, 0.9192, 0.6729, 0.8824, 0.4710, 0.9389,
         0.3137, 0.3883, 0.4914, 0.6406, 0.1147, 0.2619, 0.3827, 0.3787, 0.5994,
         0.5085, 0.0136, 0.8231, 0.3169, 0.3505, 0.5673, 0.4568, 0.9015, 0.9662,
         0.4348, 0.4837, 0.6173, 0.6467, 0.9712],
        [0.2934, 0.3514, 0.0341, 0.52

SystemExit: 

In [9]:
tensor_two = torch.tensor([[7,8,9],[10,11,12]])
tensor_tre = torch.tensor([[13,14,15],[16,17,18]])
tensor_list = [tensor_two, tensor_tre]
mytensor = torch.stack(tensor_list)
mytensor

tensor([[[ 7,  8,  9],
         [10, 11, 12]],

        [[13, 14, 15],
         [16, 17, 18]]])