# Week 5: Skip-gram, dense embeddings, and semantic similarity

In [None]:
from nltk.corpus import brown
import re
from collections import Counter
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

First let's get our tokenized dataset and vocabulary for which we will learn embeddings

In [None]:
tokenized_dataset = [token.lower() for token in brown.words() if re.match(r'\w', token)]
len(tokenized_dataset)

We will limit ourselved to tokens which appear at least 8 times in the dataset, for a total of 10 000 embeddings or so, all other tokens will be replaced by an unknown token, or '\<UNK>' which we have made sure to add to our vocabulary at index 0. 

In [None]:
counts = dict(Counter(tokenized_dataset))
vocab = [token for token, count in counts.items() if count >= 8]
vocab  = ['<UNK>'] + vocab
len(vocab)

## Datasets and DataLoaders

Now let's introduce a dataset class which will be used to store our data, create word to index mappings and generate positive and negative samples for our Skipgram training algorithm. This new class inherits from torch.utils.data.Dataset, a useful class type which can be combined with dataloader objects to make batched data generation easier with pytorch.

In [None]:
class SkipgramDataset(Dataset):
    def __init__(self, tokenized_dataset, vocab, context_size, k_negative_sample_size):
        self.tokenized_dataset = tokenized_dataset
        self.vocab = vocab
        self.context_size = context_size
        self.k_negative_sample_size = k_negative_sample_size
        self.words_to_indices = {word:i for i,word in enumerate(vocab)}
        self.indexed_dataset = [self.words_to_indices[token] if token in self.words_to_indices else self.words_to_indices['<UNK>'] for token in self.tokenized_dataset]
        self.vocab_size = len(self.vocab)
        self.skipgram_items = []
        self.get_skipgram_items()

    def __getitem__(self, idx):
        pos_samples = self.skipgram_items[idx]
        neg_samples = self.get_negative_samples(pos_samples)
        return pos_samples, neg_samples

    def __len__(self):
        return len(self.skipgram_items)

    def get_skipgram_items(self):
        for i in range(len(self.indexed_dataset)-self.context_size):
            window = self.indexed_dataset[i:(i+self.context_size*2+1)]
            target = window[self.context_size]
            if target != 0:
                context = window[:self.context_size] + window[self.context_size+1:]
                for c_token in context:
                    self.skipgram_items.append((target, c_token))

    def get_negative_samples(self, pos_samples):
        target, context = pos_samples
        neg_samples = []
        while len(neg_samples) < self.k_negative_sample_size:
            neg_context = random.sample(range(0, self.vocab_size), 1)
            if neg_context != target:
                neg_samples.append((target,neg_context[0]))
        return neg_samples
        
        

In [None]:
#Dataset and loader hyperparameters
window_size = 3
k_negative_samples = 5
batch_size = 200

data = SkipgramDataset(tokenized_dataset, vocab, window_size, k_negative_samples)
dataloader = DataLoader(data, batch_size, shuffle=True)

## Models and Modules

Here is a quick helper function to flatten of lists of k negative samples into a uniform batch that can be processed by our model the same way as positive samples.

In [None]:
def flatten_neg_samples(neg_samples):
    neg_samples_flat = [torch.IntTensor([]), torch.IntTensor([])]
    for target, context in neg_samples:
        neg_samples_flat[0] = torch.cat((neg_samples_flat[0], target))
        neg_samples_flat[1] = torch.cat((neg_samples_flat[1], context))
    return neg_samples_flat

Here is our SkipGram model which inherits from torch.nn.Module. You will notice that it simply contains the target and context embeddings and applies a dot product between them which it then sums.

In [None]:
class SkipGram_Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram_Model, self).__init__()
        self.target_embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
        self.context_embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )

    def forward(self, inputs):
        emb_target = self.target_embeddings(inputs[0])
        emb_context = self.context_embeddings(inputs[1])
        output = torch.sum(emb_target * emb_context, dim=1)
        
        return output


## Loss Functions

Next we have to define our learning objective, or loss function. In this implementation of skipgram we are using negative sampling to approximate cross entropy loss. We follow the formula given in the class slides and finally sum our results for both positive and negative samples across each batch. A perfect cross entropy loss is 0, so we want to minimize this return value and try to get it closer and closer to 0.

In [None]:
class SkipGram_NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super(SkipGram_NegativeSamplingLoss, self).__init__()
        self.log_sigmoid = nn.LogSigmoid()

    def forward(self, pos_logits, neg_logits):
        pos_loss = torch.sum(self.log_sigmoid(pos_logits), dim=0)
        neg_loss = torch.sum(self.log_sigmoid(- neg_logits), dim=0)
        return - (pos_loss + neg_loss)

## Training Procedures

Now for the training procedure. Training this skipgram model takes time, so I have run it in advance and cashed the results for you. Nevertheless, let's walk through the steps involved in this training procedure. First we define the hyperparameters needed for the model and training loop: the learning rate and number of epochs, as well as our embedding size.

In [None]:
# Train and model hyperparameters
lr = 0.001
epochs = 2
emb_size = 300

Next we need to initialize our model, optimizer and loss function, often called "criterion" in scripts. You already know what the model and loss function do, but what about the optimizer? The optimizer will be in charge of updating the model weights during training. This is why it takes the model parameters as its input. There are different types of optimization function, here we use Adam, an form of stochastic gradient descent with a learning rate of 0.001, the constant that we multiply our gradient updates by to slow learning to avoid over fitting to specific batches.

In [None]:
model = SkipGram_Model(data.vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = SkipGram_NegativeSamplingLoss()

In [None]:
train_scratch = False

In [None]:
if train_scratch:
    losses =[]
    mean_loss = 0
    i=0
    
    # set the model to train mode
    model.train()

    for epoch in range(epochs):
        for pos_samples, neg_samples in dataloader:
            # reformat negative samples into a single k*batch_size batch
            neg_samples = flatten_neg_samples(neg_samples)
            # clear gradients from optimizer
            optimizer.zero_grad()
            # get logits for both negative and positive samples
            pos_logits = model(pos_samples)
            neg_logits = model(neg_samples)
            # calculate the loss
            loss = criterion(pos_logits, neg_logits)
            # keep track of mean loss so we can plot it after
            mean_loss+=loss.item()
            if i == 0 or i%100 == 0 :
                losses.append((i,float(mean_loss/100)))
                mean_loss = 0
            i+=1
            # calculate the gradients
            loss.backward()
            # add the gradients to model parameters based on learning rate
            optimizer.step()

    sum_embeddings = model.target_embeddings.weight.detach().numpy() + model.context_embeddings.weight.detach().numpy()
    target_embeddings = model.target_embeddings.weight.detach().numpy()

    np.save('target_embeddings.npy', target_embeddings) 
    np.save('sum_embeddings.npy', sum_embeddings) 

## Similarity and embedding space

Now that we have trained our model, we can extract the embeddings from it. We have two options: we can either simply take the target embeddings, or we can sum the target and the context embeddings together.

In [None]:
target_embeddings = np.load('target_embeddings.npy')
sum_embeddings = np.load('sum_embeddings.npy')

In [None]:
target_embeddings.shape

In [None]:
sum_embeddings.shape

Here is a helper function to find similar the top n words to a given word.

In [None]:
# Function to get similar words
def get_similar_words(word, words_to_indices, vocab, embeddings, top_n=10):
    idx = words_to_indices[word]
    word_embedding = embeddings[idx]
    similarities = np.dot(embeddings, word_embedding)
    closest_idxs = (-similarities).argsort()[1:top_n+1]
    return [vocab[idx] for idx in closest_idxs]

Try playing around with it by searching for different words!

In [None]:
get_similar_words("queen", data.words_to_indices, data.vocab, target_embeddings)

In [None]:
get_similar_words("queen", data.words_to_indices, data.vocab, sum_embeddings)

NLTK also provides a set of pretrained Word2Vec embeddings trained on the google book corpus. Lets load them in and play around with them as well. [https://www.nltk.org/howto/gensim.html]

In [None]:
import nltk
from nltk.data import find
from nltk.test.gensim_fixt import setup_module
setup_module()
import gensim
nltk.download('word2vec_sample')

In [None]:
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
pretrained_embeddings = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

The following will do the same as our get_similar_words helper function but for these pretrained embeddings.

In [None]:
pretrained_embeddings.most_similar(positive=['king'], topn = 10)

This usage will apply the parallelogram rule we saw in class.

In [None]:
pretrained_embeddings.most_similar(positive=['woman','king'], negative=['man'], topn = 1)