<a href="https://colab.research.google.com/github/chendingyan/NLP490H/blob/master/Solutions_Lecture2_labwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [0]:
#@title Loading packages

import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
from scipy.spatial.distance import euclidean
from tqdm import tqdm

In [0]:
#@title Sample corpora

corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is the capital of poland',
    'berlin is the capital of germany',
    'paris is the capital of france',
]

In [0]:
#@title Tokenization

# Q: What is a token? 

tokenized_corpus = [] # Let us put the tokenized corpus in a list
for sentence in corpus:
  tokenized_sentence = []
  for token in sentence.split(' '): # simplest split is 
    tokenized_sentence.append(token)
  tokenized_corpus.append(tokenized_sentence)

# print(tokenized_corpus)

In [0]:
#@title Vocabulary

vocabulary = [] # Let us put all the tokens (mostly words) 
                # appearing in the vocabulary in a list
  
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)


# print(vocabulary)



# Q. what is the size of the vocabulary?

# A. 
vocabulary_size = len(vocabulary)

In [0]:
#@title Helper Functions

# We need a mapping from word to index and index to word. 

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

# print(word2idx)
# print(idx2word)



In [0]:
#@title A simple look-up table function

# Q. why do we need this? 

def look_up_table(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x
  
# This is a one hot representation

# Q. try printing it for word_idx = 1


# word_idx = word2idx['he']
# print(look_up_table(word_idx))

# Q. can we say anything about the word with the vector? 



In [0]:
#@title Extracting contexts and the center word!


# Let us assume the context size is '2'

window_size = 2

idx_pairs = []

# variables of interest: 
#   center_word_pos: center word position
#   context_word_pos: context_word_position
#   add sentence length as a constraint

for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]

    for center_word_pos in range(len(indices)):

        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
                
            context_word_idx = indices[context_word_pos]            
            idx_pairs.append((indices[center_word_pos], context_word_idx))

# print(idx_pairs)
idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [0]:
#@title Parameters and Hyperparameters

# Hyperparameters:
embedding_dims = 5
num_epochs = 100
learning_rate = 0.001



# The two weight matrices:
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), 
              requires_grad=True)
# W1 is the `embedding matrix' 

W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), 
              requires_grad=True)
# W2 is the `parameter matrix'



        
        

In [0]:
#@title Training the model in the standard way

for epoch in range(num_epochs):
  
    loss_val = 0
    
    for data, target in idx_pairs:
      
        x = Variable(look_up_table(data)).float() # x is a variable 
        
        # Q. what would y_true be? 
        # y_true = 
        
        # A. 
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        # 
        z1 = torch.matmul(W1, x) 
        # Q. what is z1? 
        
        z2 = torch.matmul(W2, z1)
        # Q. what is the above operation? 
        
    
        # Let us obtain prediction over the vocabulary
        log_softmax = F.log_softmax(z2, dim=0)
        
        # Our loss is a negative log-likelihood loss 
        # (what does this mean?)
        
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        
        
        loss_val += loss.item()
        
        # propagate the error
        loss.backward()
        
        # gradient descent
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        # zero out gradient accumulation
        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
        
    if epoch % 10 == 0:    
        print(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}')        

In [0]:
#@title Using embeddings

# Let us get two vectors from the trained model


x = Variable(look_up_table(1)).float()
x_emb = torch.matmul(W1, x).detach().numpy()
y = Variable(look_up_table(2)).float()
y_emb = torch.matmul(W1, y).detach().numpy()


# let us print the euclidean distance
print(euclidean(x_emb, y_emb))

# Q. What would euclidean distance do? What are we measuring in this case? (HINT: vector algebra)


In [0]:
# The two weight matrices:
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), 
              requires_grad=True)
 

W2 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), 
              requires_grad=True)


for epoch in range(num_epochs):
  
    loss_val = 0
    
    for data, target in idx_pairs:
        
        x_var = Variable(look_up_table(data)).float() 
        
        y_pos = Variable(torch.from_numpy(np.array([target])).long())
        y_pos_var = Variable(look_up_table(target)).float()
        
        neg_sample = np.random.choice(list(range(vocabulary_size)),size=(1))[0]
        y_neg = Variable(torch.from_numpy(np.array([neg_sample])))
        y_neg_var = Variable(look_up_table(neg_sample)).float()

         
        x_emb = torch.matmul(W1, x_var) 
        y_pos_emb = torch.matmul(W2, y_pos_var)
        y_neg_emb = torch.matmul(W2, y_neg_var)
        
        
        # get positive sample score
        pos_var = torch.mul(x_emb, y_pos_emb).squeeze()        
        pos_score = F.logsigmoid(pos_var)       
        pos_loss = sum(pos_score)
        
        # get negsample score
        neg_var = torch.mul(x_emb, y_neg_emb).squeeze()      
        neg_score = F.logsigmoid(-1 * neg_var)        
        neg_loss = sum(neg_score)
        
        loss = -1 * sum([neg_loss + neg_loss])
        
        # propagate the error
        loss.backward()
        
        # gradient descent
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        # zero out gradient accumulation
        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
    if epoch % 10 == 0:    
        print(f'Loss at epoch {epoch}: {loss/len(idx_pairs)}')

        


In [0]:
# france : paris ::  rome : ?
# britain : london :: rome : ?
# athens : greece ::  ?  : iraq
# unclear: clear :: ? : certain


avec = (wvecs[w2i['unclear']] - wvecs[w2i['clear']] + wvecs[w2i['certain']])

i2w[(cosine_distances(avec.reshape(-1,1).T, wvecs).argsort()[::-1])[0,0]]