In [None]:
# explore glove word emebddings

In [1]:
import pandas as pd
import numpy as np
from torch import nn, optim
import torch

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# function to load glove vectors from file
def load_glove(file_path):
    file_object  = open(file_path, "r") 
    model = {} 
    for l in file_object:
        line = l.split()
        word = line[0]
        value = np.array([float(val) for val in line[1:]])
        model[word] = value
    return model

In [3]:
file_with_path = "../data/embeddings/glove.6B.50d.txt"
glove = load_glove(file_with_path)

In [4]:
glove['python'].reshape(1, -1)

array([[ 0.5897  , -0.55043 , -1.0106  ,  0.41226 ,  0.57348 ,  0.23464 ,
        -0.35773 , -1.78    ,  0.10745 ,  0.74913 ,  0.45013 ,  1.0351  ,
         0.48348 ,  0.47954 ,  0.51908 , -0.15053 ,  0.32474 ,  1.0789  ,
        -0.90894 ,  0.42943 , -0.56388 ,  0.69961 ,  0.13501 ,  0.16557 ,
        -0.063592,  0.35435 ,  0.42819 ,  0.1536  , -0.47018 , -1.0935  ,
         1.361   , -0.80821 , -0.674   ,  1.2606  ,  0.29554 ,  1.0835  ,
         0.2444  , -1.1877  , -0.60203 , -0.068315,  0.66256 ,  0.45336 ,
        -1.0178  ,  0.68267 , -0.20788 , -0.73393 ,  1.2597  ,  0.15425 ,
        -0.93256 , -0.15025 ]])

In [5]:
# calulclate cosine similarity (import from sklearn for example)
cosine_similarity(glove['fast'].reshape(1, -1), glove['speed'].reshape(1, -1))

array([[0.75120749]])

In [6]:
# embeddings operations -> classic queen - woman + man =  king 
predicted_king = glove['queen'] - glove['woman'] + glove['man']
actual_king = glove['king']

cosine_similarity(actual_king.reshape(1, -1), predicted_king.reshape(1, -1)) # quite close

array([[0.85888392]])

# calculate own embeddings with CBOW (alternative would be skipgrams)

In [7]:
text = """How that personage haunted my dreams, I need scarcely tell you. On
stormy nights, when the wind shook the four corners of the house and
the surf roared along the cove and up the cliffs, I would see him in a
thousand forms, and with a thousand diabolical expressions. Now the leg
would be cut off at the knee, now at the hip, now he was a monstrous
kind of a creature who had never had but the one leg, and that in the
middle of his body. To see him leap and run and pursue me over hedge and
ditch was the worst of nightmares. And altogether I paid pretty dear for
my monthly fourpenny piece, in the shape of these abominable fancies"""

text = text.replace(',','').replace('.','').lower().split()

In [8]:
corpus  = set(text)
corpus_length  = len(corpus)

In [9]:
word_dict = {}
inverse_word_dict = {}

for i, word in enumerate(corpus):
    word_dict[word] = i
    inverse_word_dict[i] = word
    

In [10]:
data = []

for i in range(2, len(text)-2):
    sentence = [text[i-2], text[i-1], text[i+1], text[i+2]]
    target = text[i]
    data.append((sentence, target))

In [11]:
text[0]

'how'

In [12]:
embedding_size = 20

class CBOW(nn.Module):
    def __init__(self, corpus_length, embedding_dim):
        super(CBOW, self).__init__()
        
        self.embeddings = nn.Embedding(corpus_length, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 64)
        self.linear2 = nn.Linear(64, corpus_length)
        
        self.activation_fun1 = nn.ReLU()
        self.activation_fun2 = nn.LogSoftmax(dim = -1)
        
    def forward(self, inputs):
        
        embeds = sum(self.embeddings(inputs)).view(1, -1)
        out = self.linear1(embeds)
        out = self.activation_fun1(out)
        out = self.linear2(out)
        out = self.activation_fun2(out)
        return out
        
    def get_word_embedding(self, word):
        word = torch.LongTensor([word_dict[word]])
        embd = self.embeddings(word)
        return embd

In [13]:
model = CBOW(corpus_length, embedding_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01)


In [14]:
def make_sentence_vector(sentence, word_dict):
    idxs = [word_dict[w] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)


In [15]:
make_sentence_vector(data[0][0], word_dict)

tensor([39, 38, 72,  2])

In [16]:
word_dict['personage']

21

In [17]:
model

CBOW(
  (embeddings): Embedding(82, 20)
  (linear1): Linear(in_features=20, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=82, bias=True)
  (activation_fun1): ReLU()
  (activation_fun2): LogSoftmax(dim=-1)
)

In [18]:
epochs =100
for epoch in range(epochs):
    epoch_loss = 0
    for context_words, target_word in data:
        
        inputs = make_sentence_vector(context_words, word_dict)
        target = [word_dict[target_word]]
        model.zero_grad()
        log_probs = model(inputs)
        loss = loss_function(log_probs, torch.tensor(target, dtype=torch.long))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.data
    print(f"Epoch {epoch}/{epochs} loss: {epoch_loss.item()}")

Epoch 0/100 loss: 536.54443359375
Epoch 1/100 loss: 476.8662414550781
Epoch 2/100 loss: 434.998291015625
Epoch 3/100 loss: 396.72979736328125
Epoch 4/100 loss: 358.97467041015625
Epoch 5/100 loss: 320.5982360839844
Epoch 6/100 loss: 281.8330993652344
Epoch 7/100 loss: 243.61477661132812
Epoch 8/100 loss: 207.54013061523438
Epoch 9/100 loss: 174.24093627929688
Epoch 10/100 loss: 144.42483520507812
Epoch 11/100 loss: 118.19700622558594
Epoch 12/100 loss: 96.01138305664062
Epoch 13/100 loss: 77.28784942626953
Epoch 14/100 loss: 62.22464370727539
Epoch 15/100 loss: 50.232025146484375
Epoch 16/100 loss: 40.90771484375
Epoch 17/100 loss: 33.71605682373047
Epoch 18/100 loss: 28.241331100463867
Epoch 19/100 loss: 23.956239700317383
Epoch 20/100 loss: 20.62024688720703
Epoch 21/100 loss: 17.956451416015625
Epoch 22/100 loss: 15.83503246307373
Epoch 23/100 loss: 14.106616973876953
Epoch 24/100 loss: 12.667831420898438
Epoch 25/100 loss: 11.485947608947754
Epoch 26/100 loss: 10.471734046936035
Ep

In [21]:
def get_predicted_results(inputs, inverse_word_dict):
    index = np.argmax(inputs)
    return inverse_word_dict[index]

def predict_sentence(sentence):
    sentence_split =  sentence.replace(".", "").lower().split()
    sentence_vector =  make_sentence_vector(sentence_split, word_dict)
    prediction_array = model(sentence_vector).data.numpy()
    predicted_word = get_predicted_results(prediction_array[0], inverse_word_dict)
    print(f"Sentence: {sentence_split[:2]}---- {predicted_word} ----- {sentence_split[2:]}")
    

In [25]:
model.get_word_embedding("leap")

tensor([[ 0.6845, -1.1483,  1.2902,  0.1237,  1.0086,  1.0882,  0.2756, -0.5882,
          1.7091, -1.0418,  0.9958,  0.2379,  0.3687,  2.1622,  1.0250, -0.1954,
         -0.3950,  0.6367,  0.0842,  1.0751]], grad_fn=<EmbeddingBackward>)