In [None]:
# explore glove word emebddings

In [1]:
import pandas as pd
import numpy as np
from torch import nn, optim
import torch

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# function to load glove vectors from file
def load_glove(file_path):
    file_object  = open(file_path, "r") 
    model = {} 
    for l in file_object:
        line = l.split()
        word = line[0]
        value = np.array([float(val) for val in line[1:]])
        model[word] = value
    return model

In [None]:
file_with_path = "../data/embeddings/glove.6B.50d.txt"
glove = load_glove(file_with_path)

In [None]:
glove['python'].reshape(1, -1)

In [None]:
# calulclate cosine similarity (import from sklearn for example)
cosine_similarity(glove['fast'].reshape(1, -1), glove['speed'].reshape(1, -1))

In [None]:
# embeddings operations -> classic queen - woman + man =  king 
predicted_king = glove['queen'] - glove['woman'] + glove['man']
actual_king = glove['king']

cosine_similarity(actual_king.reshape(1, -1), predicted_king.reshape(1, -1)) # quite close

# calculate own embeddings with CBOW (alternative would be skipgrams)

In [2]:
text = """How that personage haunted my dreams, I need scarcely tell you. On
stormy nights, when the wind shook the four corners of the house and
the surf roared along the cove and up the cliffs, I would see him in a
thousand forms, and with a thousand diabolical expressions. Now the leg
would be cut off at the knee, now at the hip, now he was a monstrous
kind of a creature who had never had but the one leg, and that in the
middle of his body. To see him leap and run and pursue me over hedge and
ditch was the worst of nightmares. And altogether I paid pretty dear for
my monthly fourpenny piece, in the shape of these abominable fancies"""

text = text.replace(',','').replace('.','').lower().split()

In [3]:
corpus  = set(text)
corpus_length  = len(corpus)

In [14]:
word_dict = {}
inverse_word_dict = {}

for i, word in enumerate(corpus):
    word_dict[word] = i
    inverse_word_dict[i] = word
    

In [15]:
data = []

for i in range(2, len(text)-2):
    sentence = [text[i-2], text[i-1], text[i+1], text[i+2]]
    target = text[i]
    data.append((sentence, target))

In [16]:
text[0]

'how'

In [17]:
embedding_size = 20

class CBOW(nn.Module):
    def __init__(self, corpus_length, embedding_dim):
        super(CBOW, self).__init__()
        
        self.embeddings = nn.Embedding(corpus_length, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 64)
        self.linear2 = nn.Linear(64, corpus_length)
        
        self.activation_fun1 = nn.ReLU()
        self.activation_fun2 = nn.LogSoftmax(dim = -1)
        
    def forward(self, inputs):
        
        embeds = sum(self.embeddings(inputs)).view(1, -1)
        out = self.linear1(embeds)
        out = self.activation_fun1(out)
        out = self.linear2(out)
        out = self.activation_fun2(out)
        return out
        
    def get_word_embedding(self, word):
        word = torch.LongTensor([word_dict[word]])
        embd = self.embeddings(word)
        return embd

In [18]:
model = CBOW(corpus_length, embedding_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01)


In [19]:
def make_sentence_vector(sentence, word_dict):
    idxs = [word_dict[w] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)


In [20]:
make_sentence_vector(data[0][0], word_dict)

tensor([63, 60, 16, 74])

In [21]:
word_dict['personage']

14

In [22]:
model

CBOW(
  (embeddings): Embedding(82, 20)
  (linear1): Linear(in_features=20, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=82, bias=True)
  (activation_fun1): ReLU()
  (activation_fun2): LogSoftmax(dim=-1)
)

In [24]:
epochs =100
for epoch in range(epochs):
    epoch_loss = 0
    for context_words, target_word in data:
        
        inputs = make_sentence_vector(context_words, word_dict)
        target = [word_dict[target_word]]
        model.zero_grad()
        log_probs = model(inputs)
        loss = loss_function(log_probs, torch.tensor(target, dtype=torch.long))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.data
    print(f"Epoch {epoch}/{epochs} loss: {epoch_loss.item()}")

Epoch 0/100 loss: 18.832426071166992
Epoch 1/100 loss: 16.646726608276367
Epoch 2/100 loss: 14.897096633911133
Epoch 3/100 loss: 13.437186241149902
Epoch 4/100 loss: 12.206006050109863
Epoch 5/100 loss: 11.161602020263672
Epoch 6/100 loss: 10.26247501373291
Epoch 7/100 loss: 9.483185768127441
Epoch 8/100 loss: 8.809536933898926
Epoch 9/100 loss: 8.208992004394531
Epoch 10/100 loss: 7.68635368347168
Epoch 11/100 loss: 7.216772556304932
Epoch 12/100 loss: 6.801068305969238
Epoch 13/100 loss: 6.423181056976318
Epoch 14/100 loss: 6.082520484924316
Epoch 15/100 loss: 5.777169704437256
Epoch 16/100 loss: 5.494428634643555
Epoch 17/100 loss: 5.238974094390869
Epoch 18/100 loss: 5.003624439239502
Epoch 19/100 loss: 4.787221431732178
Epoch 20/100 loss: 4.588069915771484
Epoch 21/100 loss: 4.402891159057617
Epoch 22/100 loss: 4.231438159942627
Epoch 23/100 loss: 4.072458744049072
Epoch 24/100 loss: 3.9232451915740967
Epoch 25/100 loss: 3.7837941646575928
Epoch 26/100 loss: 3.6546950340270996
Epo