In [47]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1994)

In [48]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.4


In [49]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

False


In [50]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [51]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index("<UNK>"), seq))
    return Variable(LongTensor(idxs)) 

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [52]:
f=open("Harry Potter txt/Harry Potter 5 - Order of the Phoenix.txt",'r', encoding = 'cp1252')
raw=f.read()
tokens = nltk.sent_tokenize(raw)
corpus = [nltk.word_tokenize(tokens[i]) for i in range(len(tokens))]
corpus = [[word.lower() for word in sent] for sent in corpus]
index = [idx for idx, s in enumerate(corpus) if "chapter" in s]

In [53]:
corpus = corpus[index[5]:index[6]]

In [54]:
vocab = list(set(flatten(corpus)));len(vocab)

1746

In [55]:
word2index = {}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v: k for k, v in word2index.items()}

In [56]:
WINDOW_SIZE = 5
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

In [57]:
window_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        window_data.append((window[WINDOW_SIZE], window[i]))

In [58]:
def weighting(w_i, w_j):
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1
    
    x_max = 100
    alpha = 0.75
    
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha
    else:
        result = 1
        
    return result

In [59]:
X_i = Counter(flatten(corpus))

In [60]:
X_ik_window_5 = Counter(window_data)
X_ik = {}
weighting_dic = {}

In [61]:
from itertools import combinations_with_replacement

In [62]:
for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_window_5.get(bigram) is not None:
        co_occur = X_ik_window_5[bigram]
        X_ik[bigram] = co_occur + 1
        X_ik[(bigram[1], bigram[0])] = co_occur + 1
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1])
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0])

In [124]:
test = random.choice(window_data)
print(test)
try:
    print(X_ik[(test[0], test[1])] == X_ik[(test[1], test[0])])
except:
    1

('at', 'the')
True


In [128]:
u_p = []
v_p = []
co_p = []
weight_p = []

for pair in window_data:
    u_p.append(prepare_word(pair[0], word2index).view(1,-1))
    v_p.append(prepare_word(pair[1], word2index).view(1,-1))
    
    try:
        cooc = X_ik[pair]
    except:
        cooc = 1
    
    co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1,-1))
    weight_p.append(Variable(FloatTensor([weighting_dic[pair]])).view(1, -1))
    
train_data = list(zip(u_p, v_p, co_p, weight_p))
del u_p
del v_p
del co_p
del weight_p
print(train_data[0])

(tensor([[1118]]), tensor([[149]]), tensor([[0.6931]]), tensor([[0.0532]]))


In [139]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_u = nn.Embedding(vocab_size, embedding_dim)
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
        initrange = (2.0 / (vocab_size + embedding_dim))**0.5
        self.embedding_v.weight.data.uniform_(-initrange, initrange)
        self.embedding_u.weight.data.uniform_(-initrange, initrange)
        self.v_bias.weight.data.uniform_(-initrange, initrange)
        self.u_bias.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, center_words, target_words, coocs, weights):
        center_embeds = self.embedding_v(center_words)
        target_embeds = self.embedding_u(target_words)
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_prod = target_embeds.bmm(center_embeds.transpose(1,2)).squeeze(2)
        
        loss = weights*torch.pow(inner_prod + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    def prediction(self, inputs):
        v_embeds = self.embedding_v(inputs) # B x 1 x D
        u_embeds = self.embedding_u(inputs) # B x 1 x D
                
        return v_embeds+u_embeds # final embed

In [148]:
EMBEDDING_SIZE = 100
BATCH_SIZE = 256
EPOCH = 50

In [149]:
losses = []
model = GloVe(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [150]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets, coocs, weights = zip(*batch)
        
        inputs = torch.cat(inputs)
        targets = torch.cat(targets)
        coocs = torch.cat(coocs)
        weights = torch.cat(weights)
        
        model.zero_grad()

        loss = model(inputs, targets, coocs, weights)
        
        loss.backward()
        optimizer.step()
        losses.append(loss.data.tolist())
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))

Epoch : 0, mean_loss : 29.79
Epoch : 10, mean_loss : 4.30
Epoch : 20, mean_loss : 2.94
Epoch : 30, mean_loss : 2.25
Epoch : 40, mean_loss : 2.09


In [151]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:5] # sort by similarity

In [174]:
test = random.choice(list(vocab))
print(test)
word_similarity(test, vocab)

needs


[['kitchen', 0.5154996514320374],
 ['round', 0.3898712396621704],
 ['mundungus', 0.3614695370197296],
 ['full', 0.36109527945518494],
 ['scanning', 0.3384058475494385]]