In [1]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

In [2]:
embedding_dim = 100
learning_rate = 0.01
epochs = 10
batch_size = 10

In [3]:
with open('pos_data.p', 'rb') as f:
    pos_data = pickle.load(f)
    
with open('neg_data.p', 'rb') as f:
    neg_data = pickle.load(f)

with open('unigram_probs.p', 'rb') as f:
    unigram_probs = pickle.load(f)
    
vocab_size = len(unigram_probs)

central_words = []
contexts = []
neg_samples = []

for p in pos_data:
    central_words.append(p[0])
    contexts.append(p[1])
    
for n in neg_data:
    neg_samples.append(n[1])
    
dataset = [central_words, contexts, neg_samples]

In [4]:
print(vocab_size)

79


In [63]:
def create_batches(dataset, batch_size):
    
    batch_number = len(dataset[0]) // batch_size
    no_central_words = len(dataset[0])
    pos_words = []
    pos_contexts = []
    neg_contexts = []
    
    for bn in range(batch_number):
        indices = np.arange(0, no_central_words)
        
        #shuffle set
        np.random.shuffle(indices)
        
        indices = indices[0:batch_size]
        #shuffle dataset
        
        central = []
        contx = []
        negs = []
        
        for d in indices:
            
            central.append(dataset[0][d])
            contx.append(dataset[1][d])
            negs.append(dataset[2][d])
              
        pos_words.append(torch.from_numpy(np.asarray(central)))
        pos_contexts.append(torch.from_numpy(np.asarray(contx)))
        neg_contexts.append(torch.from_numpy(np.asarray(negs)))
    
    return  pos_words, pos_contexts, neg_contexts

pos_words, pos_contexts, neg_contexts = create_batches(dataset, batch_size)

batched_dataset = {'pos_w': pos_words, 'pos_c': pos_contexts, 'neg_c':neg_contexts}

with open('batched_dataset.p', 'wb') as f:
    pickle.dump(batched_dataset, f) 

In [64]:
with open('batched_dataset.p', 'rb') as f:
    batched_dataset = pickle.load(f)

In [65]:
batched_dataset['pos_w'][0]

tensor([  8,  67,  16,  56,  56,  71,  70,  42,  72,  78])

In [66]:
len(batched_dataset['pos_c'])

167

In [67]:
word_batches = batched_dataset['pos_w']
context_batches = batched_dataset['pos_c']
neg_context_batches = batched_dataset['neg_c']

no_batch = len(word_batches)

In [68]:
neg_context_batches[0]

tensor([[ 64,  67,  45,  32,  54],
        [ 73,  10,  34,  60,  10],
        [ 55,  27,  70,   4,  50],
        [ 78,  48,  64,  16,  75],
        [ 44,  36,  71,  32,  22],
        [ 29,  41,  48,  19,  28],
        [ 77,  44,  11,  66,  49],
        [  7,  66,   4,  73,  13],
        [ 19,  15,  53,  33,  46],
        [ 26,  76,  46,   5,  22]])

In [69]:
class SkipGram(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGram, self).__init__()
        
        #sparse embeddings for word and context vectors
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim) #, sparse = True
        self.lin1 = nn.Linear(embedding_dim, vocabulary_size, bias = False)
           
    def forward(self, pos_words):
        
        out = self.w_embeddings(pos_words)
        
        out = self.lin1(out)
        
        final_out = F.log_softmax(out, dim = 0)
        
        return final_out

In [70]:
model = SkipGram(vocab_size, embedding_dim)
loss_func = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

for e in range(epochs):
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        contexts = context_batches[b]

        optimizer.zero_grad()

        preds = model(words)
        
        loss = loss_func(preds, contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()
        
    if (e+1)%10 == 0:
        print(total_loss)
    


250.34693259000778


In [71]:
model.w_embeddings.weight[0]

tensor([-0.1757,  0.3037, -0.4713, -0.8476,  0.0191,  0.6583,  0.2654,
         0.0535,  0.1119, -0.4707, -1.0232, -0.6034, -0.6984, -0.0050,
        -0.8637, -0.8943,  0.5435,  0.4770,  0.6377, -1.2080,  1.5565,
         0.2134, -0.4355,  0.9534,  0.2121, -0.2937, -0.2291,  0.2925,
         0.6719,  0.3989,  0.9640, -1.1787,  0.4202, -0.8986, -1.8425,
        -0.3739, -0.6442, -0.2802,  0.2347,  0.0256, -0.3941, -1.3290,
        -1.0294,  0.1271,  1.5448, -0.6627, -0.2307,  0.7656,  0.8008,
        -0.4602,  0.8287, -1.0531, -0.2534,  0.9345,  0.4222, -1.0691,
         1.4121,  0.4038, -0.1973,  0.4030, -0.2278,  0.3137,  0.9210,
         1.0097,  0.6797, -1.5831, -0.2292, -0.2369, -0.8806, -0.1497,
         1.4229, -0.1659,  0.7073, -0.3759,  0.0603, -0.0784, -1.3290,
        -0.0956, -0.7328,  0.3626, -0.3000, -1.6893, -0.9691,  0.2167,
        -0.6114, -0.5323, -0.1796, -0.1267,  0.0232, -0.7965,  0.3290,
         2.7254, -0.0419,  1.2038, -1.8167,  0.7708, -0.4069, -1.2394,
      

In [102]:
class SkipGramNeg(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGramNeg, self).__init__()
        
        #sparse embeddings for word and context vectors
        
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        self.c_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        
#     initialization of embeds
#     https://adoni.github.io/2017/11/08/word2vec-pytorch/

#def init_emb(self):
#     initrange = 0.5 / self.embedding_dim
#     self.u_embeddings.weight.data.uniform_(-initrange, initrange)
#     self.v_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, pos_words, pos_conts, neg_conts):
        
        #Loss calculation, Levy&Goldberg word2vec Explained
        #https://adoni.github.io/2017/11/08/word2vec-pytorch/
        
        w_out = self.w_embeddings(pos_words)
        
        pos_out = self.c_embeddings(pos_conts)
        neg_out = self.c_embeddings(neg_conts)
        
#         print(neg_conts)
#         print(pos_conts)
#         print(pos_words)
               
        pos_val = torch.mul(w_out, pos_out).squeeze()
        pos_val = torch.sum(pos_val, dim = 1)
        pos_loss = F.logsigmoid(pos_val)
        
        neg_val = torch.bmm(neg_out, w_out.unsqueeze(2)).squeeze()
        neg_val = torch.sum(neg_val, dim = 1)
        neg_loss = F.logsigmoid(-neg_val)
        
        final_out = pos_loss + neg_loss.sum()
        final_out = -final_out.sum()/len(pos_words) #neg and mean
         
        return final_out
    

In [105]:
epochs = 150
model = SkipGramNeg(vocab_size, embedding_dim)
optimizer = optim.SparseAdam(model.parameters(), lr = learning_rate)

losses = []
avg_losses = []

print('epoch, total loss, average loss, duration')
for e in range(epochs):
    
    then = datetime.now()
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        contexts = context_batches[b]
        neg_contexts = neg_context_batches[b]

        optimizer.zero_grad()

        loss = model(words, contexts, neg_contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()       
    
    now = datetime.now()
        
    losses.append(total_loss)
    
    avg_loss = np.mean(losses)/no_batch
    
    print(e, total_loss, avg_loss, now-then)
    
    avg_losses.append(avg_loss)
    

epoch, total loss, average loss, duration
0 6825.353890698403 40.870382579 0:00:00.108423
1 931.2951987367123 23.2235002678 0:00:00.102689
2 420.3461255710572 16.3213477345 0:00:00.110398
3 301.54100554808974 12.6924194918 0:00:00.103525
4 226.53614019602537 10.4252363602 0:00:00.099595
5 169.6015288978815 8.85695996971 0:00:00.094720
6 128.27353544719517 7.70140926013 0:00:00.109288
7 97.6550446646288 6.81182819593 0:00:00.101043
8 75.03320912178606 6.10488069121 0:00:00.108342
9 57.568934701383114 5.52886503807 0:00:00.170948
10 44.56126679619774 5.05049857397 0:00:00.148143
11 34.65767208207399 4.64691794035 0:00:00.164324
12 26.931503540836275 4.30186782865 0:00:00.138341
13 20.774830242153257 4.0034772824 0:00:00.110238
14 15.90551572991535 3.74292830418 0:00:00.106839
15 12.081460603745654 3.51351678989 0:00:00.099129
16 9.166912177111953 3.31006825458 0:00:00.117032
17 6.94044858077541 3.1284844389 0:00:00.163336
18 5.265653137117624 2.96548688196 0:00:00.135329
19 4.14521851157

In [106]:
def get_embeddings(model):
    
    return model.w_embeddings.weight.data, model.c_embeddings.weight.data

def save_embeddings(embeds, file_name):
    
    with open(file_name, 'wb') as file:
        pickle.dump(embeds.numpy(), file)

In [108]:
wm, cm = get_embeddings(model)

save_embeddings(wm, 'wordvecs_skipgram_word.pickle')

save_embeddings(cm, 'wordvecs_skipgram_context.pickle')

In [109]:
with open('wordvecs_skipgram.pickle', 'rb') as file:
    w_embeds = pickle.load(file)

In [111]:
w_embeds.shape

(79, 100)

In [None]:
# visualize tsne for embeds
#each epoch save embedding weight as pickles!!!
