In [1]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

In [2]:
embedding_dim = 100
learning_rate = 0.01
epochs = 10
batch_size = 10

In [3]:
with open('pos_data.p', 'rb') as f:
    pos_data = pickle.load(f)
    
with open('neg_data.p', 'rb') as f:
    neg_data = pickle.load(f)

with open('unigram_probs.p', 'rb') as f:
    unigram_probs = pickle.load(f)
    
vocab_size = len(unigram_probs)

central_words = []
contexts = []
neg_samples = []

for p in pos_data:
    central_words.append(p[0])
    contexts.append(p[1])
    
for n in neg_data:
    neg_samples.append(n[1])
    
dataset = [central_words, contexts, neg_samples]

In [4]:
print(vocab_size)

79


In [63]:
def create_batches(dataset, batch_size):
    
    batch_number = len(dataset[0]) // batch_size
    no_central_words = len(dataset[0])
    pos_words = []
    pos_contexts = []
    neg_contexts = []
    

    indices = np.arange(0, no_central_words)
    print(indices)
    #shuffle set
    np.random.shuffle(indices)
    
    for bn in range(batch_number):
        
        b_indices = indices[bn*batch_size:bn*batch_size + batch_size]
        
        central = []
        contx = []
        negs = []
        
        for d in b_indices:
            central.append(dataset[0][d])
            contx.append(dataset[1][d])
            negs.append(dataset[2][d])
              
        pos_words.append(torch.from_numpy(np.asarray(central)))
        pos_contexts.append(torch.from_numpy(np.asarray(contx)))
        neg_contexts.append(torch.from_numpy(np.asarray(negs)))
    
    return  pos_words, pos_contexts, neg_contexts

pos_words, pos_contexts, neg_contexts = create_batches(dataset, batch_size)

batched_dataset = {'pos_w': pos_words, 'pos_c': pos_contexts, 'neg_c':neg_contexts}

with open('batched_dataset.p', 'wb') as f:
    pickle.dump(batched_dataset, f) 

[   0    1    2 ..., 1675 1676 1677]


In [64]:
with open('batched_dataset.p', 'rb') as f:
    batched_dataset = pickle.load(f)

In [65]:
batched_dataset['pos_w'][0]

tensor([ 20,  32,  42,   5,  76,  71,  42,  55,  71,  59])

In [66]:
len(batched_dataset['pos_c'])

167

In [67]:
word_batches = batched_dataset['pos_w']
context_batches = batched_dataset['pos_c']
neg_context_batches = batched_dataset['neg_c']

no_batch = len(word_batches)

In [71]:
neg_context_batches[0]

tensor([[ 16,  71,  16,  71,   0],
        [  3,  32,  78,  34,  34],
        [ 13,  10,  26,  26,  66],
        [ 59,  51,  30,  10,  46],
        [ 53,  15,  46,  28,  15],
        [ 24,  19,   4,   3,  66],
        [ 21,  54,  13,  41,  66],
        [ 27,  31,   2,  69,  62],
        [  5,  59,  70,  65,  33],
        [ 62,  57,  76,  54,  26]])

In [69]:
class SkipGram(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGram, self).__init__()
        
        #sparse embeddings for word and context vectors
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim) #, sparse = True
        self.lin1 = nn.Linear(embedding_dim, vocabulary_size, bias = False)
           
    def forward(self, pos_words):
        
        out = self.w_embeddings(pos_words)
        
        out = self.lin1(out)
        
        final_out = F.log_softmax(out, dim = 0)
        
        return final_out

In [70]:
model = SkipGram(vocab_size, embedding_dim)
loss_func = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

for e in range(epochs):
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        
        contexts = context_batches[b]

        optimizer.zero_grad()

        preds = model(words)
        
        loss = loss_func(preds, contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()
        
    if (e+1)%10 == 0:
        print(total_loss)
    


tensor([ 20,  32,  42,   5,  76,  71,  42,  55,  71,  59])
tensor([ 15,  26,  32,  63,   6,  72,  58,  42,  38,  72])
tensor([ 42,  20,   9,  20,  42,  42,  16,  54,  52,  51])
tensor([ 15,  55,  48,  20,  42,  32,  57,  71,  71,  15])
tensor([ 61,  18,  23,  71,  15,  67,  15,  48,   9,  32])
tensor([ 57,  25,  18,   8,  71,  57,  33,  63,  37,  53])
tensor([ 14,  15,  30,   0,  18,  42,  31,  22,  47,  20])
tensor([ 73,   2,  11,  10,   4,   1,   7,  21,  14,  47])
tensor([  1,  15,   4,  39,  18,  63,  26,   8,  71,  26])
tensor([ 15,  39,  71,  38,  55,  47,  15,  42,   7,   6])
tensor([ 28,  55,  58,  42,  30,   8,  47,  42,  33,  14])
tensor([ 26,  28,  19,   0,  11,  33,  42,  18,  17,  47])
tensor([ 42,  21,  15,  53,  61,  15,  67,  14,  48,  18])
tensor([  1,   9,  59,   3,  18,  16,  15,  28,   9,  42])
tensor([  5,  16,   1,  47,  52,   1,  18,   1,  61,  16])
tensor([ 55,  30,  22,  35,  42,  71,  18,  15,  42,  16])
tensor([ 15,   1,  15,  59,  52,  71,  73,   9,  24,  42

tensor([ 12,  68,  30,  33,  45,  15,  55,   1,  33,  71])
tensor([ 15,  42,  27,   2,  30,  48,  55,  30,  60,  33])
tensor([ 71,  64,  12,  33,   1,  42,  78,  38,   0,  42])
tensor([ 65,  16,  66,  51,  63,  42,  15,  71,  42,   4])
tensor([ 27,  71,  72,  52,  58,  13,  54,  68,  16,  55])
tensor([ 42,  13,  14,  21,  12,  73,  16,  71,  48,  67])
tensor([ 11,   5,  42,  78,  15,  56,  16,  46,  72,  11])
tensor([ 52,  42,  65,  46,  55,  22,  48,  32,  33,  61])
tensor([ 48,  48,  55,  64,   5,   5,  69,  14,  27,  42])
tensor([ 69,  45,  69,  71,  78,  19,   3,  29,  70,  71])
tensor([  2,  53,  19,  18,  15,  43,  18,  55,  52,  21])
tensor([ 71,   1,  56,  33,  16,  42,  32,  14,  42,  77])
tensor([ 12,  20,  16,  15,  73,  53,  42,  72,  58,  47])
tensor([ 15,  35,  65,   7,   0,  26,  67,  30,   7,   9])
tensor([ 42,  36,  35,  47,  71,  42,  32,  16,  68,   3])
tensor([ 33,  23,  42,  42,  48,  63,  42,  42,  64,  15])
tensor([ 39,  46,  44,  15,  54,  14,   1,  48,  22,   2

tensor([ 32,  71,  15,  17,  42,  52,  18,  15,  17,  53])
tensor([ 78,  27,  16,  18,  21,  33,  69,  68,  32,   3])
tensor([ 29,  22,   8,   3,  48,   2,  53,  43,  43,   3])
tensor([ 34,   6,  15,   3,  42,  16,   5,  32,  47,  32])
tensor([ 25,  11,  63,   2,  26,  32,  50,  15,  12,  42])
tensor([ 42,  20,  27,  53,  32,  77,  70,  35,  66,  33])
tensor([ 73,  42,  40,  32,  24,  14,  15,  16,  32,  73])
tensor([ 70,  32,  33,  61,   4,  22,  42,  48,  20,  39])
tensor([  3,  12,   9,  15,  32,   5,  58,  33,   0,   5])
tensor([  6,  42,   8,  15,  42,   5,  42,  14,  72,   1])
tensor([ 33,  15,  61,  15,  55,  31,  66,  46,  33,  42])
tensor([ 15,  12,  47,  14,  33,  70,  11,  32,  18,  55])
tensor([  2,  12,   5,  48,  42,  12,  15,  68,  15,  72])
tensor([ 16,   0,   5,  14,  15,  19,  71,  14,  15,  61])
tensor([ 15,  12,  14,  32,   9,  39,  39,  72,  14,  42])
tensor([ 47,  42,  10,  15,  42,  53,   2,  18,  48,  16])
tensor([ 42,  16,  48,   0,  10,  42,  42,  29,   9,  57

tensor([ 69,  45,  69,  71,  78,  19,   3,  29,  70,  71])
tensor([  2,  53,  19,  18,  15,  43,  18,  55,  52,  21])
tensor([ 71,   1,  56,  33,  16,  42,  32,  14,  42,  77])
tensor([ 12,  20,  16,  15,  73,  53,  42,  72,  58,  47])
tensor([ 15,  35,  65,   7,   0,  26,  67,  30,   7,   9])
tensor([ 42,  36,  35,  47,  71,  42,  32,  16,  68,   3])
tensor([ 33,  23,  42,  42,  48,  63,  42,  42,  64,  15])
tensor([ 39,  46,  44,  15,  54,  14,   1,  48,  22,   2])
tensor([ 13,  12,  71,  28,  15,  16,  16,  67,  55,  20])
tensor([ 55,  45,  14,  16,  63,  22,  18,  32,   6,  48])
tensor([ 14,   7,  33,  77,   2,  68,  33,  16,  14,  63])
tensor([ 16,  50,  63,  15,  30,   4,  55,  42,   9,  74])
tensor([ 42,  74,   2,  67,  16,  53,  56,  31,  20,  24])
tensor([ 66,  30,  58,  48,  71,  61,  51,  45,  22,  53])
tensor([ 73,  38,  48,  48,  14,  24,  33,  36,  32,  13])
tensor([ 52,  71,  18,  13,  41,  52,  47,  15,  28,  58])
tensor([ 20,  32,  42,   5,  76,  71,  42,  55,  71,  59

tensor([ 15,  42,  27,   2,  30,  48,  55,  30,  60,  33])
tensor([ 71,  64,  12,  33,   1,  42,  78,  38,   0,  42])
tensor([ 65,  16,  66,  51,  63,  42,  15,  71,  42,   4])
tensor([ 27,  71,  72,  52,  58,  13,  54,  68,  16,  55])
tensor([ 42,  13,  14,  21,  12,  73,  16,  71,  48,  67])
tensor([ 11,   5,  42,  78,  15,  56,  16,  46,  72,  11])
tensor([ 52,  42,  65,  46,  55,  22,  48,  32,  33,  61])
tensor([ 48,  48,  55,  64,   5,   5,  69,  14,  27,  42])
tensor([ 69,  45,  69,  71,  78,  19,   3,  29,  70,  71])
tensor([  2,  53,  19,  18,  15,  43,  18,  55,  52,  21])
tensor([ 71,   1,  56,  33,  16,  42,  32,  14,  42,  77])
tensor([ 12,  20,  16,  15,  73,  53,  42,  72,  58,  47])
tensor([ 15,  35,  65,   7,   0,  26,  67,  30,   7,   9])
tensor([ 42,  36,  35,  47,  71,  42,  32,  16,  68,   3])
tensor([ 33,  23,  42,  42,  48,  63,  42,  42,  64,  15])
tensor([ 39,  46,  44,  15,  54,  14,   1,  48,  22,   2])
tensor([ 13,  12,  71,  28,  15,  16,  16,  67,  55,  20

tensor([ 70,  32,  33,  61,   4,  22,  42,  48,  20,  39])
tensor([  3,  12,   9,  15,  32,   5,  58,  33,   0,   5])
tensor([  6,  42,   8,  15,  42,   5,  42,  14,  72,   1])
tensor([ 33,  15,  61,  15,  55,  31,  66,  46,  33,  42])
tensor([ 15,  12,  47,  14,  33,  70,  11,  32,  18,  55])
tensor([  2,  12,   5,  48,  42,  12,  15,  68,  15,  72])
tensor([ 16,   0,   5,  14,  15,  19,  71,  14,  15,  61])
tensor([ 15,  12,  14,  32,   9,  39,  39,  72,  14,  42])
tensor([ 47,  42,  10,  15,  42,  53,   2,  18,  48,  16])
tensor([ 42,  16,  48,   0,  10,  42,  42,  29,   9,  57])
tensor([ 15,  48,  67,   5,  20,  76,  12,  33,  42,  16])
tensor([  2,  48,  20,  37,  14,  46,  33,   9,  47,  48])
tensor([ 46,  75,  42,  32,  10,  48,  15,  11,  31,  20])
tensor([ 15,  16,  11,  33,  19,  33,  15,   2,  11,  73])
tensor([ 42,   5,  69,  47,   0,  69,  45,  14,  15,  14])
tensor([ 12,  68,  30,  33,  45,  15,  55,   1,  33,  71])
tensor([ 15,  42,  27,   2,  30,  48,  55,  30,  60,  33

In [23]:
model.w_embeddings.weight[0]

tensor([-1.5256, -0.7502, -0.6540, -1.6095, -0.1002, -0.6092, -0.9798,
        -1.6091, -0.7121,  0.3037, -0.7773, -0.2515, -0.2223,  1.6871,
         0.2284,  0.4676, -0.6970, -1.1608,  0.6995,  0.1991,  0.8657,
         0.2444, -0.6629,  0.8073,  1.1017, -0.1759, -2.2456, -1.4465,
         0.0612, -0.6177, -0.7981, -0.1316,  1.8793, -0.0721,  0.1578,
        -0.7735,  0.1991,  0.0457,  0.1530, -0.4757, -0.1110,  0.2927,
        -0.1578, -0.0288,  2.3571, -1.0373,  1.5748, -0.6298, -0.9274,
         0.5451,  0.0663, -0.4370,  0.7626,  0.4415,  1.1651,  2.0154,
         0.1374,  0.9386, -0.1860, -0.6446,  1.5392, -0.8696, -3.3312,
        -0.7479, -0.0255, -1.0233, -0.5962, -1.0055, -0.2106, -0.0075,
         1.6734,  0.0103, -0.7040, -0.1853, -0.9962, -0.8313, -0.4610,
        -0.5601,  0.3956, -0.9823, -0.5065,  0.0998, -0.6540,  0.7317,
        -1.4344, -0.5008,  0.1716, -0.1600,  0.2546, -0.5020, -1.0412,
         0.7323, -1.0483, -0.4709,  0.2911,  1.9907,  0.6614,  1.1899,
      

In [24]:
class SkipGramNeg(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGramNeg, self).__init__()
        
        #sparse embeddings for word and context vectors
        
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        self.c_embeddings = nn.Embedding(vocabulary_size, embedding_dim, sparse = True)
        
#     initialization of embeds
#     https://adoni.github.io/2017/11/08/word2vec-pytorch/

#def init_emb(self):
#     initrange = 0.5 / self.embedding_dim
#     self.u_embeddings.weight.data.uniform_(-initrange, initrange)
#     self.v_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, pos_words, pos_conts, neg_conts):
        
        #Loss calculation, Levy&Goldberg word2vec Explained
        #https://adoni.github.io/2017/11/08/word2vec-pytorch/
        
        w_out = self.w_embeddings(pos_words)
        
        pos_out = self.c_embeddings(pos_conts)
        neg_out = self.c_embeddings(neg_conts)
        
#         print(neg_conts)
#         print(pos_conts)
#         print(pos_words)
               
        pos_val = torch.mul(w_out, pos_out).squeeze()
        pos_val = torch.sum(pos_val, dim = 1)
        pos_loss = F.logsigmoid(pos_val)
        
        neg_val = torch.bmm(neg_out, w_out.unsqueeze(2)).squeeze()
        neg_val = torch.sum(neg_val, dim = 1)
        neg_loss = F.logsigmoid(-neg_val)
        
        final_out = pos_loss + neg_loss.sum()
        final_out = -final_out.sum()/len(pos_words) #neg and mean
         
        return final_out
    

In [105]:
epochs = 150
model = SkipGramNeg(vocab_size, embedding_dim)
optimizer = optim.SparseAdam(model.parameters(), lr = learning_rate)

losses = []
avg_losses = []

print('epoch, total loss, average loss, duration')
for e in range(epochs):
    
    then = datetime.now()
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        contexts = context_batches[b]
        neg_contexts = neg_context_batches[b]

        optimizer.zero_grad()

        loss = model(words, contexts, neg_contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()       
    
    now = datetime.now()
        
    losses.append(total_loss)
    
    avg_loss = np.mean(losses)/no_batch
    
    print(e, total_loss, avg_loss, now-then)
    
    avg_losses.append(avg_loss)
    

epoch, total loss, average loss, duration
0 6825.353890698403 40.870382579 0:00:00.108423
1 931.2951987367123 23.2235002678 0:00:00.102689
2 420.3461255710572 16.3213477345 0:00:00.110398
3 301.54100554808974 12.6924194918 0:00:00.103525
4 226.53614019602537 10.4252363602 0:00:00.099595
5 169.6015288978815 8.85695996971 0:00:00.094720
6 128.27353544719517 7.70140926013 0:00:00.109288
7 97.6550446646288 6.81182819593 0:00:00.101043
8 75.03320912178606 6.10488069121 0:00:00.108342
9 57.568934701383114 5.52886503807 0:00:00.170948
10 44.56126679619774 5.05049857397 0:00:00.148143
11 34.65767208207399 4.64691794035 0:00:00.164324
12 26.931503540836275 4.30186782865 0:00:00.138341
13 20.774830242153257 4.0034772824 0:00:00.110238
14 15.90551572991535 3.74292830418 0:00:00.106839
15 12.081460603745654 3.51351678989 0:00:00.099129
16 9.166912177111953 3.31006825458 0:00:00.117032
17 6.94044858077541 3.1284844389 0:00:00.163336
18 5.265653137117624 2.96548688196 0:00:00.135329
19 4.14521851157

In [106]:
def get_embeddings(model):
    
    return model.w_embeddings.weight.data, model.c_embeddings.weight.data

def save_embeddings(embeds, file_name):
    
    with open(file_name, 'wb') as file:
        pickle.dump(embeds.numpy(), file)

In [108]:
wm, cm = get_embeddings(model)

save_embeddings(wm, 'wordvecs_skipgram_word.pickle')

save_embeddings(cm, 'wordvecs_skipgram_context.pickle')

In [109]:
with open('wordvecs_skipgram.pickle', 'rb') as file:
    w_embeds = pickle.load(file)

In [111]:
w_embeds.shape

(79, 100)

In [None]:
# visualize tsne for embeds
#each epoch save embedding weight as pickles!!!
