In [9]:
import numpy as np

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
from scipy import stats

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

import time
from datetime import datetime

from collections import defaultdict
from collections import Counter

from random import randint
import pickle

import warnings
warnings.filterwarnings('error')

In [142]:
embedding_dim = 100
learning_rate = 0.01
epochs = 10
batch_size = 10

In [11]:
with open('pos_data.p', 'rb') as f:
    pos_data = pickle.load(f)
    
with open('neg_data.p', 'rb') as f:
    neg_data = pickle.load(f)

with open('unigram_probs.p', 'rb') as f:
    unigram_probs = pickle.load(f)
    
vocab_size = len(unigram_probs)

central_words = []
contexts = []
neg_samples = []

for p in pos_data:
    central_words.append(p[0])
    contexts.append(p[1])
    
for n in neg_data:
    neg_samples.append(n[1])
    
dataset = [central_words, contexts, neg_samples]

In [181]:
def create_batches(dataset, batch_size):
    
    batch_number = len(dataset[0]) // batch_size
    no_central_words = len(dataset[0])
    pos_words = []
    pos_contexts = []
    neg_contexts = []
    
    for bn in range(batch_number):
        indices = np.arange(0, no_central_words)
        
        #shuffle set
        np.random.shuffle(indices)
        
        indices = indices[0:batch_size]
        #shuffle dataset
        
        central = []
        contx = []
        negs = []
        
        for d in indices:
            
            central.append(dataset[0][d])
            contx.append(dataset[1][d])
            negs.append(dataset[2][d])
              
        pos_words.append(torch.from_numpy(np.asarray(central)))
        pos_contexts.append(torch.from_numpy(np.asarray(contx)))
        neg_contexts.extend(torch.from_numpy(np.asarray(negs)))
    
    return  pos_words, pos_contexts, neg_contexts

pos_words, pos_contexts, neg_contexts = create_batches(dataset, batch_size)

batched_dataset = {'pos_w': pos_words, 'pos_c': pos_contexts, 'neg_c':neg_contexts}

with open('batched_dataset.p', 'wb') as f:
    pickle.dump(batched_dataset, f) 

In [182]:
with open('batched_dataset.p', 'rb') as f:
    batched_dataset = pickle.load(f)

In [262]:
batched_dataset['pos_w'][0]

tensor([ 38,   4,  34,  20,  47,  51,  37,  13,  11,  34])

In [184]:
len(batched_dataset['pos_c'])

168

In [259]:
word_batches = batched_dataset['pos_w']
context_batches = batched_dataset['pos_c']
neg_context_batches = batched_dataset['neg_c']

no_batch = len(word_batches)

In [186]:
class SkipGram(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGram, self).__init__()
        
        #sparse embeddings for word and context vectors
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim) #, sparse = True
        self.lin1 = nn.Linear(embedding_dim, vocabulary_size, bias = False)
           
    def forward(self, pos_words):
        
        out = self.w_embeddings(pos_words)
        
        out = self.lin1(out)
        
        final_out = F.log_softmax(out, dim = 0)
        
        return final_out

In [187]:
model = SkipGram(vocab_size, embedding_dim)
loss_func = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

for e in range(epochs):
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        contexts = context_batches[b]

        optimizer.zero_grad()

        preds = model(words)
        
        loss = loss_func(preds, contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()
        
    if (e+1)%10 == 0:
        print(total_loss)
    


248.24717736244202


In [188]:
model.w_embeddings.weight[0]

tensor([-0.2402, -1.1282, -0.0895, -0.3965, -0.2605, -0.6403, -1.3293,
        -0.7503,  0.5070, -0.2929, -1.0950,  0.2650, -1.1940,  0.2802,
        -0.1430,  0.6097,  0.0903,  0.4231, -0.2599,  1.0438,  1.3403,
        -3.3277,  0.6644,  0.9449,  0.2858, -0.2634,  0.4547, -0.4700,
        -0.9312,  0.1417,  1.4448,  0.0696, -0.1068,  0.9950, -0.1015,
        -0.3644, -0.2624, -0.5105, -0.2705, -0.7756, -0.6153,  0.4367,
         0.2607, -0.3589,  0.7530,  0.5306,  0.0430, -0.5074, -0.0666,
        -0.2776,  0.1563, -0.3429, -0.6137, -0.4683,  0.0128,  0.8209,
        -0.6862, -0.7298, -0.5359, -0.0419,  0.9605, -1.0828,  0.7252,
        -0.4318,  0.3126,  0.2615, -0.4629, -0.6999, -0.3502,  1.0422,
        -0.4559,  0.5818,  0.9175, -0.0764, -0.0485, -0.2951, -1.7486,
        -0.1142,  0.0655,  0.1596,  0.1953, -0.2336,  0.2528, -0.3813,
         0.4473,  0.3742, -0.0122,  0.0698, -0.4119,  0.4317,  0.3617,
        -1.0024,  0.4201, -0.2866,  0.7597,  0.7087,  0.8933,  0.1399,
      

In [189]:
class SkipGramNeg(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(SkipGramNeg, self).__init__()
        
        #sparse embeddings for word and context vectors
        
        #initialization of embeds?
        
        self.w_embeddings = nn.Embedding(vocabulary_size, embedding_dim) #, sparse = True
        self.lin1 = nn.Linear(embedding_dim, vocabulary_size, bias = False)
        
    def forward(self, pos_words):
        
        out = self.w_embeddings(pos_words)
        
        out = self.lin1(out)
        
        final_out = F.log_softmax(out, dim = 0)
        
        return final_out

In [261]:
epochs = 10 
model = SkipGramNeg(vocab_size, embedding_dim)
loss_func = nn.NLLLoss()
#loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

losses = []
avg_losses = []

print('epoch, total loss, average loss, duration')
for e in range(epochs):
    
    then = datetime.now()
    
    total_loss = 0.0
    for b in range(no_batch):
        
        words = word_batches[b]
        contexts = context_batches[b]
        neg_contexts = neg_context_batches[b]

        optimizer.zero_grad()

        preds = model(words)
        
        loss = loss_func(preds, contexts)
        
        loss.backward()
        
        optimizer.step()

        total_loss += loss.item()
        
    
    now = datetime.now()
        
    losses.append(total_loss)
    
    avg_loss = np.mean(losses)/no_batch
    
    print(e, total_loss, avg_loss, now-then)
    
    avg_losses.append(avg_loss)
    

epoch, total loss, average loss, duration
0 388.3636381626129 2.3116883224 0:00:00.056499
1 298.03815948963165 2.04286249301 0:00:00.044088
2 279.8476963043213 1.91716169436 0:00:00.044176
3 269.78527426719666 1.83933745271 0:00:00.044625
4 264.7503014802933 1.7866488925 0:00:00.054584
5 258.9631435871124 1.74578195763 0:00:00.045191
6 255.74946749210358 1.71385857209 0:00:00.042508
7 252.07343870401382 1.68718089248 0:00:00.043858
8 250.33574923872948 1.66528232059 0:00:00.043178
9 246.82622891664505 1.64567446288 0:00:00.044878


In [254]:
print(len(preds[0]))

79


In [243]:
def get_embeddings(model):
    
    return model.w_embeddings.weight.data, model.lin1.weight.data

def save_embeddings(embeds, file_name):
    
    with open(file_name, 'wb') as file:
        pickle.dump(embeds.numpy(), file)

In [244]:
wm, cm = get_embeddings(model)

In [245]:
save_embeddings(wm, 'wordvecs_skipgram.pickle')

In [247]:
with open('wordvecs_skipgram.pickle', 'rb') as file:
    w_embeds = pickle.load(file)