In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [23]:
from collections import Counter
import itertools

import numpy as np

from src.utils.logger import logger

In [34]:
PARAMS = {'dim': 128,
          'window': 5,
          'min_count': 1,
          'negative_samp': 5,
          'epochs': 10,
          'seed': 42}

### Load data

In [9]:
sequences = np.load('../data/books_sequences_sample.npy')
sequences = sequences.tolist()

### Get pairs

In [15]:
def get_pairs(sequences, window=5):
    pairs = []
    window = PARAMS['window']

    for sequence in sequences:
        for center_idx, node in enumerate(sequence):
            for i in range(-window, window+1):
                context_idx = center_idx + i
                if context_idx > 0 and context_idx < len(sequence) and node != sequence[context_idx]:
                    pairs.append((node, sequence[context_idx]))
                    
    return pairs

In [18]:
pairs = get_pairs(sequences, PARAMS['window'])
logger.info('Len of pairs: {:,}'.format(len(pairs)))

2019-11-27 12:04:13,047 - Len of pairs: 63,888


### Negative sampling

In [181]:
def get_negative_samples(sequences, power=0.75):
    sample_table_size = 1e6
    
    # Flatten list
    seq_flat = list(itertools.chain.from_iterable(sequences))
    
    # Get word frequency
    word_freq = Counter(seq_flat)
    
    # Convert to array
    word_freq = np.array(list(word_freq.items()), dtype=np.float64)
    
    # Adjust by power 
    word_freq[:, 1] = word_freq[:, 1] ** power
    
    # Get probabilities
    word_freq_sum = word_freq[:, 1].sum()
    word_freq[:, 1] = word_freq[:, 1] / word_freq_sum
    
    # Multiply probabilities by sample table size
    word_freq[:, 1] = np.round(word_freq[:, 1] * sample_table_size)
    
    # Convert to int
    word_freq = word_freq.astype(int).tolist()
    
    # Create sample table
    sample_table = [[tup[0]] * tup[1] for tup in word_freq]
    sample_table = list(itertools.chain.from_iterable(sample_table))
    
    return sample_table

In [182]:
samp_table = get_negative_samples(sequences)

### Create skip-gram model

In [186]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [198]:
emb_size=10
emb_dim = 10
u_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)
v_embeddings = nn.Embedding(emb_size, emb_dim, sparse=True)

initrange = 0.5/emb_size
u_embeddings.weight.data.uniform_(-initrange, initrange)
v_embeddings.weight.data.uniform_(-initrange, initrange)

tensor([[-0.0185, -0.0152, -0.0059,  0.0161, -0.0259, -0.0434, -0.0407,  0.0061,
          0.0147, -0.0363],
        [-0.0145,  0.0046, -0.0296, -0.0288, -0.0404,  0.0445,  0.0388, -0.0191,
         -0.0246,  0.0424],
        [-0.0435,  0.0442, -0.0046, -0.0249, -0.0112,  0.0221, -0.0455, -0.0396,
          0.0460,  0.0369],
        [-0.0415,  0.0203,  0.0182, -0.0139,  0.0436,  0.0256,  0.0357, -0.0126,
          0.0469,  0.0110],
        [-0.0050, -0.0477,  0.0295,  0.0130, -0.0281,  0.0094, -0.0473,  0.0017,
          0.0454, -0.0103],
        [ 0.0109, -0.0363,  0.0052, -0.0433,  0.0360,  0.0311,  0.0175, -0.0436,
         -0.0159,  0.0490],
        [ 0.0148,  0.0466,  0.0431,  0.0260, -0.0319, -0.0033, -0.0496, -0.0394,
         -0.0053, -0.0259],
        [ 0.0230, -0.0355, -0.0117, -0.0018,  0.0344, -0.0466,  0.0123, -0.0269,
         -0.0371,  0.0148],
        [-0.0123, -0.0059, -0.0144,  0.0006,  0.0430,  0.0382,  0.0173, -0.0130,
         -0.0403,  0.0419],
        [-0.0110,  

In [199]:
u_embeddings.weight

Parameter containing:
tensor([[-0.0114,  0.0368, -0.0147,  0.0464,  0.0323,  0.0079, -0.0283, -0.0217,
         -0.0451, -0.0026],
        [-0.0292,  0.0343,  0.0427,  0.0311, -0.0375,  0.0035, -0.0429, -0.0261,
         -0.0168, -0.0264],
        [ 0.0489,  0.0181, -0.0218, -0.0067, -0.0157, -0.0236, -0.0153, -0.0209,
          0.0285, -0.0310],
        [-0.0244,  0.0109, -0.0453,  0.0263, -0.0247,  0.0163,  0.0304,  0.0101,
         -0.0270, -0.0390],
        [-0.0226,  0.0453, -0.0295, -0.0336, -0.0328, -0.0453, -0.0494, -0.0278,
          0.0485,  0.0446],
        [-0.0307,  0.0128, -0.0433, -0.0313,  0.0390, -0.0343,  0.0464,  0.0178,
         -0.0122,  0.0433],
        [ 0.0499,  0.0269, -0.0063, -0.0223, -0.0093, -0.0076,  0.0266, -0.0310,
         -0.0078,  0.0239],
        [-0.0006,  0.0291, -0.0166,  0.0286, -0.0494,  0.0380, -0.0094,  0.0287,
          0.0103, -0.0302],
        [-0.0485, -0.0298,  0.0009,  0.0145,  0.0481,  0.0285, -0.0376,  0.0436,
         -0.0413,  0.0436

In [202]:
inpt = [1, 2, 3]
emb_u = u_embeddings(torch.LongTensor(inpt))
emb_v = v_embeddings(torch.LongTensor(inpt))

In [204]:
emb_u

tensor([[-0.0292,  0.0343,  0.0427,  0.0311, -0.0375,  0.0035, -0.0429, -0.0261,
         -0.0168, -0.0264],
        [ 0.0489,  0.0181, -0.0218, -0.0067, -0.0157, -0.0236, -0.0153, -0.0209,
          0.0285, -0.0310],
        [-0.0244,  0.0109, -0.0453,  0.0263, -0.0247,  0.0163,  0.0304,  0.0101,
         -0.0270, -0.0390]], grad_fn=<EmbeddingBackward>)

In [205]:
emb_v

tensor([[-0.0145,  0.0046, -0.0296, -0.0288, -0.0404,  0.0445,  0.0388, -0.0191,
         -0.0246,  0.0424],
        [-0.0435,  0.0442, -0.0046, -0.0249, -0.0112,  0.0221, -0.0455, -0.0396,
          0.0460,  0.0369],
        [-0.0415,  0.0203,  0.0182, -0.0139,  0.0436,  0.0256,  0.0357, -0.0126,
          0.0469,  0.0110]], grad_fn=<EmbeddingBackward>)

In [262]:
score = torch.mul(emb_u, emb_v).squeeze()
score

tensor([[ 4.2301e-04,  1.5763e-04, -1.2648e-03, -8.9402e-04,  1.5161e-03,
          1.5764e-04, -1.6655e-03,  4.9921e-04,  4.1257e-04, -1.1215e-03],
        [-2.1268e-03,  7.9833e-04,  9.9889e-05,  1.6749e-04,  1.7612e-04,
         -5.2174e-04,  6.9428e-04,  8.2802e-04,  1.3102e-03, -1.1436e-03],
        [ 1.0119e-03,  2.2096e-04, -8.2383e-04, -3.6643e-04, -1.0789e-03,
          4.1874e-04,  1.0859e-03, -1.2789e-04, -1.2679e-03, -4.2762e-04]],
       grad_fn=<SqueezeBackward0>)

In [263]:
score = torch.sum(score, dim=1)
score

tensor([-0.0018,  0.0003, -0.0014], grad_fn=<SumBackward1>)

In [264]:
F.logsigmoid(score)

tensor([-0.6940, -0.6930, -0.6938], grad_fn=<LogSigmoidBackward>)

In [265]:
torch.mm(emb_u, emb_v.t())

tensor([[-0.0018,  0.0036, -0.0016],
        [-0.0024,  0.0003, -0.0025],
        [ 0.0027, -0.0027, -0.0014]], grad_fn=<MmBackward>)

In [266]:
emb_v.t()

tensor([[-0.0145, -0.0435, -0.0415],
        [ 0.0046,  0.0442,  0.0203],
        [-0.0296, -0.0046,  0.0182],
        [-0.0288, -0.0249, -0.0139],
        [-0.0404, -0.0112,  0.0436],
        [ 0.0445,  0.0221,  0.0256],
        [ 0.0388, -0.0455,  0.0357],
        [-0.0191, -0.0396, -0.0126],
        [-0.0246,  0.0460,  0.0469],
        [ 0.0424,  0.0369,  0.0110]], grad_fn=<TBackward>)

In [267]:
torch.t(emb_v)

tensor([[-0.0145, -0.0435, -0.0415],
        [ 0.0046,  0.0442,  0.0203],
        [-0.0296, -0.0046,  0.0182],
        [-0.0288, -0.0249, -0.0139],
        [-0.0404, -0.0112,  0.0436],
        [ 0.0445,  0.0221,  0.0256],
        [ 0.0388, -0.0455,  0.0357],
        [-0.0191, -0.0396, -0.0126],
        [-0.0246,  0.0460,  0.0469],
        [ 0.0424,  0.0369,  0.0110]], grad_fn=<TBackward>)

In [276]:
neg_inpt = [[4, 5, 6, 7], [4, 5, 6, 7], [4, 5, 6, 7]]

In [300]:
neg_emb_v = v_embeddings(torch.LongTensor(neg_inpt))

In [307]:
neg_emb_v.shape

torch.Size([3, 4, 10])

In [310]:
emb_u.unsqueeze(2).shape

torch.Size([3, 10, 1])

In [313]:
neg_score = torch.bmm(neg_emb_v, emb_u.unsqueeze(2))
neg_score.shape

torch.Size([3, 4, 1])

In [302]:
sum(F.logsigmoid(-1*neg_score))

tensor([-2.0798, -2.0740, -2.0850, -2.0770], grad_fn=<AddBackward0>)

In [303]:
neg_score = torch.sum(neg_score, dim=1)
neg_score

tensor([ 0.0036,  0.0015, -0.0093], grad_fn=<SumBackward1>)

In [304]:
sum(F.logsigmoid(-1*neg_score))

tensor(-2.0774, grad_fn=<AddBackward0>)

In [283]:
losses = []

In [284]:
losses.append(sum(score))
losses.append(sum(neg_score))
-1 * sum(losses)

tensor(0.0070, grad_fn=<MulBackward0>)

In [286]:
-1 * (sum(score) + sum(neg_score))

tensor(0.0070, grad_fn=<MulBackward0>)