In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [3]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

from src.ml.data_loader_with_meta import Sequences, SequencesDataset
from src.ml.skipgram import SkipGram as SkipGramBase
from src.ml.skipgram_with_meta_weighted import SkipGram
from src.utils.logger import logger
from src.utils.io_utils import load_model

In [4]:
batchsize = 1
shuffle = False
num_workers = 4
emb_dim = 8
epochs = 1
initial_lr=0.025
MODEL_PATH = '../model'

In [5]:
dataset = 'electronics'

In [34]:
sequences = Sequences('../data/{}_sequences_samp.npy'.format(dataset), 
                      '../data/{}_edges_val_samp.csv'.format(dataset),
                      '../data/{}_meta.csv'.format(dataset))

2019-12-09 16:42:39,860 - Sequences loaded (length = 5,000)
2019-12-09 16:42:39,936 - Validation set loaded: (100000, 3)
2019-12-09 16:42:39,944 - Word frequency calculated
2019-12-09 16:42:39,981 - Adding val products to word2id, original size: 28695
2019-12-09 16:42:40,049 - Added val products to word2id, updated size: 133050
2019-12-09 16:42:40,053 - No. of unique tokens: 133050
2019-12-09 16:42:41,312 - Model saved to model/word2id
2019-12-09 16:42:42,532 - Model saved to model/id2word
2019-12-09 16:42:42,533 - Word2Id and Id2Word created and saved
2019-12-09 16:42:46,268 - No. of rows in meta before filter by word2id: 498196
2019-12-09 16:42:46,424 - No. of rows in meta after filter by word2id: 79566
2019-12-09 16:42:46,633 - Model saved to model/encoder
2019-12-09 16:42:47,923 - Embedding dimensions: OrderedDict([('product', 133050), ('category_lvl_3', 55)])
2019-12-09 16:42:48,606 - Model saved to model/meta_dict
2019-12-09 16:42:48,693 - Convert sequence and wordfreq to ID
2019

In [7]:
sequences_dset = SequencesDataset(sequences)

In [8]:
sequences_dload = DataLoader(sequences_dset, batch_size=batchsize, shuffle=shuffle, num_workers=num_workers, collate_fn=sequences_dset.collate)

In [9]:
device = 'cpu'

In [12]:
sequences.emb_sizes['product']

133050

In [45]:
skipgram = SkipGram(sequences.emb_sizes, emb_dim).to(device)

2019-12-09 16:46:29,886 - Model initialized: SkipGram(
  (center_embeddings): ModuleList(
    (0): Embedding(133050, 8, sparse=True)
    (1): Embedding(55, 8, sparse=True)
  )
  (context_embeddings): ModuleList(
    (0): Embedding(133050, 8, sparse=True)
    (1): Embedding(55, 8, sparse=True)
  )
  (emb_weights): Embedding(133050, 2, sparse=True)
  (emb_weights_softmax): Softmax(dim=1)
)


### Train code

In [27]:
optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)

for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(sequences_dload))
    
    running_loss = 0
    for i, batches in enumerate(sequences_dload):

        # logger.info('Batch shape: {}, {}, {}'.format(batches[0].shape, batches[1].shape, batches[2].shape))
        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1

        if i % 1000 == 0:
            logger.info('Epoch: {:,}, Seq Count: {:,}/{}, Loss: {:.4f}, Lr: {:.6f}'.format(epoch, i, len(sequences_dload), running_loss,
                                                                                        optimizer.param_groups[0][
                                                                                            'lr']))
            running_loss = 0

    # skipgram.save_embeddings(file_name='{}/skipgram_epoch_{}.npy'.format(MODEL_PATH, epoch))

2019-12-09 16:40:01,724 - Epoch: 0, Seq Count: 0/5000, Loss: 0.4157, Lr: 0.025000
2019-12-09 16:40:10,821 - Epoch: 0, Seq Count: 1,000/5000, Loss: 4.0503, Lr: 0.022608
2019-12-09 16:40:20,148 - Epoch: 0, Seq Count: 2,000/5000, Loss: 4.1510, Lr: 0.016355
2019-12-09 16:40:29,328 - Epoch: 0, Seq Count: 3,000/5000, Loss: 4.1650, Lr: 0.008630
2019-12-09 16:40:38,706 - Epoch: 0, Seq Count: 4,000/5000, Loss: 4.1127, Lr: 0.002383


In [28]:
centers.shape

torch.Size([65, 2])

In [None]:
centers

In [None]:
emb_centers = []
for i in range(centers.shape[1]):
    logger.info('center i: {}, center: {}'.format(i, centers[:, i]))
    emb_centers.append(skipgram.center_embeddings[i](centers[:, i]))
emb_center = torch.mean(torch.stack(emb_centers), axis=0)

In [None]:
sequences.n_unique_tokens

In [None]:
sequences.emb_sizes

In [None]:
emb_weights = nn.Embedding(sequences.n_unique_tokens, len(sequences.emb_sizes))
emb_equal_weight = 1 / (len(sequences.emb_sizes) + 1)
emb_weights.weight.data.uniform_(emb_equal_weight, emb_equal_weight)

emb_weights_softmax = nn.Softmax(dim=1)

In [None]:
centers[:, 0]

In [None]:
emb_weightage = emb_weights(centers[:, 0])

In [None]:
emb_weightage[:5]

In [None]:
emb_weightage_norm = emb_weights_softmax(emb_weightage)

In [None]:
emb_weightage_norm[:5]

In [None]:
embs = torch.stack(emb_centers)

In [None]:
embs_weighted = embs * emb_weightage_norm.T.unsqueeze(2).expand_as(embs)

In [None]:
torch.sum(embs_weighted, axis=0)

In [29]:
centers[:, 0]

tensor([ 3447,  3447,  3447,  3447,  3447, 17085, 17085, 17085, 17085, 17085,
        13751, 13751, 13751, 13751, 13751, 13751, 28690, 28690, 28690, 28690,
        28690, 28690, 28690, 28691, 28691, 28691, 28691, 28691, 28691, 28691,
        28691, 28692, 28692, 28692, 28692, 28692, 28692, 28692, 28692,  9208,
         9208,  9208,  9208,  9208,  9208,  9208,  9208, 28693, 28693, 28693,
        28693, 28693, 28693, 28693, 11462, 11462, 11462, 11462, 11462, 11462,
        28694, 28694, 28694, 28694, 28694])

In [47]:
skipgram.emb_weights(centers[:, 0])[:5]

tensor([[0.8368, 0.8811],
        [0.8368, 0.8811],
        [0.8368, 0.8811],
        [0.5910, 0.6241],
        [0.5910, 0.6241]], grad_fn=<SliceBackward>)

In [50]:
skipgram.get_embedding(centers)[:5]

tensor([[-0.0489, -0.0327, -0.0250,  0.0094, -0.0116, -0.0125,  0.0161,  0.0340],
        [-0.0489, -0.0327, -0.0250,  0.0094, -0.0116, -0.0125,  0.0161,  0.0340],
        [-0.0489, -0.0327, -0.0250,  0.0094, -0.0116, -0.0125,  0.0161,  0.0340],
        [ 0.0306, -0.0034, -0.0086,  0.0134, -0.0040, -0.0083,  0.0137, -0.0009],
        [ 0.0306, -0.0034, -0.0086,  0.0134, -0.0040, -0.0083,  0.0137, -0.0009]],
       grad_fn=<SliceBackward>)

### Save torch params

In [None]:
torch.save(skipgram.state_dict(), '../model/skipgram_sample.pt')

In [None]:
model = SkipGram(sequences.n_unique_tokens, emb_dim).to(device)

In [None]:
model.load_state_dict(torch.load('../model/skipgram_sample.pt'))

In [None]:
model.eval()

### Check with validation

In [32]:
val_samp = pd.read_csv('../data/{}_edges_val_samp.csv'.format(dataset), dtype={'product1': 'object', 'product2': 'object'})

In [36]:
word2id = load_model('../model/word2id')

2019-12-09 16:44:05,420 - Model loaded from: ../model/word2id (Size: 16818322 bytes)


In [37]:
word2id_func =  np.vectorize(sequences.get_product_id)

In [38]:
val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
val_samp['product2_id'] = word2id_func(val_samp['product2'].values)

In [39]:
def get_id_and_meta(product_id):
    return [product_id] + sequences.get_meta(product_id)

In [40]:
val_product1 = val_samp['product1_id'].apply(get_id_and_meta)
val_product2 = val_samp['product2_id'].apply(get_id_and_meta)

In [41]:
val_product1

0        [105831, 39]
1        [117491, 34]
2          [36325, 5]
3        [104235, 20]
4          [55705, 3]
             ...     
99995     [67609, 25]
99996     [107264, 5]
99997     [20998, 10]
99998      [17168, 5]
99999    [108845, 19]
Name: product1_id, Length: 100000, dtype: object

In [42]:
optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)

for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(sequences_dload))
    
    running_loss = 0
    for i, batches in enumerate(sequences_dload):

        # logger.info('Batch shape: {}, {}, {}'.format(batches[0].shape, batches[1].shape, batches[2].shape))
        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1

        if i % 100 == 0:
            # Validation Check
            with torch.no_grad():
                product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1).to(device))
                product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2).to(device))
                cos_sim = F.cosine_similarity(product1_emb, product2_emb)
                score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy())

            logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
                        "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(sequences_dload), running_loss,
                                                                           score, optimizer.param_groups[0]['lr']))
            running_loss = 0

2019-12-09 16:44:28,805 - Epoch: 0, Seq: 0/5,000, Loss: 0.4147, AUC-ROC: 0.5321, Lr: 0.025000


KeyboardInterrupt: 

In [43]:
product1_emb

tensor([[ 0.3912, -0.1188,  0.0619,  ...,  0.2390, -0.1749,  0.2658],
        [-0.1681, -0.0960,  0.5144,  ...,  0.3660, -0.2210,  0.1719],
        [ 0.0141,  0.0335, -0.2797,  ...,  0.5986, -0.2581,  0.2112],
        ...,
        [-0.1865,  0.3298, -0.3989,  ..., -0.0358,  0.1962,  0.2721],
        [-0.2255,  0.5091, -0.3923,  ...,  0.6163, -0.0902,  0.0494],
        [-0.4326,  0.4178, -0.1241,  ...,  0.2138,  0.5804,  0.0941]])

In [44]:
logger.info('{}'.format(skipgram))

2019-12-09 16:45:44,440 - SkipGram(
  (center_embeddings): ModuleList(
    (0): Embedding(133050, 8, sparse=True)
    (1): Embedding(55, 8, sparse=True)
  )
  (context_embeddings): ModuleList(
    (0): Embedding(133050, 8, sparse=True)
    (1): Embedding(55, 8, sparse=True)
  )
  (emb_weights): Embedding(133050, 2, sparse=True)
  (emb_weights_softmax): Softmax(dim=1)
)


In [None]:
centers = torch.LongTensor(val_product1[:5])

In [None]:
emb_centers = []
for row_idx, center in enumerate(centers):
    # logger.info('Row idx: {}, Center: {}'.format(row_idx, center))
    emb_center = []
    for col_idx, center_ in enumerate(center):
        logger.info('Row idx: {}, col idx: {}, center_: {}'.format(row_idx, col_idx, center_))
        emb_center.append(skipgram.center_embeddings[col_idx](center_))
        
    emb_centers.append(torch.mean(torch.stack(emb_center), axis=0))

In [None]:
torch.stack(emb_centers)

In [None]:
torch.mean(torch.stack(emb_center), axis=0)

In [None]:
skipgram.center_embeddings[1]

In [None]:
product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1[:5]))

In [None]:
product1_emb

In [None]:
product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2[:5]))

In [None]:
product1_emb

In [None]:
F.cosine_similarity(product1_emb, product2_emb)

In [None]:
product1_emb

In [None]:
torch.stack(product1_emb)

In [None]:
val_product2

In [None]:
val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]

In [None]:
val_samp

In [None]:
product1_emb = model.get_center_emb(torch.LongTensor(product1_id))
product2_emb = model.get_center_emb(torch.LongTensor(product2_id))

In [None]:
product1_emb

In [None]:
cos_sim = F.cosine_similarity(product1_emb, product2_emb)
cos_sim

In [None]:
cos_sim.detach().numpy()

In [None]:
x = np.array([-0.2257,  0.2379, -0.2139,  0.2115,  0.2185, -0.2326,  0.2114, -0.2235])
y = np.array([-0.2150, -0.1220,  0.0284,  0.2917,  0.1297, -0.2589, -0.1423, -0.2585])

In [None]:
np.inner(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [None]:
product1_tensor

In [None]:
print(emb)

In [None]:
skipgram.state_dict()

### Scratch

In [None]:
meta_cols = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand']

In [None]:
cat = meta_cols.copy()

In [None]:
cat

In [None]:
cat.remove('asin')

In [None]:
cat

In [None]:
meta_cols