In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [8]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

from src.ml.data_loader_with_meta import Sequences, SequencesDataset
from src.ml.skipgram import SkipGram as SkipGramBase
from src.ml.skipgram_with_meta import SkipGram
from src.utils.logger import logger
from src.utils.io_utils import load_model

In [99]:
batchsize = 1
shuffle = False
num_workers = 4
emb_dim = 8
epochs = 1
initial_lr=0.025
MODEL_PATH = '../model'

In [5]:
dataset = 'electronics'

In [3]:
sequences = Sequences('../data/{}_sequences_samp.npy'.format(dataset), 
                      '../data/{}_edges_val_samp.csv'.format(dataset),
                      '../data/{}_meta.csv'.format(dataset))

NameError: name 'Sequences' is not defined

In [29]:
sequences_dset = SequencesDataset(sequences)

In [101]:
sequences_dload = DataLoader(sequences_dset, batch_size=batchsize, shuffle=shuffle, num_workers=num_workers, collate_fn=sequences_dset.collate)

In [31]:
device = 'cpu'

In [32]:
skipgram = SkipGram(sequences.emb_sizes, emb_dim).to(device)

In [9]:
meta = pd.read_csv('../data/{}_meta.csv'.format(dataset))

In [10]:
meta

Unnamed: 0,asin,description,categories,title,price,related,brand,category_lvl_1,category_lvl_2,category_lvl_3,category_lvl_4
0,0132793040,the kelby training dvd mastering blend modes i...,"['electronics', 'computers & accessories', 'ca...",kelby training dvd: mastering blend modes in a...,,0,,electronics,computers & accessories,cables & accessories,monitor accessories
1,0321732944,,"['electronics', 'computers & accessories', 'ca...",kelby training dvd: adobe photoshop cs5 crash ...,,0,,electronics,computers & accessories,cables & accessories,monitor accessories
2,0439886341,digital organizer and messenger,"['electronics', 'computers & accessories', 'pd...",digital organizer and messenger,8.15,1,,electronics,computers & accessories,"pdas, handhelds & accessories",pdas & handhelds
3,0511189877,the clikr-5 ur5u-8780l remote control is desig...,"['electronics', 'accessories & supplies', 'aud...",clikr-5 time warner cable remote control ur5u-...,23.36,1,,electronics,accessories & supplies,audio & video accessories,remote controls
4,0528881469,"like its award-winning predecessor, the intell...","['electronics', 'gps & navigation', 'vehicle g...",rand mcnally 528881469 7-inch intelliroute tnd...,299.99,1,,electronics,gps & navigation,vehicle gps,trucking gps
...,...,...,...,...,...,...,...,...,...,...,...
498191,bt008v9j9u,vehicle suction cup mount (replacement) notice...,"['electronics', 'gps & navigation', 'gps syste...",suction cup mount,21.99,1,garmin,electronics,gps & navigation,gps system accessories,vehicle mounts
498192,bt008sxq4c,quatech - 1 port pcmcia to db-25 parallel adap...,"['electronics', 'computers & accessories', 'ca...",parallel pcmcia card 1port epp,23.99,1,,electronics,computers & accessories,cables & accessories,cables & interconnects
498193,bt008g3w52,c2g - 5m ultma usb 2.0 a mini b cble,"['electronics', 'computers & accessories', 'ca...",c2g / cables to go 5m ultima usb 2.0 cable,18.91,1,c2g,electronics,computers & accessories,cables & accessories,cables & interconnects
498194,bt008uktmw,keyboard drawer.,"['electronics', 'computers & accessories', 'ca...",underdesk keyboard drawer,25.54,1,fellowes,electronics,computers & accessories,cables & accessories,keyboards


### Train code

In [102]:
optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)

for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(sequences_dload))
    
    running_loss = 0
    for i, batches in enumerate(sequences_dload):

        # logger.info('Batch shape: {}, {}, {}'.format(batches[0].shape, batches[1].shape, batches[2].shape))
        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1

        if i % 1000 == 0:
            logger.info('Epoch: {:,}, Seq Count: {:,}/{}, Loss: {:.4f}, Lr: {:.6f}'.format(epoch, i, len(sequences_dload), running_loss,
                                                                                        optimizer.param_groups[0][
                                                                                            'lr']))
            running_loss = 0

    # skipgram.save_embeddings(file_name='{}/skipgram_epoch_{}.npy'.format(MODEL_PATH, epoch))

2019-12-09 15:14:21,207 - Epoch: 0, Seq Count: 0/5000, Loss: 0.0404, Lr: 0.025000
2019-12-09 15:14:22,184 - Epoch: 0, Seq Count: 100/5000, Loss: 0.9065, Lr: 0.024975
2019-12-09 15:14:23,144 - Epoch: 0, Seq Count: 200/5000, Loss: 0.8626, Lr: 0.024900
2019-12-09 15:14:24,094 - Epoch: 0, Seq Count: 300/5000, Loss: 0.5013, Lr: 0.024777
2019-12-09 15:14:25,045 - Epoch: 0, Seq Count: 400/5000, Loss: 0.7018, Lr: 0.024605
2019-12-09 15:14:25,986 - Epoch: 0, Seq Count: 500/5000, Loss: 0.8305, Lr: 0.024386
2019-12-09 15:14:26,932 - Epoch: 0, Seq Count: 600/5000, Loss: 0.8488, Lr: 0.024119
2019-12-09 15:14:27,907 - Epoch: 0, Seq Count: 700/5000, Loss: 0.8957, Lr: 0.023807
2019-12-09 15:14:28,862 - Epoch: 0, Seq Count: 800/5000, Loss: 0.9884, Lr: 0.023450
2019-12-09 15:14:29,851 - Epoch: 0, Seq Count: 900/5000, Loss: 1.0634, Lr: 0.023050
2019-12-09 15:14:30,839 - Epoch: 0, Seq Count: 1,000/5000, Loss: 0.6308, Lr: 0.022608
2019-12-09 15:14:31,755 - Epoch: 0, Seq Count: 1,100/5000, Loss: 0.9069, Lr:

KeyboardInterrupt: 

In [103]:
centers.shape

torch.Size([65, 2])

In [104]:
centers

tensor([[10662,     4],
        [10662,     4],
        [10662,     4],
        [10662,     4],
        [10662,     4],
        [10671,     0],
        [10671,     0],
        [10671,     0],
        [10671,     0],
        [10671,     0],
        [10679,     3],
        [10679,     3],
        [10679,     3],
        [10679,     3],
        [10679,     3],
        [10679,     3],
        [10699,     0],
        [10699,     0],
        [10699,     0],
        [10699,     0],
        [10699,     0],
        [10699,     0],
        [10699,     0],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [10700,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [ 5923,     3],
        [10701,     0],
        [10701,     0],
        [10701, 

In [105]:
emb_centers = []
for i in range(centers.shape[1]):
    logger.info('center i: {}, center: {}'.format(i, centers[:, i]))
    emb_centers.append(skipgram.center_embeddings[i](centers[:, i]))
emb_center = torch.mean(torch.stack(emb_centers), axis=0)

2019-12-09 15:14:54,195 - center i: 0, center: tensor([10662, 10662, 10662, 10662, 10662, 10671, 10671, 10671, 10671, 10671,
        10679, 10679, 10679, 10679, 10679, 10679, 10699, 10699, 10699, 10699,
        10699, 10699, 10699, 10700, 10700, 10700, 10700, 10700, 10700, 10700,
        10700,  5923,  5923,  5923,  5923,  5923,  5923,  5923,  5923, 10701,
        10701, 10701, 10701, 10701, 10701, 10701, 10701, 10702, 10702, 10702,
        10702, 10702, 10702, 10702, 10703, 10703, 10703, 10703, 10703, 10703,
         2287,  2287,  2287,  2287,  2287])
2019-12-09 15:14:54,197 - center i: 1, center: tensor([4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])


In [None]:
emb_weights = nn.Embedding(133050, len(sequences.meta_dict))

In [None]:
emb_

In [111]:
embs = torch.stack(emb_centers)

In [112]:
embs

tensor([[[ 0.0918,  0.1411,  0.2642,  ...,  0.0383,  0.1361, -0.0501],
         [ 0.0918,  0.1411,  0.2642,  ...,  0.0383,  0.1361, -0.0501],
         [ 0.0918,  0.1411,  0.2642,  ...,  0.0383,  0.1361, -0.0501],
         ...,
         [ 0.0904,  0.1429, -0.1467,  ...,  0.0428, -0.1430, -0.0881],
         [ 0.0904,  0.1429, -0.1467,  ...,  0.0428, -0.1430, -0.0881],
         [ 0.0904,  0.1429, -0.1467,  ...,  0.0428, -0.1430, -0.0881]],

        [[ 1.4873,  0.1554,  1.6058,  ...,  0.1069,  2.0883, -0.1213],
         [ 1.4873,  0.1554,  1.6058,  ...,  0.1069,  2.0883, -0.1213],
         [ 1.4873,  0.1554,  1.6058,  ...,  0.1069,  2.0883, -0.1213],
         ...,
         [-0.6694,  1.9951, -0.4591,  ..., -1.8206, -0.6763, -2.2768],
         [-0.6694,  1.9951, -0.4591,  ..., -1.8206, -0.6763, -2.2768],
         [-0.6694,  1.9951, -0.4591,  ..., -1.8206, -0.6763, -2.2768]]],
       grad_fn=<StackBackward>)

### Save torch params

In [None]:
torch.save(skipgram.state_dict(), '../model/skipgram_sample.pt')

In [None]:
model = SkipGram(sequences.n_unique_tokens, emb_dim).to(device)

In [None]:
model.load_state_dict(torch.load('../model/skipgram_sample.pt'))

In [None]:
model.eval()

### Check with validation

In [35]:
val_samp = pd.read_csv('../data/{}_edges_val_samp.csv'.format(dataset), dtype={'product1': 'object', 'product2': 'object'})

In [36]:
word2id = load_model('../model/word2id')

2019-12-09 14:23:09,692 - Model loaded from: ../model/word2id (Size: 16818322 bytes)


In [37]:
word2id_func =  np.vectorize(sequences.get_product_id)

In [38]:
val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
val_samp['product2_id'] = word2id_func(val_samp['product2'].values)

In [40]:
def get_id_and_meta(product_id):
    return [product_id] + sequences.get_meta(product_id)

In [72]:
val_product1 = val_samp['product1_id'].apply(get_id_and_meta)
val_product2 = val_samp['product2_id'].apply(get_id_and_meta)

In [73]:
val_product1

0        [118483, 11]
1          [60942, 6]
2          [93414, 4]
3          [36156, 3]
4          [65513, 2]
             ...     
99995     [117348, 6]
99996      [63841, 4]
99997      [20998, 2]
99998      [17168, 4]
99999      [52331, 2]
Name: product1_id, Length: 100000, dtype: object

In [86]:
optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)

for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(sequences_dload))
    
    running_loss = 0
    for i, batches in enumerate(sequences_dload):

        # logger.info('Batch shape: {}, {}, {}'.format(batches[0].shape, batches[1].shape, batches[2].shape))
        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1

        if i % 100 == 0:
            # Validation Check
            with torch.no_grad():
                product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1).to(device))
                product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2).to(device))
                cos_sim = F.cosine_similarity(product1_emb, product2_emb)
                score = roc_auc_score(val_samp['edge'], cos_sim.detach().cpu().numpy())

            logger.info("Epoch: {}, Seq: {:,}/{:,}, " \
                        "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(sequences_dload), running_loss,
                                                                           score, optimizer.param_groups[0]['lr']))
            running_loss = 0

Traceback (most recent call last):
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/eugeneyan/.pyenv/versions/3.7.2/lib/python3.7/multiprocessing/connection.py", line 200, in sen

KeyboardInterrupt: 

In [74]:
centers = torch.LongTensor(val_product1[:5])

In [69]:
emb_centers = []
for row_idx, center in enumerate(centers):
    # logger.info('Row idx: {}, Center: {}'.format(row_idx, center))
    emb_center = []
    for col_idx, center_ in enumerate(center):
        logger.info('Row idx: {}, col idx: {}, center_: {}'.format(row_idx, col_idx, center_))
        emb_center.append(skipgram.center_embeddings[col_idx](center_))
        
    emb_centers.append(torch.mean(torch.stack(emb_center), axis=0))

2019-12-09 14:29:49,446 - Row idx: 0, col idx: 0, center_: 118483
2019-12-09 14:29:49,447 - Row idx: 0, col idx: 1, center_: 11
2019-12-09 14:29:49,448 - Row idx: 1, col idx: 0, center_: 60942
2019-12-09 14:29:49,450 - Row idx: 1, col idx: 1, center_: 6
2019-12-09 14:29:49,452 - Row idx: 2, col idx: 0, center_: 93414
2019-12-09 14:29:49,453 - Row idx: 2, col idx: 1, center_: 4
2019-12-09 14:29:49,454 - Row idx: 3, col idx: 0, center_: 36156
2019-12-09 14:29:49,455 - Row idx: 3, col idx: 1, center_: 3
2019-12-09 14:29:49,456 - Row idx: 4, col idx: 0, center_: 65513
2019-12-09 14:29:49,458 - Row idx: 4, col idx: 1, center_: 2


In [71]:
torch.stack(emb_centers)

tensor([[-0.2340, -0.4264, -0.2047, -0.2885, -0.8206, -0.9066, -0.2055, -0.0785],
        [-0.8490,  0.6206,  0.7294,  0.3560,  0.3803, -1.1591,  0.3385,  0.0555],
        [ 0.7365,  0.0088,  0.8658,  1.5167, -0.4314, -0.0341,  1.1788,  0.2331],
        [-0.1527,  0.1430,  0.0037,  0.3374, -0.4008, -0.4302, -0.0222, -0.2551],
        [-0.5657, -0.4164,  0.6629, -0.5685, -0.5612,  0.5829,  0.3356, -0.8318]],
       grad_fn=<StackBackward>)

In [68]:
torch.mean(torch.stack(emb_center), axis=0)

tensor([-0.5657, -0.4164,  0.6629, -0.5685, -0.5612,  0.5829,  0.3356, -0.8318],
       grad_fn=<MeanBackward1>)

In [57]:
skipgram.center_embeddings[1]

Embedding(15, 8, sparse=True)

In [75]:
product1_emb = skipgram.get_center_emb(torch.LongTensor(val_product1[:5]))

In [76]:
product1_emb

tensor([[-0.2340, -0.4264, -0.2047, -0.2885, -0.8206, -0.9066, -0.2055, -0.0785],
        [-0.8490,  0.6206,  0.7294,  0.3560,  0.3803, -1.1591,  0.3385,  0.0555],
        [ 0.7365,  0.0088,  0.8658,  1.5167, -0.4314, -0.0341,  1.1788,  0.2331],
        [-0.1527,  0.1430,  0.0037,  0.3374, -0.4008, -0.4302, -0.0222, -0.2551],
        [-0.5657, -0.4164,  0.6629, -0.5685, -0.5612,  0.5829,  0.3356, -0.8318]],
       grad_fn=<StackBackward>)

In [78]:
product2_emb = skipgram.get_center_emb(torch.LongTensor(val_product2[:5]))

In [81]:
product1_emb

tensor([[-0.2340, -0.4264, -0.2047, -0.2885, -0.8206, -0.9066, -0.2055, -0.0785],
        [-0.8490,  0.6206,  0.7294,  0.3560,  0.3803, -1.1591,  0.3385,  0.0555],
        [ 0.7365,  0.0088,  0.8658,  1.5167, -0.4314, -0.0341,  1.1788,  0.2331],
        [-0.1527,  0.1430,  0.0037,  0.3374, -0.4008, -0.4302, -0.0222, -0.2551],
        [-0.5657, -0.4164,  0.6629, -0.5685, -0.5612,  0.5829,  0.3356, -0.8318]],
       grad_fn=<StackBackward>)

In [80]:
F.cosine_similarity(product1_emb, product2_emb)

tensor([ 0.1303, -0.0470,  0.0279,  0.2951,  0.9985], grad_fn=<DivBackward0>)

In [None]:
product1_emb

In [None]:
torch.stack(product1_emb)

In [None]:
val_product2

In [None]:
val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]

In [None]:
val_samp

In [None]:
product1_emb = model.get_center_emb(torch.LongTensor(product1_id))
product2_emb = model.get_center_emb(torch.LongTensor(product2_id))

In [None]:
product1_emb

In [None]:
cos_sim = F.cosine_similarity(product1_emb, product2_emb)
cos_sim

In [None]:
cos_sim.detach().numpy()

In [None]:
x = np.array([-0.2257,  0.2379, -0.2139,  0.2115,  0.2185, -0.2326,  0.2114, -0.2235])
y = np.array([-0.2150, -0.1220,  0.0284,  0.2917,  0.1297, -0.2589, -0.1423, -0.2585])

In [None]:
np.inner(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [None]:
product1_tensor

In [None]:
print(emb)

In [None]:
skipgram.state_dict()

### Scratch

In [None]:
meta_cols = ['asin', 'price', 'category_lvl_2', 'category_lvl_3', 'category_lvl_4', 'brand']

In [None]:
cat = meta_cols.copy()

In [None]:
cat

In [None]:
cat.remove('asin')

In [None]:
cat

In [None]:
meta_cols