# Embedding Store - abstraction for downstream use case

# Set up

In [1]:
import torch
import json
import pandas as pd
import numpy as np
from loguru import logger
import sys
from pathlib import Path
from typing import Union, List

sys.path.insert(0, '..')  # Need this for the torch load to work

# Load model

In [2]:
run_name = '003-increase-batch-size'
model_path = f'data/{run_name}/skipgram_model_full.pth'
id_mapping_path = f'data/{run_name}/skipgram_id_mapping.json'

In [3]:
model = torch.load(model_path)
model = model.to("cpu")

with open(id_mapping_path, 'r') as f:
    id_mapper = json.load(f)

  model = torch.load(model_path)


In [4]:
model.embeddings(torch.tensor([1,2,3]))

tensor([[ 3.0697e-01, -1.9366e-01, -3.6119e-01, -2.3926e-01,  5.1450e-02,
         -2.1186e-01, -8.7054e-02,  1.0064e-01,  1.0387e-01, -1.6844e-01,
         -2.0294e-01,  2.2540e-01, -4.1290e-01, -2.7462e-01, -2.3910e-01,
          1.2672e-01,  8.9339e-02,  1.9948e-01,  1.0796e-01, -1.0444e-01,
         -1.4779e-01, -4.2136e-02, -1.2616e-01, -1.0927e-01, -7.8715e-02,
          1.0555e-01,  7.1937e-02, -4.2882e-02,  2.9020e-01,  2.2560e-01,
         -6.9849e-02, -1.5955e-01,  7.9546e-02, -1.2914e-01,  3.9749e-01,
         -1.7765e-01, -1.4966e-01,  2.4029e-01, -1.0989e-01, -1.1080e-01,
         -6.1592e-02,  2.0496e-01,  3.2974e-01,  1.8612e-01, -4.8572e-02,
          3.1325e-01, -3.6450e-01, -1.5718e-01,  9.6622e-02, -2.6860e-01,
         -8.3292e-02, -3.9046e-01, -8.8346e-02,  1.7577e-01, -3.0005e-02,
         -3.6283e-01,  5.7607e-01, -1.5172e-01,  4.1116e-01,  1.4046e-01,
          2.1911e-01, -1.2773e-01, -7.8384e-02, -3.2731e-02, -1.4895e-01,
         -4.2191e-02,  2.0459e-01,  4.

# Init Embedding Store

In [5]:
from src.embedding_store import TorchEmbeddingStore

In [6]:
embs = TorchEmbeddingStore(id_mapper, model.embeddings)

In [7]:
type(model.embeddings)

torch.nn.modules.sparse.Embedding

In [8]:
embs.get_emb(1)

tensor([ 0.3070, -0.1937, -0.3612, -0.2393,  0.0514, -0.2119, -0.0871,  0.1006,
         0.1039, -0.1684, -0.2029,  0.2254, -0.4129, -0.2746, -0.2391,  0.1267,
         0.0893,  0.1995,  0.1080, -0.1044, -0.1478, -0.0421, -0.1262, -0.1093,
        -0.0787,  0.1056,  0.0719, -0.0429,  0.2902,  0.2256, -0.0698, -0.1595,
         0.0795, -0.1291,  0.3975, -0.1777, -0.1497,  0.2403, -0.1099, -0.1108,
        -0.0616,  0.2050,  0.3297,  0.1861, -0.0486,  0.3133, -0.3645, -0.1572,
         0.0966, -0.2686, -0.0833, -0.3905, -0.0883,  0.1758, -0.0300, -0.3628,
         0.5761, -0.1517,  0.4112,  0.1405,  0.2191, -0.1277, -0.0784, -0.0327,
        -0.1490, -0.0422,  0.2046,  0.0467, -0.0588, -0.0991,  0.0456,  0.0160,
        -0.3468,  0.1730,  0.0922,  0.0191,  0.2467, -0.1767, -0.1152,  0.1358,
        -0.0116,  0.0501, -0.0816,  0.1474, -0.0017, -0.0805, -0.0441, -0.0526,
         0.0676, -0.2905,  0.1534,  0.1865, -0.1955,  0.1015, -0.2502, -0.0394,
        -0.0804, -0.4203,  0.5659, -0.36

# Load data

In [9]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")

In [10]:
items = train_df['parent_asin'].unique()
embeddings = embs.get_emb(items)

In [11]:
assert len(embeddings) == len(items)
assert embeddings.shape[1] == embs.embedding_dim

# Persist

In [12]:
persist_fp = 'model/item2vec_embeddings.pth'
Path(persist_fp).parents[0].mkdir(exist_ok=True)
embs.save(persist_fp)

In [13]:
loaded_embs = TorchEmbeddingStore.load(persist_fp)

  checkpoint = torch.load(file_path)


In [14]:
assert embs.embedding_dim == loaded_embs.embedding_dim
assert all(embs.get_emb(0) == loaded_embs.get_emb(0))