# Embedding Store - abstraction for downstream use case

# Set up

In [1]:
import sys
from pathlib import Path

import pandas as pd
import torch

sys.path.insert(0, "..")  # Need this for the torch load to work
from src.id_mapper import IDMapper

# Load model

In [2]:
run_name = "003-increase-batch-size"
model_path = f"data/{run_name}/skipgram_model_full.pth"
id_mapping_path = "../data/idm.json"

In [3]:
model = torch.load(model_path)
model = model.to("cpu")

idm = IDMapper().load(id_mapping_path)
id_mapper = {
    "id_to_idx": idm.item_to_index,
    "idx_to_id": {i: v for i, v in enumerate(idm.index_to_item)},
}

  model = torch.load(model_path)


In [4]:
model.embeddings(torch.tensor([1, 2, 3]))

tensor([[-3.3563e-01, -6.7494e-02, -1.2587e-01, -9.1505e-02,  8.3021e-02,
          1.6040e-01, -3.2645e-02, -8.6713e-02,  3.8964e-02,  6.9410e-02,
          9.3355e-02,  1.2719e-01,  3.2712e-01,  7.7326e-03, -1.7127e-01,
          7.5069e-02,  6.6423e-02,  8.3719e-02,  2.4073e-01, -2.1420e-01,
         -3.7354e-02, -6.9796e-02, -4.1970e-02, -2.1700e-02,  5.1872e-02,
          1.6061e-01, -1.9597e-01,  6.6261e-02, -7.2800e-02,  9.5779e-02,
         -1.3496e-01, -8.7845e-02,  1.6247e-01,  6.8214e-03, -1.6455e-03,
          1.3581e-01,  2.4187e-01, -7.2012e-02, -3.9196e-02,  1.4857e-01,
          1.2731e-01,  1.0663e-01, -1.5828e-01, -2.6537e-01, -1.1889e-01,
          8.2768e-02,  3.9514e-02,  1.6478e-03, -4.1616e-02,  2.4122e-02,
         -8.9388e-02,  9.4123e-02,  9.0517e-02, -8.1978e-02, -9.3368e-02,
         -1.4800e-01,  1.6849e-01,  4.3713e-02,  3.5719e-01, -1.5130e-03,
          4.9855e-02, -1.1850e-01,  1.4244e-02, -1.0346e-01,  8.2536e-02,
          1.8407e-01,  6.4569e-02,  9.

# Init Embedding Store

In [5]:
from src.embedding_store import TorchEmbeddingStore

In [6]:
embs = TorchEmbeddingStore(id_mapper, model.embeddings)

In [7]:
type(model.embeddings)

torch.nn.modules.sparse.Embedding

In [8]:
embs.get_emb(1)

tensor([-0.3356, -0.0675, -0.1259, -0.0915,  0.0830,  0.1604, -0.0326, -0.0867,
         0.0390,  0.0694,  0.0934,  0.1272,  0.3271,  0.0077, -0.1713,  0.0751,
         0.0664,  0.0837,  0.2407, -0.2142, -0.0374, -0.0698, -0.0420, -0.0217,
         0.0519,  0.1606, -0.1960,  0.0663, -0.0728,  0.0958, -0.1350, -0.0878,
         0.1625,  0.0068, -0.0016,  0.1358,  0.2419, -0.0720, -0.0392,  0.1486,
         0.1273,  0.1066, -0.1583, -0.2654, -0.1189,  0.0828,  0.0395,  0.0016,
        -0.0416,  0.0241, -0.0894,  0.0941,  0.0905, -0.0820, -0.0934, -0.1480,
         0.1685,  0.0437,  0.3572, -0.0015,  0.0499, -0.1185,  0.0142, -0.1035,
         0.0825,  0.1841,  0.0646,  0.0935,  0.1118, -0.0436,  0.1233, -0.3307,
         0.0170,  0.0298, -0.0646, -0.0564, -0.0227, -0.0528,  0.0205,  0.0723,
         0.1453, -0.1437,  0.0918,  0.2310, -0.0754, -0.1051,  0.0459, -0.5296,
        -0.2158, -0.0066,  0.0103,  0.0121, -0.4785, -0.3690, -0.1204,  0.1022,
         0.0430,  0.4053,  0.0951,  0.33

# Load data

In [9]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")

In [10]:
items = train_df["parent_asin"].unique()
embeddings = embs.get_emb(items)

In [11]:
assert len(embeddings) == len(items)
assert embeddings.shape[1] == embs.embedding_dim

# Persist

In [12]:
persist_fp = "model/item2vec_embeddings.pth"
Path(persist_fp).parents[0].mkdir(exist_ok=True)
embs.save(persist_fp)

In [13]:
loaded_embs = TorchEmbeddingStore.load(persist_fp)

  checkpoint = torch.load(file_path)


In [14]:
assert embs.embedding_dim == loaded_embs.embedding_dim
assert all(embs.get_emb(0) == loaded_embs.get_emb(0))