# Embedding Store - abstraction for downstream use case

# Set up

In [1]:
import sys
from pathlib import Path

import pandas as pd
import torch

sys.path.insert(0, "..")  # Need this for the torch load to work
from src.id_mapper import IDMapper

# Load model

In [2]:
run_name = "003-increase-batch-size"
model_path = f"data/{run_name}/skipgram_model_full.pth"
id_mapping_path = "../data/idm.json"

In [3]:
model = torch.load(model_path)
model = model.to("cpu")

idm = IDMapper().load(id_mapping_path)
id_mapper = {
    "id_to_idx": idm.item_to_index,
    "idx_to_id": {i: v for i, v in enumerate(idm.index_to_item)},
}

  model = torch.load(model_path)


In [4]:
model.embeddings(torch.tensor([1, 2, 3]))

tensor([[-2.4360e-03,  4.3166e-02, -9.3488e-02,  1.9916e-02,  2.7102e-02,
          1.5594e-02, -1.1986e-02,  1.9114e-01,  1.2731e-02,  2.3147e-02,
          3.0469e-01, -3.9866e-02,  6.9936e-02, -2.3489e-01, -1.7257e-01,
          8.1035e-02,  3.6162e-03,  8.7359e-02,  3.4401e-02,  9.4056e-02,
         -1.1891e-01, -1.4028e-01,  6.4878e-02,  3.7294e-01, -9.1436e-02,
          9.2053e-03, -6.7754e-02, -4.1515e-02,  2.7679e-03,  5.4015e-02,
         -4.4041e-02, -2.9830e-02, -2.1544e-03, -8.5978e-02, -1.5885e-01,
         -3.3294e-01, -2.0594e-01, -1.8462e-02, -1.4763e-01,  3.9161e-01,
          1.1713e-01,  2.1464e-01,  5.8163e-02,  7.4750e-02,  4.0880e-02,
         -7.0310e-03, -2.6233e-01,  6.7790e-02,  3.6532e-02,  1.2249e-02,
          5.2275e-02,  1.1784e-01,  2.6386e-02,  2.9819e-01, -2.0051e-01,
         -7.4982e-02, -1.5800e-02,  2.9282e-02, -1.5568e-02, -2.0330e-01,
          1.0215e-01,  7.3099e-02, -9.7547e-02, -6.3989e-02,  5.7653e-02,
          5.5505e-02,  4.3651e-03, -1.

# Init Embedding Store

In [5]:
from src.embedding_store import TorchEmbeddingStore

In [6]:
embs = TorchEmbeddingStore(id_mapper, model.embeddings)

In [7]:
type(model.embeddings)

torch.nn.modules.sparse.Embedding

In [8]:
embs.get_emb(1)

tensor([-0.0024,  0.0432, -0.0935,  0.0199,  0.0271,  0.0156, -0.0120,  0.1911,
         0.0127,  0.0231,  0.3047, -0.0399,  0.0699, -0.2349, -0.1726,  0.0810,
         0.0036,  0.0874,  0.0344,  0.0941, -0.1189, -0.1403,  0.0649,  0.3729,
        -0.0914,  0.0092, -0.0678, -0.0415,  0.0028,  0.0540, -0.0440, -0.0298,
        -0.0022, -0.0860, -0.1589, -0.3329, -0.2059, -0.0185, -0.1476,  0.3916,
         0.1171,  0.2146,  0.0582,  0.0747,  0.0409, -0.0070, -0.2623,  0.0678,
         0.0365,  0.0122,  0.0523,  0.1178,  0.0264,  0.2982, -0.2005, -0.0750,
        -0.0158,  0.0293, -0.0156, -0.2033,  0.1022,  0.0731, -0.0975, -0.0640,
         0.0577,  0.0555,  0.0044, -0.1516, -0.0613, -0.0145, -0.0151, -0.0765,
        -0.1222,  0.0704, -0.0635, -0.0071,  0.1630,  0.2253, -0.0776,  0.2430,
        -0.0866, -0.0111,  0.0865, -0.0278, -0.0668,  0.1336,  0.2901, -0.1372,
         0.0430,  0.0281,  0.1156, -0.0299,  0.2180, -0.0168, -0.2976,  0.0378,
        -0.0263, -0.2304, -0.0647,  0.04

# Load data

In [9]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")

In [10]:
items = train_df["parent_asin"].unique()
embeddings = embs.get_emb(items)

In [11]:
assert len(embeddings) == len(items)
assert embeddings.shape[1] == embs.embedding_dim

# Persist

In [12]:
persist_fp = "model/item2vec_embeddings.pth"
Path(persist_fp).parents[0].mkdir(exist_ok=True)
embs.save(persist_fp)

In [13]:
loaded_embs = TorchEmbeddingStore.load(persist_fp)

  checkpoint = torch.load(file_path)


In [14]:
assert embs.embedding_dim == loaded_embs.embedding_dim
assert all(embs.get_emb(0) == loaded_embs.get_emb(0))