# Embedding Store - abstraction for downstream use case

# Set up

In [1]:
import sys
from pathlib import Path

import pandas as pd
import torch

sys.path.insert(0, "..")  # Need this for the torch load to work
from src.id_mapper import IDMapper

# Load model

In [2]:
run_name = "003-increase-batch-size"
model_path = f"data/{run_name}/skipgram_model_full.pth"
id_mapping_path = "../data/idm.json"

In [3]:
model = torch.load(model_path)
model = model.to("cpu")

idm = IDMapper().load(id_mapping_path)
id_mapper = {
    "id_to_idx": idm.item_to_index,
    "idx_to_id": {i: v for i, v in enumerate(idm.index_to_item)},
}

  model = torch.load(model_path)


In [4]:
model.embeddings(torch.tensor([1, 2, 3]))

tensor([[-0.1317, -0.0332, -0.0177, -0.0446,  0.0536, -0.1227, -0.1424,  0.1958,
          0.0657,  0.0272, -0.1571,  0.0502,  0.1533, -0.1205,  0.0916, -0.1281,
         -0.0331,  0.1181,  0.2133, -0.0450,  0.1496, -0.0096,  0.0081, -0.0941,
         -0.0172,  0.0733,  0.1375, -0.2049, -0.1056, -0.0068,  0.3991,  0.2537,
          0.1147,  0.1821, -0.2608, -0.0653,  0.0886,  0.0674, -0.0380,  0.0712,
          0.1384, -0.1220,  0.0896,  0.0078,  0.1579,  0.1571, -0.0678,  0.0752,
         -0.0978, -0.0072, -0.0760, -0.1158, -0.0801, -0.0125,  0.4491, -0.0100,
          0.2482,  0.0468, -0.0301, -0.1946, -0.0987, -0.0975,  0.0178, -0.0675,
          0.0204, -0.0738, -0.1384,  0.1261, -0.1532,  0.1945, -0.1171, -0.1872,
          0.0741, -0.0638, -0.1399,  0.1967,  0.0769,  0.1532,  0.0869,  0.0390,
         -0.0419,  0.2155, -0.0271,  0.0935, -0.0526, -0.0776,  0.0850, -0.1611,
          0.0043,  0.1649, -0.2891, -0.0173, -0.1837, -0.0094, -0.1548,  0.0213,
          0.0256,  0.2108,  

# Init Embedding Store

In [5]:
from src.embedding_store import TorchEmbeddingStore

In [6]:
embs = TorchEmbeddingStore(id_mapper, model.embeddings)

In [7]:
type(model.embeddings)

torch.nn.modules.sparse.Embedding

In [8]:
embs.get_emb(1)

tensor([-0.1317, -0.0332, -0.0177, -0.0446,  0.0536, -0.1227, -0.1424,  0.1958,
         0.0657,  0.0272, -0.1571,  0.0502,  0.1533, -0.1205,  0.0916, -0.1281,
        -0.0331,  0.1181,  0.2133, -0.0450,  0.1496, -0.0096,  0.0081, -0.0941,
        -0.0172,  0.0733,  0.1375, -0.2049, -0.1056, -0.0068,  0.3991,  0.2537,
         0.1147,  0.1821, -0.2608, -0.0653,  0.0886,  0.0674, -0.0380,  0.0712,
         0.1384, -0.1220,  0.0896,  0.0078,  0.1579,  0.1571, -0.0678,  0.0752,
        -0.0978, -0.0072, -0.0760, -0.1158, -0.0801, -0.0125,  0.4491, -0.0100,
         0.2482,  0.0468, -0.0301, -0.1946, -0.0987, -0.0975,  0.0178, -0.0675,
         0.0204, -0.0738, -0.1384,  0.1261, -0.1532,  0.1945, -0.1171, -0.1872,
         0.0741, -0.0638, -0.1399,  0.1967,  0.0769,  0.1532,  0.0869,  0.0390,
        -0.0419,  0.2155, -0.0271,  0.0935, -0.0526, -0.0776,  0.0850, -0.1611,
         0.0043,  0.1649, -0.2891, -0.0173, -0.1837, -0.0094, -0.1548,  0.0213,
         0.0256,  0.2108,  0.2230,  0.07

# Load data

In [9]:
train_df = pd.read_parquet("../data/train_features_neg_df.parquet")

In [10]:
items = train_df["parent_asin"].unique()
embeddings = embs.get_emb(items)

In [11]:
assert len(embeddings) == len(items)
assert embeddings.shape[1] == embs.embedding_dim

# Persist

In [12]:
persist_fp = "model/item2vec_embeddings.pth"
Path(persist_fp).parents[0].mkdir(exist_ok=True)
embs.save(persist_fp)

In [13]:
loaded_embs = TorchEmbeddingStore.load(persist_fp)

  checkpoint = torch.load(file_path)


In [14]:
assert embs.embedding_dim == loaded_embs.embedding_dim
assert all(embs.get_emb(0) == loaded_embs.get_emb(0))