# Batch pre-computed recommendations

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import sys

import mlflow
import torch
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel
from tqdm.auto import tqdm

load_dotenv()

sys.path.insert(0, "..")

from src.vector_search import FaissNN

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "005-refactor"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    top_K: int = 100
    top_k: int = 10

    embedding_dim: int = 128

    mlf_model_name: str = "item2vec"

    batch_recs_fp: str = None

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)
        self.batch_recs_fp = f"{self.notebook_persist_dp}/batch_recs.jsonl"

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "005-refactor",
  "notebook_persist_dp": "/Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/005-refactor",
  "random_seed": 41,
  "device": null,
  "top_K": 100,
  "top_k": 10,
  "embedding_dim": 128,
  "mlf_model_name": "item2vec",
  "batch_recs_fp": "/Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/005-refactor/batch_recs.jsonl"
}


# Load model

In [4]:
mlf_client = mlflow.MlflowClient()

In [5]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{args.mlf_model_name}@champion")

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
run_id = model.metadata.run_id
run_info = mlf_client.get_run(run_id).info
artifact_uri = run_info.artifact_uri

In [7]:
sample_input = mlflow.artifacts.load_dict(f"{artifact_uri}/inferrer/input_example.json")
sample_input

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

{'item_1_ids': ['B00DNWHEVU'], 'item_2_ids': ['B00IUGYTRQ']}

In [8]:
prediction = model.predict(sample_input)
prediction

{'item_1_ids': ['B00DNWHEVU'],
 'item_2_ids': ['B00IUGYTRQ'],
 'scores': [0.244841530919075]}

# Batch computing

## ANN Search

In [9]:
skipgram_model = model.unwrap_python_model().model
embedding_0 = skipgram_model.embeddings(torch.tensor(0))
embedding_dim = embedding_0.size()[0]
embedding_0

tensor([-0.0270,  0.3653,  0.0864, -0.1741, -0.1233, -0.1826,  0.2630,  0.0644,
        -0.0557,  0.0157, -0.2415, -0.3705, -0.4329, -0.0532, -0.2477, -0.1174,
        -0.4903, -0.1717, -0.2361, -0.0730, -0.0500, -0.2081, -0.1080, -0.1420,
        -0.3621,  0.0939, -0.2968,  0.5956, -0.3357,  0.0252,  0.1848, -0.2468,
         0.2319,  0.2708,  0.4960,  0.1100, -0.2494,  0.2932, -0.3072, -0.0803,
         0.0185, -0.1368,  0.1899,  0.1284,  0.1652, -0.2464, -0.0149,  0.0208,
        -0.0650,  0.3353, -0.0311,  0.1289,  0.1494, -0.2388, -0.2141,  0.0291,
         0.3164,  0.4499,  0.1775, -0.0693, -0.1029, -0.1723, -0.1086,  0.2336,
        -0.0344,  0.1623, -0.1224,  0.3976,  0.2152, -0.1003,  0.0021, -0.0272,
        -0.1455,  0.4159, -0.1531, -0.0593, -0.0185, -0.1940, -0.1258,  0.1931,
        -0.0683,  0.1867, -0.1152,  0.0925, -0.0906, -0.1974, -0.1905,  0.0008,
        -0.0126, -0.2987,  0.1758, -0.1993, -0.0110, -0.3191, -0.2075,  0.4081,
         0.1795,  0.2005,  0.0391,  0.04

In [10]:
id_mapping = model.unwrap_python_model().id_mapping
all_items = list(id_mapping["id_to_idx"].values())
all_items[:5]

[0, 1, 2, 3, 4]

In [11]:
embeddings = skipgram_model.embeddings(torch.tensor(all_items)).detach().numpy()
embeddings

array([[-0.0270111 ,  0.36530462,  0.0864055 , ...,  0.2701319 ,
         0.28888178, -0.03838563],
       [-0.16257812,  0.17983401,  0.05818671, ..., -0.5151613 ,
         0.06293295, -0.22570048],
       [-0.14043814, -0.13348329,  0.08681496, ...,  0.07020528,
         0.11107803, -0.13295458],
       ...,
       [-0.01977942,  0.17683266,  0.08735713, ...,  0.19707859,
         0.14933012, -0.07864665],
       [ 0.05965793,  0.04048064, -0.03999281, ..., -0.17725047,
        -0.02599205,  0.24590547],
       [-0.11013471,  0.036272  , -0.12161028, ...,  0.04601436,
         0.01188595, -0.03651233]], dtype=float32)

In [12]:
nn_search = FaissNN(
    embedding_dim=embedding_dim,
    use_gpu=False,  # TODO: Install faiss-gpu on Mac to make use of GPU
    metric="L2",
)

nn_search.add_embeddings(embeddings)

In [15]:
item_indice = 0

query_embedding = embeddings[item_indice]
distances, indices = nn_search.search(query_embedding, k=args.top_k)

print("Nearest neighbors (indices):", indices)
print("Distances:", distances)

skipgram_model(torch.tensor([item_indice] * args.top_k), torch.tensor(indices[0]))

Nearest neighbors (indices): [[   0 3750 2866 3432 4157 2526 1825 4212 1198  100]]
Distances: [[0.        5.2470655 5.648892  5.6605287 5.7360883 5.8752265 5.8926578
  5.8928266 5.8931956 5.8958883]]


tensor([0.9974, 0.9628, 0.8395, 0.8315, 0.8350, 0.8225, 0.9015, 0.7591, 0.7914,
        0.8147], grad_fn=<SigmoidBackward0>)

## Predict for all items

In [16]:
recs = []
for indice in tqdm(all_items):
    query_embedding = embeddings[indice]
    _, neighbors = nn_search.search(query_embedding, k=args.top_K + 1)
    neighbors = neighbors[0]
    # Remove self-recommendation
    neighbors = [neighbor for neighbor in neighbors if neighbor != indice]
    scores = (
        skipgram_model(torch.tensor([indice] * len(neighbors)), torch.tensor(neighbors))
        .detach()
        .numpy()
        .astype(float)
    )
    neighbors, scores = zip(
        *sorted(zip(neighbors, scores), key=lambda x: x[1], reverse=True)
    )
    neighbor_ids = [id_mapping["idx_to_id"][str(idx)] for idx in neighbors]
    id_ = id_mapping["idx_to_id"][str(indice)]
    recs.append(
        {"target_item": id_, "rec_item_ids": neighbor_ids, "rec_scores": list(scores)}
    )

  0%|          | 0/4630 [00:00<?, ?it/s]

In [17]:
recs[0]

{'target_item': 'B00DNWHEVU',
 'rec_item_ids': ['B0055202HE',
  'B0087LXZRE',
  'B07P574T71',
  'B006MAT1NO',
  'B00DBRNBO6',
  'B008982OUO',
  'B00741A2J2',
  'B00DBKSN8M',
  'B002HP18IO',
  'B002CVUWEY',
  'B001EYUSA8',
  'B002I0J5FG',
  'B004J17XZ6',
  'B004CDGG4U',
  'B001EYUY8Y',
  'B00006IJIS',
  'B0045EPEBW',
  'B001ELJEGA',
  'B004R9OVEG',
  'B000HE7KZC',
  'B00007KE6C',
  'B000035Y63',
  'B003EGULF6',
  'B000R3BN4M',
  'B001EYUU1U',
  'B00GBCLG0E',
  'B002I0H738',
  'B001QCWSHO',
  'B004GGUAJU',
  'B003L20IQU',
  'B004519O9S',
  'B003S9VNWE',
  'B009AR2CAU',
  'B001KYV9AK',
  'B000NNFJQY',
  'B0053B7ICO',
  'B00CIO5ILW',
  'B008LQPI0M',
  'B0040Y3K9U',
  'B002JTX7SM',
  'B006G81HV6',
  'B004NBXRDY',
  'B00000I1BS',
  'B003VUO6LU',
  'B00HVWU0Q4',
  'B003ANMB6A',
  'B003FMVLSK',
  'B001ELJFN2',
  'B00CHYOP94',
  'B009DYE3JI',
  'B003QCJLQI',
  'B000ZKA0KA',
  'B001EYUUF6',
  'B000XGJH1O',
  'B00005ATSM',
  'B0045EPECG',
  'B003UKPZ4S',
  'B00IA4455Y',
  'B000X37732',
  'B002VFQ

# Persist

In [18]:
logger.info(f"Saving batch recs output to {args.batch_recs_fp}...")
with open(args.batch_recs_fp, "w") as f:
    for rec in recs:
        f.write(json.dumps(rec) + "\n")

[32m2024-10-07 21:20:20.164[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSaving batch recs output to /Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/005-refactor/batch_recs.jsonl...[0m
