# Batch pre-computed recommendations

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import sys

import mlflow
import torch
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel
from tqdm.auto import tqdm

load_dotenv()

sys.path.insert(0, "..")

from src.vector_search import FaissNN

# Controller

In [94]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "003-alias-champion"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    top_K: int = 100
    top_k: int = 10

    embedding_dim: int = 128

    mlf_model_name: str = "item2vec"

    batch_recs_fp: str = None

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)
        self.batch_recs_fp = f"{self.notebook_persist_dp}/batch_recs.jsonl"

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "log_to_mlflow": true,
  "run_name": "003-alias-champion",
  "notebook_persist_dp": "/Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/003-alias-champion",
  "random_seed": 41,
  "device": null,
  "top_K": 100,
  "top_k": 10,
  "max_epochs": 1,
  "batch_size": 128,
  "num_negative_samples": 2,
  "window_size": 1,
  "embedding_dim": 128,
  "early_stopping_patience": 5,
  "learning_rate": 0.01,
  "l2_reg": 0.00001,
  "mlf_model_name": "item2vec",
  "min_roc_auc": 0.7,
  "batch_recs_fp": "/Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/003-alias-champion/batch_recs.jsonl"
}


# Load model

In [4]:
mlf_client = mlflow.MlflowClient()

In [5]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{args.mlf_model_name}@champion")

In [6]:
run_id = model.metadata.run_id
run_info = mlf_client.get_run(run_id).info
artifact_uri = run_info.artifact_uri

In [7]:
sample_input = model.metadata.load_input_example(f"{artifact_uri}/inferrer")
sample_input

{'item_1_ids': ['B00DNWHEVU'], 'item_2_ids': ['B00IUGYTRQ']}

In [8]:
prediction = model.predict(sample_input)
prediction

array([0.57443655], dtype=float32)

# Batch computing

## ANN Search

In [39]:
skipgram_model = model.unwrap_python_model().model
embedding_0 = skipgram_model.embeddings(torch.tensor(0))
embedding_dim = embedding_0.size()[0]
embedding_0

tensor([ 0.1063,  0.0637, -0.2781, -0.2211,  0.3249, -0.0590,  0.2670,  0.0403,
        -0.0407,  0.0406, -0.2914, -0.0803,  0.0965, -0.2045, -0.0182,  0.1387,
        -0.0026, -0.1545,  0.1664, -0.2552,  0.1066,  0.2608,  0.0934, -0.1265,
         0.0078, -0.0083,  0.2949,  0.2016,  0.3150,  0.2007,  0.0396, -0.6107,
         0.1848, -0.1913, -0.0131,  0.2446,  0.1231,  0.2834, -0.4713,  0.1334,
         0.0109, -0.1116,  0.3186, -0.0880,  0.3027,  0.0369,  0.0780,  0.2961,
         0.2274, -0.2656, -0.0009,  0.0592,  0.0843, -0.3508,  0.1197, -0.2257,
         0.0079,  0.0541,  0.2524,  0.0653,  0.0649,  0.2002, -0.1778, -0.1682,
        -0.2628,  0.1615,  0.0328,  0.1523,  0.4343, -0.2719, -0.1512, -0.1011,
        -0.1519,  0.3563, -0.1717, -0.2401,  0.0619,  0.1834, -0.1960,  0.1252,
        -0.1073, -0.1228,  0.4250,  0.1358,  0.2330, -0.2585, -0.0255,  0.0568,
         0.1532, -0.3840,  0.0411, -0.2388, -0.1980,  0.1239,  0.0210, -0.0583,
        -0.1420, -0.3904,  0.1271, -0.00

In [51]:
id_mapping = model.unwrap_python_model().id_mapping
all_items = list(id_mapping["id_to_idx"].values())
all_items[:5]

[0, 1, 2, 3, 4]

In [44]:
embeddings = skipgram_model.embeddings(torch.tensor(all_items)).detach().numpy()
embeddings

array([[ 0.10632942,  0.06370179, -0.27810845, ...,  0.48868826,
         0.14318596,  0.14822829],
       [-0.1683319 ,  0.21401414,  0.22829568, ...,  0.06086319,
         0.13857512,  0.10889213],
       [ 0.48648486, -0.10663053, -0.20740558, ...,  0.03420575,
        -0.16901188,  0.05877741],
       ...,
       [ 0.01539592, -0.13812037, -0.11617418, ..., -0.26600996,
        -0.10895204,  0.05188315],
       [-0.09828333,  0.21711013, -0.09971067, ...,  0.1983842 ,
        -0.08722479,  0.0689928 ],
       [ 0.03498199,  0.285267  ,  0.16006969, ...,  0.0856421 ,
        -0.00720352,  0.22712462]], dtype=float32)

In [None]:
nn_search = FaissNN(
    embedding_dim=embedding_dim,
    use_gpu=False,  # TODO: Install faiss-gpu on Mac to make use of GPU
    metric="L2",
)

nn_search.add_embeddings(embeddings)

In [None]:
item_indice = 0

query_embedding = embeddings[item_indice]
distances, indices = nn_search.search(query_embedding, k=args.top_k)

print("Nearest neighbors (indices):", indices)
print("Distances:", distances)

skipgram_model(torch.tensor([item_indice] * args.top_k), torch.tensor(indices[0]))

Nearest neighbors (indices): [[   0  204 3933 2045 3692 3540  847  733 3220 3342]]
Distances: [[0.        4.6188965 4.627966  4.911262  4.9127235 4.9812317 5.091267
  5.0969453 5.1657567 5.252323 ]]


## Predict for all items

In [90]:
recs = []
for indice in tqdm(all_items):
    query_embedding = embeddings[indice]
    _, neighbors = nn_search.search(query_embedding, k=args.top_K + 1)
    neighbors = neighbors[0]
    # Remove self-recommendation
    neighbors = [neighbor for neighbor in neighbors if neighbor != indice]
    scores = (
        skipgram_model(torch.tensor([indice] * len(neighbors)), torch.tensor(neighbors))
        .detach()
        .numpy()
        .astype(float)
    )
    neighbors, scores = zip(
        *sorted(zip(neighbors, scores), key=lambda x: x[1], reverse=True)
    )
    neighbor_ids = [id_mapping["idx_to_id"][str(idx)] for idx in neighbors]
    id_ = id_mapping["idx_to_id"][str(indice)]
    recs.append(
        {"target_item": id_, "rec_item_ids": neighbor_ids, "rec_scores": list(scores)}
    )

  0%|          | 0/4630 [00:00<?, ?it/s]

In [91]:
recs[0]

{'target_item': 'B00DNWHEVU',
 'rec_item_ids': ['B0087LXZRE',
  'B00420K18K',
  'B00ANZR6M2',
  'B0031SWWPO',
  'B006G81HV6',
  'B003URSUSE',
  'B001QCWSG0',
  'B0050SZ7D2',
  'B002HP18IO',
  'B00JBNHLFO',
  'B0050SZ49Y',
  'B000M5ZIVY',
  'B005EHQHSS',
  'B0030GBSUC',
  'B00007KE6C',
  'B07N1XKY1L',
  'B009S4JTTC',
  'B003LPTAL6',
  'B003S9VNWE',
  'B0050SVDQ2',
  'B004CDGFRI',
  'B001E8WRE4',
  'B001EYUY8Y',
  'B07XJYZTPQ',
  'B007VYW5Y2',
  'B009GWKP76',
  'B002824SNS',
  'B006WJ70P6',
  'B004D1Z3UO',
  'B010R2RHGU',
  'B0045EPEBW',
  'B001EYUPCE',
  'B00FA5PKDM',
  'B004SL3LLW',
  'B002JTX7OG',
  'B003UF24SI',
  'B003QX4F7C',
  'B0045EPECQ',
  'B001EYUQ7I',
  'B00DY5JNXI',
  'B001EYURCC',
  'B00MOQWBQ4',
  'B001E58B1K',
  'B00AIALGZK',
  'B00005RCQJ',
  'B000R3BN4M',
  'B009AR2CAU',
  'B008BG3B2Y',
  'B0023CC0M4',
  'B002I08RR8',
  'B002I0JNFI',
  'B00MVY10CU',
  'B07H3F94ZN',
  'B001EYUX1W',
  'B000035Y63',
  'B005HRZ29K',
  'B008J1372A',
  'B003Q9RG9K',
  'B00DQNF1SO',
  'B006BZ1

# Persist

In [95]:
logger.info(f"Saving batch recs output to {args.batch_recs_fp}...")
with open(args.batch_recs_fp, "w") as f:
    for rec in recs:
        f.write(json.dumps(rec) + "\n")

[32m2024-10-04 14:23:42.750[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSaving batch recs output to /Users/dvq/frostmourne/fsds/fsds-recsys/chapters/l7/notebooks/data/003-alias-champion/batch_recs.jsonl...[0m
