# Batch pre-computed recommendations

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import sys

import mlflow
import torch
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel
from tqdm.auto import tqdm

load_dotenv()

sys.path.insert(0, "..")

from src.vector_search import FaissNN

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-first-attempt"
    notebook_persist_dp: str = None
    random_seed: int = 41
    device: str = None

    top_K: int = 100
    top_k: int = 10

    embedding_dim: int = 128

    mlf_model_name: str = "item2vec"

    batch_recs_fp: str = None

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)
        self.batch_recs_fp = f"{self.notebook_persist_dp}/batch_recs.jsonl"

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-first-attempt",
  "notebook_persist_dp": "/Users/quy.dinh/frostmourne/recsys-mvp/notebooks/data/000-first-attempt",
  "random_seed": 41,
  "device": null,
  "top_K": 100,
  "top_k": 10,
  "embedding_dim": 128,
  "mlf_model_name": "item2vec",
  "batch_recs_fp": "/Users/quy.dinh/frostmourne/recsys-mvp/notebooks/data/000-first-attempt/batch_recs.jsonl"
}


# Load model

In [4]:
mlf_client = mlflow.MlflowClient()

In [5]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{args.mlf_model_name}@champion")

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

 - numpy (current: 1.26.4, required: numpy==2.0.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [6]:
run_id = model.metadata.run_id
run_info = mlf_client.get_run(run_id).info
artifact_uri = run_info.artifact_uri

In [7]:
sample_input = mlflow.artifacts.load_dict(f"{artifact_uri}/inferrer/input_example.json")
sample_input

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

{'item_1_ids': ['B01GHSPTKY'], 'item_2_ids': ['B01NCVU39G']}

In [8]:
prediction = model.predict(sample_input)
prediction

{'item_1_ids': ['B01GHSPTKY'],
 'item_2_ids': ['B01NCVU39G'],
 'scores': [0.5856391787528992]}

# Batch computing

## ANN Search

In [9]:
skipgram_model = model.unwrap_python_model().model
embedding_0 = skipgram_model.embeddings(torch.tensor(0))
embedding_dim = embedding_0.size()[0]
embedding_0

tensor([ 0.0724, -0.0767, -0.2191, -0.0854,  0.1054,  0.0132,  0.2014,  0.0446,
         0.2218,  0.1166, -0.0863, -0.0153,  0.1844,  0.1532, -0.1622, -0.0426,
        -0.1159,  0.0284, -0.0267, -0.0918, -0.0736, -0.2850,  0.0148,  0.0548,
        -0.1175,  0.0228, -0.0305,  0.1196,  0.0668,  0.2331,  0.0125,  0.2259,
        -0.1539, -0.1171,  0.0529,  0.0086,  0.1421,  0.0833, -0.0282,  0.0463,
         0.2397, -0.1958,  0.0900,  0.0121, -0.1647,  0.0359, -0.0999, -0.0860,
        -0.1510, -0.0784,  0.1169,  0.0210, -0.0281, -0.2030,  0.1205,  0.0319,
         0.0391,  0.0203, -0.2670,  0.1198,  0.0642, -0.1847,  0.0349, -0.0252,
        -0.0111, -0.0141, -0.2085, -0.0684,  0.0729,  0.2131,  0.0927,  0.0009,
        -0.2403,  0.0962, -0.1512, -0.1278, -0.1753, -0.2403, -0.0523,  0.0561,
        -0.0912, -0.3289,  0.0077,  0.3224,  0.0307,  0.2757,  0.1016,  0.0132,
         0.0538, -0.0318, -0.1471,  0.0210, -0.0053, -0.0749, -0.0878,  0.3513,
         0.0126,  0.0269,  0.0617, -0.02

In [10]:
id_mapping = model.unwrap_python_model().id_mapping
all_items = list(id_mapping["id_to_idx"].values())
all_items[:5]

[0, 1, 2, 3, 4]

In [11]:
embeddings = skipgram_model.embeddings(torch.tensor(all_items)).detach().numpy()
embeddings

array([[ 0.07242306, -0.07665822, -0.2190716 , ...,  0.07018692,
        -0.05276525, -0.07587786],
       [ 0.00143831, -0.07414407,  0.12090597, ...,  0.20651591,
         0.09309984,  0.34579048],
       [-0.14465706, -0.0803016 , -0.10050222, ..., -0.3577661 ,
         0.05050609,  0.22170302],
       ...,
       [-0.09108083,  0.03409648,  0.14109258, ...,  0.11506427,
        -0.17595527, -0.00111264],
       [-0.09865314,  0.14300263,  0.00952864, ...,  0.19733563,
        -0.12019644,  0.13168813],
       [ 0.06380922,  0.22691512, -0.03605513, ..., -0.15583682,
         0.4130729 , -0.3798564 ]], dtype=float32)

In [12]:
nn_search = FaissNN(
    embedding_dim=embedding_dim,
    use_gpu=False,  # TODO: Install faiss-gpu on Mac to make use of GPU
    metric="L2",
)

nn_search.add_embeddings(embeddings)

In [13]:
item_indice = 0

query_embedding = embeddings[item_indice]
distances, indices = nn_search.search(query_embedding, k=args.top_k)

print("Nearest neighbors (indices):", indices)
print("Distances:", distances)

skipgram_model(torch.tensor([item_indice] * args.top_k), torch.tensor(indices[0]))

Nearest neighbors (indices): [[   0 2581 1029 2051  215 1921 4526 2223  973 2023]]
Distances: [[0.        2.3501468 2.3908849 2.39777   2.459336  2.4745297 2.5172043
  2.5510547 2.572996  2.5938916]]


tensor([0.9105, 0.7186, 0.8135, 0.6648, 0.7467, 0.6255, 0.6438, 0.6674, 0.7374,
        0.7704], grad_fn=<SigmoidBackward0>)

## Predict for all items

In [14]:
recs = []
for indice in tqdm(all_items):
    query_embedding = embeddings[indice]
    _, neighbors = nn_search.search(query_embedding, k=args.top_K + 1)
    neighbors = neighbors[0]
    # Remove self-recommendation
    neighbors = [neighbor for neighbor in neighbors if neighbor != indice]
    scores = (
        skipgram_model(torch.tensor([indice] * len(neighbors)), torch.tensor(neighbors))
        .detach()
        .numpy()
        .astype(float)
    )
    neighbors, scores = zip(
        *sorted(zip(neighbors, scores), key=lambda x: x[1], reverse=True)
    )
    neighbor_ids = [id_mapping["idx_to_id"][str(idx)] for idx in neighbors]
    id_ = id_mapping["idx_to_id"][str(indice)]
    recs.append(
        {"target_item": id_, "rec_item_ids": neighbor_ids, "rec_scores": list(scores)}
    )

  0%|          | 0/4630 [00:00<?, ?it/s]

In [15]:
recs[0]

{'target_item': 'B01GHSPTKY',
 'rec_item_ids': ['B000GHLBUA',
  'B01JUEWS4W',
  'B00ANHBGZ8',
  'B00002SVFM',
  'B00ECOGM52',
  'B008FPMBNG',
  'B002JTX7FK',
  'B000PT18OS',
  'B0015PHMFU',
  'B0002ST1YQ',
  'B016HQZDCA',
  'B002I0EPBA',
  'B00BALK9CM',
  'B07D19QQHW',
  'B00HPU96FI',
  'B0083CJ2X8',
  'B005WMIR4C',
  'B002PXRXNW',
  'B0116BBW5S',
  'B00M37WR90',
  'B07GM5MP6M',
  'B0BJ6SBBSJ',
  'B07M8ZXQR6',
  'B01KUAMCWI',
  'B08V86DS42',
  'B07KF1FB1F',
  'B004ZX75AQ',
  'B000KQQUMQ',
  'B00M7ZIZGC',
  'B002BSC5CU',
  'B07TV96Q4Z',
  'B001262U54',
  'B086HMFYFD',
  'B001EYUWDG',
  'B00ZGDSFHG',
  'B013E2KD5M',
  'B07V3G6C1F',
  'B078YQH6DB',
  'B0C8M4WSYB',
  'B00004YC3N',
  'B00HS7LJ2G',
  'B00005B9ZG',
  'B003QCJLQI',
  'B01FZ884E0',
  'B074RNL1RX',
  'B00008KU9T',
  'B09VYJWTDN',
  'B0007WWYLY',
  'B0001XMA6I',
  'B0BN3JZ8PX',
  'B007VYW3US',
  'B077SN7QN3',
  'B017VLXJ7G',
  'B00005J53A',
  'B004WLRR4K',
  'B078YBBR3X',
  'B00004TN9O',
  'B009AFLXJS',
  'B00004TBGT',
  'B00DULU

# Persist

In [16]:
logger.info(f"Saving batch recs output to {args.batch_recs_fp}...")
with open(args.batch_recs_fp, "w") as f:
    for rec in recs:
        f.write(json.dumps(rec) + "\n")

[32m2024-10-10 11:48:24.641[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mSaving batch recs output to /Users/quy.dinh/frostmourne/recsys-mvp/notebooks/data/000-first-attempt/batch_recs.jsonl...[0m
