In [1]:
import os
import json
import torch
import polars as pl
import numpy as np
import tqdm
import pandas as pd
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.trainer import Trainer
from recbole.evaluator import Evaluator
from recbole.model.general_recommender.lightgcn import LightGCN
from recbole.model.context_aware_recommender.deepfm import DeepFM
from recbole.utils import init_seed
import faiss
from sentence_transformers import SentenceTransformer
import openai


import warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [127]:
DATA_ROOT = "./data/"
INTERS_DATA_PATH = "data/ml-1m/ml-1m.inter"
MOVIES_PATH = "data/ml-1m/ml-1m.item" 
EMB_PATH  = "mv_user_embeddings_best.pt"
SEED_PATH  = "diverse_seed_indices.json"
BERT_MODEL  = "all-MiniLM-L6-v2"
RATIONALE_INDEX_PATH  = "llm_rationale/index.faiss"
RATIONALE_EMB_PATH = "llm_rationale/embeddings.npy"
RATIONALE_META_PATH = "llm_rationale/meta.json"
TOP_K_ITEMS = 10 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [83]:
with open(SEED_PATH, "r") as f:
    seed_idxs = np.array(json.load(f))

seed_idxs

array([ 483, 5831, 1203, 1489, 3885,  302, 1611,    0,  999, 5172, 4577,
       5522, 5359,   26, 1605, 1607, 3836, 6036, 1160,  889])

In [4]:
ratings = pd.read_csv(INTERS_DATA_PATH, sep="\t", names=["user_id", "movie_id", "rating", "ts"], header=None, low_memory=False).iloc[1:]
movies = pd.read_csv(MOVIES_PATH, sep="\t", names=["movie_id","title", "year", "genres"],header=None, low_memory=False).iloc[1:]

ratings_sorted = ratings.sort_values(
    ["user_id", "rating", "ts"], ascending=[True, False, False]
)
ratings_sorted = ratings_sorted.merge(movies[["movie_id","title", "genres"]], on="movie_id", how="inner")
user_top_titles = (
    ratings_sorted
      .groupby("user_id")
      .apply(
          lambda df: [
              f"{title} ({genre})"
              for title, genre in zip(df["title"], df["genres"])
          ][:TOP_K_ITEMS]
      )
)

In [5]:
user_top_titles.get(seed_idxs[0])

["Toy Story (Animation Children's Comedy)",
 'Erin Brockovich (Drama)',
 'Lost in Space (Action Sci-Fi Thriller)',
 "Breakfast at Tiffany's (Drama Romance)",
 'Star Trek: First Contact (Action Adventure Sci-Fi)',
 'Men in Black (Action Adventure Comedy Sci-Fi)',
 'Rocketeer, The (Action Adventure Sci-Fi)',
 'Starship Troopers (Action Adventure Sci-Fi War)',
 "Rescuers, The (Animation Children's)",
 'Star Trek VI: The Undiscovered Country (Action Adventure Sci-Fi)']

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")

rationale_texts = []
rationale_meta  = []
for uid in tqdm.tqdm(seed_idxs, desc="Generating ML rationales"):
    titles = user_top_titles.get(uid, [])
    prompt = (
        f"The user rated these movies highly: {titles}. "
        "Write a brief rationale explaining the user's movie preferences and suggest 3 movies for this user."
    )
    resp = openai.ChatCompletion.create(
        model="o3",
        messages=[
            {"role":"system","content":"You are a movie recommender rationale generator."},
            {"role":"user","content":prompt}
        ],
        max_tokens=250,
    )
    text = resp.choices[0].message.content.strip()
    rid = len(rationale_texts)
    rationale_texts.append(text)
    rationale_meta.append({
        "seed_user_idx": uid,
        "rationale_idx": rid,
        "text": text
    })

In [64]:
bert = SentenceTransformer(BERT_MODEL, device=DEVICE)
rationale_embeddings = bert.encode(
    rationale_texts,
    convert_to_numpy=True,
    normalize_embeddings=True
)

dim = rationale_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(rationale_embeddings)
faiss.write_index(index, RATIONALE_INDEX_PATH)

np.save(RATIONALE_EMB_PATH, rationale_embeddings)
with open(RATIONALE_META_PATH, "w") as f:
    json.dump(rationale_meta, f, indent=2)

In [70]:
user_embs  = torch.load(EMB_PATH)
user_embs = user_embs.numpy()
faiss.normalize_L2(user_embs)
user_embs.shape

seed_embs = user_embs[np.array(seed_idxs)]
faiss.normalize_L2(seed_embs)
d = seed_embs.shape[1]
seed_index = faiss.IndexFlatIP(d)
seed_index.add(seed_embs)

_, I_seed = seed_index.search(user_embs, 1)
I_seed = I_seed[:, 0]  

In [None]:
llm_embs = np.load(RATIONALE_EMB_PATH) 
meta  = json.load(open(RATIONALE_META_PATH, 'r'))

In [73]:
meta[0]

{'seed_user_idx': 483,
 'rationale_idx': 1,
 'text': '\n1. Action-Adventure Sci-Fi with a Sense of Fun\n   You gravitate toward high-energy, effects-driven stories set in space or involving futuristic tech—*Star Trek: First Contact*, *Men in Black*, *The Rocketeer*—but you also appreciate a light touch of humor (*Men in Black*, *Men in Black II*) and a spirit of wonder (*Lost in Space*).\n\n2. Strong Character-Driven Drama\n   Beyond the sci-fi thrills, you enjoy grounded, emotionally rich narratives—*Erin Brockovich* and *Breakfast at Tiffany’s*—that hinge on compelling performances and personal stakes.\n\n3. Family-Friendly Animation\n   You like animated adventures that balance childlike charm with smart, witty scripts—*Toy Story* and *The Rescuers*—showing you value heart and humor that play equally to kids and grown-ups.\n\n### Three Films suggestions\n\n1. Galaxy Quest (1999)\n   A loving send-up of space-opera fandom, this action-adventure sci-fi comedy follows washed-up TV acto

In [122]:
seed_to_rid = { entry["seed_user_idx"]: entry["rationale_idx"] for entry in meta }

nearest_seed_user_idx = seed_idxs[I_seed] 
nearest_rid = np.array([seed_to_rid[uid] - 1 for uid in nearest_seed_user_idx])

user_llm_emb = llm_embs[nearest_rid, :]
user_llm_emb.shape

(6041, 384)

In [124]:
fused = np.concatenate([user_embs, user_llm_emb], axis=1)
fused.shape

(6041, 512)

In [None]:
d_fused = fused.shape[1]

ml_folder = os.path.join(DATA_ROOT, "ml-1m")
os.makedirs(ml_folder, exist_ok=True)
feat_file = os.path.join(ml_folder, "ml-1m_user_feat.tsv")

with open(feat_file, "w") as fout:
    header = ["user_id:token"] + [f"f{i}:float" for i in range(d_fused)]
    fout.write("\t".join(header) + "\n")
    for uid_token in range(1, user_embs.shape[0]+1):
        idx = uid_token-1  # zero-based index
        vec = fused[idx]
        row = [str(uid_token)] + [f"{v:.6f}" for v in vec]
        fout.write("\t".join(row) + "\n")

In [None]:
SEED           = 2025
EMB_SIZE       = 512
N_LAYERS       = 3
REG_WEIGHT     = 1e-5
LR             = 1e-3
BATCH_SIZE     = 2048
EPOCHS         = 20

def run_experiment(use_user_features=False):
    config_dict = {
        "data_path": DATA_ROOT,
        "seed": SEED,
        "train_batch_size": BATCH_SIZE,
        "eval_batch_size": BATCH_SIZE,
        "learning_rate": LR,
        "epochs": EPOCHS,
        # "load_col": {"inter": ["user_id", "movie_id", "rating"]},
        # "USER_ID_FIELD":  "user_id",
        # "ITEM_ID_FIELD":  "movie_id",
        # "RATING_FIELD":   "rating",
        # "LABEL_FIELD"       : "label",
        #     "RATING_FIELD"      : "rating",
        # "LABEL_FIELD"       : "rating",
    }

    if use_user_features:
        config_dict.update({
            "USER_SIDE_DATA": True,
            "USER_FEATURE_FIELDS": [f"f{i}" for i in range(d_fused)],
        })
    config = Config(
        model       = LightGCN,
        dataset     = "ml-1m",
        config_dict = config_dict
    )
    init_seed(SEED, reproducibility=True)
    dataset = create_dataset(config)
    train, valid, test = data_preparation(config, dataset)

    model = LightGCN(config, dataset).to(DEVICE)
    trainer = Trainer(config, model)
    best_model = trainer.fit(train, valid, show_progress = True)

    test_result = trainer.evaluate(test, load_best_model=True)
    return test_result

In [None]:
baseline = run_experiment(use_user_features=False)
baseline

OrderedDict([('recall@10', 0.1416),
             ('mrr@10', 0.4298),
             ('ndcg@10', 0.2357),
             ('hit@10', 0.6974),
             ('precision@10', 0.1837)])

In [172]:
enriched = run_experiment(use_user_features=True)

[1;35mTrain     0[0m: 100%|████████████████████████| 394/394 [00:35<00:00, 10.98it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mEvaluate   [0m: 100%|█████████████████████| 6040/6040 [00:22<00:00, 266.47it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mTrain     1[0m: 100%|████████████████████████| 394/394 [00:35<00:00, 11.07it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mEvaluate   [0m: 100%|█████████████████████| 6040/6040 [00:22<00:00, 269.18it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mTrain     2[0m: 100%|████████████████████████| 394/394 [00:35<00:00, 11.08it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mEvaluate   [0m: 100%|█████████████████████| 6040/6040 [00:21<00:00, 278.27it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mTrain     3[0m: 100%|████████████████████████| 394/394 [00:35<00:00, 11.05it/s, [1;33mGPU RAM: 0.44 G/5.93 G[0m][0m
[1;35mEvaluate   [0m: 100%|█████████████████████| 6040/6040 [00:22<00:00, 269.49it/s, [1;33mGPU RAM: 0.44 G/

In [176]:
enriched

OrderedDict([('recall@10', 0.1279),
             ('mrr@10', 0.4063),
             ('ndcg@10', 0.2185),
             ('hit@10', 0.6714),
             ('precision@10', 0.1708)])

In [None]:
EPOCHS         = 60
b= run_experiment(use_user_features=False)

In [None]:
e = run_experiment(use_user_features=True)