### This notebook performs data preprocessing on the MovieLens - 1M dataset to create candidate pools that will be used by both the recommender system and the LLMs to establish equivalent baselines.

# Data Preprocessing

### 1) Convert ratings.dat to csv for ease of reading

In [3]:
import pandas as pd
from pathlib import Path

RAW = Path(r"C:\Users\abdul\ece1508gp\movielens_dataset")
out_csv = RAW / "ratings.csv"

df = pd.read_csv(
    RAW / "ratings.dat",
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"]
)
df.to_csv(out_csv, index=False)
print("wrote:", out_csv, len(df))

wrote: C:\Users\abdul\ece1508gp\movielens_dataset\ratings.csv 1000209


### 2) Convert movies.dat to readable .csv

In [20]:
import pandas as pd

input_path  = r"C:\Users\abdul\ece1508gp\movielens_dataset\movies.dat"
output_path = r"C:\Users\abdul\ece1508gp\movielens_dataset\movies.csv"

df = pd.read_csv(
    input_path,
    sep="::",
    engine="python", 
    header=None,
    names=["movieId", "title", "genres"],
    encoding="latin-1" 
)

df.to_csv(output_path, index=False)

print("Converted:", output_path)

Converted: C:\Users\abdul\ece1508gp\movielens_dataset\movies.csv


### 3) Cap ratings to limit only 6 movies per user in total for both test/train

In [28]:
import pandas as pd

ratings = pd.read_csv(r"C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset\ratings.csv")
ratings = ratings.sort_values(["userId", "timestamp"], ascending=[True, False])

ratings_limited = ratings.groupby("userId").head(5).reset_index(drop=True)
print("Limited ratings shape:", ratings_limited.shape)

# rewrite for calling split later on
ratings_limited.to_csv(r"C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset\ratings.csv", index=False)

Limited ratings shape: (30200, 4)


# Executing Time Aware LOO Split

### 1) Create time-aware leave-one-out split with data preprocessing

In [29]:
import pandas as pd
import numpy as np
from pathlib import Path

def _detect_ts_unit(ts_series):
    vmax = float(ts_series.max())
    return "ms" if vmax > 1e12 else "s"

def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 3.0,
    min_positives: int = 3,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","movieId","rating","timestamp"}   #check for missing columns
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")

    unit = _detect_ts_unit(ratings["timestamp"])   # normalize the timestamps and filter out rows where < rating threshold
    ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)

    pos = ratings[ratings["rating"] >= rating_threshold].copy()

    #drop duplicates and keep only first interaction per user-item
    pos = pos.sort_values(["userId","ts","movieId"], kind="mergesort")
    pos = pos.drop_duplicates(["userId","movieId"], keep="first")

    pos = pos[pos.groupby("userId")["movieId"].transform("size") >= min_positives].copy()   #filtering for min positives

    #rank by time and assign splits, basically last is test, second last is val, rest train
    pos["n"] = pos.groupby("userId")["movieId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"]>=3) & (pos["idx"] == pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","movieId","ts"]].reset_index(drop=True)
    val_targets  = pos[pos["split"]=="val"][["userId","movieId","ts"]].rename(
        columns={"movieId":"val_item","ts":"ts_val"}).reset_index(drop=True)
    test_targets = pos[pos["split"]=="test"][["userId","movieId","ts"]].rename(
        columns={"movieId":"test_item","ts":"ts_test"}).reset_index(drop=True)

    #building user and item id maps from train set only
    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"])
    uids["uid"] = range(len(uids))
    iids = pd.DataFrame(sorted(train["movieId"].unique()), columns=["movieId"])
    iids["iid"] = range(len(iids))

    #creating indexed versions of splits for both easier model training 
    train_idx = (train.merge(uids, on="userId", how="inner")
                      .merge(iids, on="movieId", how="inner"))
    val_idx = None
    if len(val_targets):
        val_idx = (val_targets.merge(uids, on="userId", how="inner")
                              .merge(iids, left_on="val_item", right_on="movieId", how="left")
                              .drop(columns=["movieId"]))
    test_idx = (test_targets.merge(uids, on="userId", how="inner")
                               .merge(iids, left_on="test_item", right_on="movieId", how="left")
                               .drop(columns=["movieId"]))

    #save outputs to local
    sp = out / "splits"
    train.to_parquet(sp / "train.parquet", index=False)
    if len(val_targets):
        val_targets.to_parquet(sp / "val_targets.parquet", index=False)
    test_targets.to_parquet(sp / "test_targets.parquet", index=False)
    uids.to_parquet(sp / "user_id_map.parquet", index=False)
    iids.to_parquet(sp / "item_id_map.parquet", index=False)
    train_idx.to_parquet(sp / "train_indexed.parquet", index=False)
    if val_idx is not None:
        val_idx.to_parquet(sp / "val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp / "test_targets_indexed.parquet", index=False)

    if also_csv:
        train.to_csv(sp / "train.csv", index=False)
        if len(val_targets): val_targets.to_csv(sp / "val_targets.csv", index=False)
        test_targets.to_csv(sp / "test_targets.csv", index=False)
        uids.to_csv(sp / "user_id_map.csv", index=False)
        iids.to_csv(sp / "item_id_map.csv", index=False)
        train_idx.to_csv(sp / "train_indexed.csv", index=False)
        if val_idx is not None:
            val_idx.to_csv(sp / "val_targets_indexed.csv", index=False)
        test_idx.to_csv(sp / "test_targets_indexed.csv", index=False)

    # sanity check counts + stats
    cold_val = int(val_idx["iid"].isna().sum()) if val_idx is not None and "iid" in val_idx else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids)}
Items (TRAIN map): {len(iids)}
TRAIN positives  : {len(train)}
VAL users        : {len(val_targets["userId"].unique()) if len(val_targets) else 0}
TEST users       : {len(test_targets["userId"].unique())}
Cold-start VAL items  : {cold_val}
Cold-start TEST items : {cold_test}
"""
    (sp / "stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

### 2) Actually call the Time Aware LOO Split

In [40]:
time_aware_loo_split(
    ratings_csv=r"C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset\ratings.csv",
    out_dir=r"C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset",
    rating_threshold=3.0,   #setting to 3 to get more positives
    min_positives=4,        
    also_csv=True           
)

Time-aware LOO split summary
Users (TRAIN map): 4675
Items (TRAIN map): 2233
TRAIN positives  : 12557
VAL users        : 4675
TEST users       : 4675
Cold-start VAL items  : 289
Cold-start TEST items : 303



### 3) Load created splits and verify the Users/Items dimensions in splits

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix

SPLITS = Path(r"C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset\splits")

train = pd.read_parquet(SPLITS / "train_indexed.parquet")       
val_idx = pd.read_parquet(SPLITS / "val_targets_indexed.parquet") 
test_idx = pd.read_parquet(SPLITS / "test_targets_indexed.parquet") 

U = int(train["uid"].max()) + 1
I = int(train["iid"].max()) + 1
print("users:", U, "items:", I, "train rows:", len(train))

# generating the actual users×items implicit matrix for als recommender using int32 since Windows was throwing errors with default dtypes 
rows = train["uid"].to_numpy(dtype=np.int32, copy=False)
cols = train["iid"].to_numpy(dtype=np.int32, copy=False)
data = np.ones(len(train), dtype=np.float32)

R = csr_matrix((data, (rows, cols)), shape=(U, I), dtype=np.float32)

#also filtering out the seen items per user for pool generation and evaluation
user_seen = train.groupby("uid")["iid"].apply(set).to_dict()

# Pool Generation

### First tool: Using Popularity (popular movies train-only)

In [42]:
pop = (
    train.groupby("iid").size()
         .sort_values(ascending=False)
)

def top_pop_unseen(u_seen, P=50):
    out = []
    for iid in pop.index:
        if iid not in u_seen:
            out.append(int(iid))
            if len(out) >= P:
                break
    return out

### Second tool: Using item-item k-nearest neighbors (w sklearn)

In [43]:
from sklearn.neighbors import NearestNeighbors

item_users = R.T.tocsr()
item_users.sort_indices()

knn = NearestNeighbors(
    n_neighbors=51,      # 51 cause self will be dropped later
    metric="cosine",
    algorithm="brute",
    n_jobs=-1,
)
knn.fit(item_users)
print("sklearn item–item fitted on", item_users.shape)

def item_neighbors_from_history_sklearn(u_seen, per_item=20):
    C = []
    for iid in u_seen:
        dists, idxs = knn.kneighbors(item_users[iid], n_neighbors=per_item+1, return_distance=True)
        neigh_ids = idxs[0][1:]   # drop self
        C.extend(int(j) for j in neigh_ids)
    return C


sklearn item–item fitted on (2233, 4675)


### Third tool: Lightweight ALS Train

In [44]:
from implicit.als import AlternatingLeastSquares

#arbitrary ALS parameters, just need a trained model for generating pool of recs
als = AlternatingLeastSquares(
    factors=64,
    regularization=0.05,
    iterations=10,
    use_gpu=False
)

# implicit ALS needs item×user matrix
als.fit(R.T.tocsr())
print("ALS trained.")

100%|██████████| 10/10 [00:00<00:00, 63.68it/s]

ALS trained.





### Build candidate pools (w artifacts - popularity/item-item,als)

In [45]:
from tqdm import tqdm
import pyarrow as pa, pyarrow.parquet as pq

def mf_top_unseen(uid: int, Rk):
    ids, scores = als.recommend(
        userid=int(uid),
        user_items=R[int(uid)],
        N=Rk,
        filter_already_liked_items=True,
        recalculate_user=True,
    )
    return [int(i) for i in ids]

def build_pool_for_user(uid: int, # putting back parameters for easy tuning
                        P=200,        
                        K=400,       
                        Rk=200,      
                        knn_per_item=20):  
    seen = user_seen.get(uid, set())
    seen_set = set(seen)

    C = []
    C += top_pop_unseen(seen, P=P) #get popular unseen

    if len(seen):
        C += mf_top_unseen(uid, Rk=Rk) #ALS recs

    if len(seen):
        few = list(seen)[:12]         # only 12 items from history
        C += item_neighbors_from_history_sklearn(few, per_item=knn_per_item)

    # de-dupe movies then drop seen and limit
    dedup = []
    for iid in C:
        if iid in seen_set:
            continue
        if iid in dedup:
            continue
        dedup.append(iid)
        if len(dedup) >= K:
            break

    return dedup


def make_candidates(user_ids, P=200, K=400, Rk=200, knn_per_item=20):
    rows = []
    for u in tqdm(user_ids):
        rows.append({
            "uid": int(u),
            "candidates": build_pool_for_user(
                int(u),
                P=P,
                K=K,
                Rk=Rk,
                knn_per_item=knn_per_item,
            )
        })
    return pd.DataFrame(rows)


#build user lists
users_val  = sorted(val_idx["uid"].unique())  if len(val_idx)  else []
users_test = sorted(test_idx["uid"].unique()) if len(test_idx) else []

#build pools
cand_val  = make_candidates(users_val,  P=20,  K=50, Rk=20, knn_per_item=10) #limited it to 50 candidates for faster testing on LLMs
cand_test = make_candidates(users_test, P=20,  K=50, Rk=20, knn_per_item=10)

#finally save outputs
OUT = SPLITS.parent / "candidates"
OUT.mkdir(exist_ok=True)
cand_val.to_parquet(OUT / "val.parquet", index=False)
cand_test.to_parquet(OUT / "test.parquet", index=False)
print("Wrote candidates to:", OUT)

100%|██████████| 4675/4675 [09:40<00:00,  8.05it/s]
100%|██████████| 4675/4675 [09:46<00:00,  7.97it/s]


Wrote candidates to: C:\Users\abdul\OneDrive\Documents\GitHub\recsysvsllms\movielens_dataset\candidates


### Load & Define Covered Users

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path("C:/Users/abdul/OneDrive/Documents/GitHub/recsysvsllms/movielens_dataset")
SPLITS = ROOT / "splits"
CANDS  = ROOT / "candidates"

train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")   
val_idx   = pd.read_parquet(SPLITS / "val_targets_indexed.parquet")
test_idx  = pd.read_parquet(SPLITS / "test_targets_indexed.parquet")

cand_val  = pd.read_parquet(CANDS / "val.parquet")  
cand_test = pd.read_parquet(CANDS / "test.parquet")

def coverage_rows(cands_df, targets_df, tgt_name):
    df = cands_df.merge(
        targets_df[["uid", "iid"]].rename(columns={"iid": tgt_name}),
        on="uid", how="inner"
    )

    #drop cold-start targets (iid is NaN) and rows without candidates
    before = len(df)
    df = df[df[tgt_name].notna() & df["candidates"].notna()].copy()
    dropped = before - len(df)
    df["candidates"] = df["candidates"].apply(lambda x: list(x) if isinstance(x, (list, tuple, np.ndarray)) else [])

    #using pandas for nullable Int64 to get rid of NaN issues
    df[tgt_name] = df[tgt_name].astype("Int64")
    df["target_in_pool"] = [t in set(c) for t, c in zip(df[tgt_name], df["candidates"])]

    print(f"{tgt_name}: dropped {dropped} rows (cold-start/invalid); kept {len(df)}")
    return df

val_cov  = coverage_rows(cand_val,  val_idx,  "val_item")
test_cov = coverage_rows(cand_test, test_idx, "test_item")

val_covered  = val_cov[val_cov["target_in_pool"]].copy()
test_covered = test_cov[test_cov["target_in_pool"]].copy()

print(f"Coverage  val={len(val_covered)/len(val_cov):.2%}  test={len(test_covered)/len(test_cov):.2%}")

### Get number of covered Users

In [15]:
def mark_coverage(cands_df: pd.DataFrame, targets_df: pd.DataFrame, tgt_name: str):
    df = cands_df.merge(
        targets_df[["uid","iid"]].rename(columns={"iid": tgt_name}),
        on="uid", how="inner"
    )
    df["candidates"] = df["candidates"].apply(
        lambda x: list(x) if isinstance(x, (list, tuple, np.ndarray, pd.Series)) else []
    )
    # fill NaNs to avoid int() on NaN
    tgt = df[tgt_name].fillna(-1).astype(int).to_numpy()
    df["target_in_pool"] = [int(t) in set(c) for t, c in zip(tgt, df["candidates"])]
    return df[["uid","target_in_pool"]]

val_mark  = mark_coverage(cand_val,  val_idx,  "val_item")
test_mark = mark_coverage(cand_test, test_idx, "test_item")

covered_val_uids  = set(val_mark.loc[val_mark["target_in_pool"], "uid"])
covered_test_uids = set(test_mark.loc[test_mark["target_in_pool"], "uid"])

print(f"Users valid for eval  val={len(covered_val_uids)}  test={len(covered_test_uids)}")

Users valid for eval  val=432  test=404


### Limit to 100 users (reused for LLMs)

In [16]:
from pathlib import Path
import pandas as pd
import numpy as np

def new_coverage_rows(cands_df, targets_df, tgt_name):
    df = cands_df.merge(
        targets_df[["uid", "iid"]].rename(columns={"iid": tgt_name}),
        on="uid", how="inner"
    )

    before = len(df) 
    df = df[df[tgt_name].notna() & df["candidates"].notna()].copy()
    dropped = before - len(df) #dropping rows if target missing or candidates list missing

    df["candidates"] = df["candidates"].apply( 
        lambda x: list(x) if isinstance(x, (list, tuple, np.ndarray, pd.Series)) else [] #ensuring list stype
    )
    df[tgt_name] = df[tgt_name].astype("Int64")
    df["target_in_pool"] = [t in set(c) for t, c in zip(df[tgt_name], df["candidates"])]

    print(f"{tgt_name}: dropped {dropped} rows; kept {len(df)}")
    return df

N = 100 #setting to 100 users for LLM experiments

val_uids_100  = set(list(covered_val_uids)[:N])
test_uids_100 = set(list(covered_test_uids)[:N])

print(f"Using {len(val_uids_100)} val users")
print(f"Using {len(test_uids_100)} test users")

val_idx_100   = val_idx[val_idx["uid"].isin(val_uids_100)].copy() #restricting to only selected users
test_idx_100  = test_idx[test_idx["uid"].isin(test_uids_100)].copy()

cand_val_100  = cand_val[cand_val["uid"].isin(val_uids_100)].reset_index(drop=True)
cand_test_100 = cand_test[cand_test["uid"].isin(test_uids_100)].reset_index(drop=True)

print("\nShapes after restricting:")
print("  val_idx_100   :", val_idx_100.shape)
print("  cand_val_100  :", cand_val_100.shape)
print("  test_idx_100  :", test_idx_100.shape)
print("  cand_test_100 :", cand_test_100.shape)


#saving frozen copies for LLMs
val_idx_100.to_parquet(SPLITS / "val_targets_indexed_100.parquet", index=False)
test_idx_100.to_parquet(SPLITS / "test_targets_indexed_100.parquet", index=False)
cand_val_100.to_parquet(CANDS / "val_100.parquet", index=False)
cand_test_100.to_parquet(CANDS / "test_100.parquet", index=False)

Using 100 val users
Using 100 test users

Shapes after restricting:
  val_idx_100   : (100, 5)
  cand_val_100  : (100, 2)
  test_idx_100  : (100, 5)
  cand_test_100 : (100, 2)


### Sanity check coverage

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path("C:/Users/abdul/OneDrive/Documents/GitHub/recsysvsllms/movielens/data/dataset")
SPLITS = ROOT / "splits"
CANDS  = ROOT / "candidates"

train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")   

val_eval_targets  = pd.read_parquet(SPLITS / "val_targets_indexed_100.parquet")
test_eval_targets = pd.read_parquet(SPLITS / "test_targets_indexed_100.parquet")

cand_val_eval  = pd.read_parquet(CANDS / "val_100.parquet")
cand_test_eval = pd.read_parquet(CANDS / "test_100.parquet")

# sanity check, should print 100%
def coverage_percent(cands_df, targets_df, tgt_col):
    df = cands_df.merge(targets_df[["uid", tgt_col]], on="uid")
    return np.mean([int(t) in set(c) for t, c in zip(df[tgt_col], df["candidates"])])

print("Eval coverage",
      f"val={coverage_percent(cand_val_eval,  val_eval_targets,  'iid'):.2%}",
      f"test={coverage_percent(cand_test_eval, test_eval_targets, 'iid'):.2%}")

Eval coverage val=100.00% test=100.00%


# Training Recommender System

### Setting ALS

In [33]:
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight 
import time  

U = int(train_idx["uid"].max()) + 1 #building users × items CSR from train_idx 
I = int(train_idx["iid"].max()) + 1

rows = train_idx["uid"].to_numpy(dtype=np.int32, copy=False)
cols = train_idx["iid"].to_numpy(dtype=np.int32, copy=False)
data = np.ones(len(train_idx), dtype=np.float32)

alpha = 15
R = csr_matrix((data, (rows, cols)), shape=(U, I), dtype=np.float32)
X = bm25_weight(R, K1=100, B=0.8).astype(np.float32)

als = AlternatingLeastSquares( #training ALS
    factors=96,
    regularization=0.05,
    iterations=200,
    use_gpu=False,     
    dtype=np.float32,
)

start = time.time()
als.fit(alpha*X)
end = time.time()
print(f"ALS training time: {end - start:.2f} seconds")

U_f = als.user_factors   # get shapes/embeddings
V_f = als.item_factors   

print("ALS shapes  user_factors:", U_f.shape, " item_factors:", V_f.shape, " U:", U, " I:", I)
assert U_f.shape[0] == U and V_f.shape[0] == I

100%|██████████| 200/200 [00:03<00:00, 55.41it/s]

ALS training time: 3.62 seconds
ALS shapes  user_factors: (4675, 96)  item_factors: (2233, 96)  U: 4675  I: 2233





#### Sparsity check

In [34]:
num_users = U
num_items = I
num_interactions = len(train_idx)
sparsity = 1 - (num_interactions / (num_users * num_items))
print(f"Users: {num_users}, Items: {num_items}, Interactions: {num_interactions}")
print(f"Sparsity: {sparsity:.6f}")

Users: 4675, Items: 2233, Interactions: 12557
Sparsity: 0.998797


### Evaluating Baselines

In [25]:
#need to first join targets with candidate pools
val_eval  = (cand_val_eval
             .merge(val_eval_targets[["uid","iid"]]
                    .rename(columns={"iid":"target"}), on="uid"))
test_eval = (cand_test_eval
             .merge(test_eval_targets[["uid","iid"]]
                    .rename(columns={"iid":"target"}), on="uid"))

I   = V_f.shape[0] # setting I to number of items

def score_als(uid: int, cand_iids: list[int]) -> np.ndarray:
    if not cand_iids:
        return np.asarray([])
    u = U_f[int(uid)]
    c = np.array([int(i) for i in cand_iids if 0 <= int(i) < I], dtype=np.int64)
    if c.size == 0:
        return np.asarray([])
    return V_f[c] @ u

def hit_at_k(ranked, tgt, k):
    return 1.0 if tgt in ranked[:k] else 0.0

def ndcg_at_k(ranked, tgt, k):
    for rank, iid in enumerate(ranked[:k], start=1):
        if iid == tgt:
            return 1.0 / np.log2(rank + 1)  # +1 in denom because ranks start at 1
    return 0.0

def eval_pool(df_eval: pd.DataFrame, scorer, ks=[5, 10, 20]):
    results = {f"hr@{k}": [] for k in ks}
    results.update({f"ndcg@{k}": [] for k in ks})
    for _, r in df_eval.iterrows():
        uid  = int(r["uid"])
        cnds = [int(x) for x in (r["candidates"] if isinstance(r["candidates"], (list, tuple, np.ndarray, pd.Series)) else [])]
        tgt  = int(r["target"])
        if not cnds:
            continue
        scores = scorer(uid, cnds)
        if scores.size == 0:
            continue
        order  = np.argsort(-scores)
        ranked = [cnds[i] for i in order]
        for k in ks:
            results[f"hr@{k}"].append(hit_at_k(ranked, tgt, k))
            results[f"ndcg@{k}"].append(ndcg_at_k(ranked, tgt, k))
    return {key: float(np.mean(val)) for key, val in results.items()}

ks = [5, 10, 20]
val_metrics = eval_pool(val_eval, score_als, ks=ks)
test_metrics = eval_pool(test_eval, score_als, ks=ks)

print("[VAL]  ALS", " ".join([f"HR@{k}={val_metrics[f'hr@{k}']:.3f} NDCG@{k}={val_metrics[f'ndcg@{k}']:.3f}" for k in ks]))
print("[TEST] ALS", " ".join([f"HR@{k}={test_metrics[f'hr@{k}']:.3f} NDCG@{k}={test_metrics[f'ndcg@{k}']:.3f}" for k in ks]))

[VAL]  ALS HR@5=0.190 NDCG@5=0.117 HR@10=0.300 NDCG@10=0.155 HR@20=0.440 NDCG@20=0.189
[TEST] ALS HR@5=0.070 NDCG@5=0.051 HR@10=0.260 NDCG@10=0.067 HR@20=0.350 NDCG@20=0.124


### Hyperparameter Tuning

In [None]:
def ndcg_at_k(ranked, tgt, k=10):
    for rank, iid in enumerate(ranked[:k], start=1):
        if iid == tgt:
            return 1.0 / np.log2(rank + 1)
    return 0.0

def hit_at_k(ranked, tgt, k=10):
    return 1.0 if tgt in ranked[:k] else 0.0

def eval_pool(df_eval, scorer, k=10):
    hits, ndcgs = [], []
    for _, r in df_eval.iterrows():
        uid  = int(r["uid"])
        cnds = r["candidates"]
        if not isinstance(cnds, (list, tuple, np.ndarray, pd.Series)) or len(cnds) == 0:
            continue
        cnds = [int(x) for x in cnds]
        scores = scorer(uid, cnds)
        if scores.size == 0:
            continue
        order  = np.argsort(-scores)
        ranked = [cnds[i] for i in order]
        hits.append(hit_at_k(ranked, int(r["target"]), k))
        ndcgs.append(ndcg_at_k(ranked, int(r["target"]), k))
    return float(np.mean(hits)), float(np.mean(ndcgs))

def train_eval_als_once(seed, R, alpha, factors, reg, iters, use_gpu,
                        val_eval, test_eval, k=10):
    X = bm25_weight(R, K1=100, B=0.8).astype(np.float32)
    als = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iters,
        use_gpu=use_gpu,
        dtype=np.float32,
        random_state=seed,
    )
    als.fit(alpha * X)

    U_f = als.user_factors
    V_f = als.item_factors
    I   = V_f.shape[0]

    def score_als(uid, cand_iids):
        if not cand_iids: return np.asarray([])
        u = U_f[int(uid)]
        c = np.array([int(i) for i in cand_iids if 0 <= int(i) < I], dtype=np.int64)
        if c.size == 0: return np.asarray([])
        return V_f[c] @ u

    return eval_pool(val_eval,  score_als, k), eval_pool(test_eval, score_als, k)

param_grid = [ #grid search for hyperparam tuning
    {"factors": 64,  "reg": 0.05, "alpha": 15, "iters": 150},
    {"factors": 96,  "reg": 0.05, "alpha": 15, "iters": 200},
    {"factors": 128, "reg": 0.1,  "alpha": 10, "iters": 200},
    {"factors": 128, "reg": 0.05, "alpha": 20, "iters": 200},
]

results = []
seeds = [0, 1, 2]  # average across a few random seeds

for params in param_grid:
    vals, tests = [], []
    for s in seeds:
        val_metrics, test_metrics = train_eval_als_once(
            seed=s,
            R=R,
            alpha=params["alpha"],
            factors=params["factors"],
            reg=params["reg"],
            iters=params["iters"],
            use_gpu=False,
            val_eval=val_eval,
            test_eval=test_eval,
            k=10,
        )
        vals.append(val_metrics)
        tests.append(test_metrics)

    vals, tests = np.array(vals), np.array(tests)
    val_mean, val_std = vals.mean(0), vals.std(0)
    test_mean, test_std = tests.mean(0), tests.std(0)

    results.append({
        **params,
        "val_HR": val_mean[0], "val_NDCG": val_mean[1],
        "test_HR": test_mean[0], "test_NDCG": test_mean[1],
    })

res_df = pd.DataFrame(results)
print(res_df.sort_values("val_NDCG", ascending=False))


100%|██████████| 150/150 [00:03<00:00, 45.56it/s]
100%|██████████| 150/150 [00:03<00:00, 46.62it/s]
100%|██████████| 150/150 [00:03<00:00, 42.96it/s]
100%|██████████| 200/200 [00:04<00:00, 42.02it/s]
100%|██████████| 200/200 [00:04<00:00, 41.79it/s]
100%|██████████| 200/200 [00:05<00:00, 37.13it/s]
100%|██████████| 200/200 [00:06<00:00, 30.87it/s]
100%|██████████| 200/200 [00:05<00:00, 33.85it/s]
100%|██████████| 200/200 [00:05<00:00, 38.54it/s]
100%|██████████| 200/200 [00:05<00:00, 39.25it/s]
100%|██████████| 200/200 [00:04<00:00, 41.93it/s]
100%|██████████| 200/200 [00:05<00:00, 38.35it/s]
   factors   reg  alpha  iters    val_HR  val_NDCG   test_HR  test_NDCG
0       64  0.05     15    150  0.366667  0.201420  0.260000   0.131664
1       96  0.05     15    200  0.303333  0.151813  0.240000   0.117803
3      128  0.05     20    200  0.293333  0.141607  0.186667   0.092527
2      128  0.10     10    200  0.296667  0.137689  0.180000   0.094043



Best ALS hyperparameters: factors=64, regularization=0.05, alpha=15, iterations=150