In [None]:
# import all necessary packages
import numpy as np, pandas as pd, scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from math import sqrt
import re, gc, warnings
warnings.filterwarnings("ignore")

In [None]:
# set a random generator with fixed seed for reproducibility and consistency
RNG = np.random.default_rng(42)

# a helper function for a text cleaner
# it converts to lowercase, removes extra whitespace, and returns empty string if input is not valid text
def clean_text(s: str) -> str:
  if not isinstance(s, str):
    return ""
  s = s.lower()
  s = re.sub(r"\s+", " ", s) # this replaces multiple spaces/newlines with single space
  return s.strip()

# a helper function that normalizes book titles
# it lowercases text, removes punctuation and special characters, and strips redundant spaces
def normalize_title(s: str) -> str:
  s = clean_text(s)
  s = re.sub(r"[^a-z0-9 ]+", "", s)
  return re.sub(r"\s+", " ", s).strip()

In [None]:
# load the data
from google.colab import drive
drive.mount('/content/drive')

# meta data first
meta_path = "/content/drive/MyDrive/books_data.csv"
meta = pd.read_csv(meta_path)
meta.head()

Mounted at /content/drive


Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [None]:
# load the ratings dataset
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/My Drive/Books_rating.csv"
df = pd.read_csv(file_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [None]:
# standardize column names
# remove spaces and make lowercase for consistency
df.columns   = df.columns.str.strip().str.lower()
meta.columns = meta.columns.str.strip().str.lower()

# rename important rating columns to simpler names
df = df.rename(columns={"review/score":"rating", "review/time":"unix_time"})

# clean ratings data
# drop rows missing user_id, title, or rating
# keep only ratings in the valid range [0.5, 5.0]
df = df.dropna(subset=["user_id","title","rating"])
df = df[(df["rating"] >= 0.5) & (df["rating"] <= 5.0)]

# create normalized keys for joining ratings with metadata
df["title_key"]   = df["title"].map(normalize_title)
meta["title_key"] = meta["title"].map(normalize_title)

# keep only useful metadata columns
keep_meta = [c for c in ["title_key","description","authors","publisher","categories"] if c in meta.columns]
meta_small = meta[keep_meta].drop_duplicates("title_key")

# merge ratings (df) with metadata using title_key as join key
# keep all ratings
books = df.merge(meta_small, on="title_key", how="left")

In [None]:
# keep only the columns needed for collaborative filtering
interactions = (df.loc[:, ["user_id","title_key","title","rating","unix_time"]]
                  .dropna(subset=["user_id","title_key","rating"])
                  .query("0.5 <= rating <= 5.0") # enforce valid rating range
                  .reset_index(drop=True)) # reset row index for cleanliness

# shrink memory usage by using lighter datatypes
interactions["user_id"]  = interactions["user_id"].astype("string")
interactions["title_key"]= interactions["title_key"].astype("string")
interactions["title"]    = interactions["title"].astype("string")
interactions["rating"]   = interactions["rating"].astype("float32")

# show dataset size (rows, columns) to confirm shape
print("interactions:", interactions.shape)

interactions: (2438018, 5)


In [None]:
# pick only the metadata columns that are text-like
meta_cols = [c for c in ["title_key","title","description","authors","publisher","categories"] if c in meta.columns]

# build a clean metadata table for items
items_meta = (meta[meta_cols]
              .drop_duplicates("title_key")
              .copy())

# clean up text fields
# replace missing values with empty strings
# apply clean_text() function
for col in items_meta.columns:
    if col != "title_key":
        items_meta[col] = items_meta[col].fillna("").map(clean_text)

# print resulting shape (#rows, #columns)
print("items_meta:", items_meta.shape)

items_meta: (206859, 6)


In [None]:
# select only the metadata columns that contain textual information
# this keeps the identifiers (title_key) and fields like title, description, authors, etc.
meta_cols = [c for c in ["title_key","title","description","authors","publisher","categories"] if c in meta.columns]

# create a new dataframe with only those columns
items_meta = (meta[meta_cols]
              .drop_duplicates("title_key")
              .copy())

# clean each text column
# replace missing values with empty strings
# apply clean_text() to normalize text
for col in items_meta.columns:
    if col != "title_key":
        items_meta[col] = items_meta[col].fillna("").map(clean_text)


# print the shape to confirm how many rows (books) and columns remain
print("items_meta:", items_meta.shape)


items_meta: (206859, 6)


In [None]:
# build a lightweight text corpus from user reviews
# extract only the needed review columns alongside the book key
ratings_text = df.loc[:, ["title_key","review/summary","review/text"]].copy()

# replace missing values with empty strings
ratings_text["review/summary"] = ratings_text["review/summary"].fillna("")
ratings_text["review/text"]    = ratings_text["review/text"].fillna("")


# define a helper function: combine a capped number of reviews per item
def _combine_sample(group, cap=20):
    # take up to 'cap' reviews per item; join and clean
    txt = (" ".join((group["review/summary"] + " " + group["review/text"]).head(cap).tolist())).lower()
    return " ".join(txt.split())[:20000]  # collapse whitespace & cap length


# group reviews by book, apply the combiner, and reset as a new dataframe
items_reviews = (ratings_text.groupby("title_key", sort=False)
                 .apply(_combine_sample)
                 .rename("reviews_blob")
                 .reset_index())

# check how many books (rows) and review blobs we have
print("items_reviews:", items_reviews.shape)

items_reviews: (201321, 2)


In [None]:
# merge item metadata with aggregated review blobs
items = (items_meta.merge(items_reviews, on="title_key", how="left")
                   .fillna({"reviews_blob": ""}))

# define which textual fields to combine into the final content representation
content_parts = [c for c in ["title","description","authors","publisher","categories","reviews_blob"] if c in items.columns]

# concatenate all available text fields into a single string per item
# clean it
items["content_str"] = items[content_parts].agg(" ".join, axis=1).map(clean_text)

# keep only items that actually appear in the interactions table
# so we don’t waste memory modeling items with no user activity
items = items[items["title_key"].isin(interactions["title_key"].unique())].copy()

# retain a minimal schema
items = items.loc[:, ["title_key","title","content_str"]]

# check of shape and preview of the first rows
print("items (final):", items.shape)
items.head(3)


items (final): (201321, 3)


Unnamed: 0,title_key,title,content_str
0,its only art if its well hung,its only art if its well hung!,its only art if its well hung! ['julie strain'...
1,dr seuss american icon,dr. seuss: american icon,dr. seuss: american icon philip nel takes a fa...
2,wonderful worship in smaller churches,wonderful worship in smaller churches,wonderful worship in smaller churches this res...


In [None]:
# filter users and items with too few interactions
# keep only users with ≥ MIN_USER ratings and items with ≥ MIN_ITEM
MIN_USER, MIN_ITEM = 3, 5
u_ok = interactions["user_id"].value_counts()
i_ok = interactions["title_key"].value_counts()


# keep only rows where both user and item meet minimum activity thresholds
mask = interactions["user_id"].isin(u_ok[u_ok>=MIN_USER].index) & \
       interactions["title_key"].isin(i_ok[i_ok>=MIN_ITEM].index)
interactions = interactions.loc[mask].reset_index(drop=True)

# rebuild the items table to include only the filtered items
items = items[items["title_key"].isin(interactions["title_key"].unique())].reset_index(drop=True)

# map string IDs to numeric IDs for matrix factorization / kNN
# create dictionaries mapping original user IDs and item keys to consecutive integers
uid_map = {u:i for i,u in enumerate(interactions["user_id"].unique())}
iid_map = {t:i for i,t in enumerate(interactions["title_key"].unique())}

# apply mappings to create numeric columns 'u' (user index) and 'i' (item index)
interactions["u"] = interactions["user_id"].map(uid_map).astype("int32")
interactions["i"] = interactions["title_key"].map(iid_map).astype("int32")
items["i"]        = items["title_key"].map(iid_map).astype("int32")


# print final shapes to confirm filtering effect
print("interactions:", interactions.shape, "items:", items.shape)

interactions: (1344961, 7) items: (62804, 4)


In [None]:
import numpy as np, gc
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

# -small guardrails to reduce memory
# limit each item's text length to 8,000 characters
# this prevents very long reviews/metadata from blowing up TF-IDF matrix size
items["content_str"] = items["content_str"].astype(str).str.slice(0, 8000)


# flag to choose between TF-IDF and HashingVectorizer
USE_HASHING = False

if not USE_HASHING:
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer(
        stop_words="english",     # fewer features
        ngram_range=(1,1),        # unigrams only (big drop in RAM)
        min_df=10,                # ignore ultra-rare terms
        max_df=0.85,              # drop very common terms
        max_features=100_000,     # cap vocab size
        dtype=np.float32          # 32-bit to halve memory
    )

    # fit TF-IDF model and transform item content into sparse feature vectors
    X = tfidf.fit_transform(items["content_str"])

In [None]:
# collaborative filtering (CF),  Item-based kNN
# cuild a sparse user–item matrix and fit neighbors
import numpy as np, scipy.sparse as sp
from sklearn.neighbors import NearestNeighbors


# build sparse ratings matrix
n_users = interactions["u"].max() + 1
n_items = interactions["i"].max() + 1

R = sp.csr_matrix(
    (interactions["rating"].astype("float32"), (interactions["u"], interactions["i"])),
    shape=(n_users, n_items)
)

# adjusted cosine
# compute each user's mean rating (to remove bias)
user_counts = (R != 0).sum(axis=1).A.ravel()
user_sums   = np.array(R.sum(axis=1)).ravel()
user_means  = np.divide(user_sums, user_counts, out=np.full_like(user_sums, np.nan, dtype="float64"), where=user_counts>0)

# fill NaN (for users with no ratings) with global mean rating
user_means  = np.nan_to_num(user_means, nan=float(interactions["rating"].mean()))


# subtract each user’s mean from their ratings
# this mean-centering makes cosine similarity "adjusted cosine"
R_centered = R.copy().astype(np.float32)
rows, _ = R_centered.nonzero()
R_centered.data = R_centered.data - user_means[rows]

# item kNN model
# fit nearest-neighbors on items
# distance = cosine similarity on mean-centered ratings
nn_item = NearestNeighbors(metric="cosine", algorithm="brute")
nn_item.fit(R_centered.T)


# define a helper - get top similar items for a given item index
def similar_items_cf(i_idx: int, topk: int = 50):
    dist, ind = nn_item.kneighbors(R_centered.T[i_idx], n_neighbors=topk+1)
    ind, dist = ind.ravel()[1:], dist.ravel()[1:]  # drop self
    return ind, 1.0 - dist

In [None]:
# define the hybrid recommender function
def recommend_for_user(u_idx: int, alpha: float = 0.6,   # weight on CF (0..1)
                       seeds: int = 5, topk_per_source: int = 100, final_k: int = 10):
  # get user's rated items
  row = R[u_idx]
  rated_items, rated_vals = row.indices, row.data
  if rated_items.size == 0:
      return []  # cold-start fallback could be popularity

  # pick top "seed" items = items user rated highest
  seed_items = rated_items[np.argsort(-rated_vals)[:seeds]]

  # dicts for candidate scores
  cand_ct, cand_cf = {}, {}

  # expand from seed items
  for it in seed_items:
      # content neighbors
      idx_ct, sim_ct = similar_items_content(it, topk=topk_per_source)
      for j, s in zip(idx_ct, sim_ct):
          if j in rated_items: continue
          cand_ct[j] = max(cand_ct.get(j, 0.0), float(s))
      # CF neighbors
      idx_cf, sim_cf = similar_items_cf(it, topk=topk_per_source)
      for j, s in zip(idx_cf, sim_cf):
          if j in rated_items: continue
          cand_cf[j] = max(cand_cf.get(j, 0.0), float(s))

  # merge candidates from both sources
  cand_ids = set(cand_ct) | set(cand_cf)
  if not cand_ids:
    return []

  # mormalize scores
  def norm(d):
      if not d:
        return {}
      v = np.array(list(d.values()))
      lo, hi = v.min(), v.max()
      if hi == lo: return {k: 0.0 for k in d}
      return {k: (val - lo)/(hi - lo) for k,val in d.items()}

  ct_n, cf_n = norm(cand_ct), norm(cand_cf)

  # blend scores
  scores = {j: (1-alpha)*ct_n.get(j,0.0) + alpha*cf_n.get(j,0.0) for j in cand_ids}
  return sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:final_k]


In [None]:
import pandas as pd, numpy as np, gc

# copy only necessary columns for evaluation
inter_small = interactions.loc[:, ["u","i","rating","unix_time"]].copy()
inter_small["u"] = inter_small["u"].astype("int32")
inter_small["i"] = inter_small["i"].astype("int32")
inter_small["rating"] = inter_small["rating"].astype("float32")


# case 1: if we have timestamps
if inter_small["unix_time"].notna().any():
    inter_sorted = inter_small.sort_values(["u","unix_time"], kind="mergesort")
    test_idx = inter_sorted.groupby("u", sort=False).tail(1).index
    test  = inter_sorted.loc[test_idx, ["u","i","rating"]].reset_index(drop=True)
    train = inter_sorted.drop(index=test_idx).reset_index(drop=True)
    del inter_sorted

# case 2: no timestamps available
else:
    test_idx = inter_small.groupby("u", sort=False).sample(n=1, random_state=42).index
    test  = inter_small.loc[test_idx, ["u","i","rating"]].reset_index(drop=True)
    train = inter_small.drop(index=test_idx).reset_index(drop=True)

# cleanup
del inter_small; gc.collect()
print("Train/Test:", train.shape, test.shape, "| test users:", test["u"].nunique())


Train/Test: (1176302, 4) (168659, 3) | test users: 168659


In [None]:
import numpy as np, scipy.sparse as sp
from sklearn.neighbors import NearestNeighbors

n_users = int(interactions["u"].max()) + 1
n_items = int(interactions["i"].max()) + 1

# ratings matrix on TRAIN ONLY
R = sp.csr_matrix(
    (train["rating"].astype("float32"), (train["u"], train["i"])),
    shape=(n_users, n_items), dtype=np.float32
)

# user-mean centering
user_counts = (R != 0).sum(axis=1).A.ravel()
user_sums   = np.array(R.sum(axis=1)).ravel()
user_means  = np.divide(user_sums, user_counts,
                        out=np.full_like(user_sums, np.nan, dtype="float64"),
                        where=user_counts>0)
user_means  = np.nan_to_num(user_means, nan=float(train["rating"].mean()))

R_centered = R.copy()
rows, _ = R_centered.nonzero()
R_centered.data = R_centered.data - user_means[rows]

# refit the item-based CF neighbors
nn_item = NearestNeighbors(metric="cosine", algorithm="brute")
nn_item.fit(R_centered.T)

print("R shape:", R.shape, "| centered nnz:", R_centered.nnz)


R shape: (168659, 62804) | centered nnz: 844885


In [None]:
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# precision / recall at K
def precision_recall_at_k(pred_pairs, gt_set, k=10):
    if not pred_pairs: return 0.0, 0.0
    pred_k = [i for i,_ in pred_pairs[:k]]
    hits = set(pred_k) & gt_set
    return len(hits)/max(1,k), len(hits)/max(1,len(gt_set))

# evaluate recommender on Top-K metrics
def evaluate_topk(train_df, test_df, k=10, alpha=0.6, sample_users=200):
    users = test_df["u"].unique()
    if len(users) > sample_users:
        users = np.random.default_rng(42).choice(users, size=sample_users, replace=False)
    precs, recs, hits, covered = [], [], [], set()
    for u in users:
        gt = set(test_df.loc[test_df["u"]==u, "i"].tolist())
        preds = recommend_for_user(int(u), alpha=alpha, final_k=k)
        covered.update([i for i,_ in (preds or [])])
        if gt:
            p,r = precision_recall_at_k(preds or [], gt, k=k)
            precs.append(p); recs.append(r)
            hits.append(1.0 if any(i in gt for i,_ in (preds or [])) else 0.0)
    return {
        "precision@k": float(np.mean(precs) if precs else 0.0),
        "recall@k":    float(np.mean(recs) if recs else 0.0),
        "hit_rate":    float(np.mean(hits) if hits else 0.0),
        "coverage":    float(len(covered) / max(1, n_items))
    }

# item-mean baseline (TRAIN only)
item_counts = (R != 0).sum(axis=0).A.ravel()
item_sums   = np.array(R.sum(axis=0)).ravel()
item_means  = np.divide(item_sums, item_counts,
                        out=np.full_like(item_sums, np.nan, dtype="float64"),
                        where=item_counts>0)
item_means  = np.nan_to_num(item_means, nan=float(train["rating"].mean()))

# RMSE and MAE for baseline predictor
def evaluate_rmse_mae(test_df):
    if len(test_df)==0: return {"RMSE":np.nan, "MAE":np.nan}
    y_true = test_df["rating"].to_numpy()
    y_pred = np.array([item_means[i] if 0 <= i < len(item_means) else float(train["rating"].mean())
                       for i in test_df["i"]])
    return {"RMSE": sqrt(mean_squared_error(y_true, y_pred)),
            "MAE":  mean_absolute_error(y_true, y_pred)}


In [None]:
# rebuild content vectors + kNN
import numpy as np, gc
from sklearn.neighbors import NearestNeighbors

# make sure items are aligned (i = item index, content_str exists)
items = items.sort_values("i").reset_index(drop=True)
items["content_str"] = items["content_str"].astype(str).str.slice(0, 8000)  # guard for RAM

# try light TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,1),
    min_df=10,
    max_df=0.85,
    max_features=100_000,
    dtype=np.float32
)
X = tfidf.fit_transform(items["content_str"])

# cosine kNN over content space
nn_content = NearestNeighbors(metric="cosine", algorithm="brute")
nn_content.fit(X)

# neighbor lookup function
def similar_items_content(i_idx: int, topk: int = 50):
    """Return neighbor item indices and cosine sims from content model."""
    topk = min(int(topk), X.shape[0]-1)
    dist, ind = nn_content.kneighbors(X[i_idx], n_neighbors=topk+1)
    ind, dist = ind.ravel()[1:], dist.ravel()[1:]  # drop self
    return ind, 1.0 - dist

print("Content matrix:", X.shape, "| ready =", callable(similar_items_content))
gc.collect();

Content matrix: (62804, 59185) | ready = True


In [None]:
# print results
print("Rating baseline:", evaluate_rmse_mae(test))
print("Top-K (alpha=0.6):", evaluate_topk(train, test, k=10, alpha=0.6, sample_users=200))

Rating baseline: {'RMSE': 7.120232402497215, 'MAE': 3.406661982489504}
Top-K (alpha=0.6): {'precision@k': 0.027999999999999997, 'recall@k': 0.28, 'hit_rate': 0.28, 'coverage': 0.023517610343290238}


In [None]:
# neighbor caches to avoid recomputation
ct_cache, cf_cache = {}, {}
CT_TOPK, CF_TOPK = 30, 30  # neighbors to fetch per seed item

# content neighbors (from TF-IDF / content space)
def get_ct_neighbors(i):
    if i in ct_cache: return ct_cache[i]
    k = min(CT_TOPK + 1, X.shape[0])
    dist, ind = nn_content.kneighbors(X[i], n_neighbors=k)
    ind, dist = ind.ravel()[1:], dist.ravel()[1:]   # drop self
    sims = 1.0 - dist
    ct_cache[i] = (ind, sims)
    return ct_cache[i]

# collaborative neighbors (from rating matrix)
def get_cf_neighbors(i):
    if i in cf_cache: return cf_cache[i]
    k = min(CF_TOPK + 1, R_centered.shape[1])
    dist, ind = nn_item.kneighbors(R_centered.T[i], n_neighbors=k)
    ind, dist = ind.ravel()[1:], dist.ravel()[1:]
    sims = 1.0 - dist
    cf_cache[i] = (ind, sims)
    return cf_cache[i]

# hybrid recommender (fast)
def recommend_for_user_fast(u_idx: int, alpha: float = 0.6, seeds: int = 5, final_k: int = 10):
    row = R[u_idx]
    if row.nnz == 0: return []
    rated_items, rated_vals = row.indices, row.data
    seed_items = rated_items[np.argsort(-rated_vals)[:max(1, min(seeds, rated_items.size))]]

    cand_ct, cand_cf = {}, {}
    for it in seed_items:
        idx_ct, sim_ct = get_ct_neighbors(int(it))
        for j, s in zip(idx_ct, sim_ct):
            if j in rated_items: continue
            cand_ct[j] = max(cand_ct.get(j, 0.0), float(s))
        idx_cf, sim_cf = get_cf_neighbors(int(it))
        for j, s in zip(idx_cf, sim_cf):
            if j in rated_items: continue
            cand_cf[j] = max(cand_cf.get(j, 0.0), float(s))

    ids = set(cand_ct) | set(cand_cf)
    if not ids: return []

    # normalize scores to [0,1]
    def norm(d):
        if not d: return {}
        v = np.fromiter(d.values(), dtype=float)
        lo, hi = v.min(), v.max()
        if hi <= lo: return {k:0.0 for k in d}
        return {k:(val-lo)/(hi-lo) for k, val in d.items()}

    ct_n, cf_n = norm(cand_ct), norm(cand_cf)
    scores = {j:(1-alpha)*ct_n.get(j,0.0) + alpha*cf_n.get(j,0.0) for j in ids}
    return sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:final_k]

In [None]:
# compute F1 given precision and recall
def f1_from_pr(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

# top-K metrics including F1
def evaluate_topk_fast_with_f1(train_df, test_df, k=10, alpha=0.6, sample_users=100):
    users = test_df["u"].unique()
    if len(users) > sample_users:
        users = np.random.default_rng(42).choice(users, size=sample_users, replace=False)

    # metric collectors
    precs, recs, f1s, hits, covered = [], [], [], [], set()
    for u in users:
        gt = set(test_df.loc[test_df["u"]==u, "i"].tolist())
        preds = recommend_for_user_fast(int(u), alpha=alpha, final_k=k)
        covered.update([i for i,_ in (preds or [])])
        if gt:
            pred_k = [i for i,_ in preds[:k]]
            hitset = set(pred_k) & gt
            p = len(hitset)/k
            r = len(hitset)/max(1, len(gt))
            precs.append(p); recs.append(r)
            f1s.append(f1_from_pr(p, r))
            hits.append(1.0 if hitset else 0.0)

    # aggregate metrics across sampled users
    return {
        "precision@k": float(np.mean(precs) if precs else 0.0),
        "recall@k":    float(np.mean(recs) if recs else 0.0),
        "f1@k":        float(np.mean(f1s)  if f1s  else 0.0),
        "hit_rate":    float(np.mean(hits) if hits else 0.0),
        "coverage":    float(len(covered) / (R.shape[1] or 1))
    }

# quick run
evaluate_topk_fast_with_f1(train, test, k=10, alpha=0.6, sample_users=100)

{'precision@k': 0.026000000000000002,
 'recall@k': 0.26,
 'f1@k': 0.047272727272727265,
 'hit_rate': 0.26,
 'coverage': 0.012801732373734158}

In [None]:
def evaluate_topk_fast(train_df, test_df, k=10, alpha=0.6, sample_users=100):
    users = test_df["u"].unique()
    if len(users) > sample_users:
        users = np.random.default_rng(42).choice(users, size=sample_users, replace=False)
    precs, recs, hits, covered = [], [], [], set()
    for u in users:
        gt = set(test_df.loc[test_df["u"]==u, "i"].tolist())
        preds = recommend_for_user_fast(int(u), alpha=alpha, final_k=k)
        covered.update([i for i,_ in (preds or [])])
        if gt:
            pred_k = [i for i,_ in preds[:k]]
            hitset = set(pred_k) & gt
            precs.append(len(hitset)/k)
            recs.append(len(hitset)/max(1,len(gt)))
            hits.append(1.0 if hitset else 0.0)
    # return averaged metrics across users
    return {
        "precision@k": float(np.mean(precs) if precs else 0.0),
        "recall@k":    float(np.mean(recs) if recs else 0.0),
        "hit_rate":    float(np.mean(hits) if hits else 0.0),
        "coverage":    float(len(covered) / (R.shape[1] or 1))
    }

# quick sweep
alphas = [0.0, 0.25, 0.5, 0.75, 1.0]
alpha_scores = [(a, evaluate_topk_fast(train, test, k=10, alpha=a, sample_users=100)) for a in alphas]
alpha_scores

BEST_ALPHA = max(alpha_scores, key=lambda t: t[1]["precision@k"])[0]
print("BEST_ALPHA:", BEST_ALPHA)

BEST_ALPHA: 0.25


In [None]:
# popularity fallback (train only)
pop_scores = np.array((R != 0).sum(axis=0)).ravel()
popular_items = np.argsort(-pop_scores)
title_lookup = items.set_index("i")["title"].to_dict()

def recommend(u_idx: int, k: int = 10, mode: str = "hybrid", alpha: float = None):
    if alpha is None: alpha = BEST_ALPHA if 'BEST_ALPHA' in globals() else 0.6
    if mode == "popular" or u_idx < 0 or u_idx >= R.shape[0] or R[u_idx].nnz == 0:
        top = [(int(i), float(pop_scores[i])) for i in popular_items[:k]]
    elif mode == "content":
        top = recommend_for_user_fast(u_idx, alpha=0.0, final_k=k)
    elif mode == "cf":
        top = recommend_for_user_fast(u_idx, alpha=1.0, final_k=k)
    else:
        top = recommend_for_user_fast(u_idx, alpha=alpha, final_k=k) or \
              [(int(i), float(pop_scores[i])) for i in popular_items[:k]]
    return [(i, title_lookup.get(i, f"item_{i}"), float(s)) for i, s in (top or [])]

# demo: show 5 recs for a random user
u_demo = int(interactions["u"].sample(1, random_state=7).iloc[0])
recommend(u_demo, k=5, mode="hybrid")


[(np.int64(5695), 'mountains of spices', 0.75),
 (np.int64(13660), 'the collected beowulf', 0.2929676294511777),
 (np.int64(61232), 'symbolic interactionism: perspective and method', 0.25),
 (np.int64(54613), 'the positive power of jesus christ', 0.2330634337978207),
 (np.int64(19925), "god's psychiatry", 0.21434733211052745)]

In [None]:
import numpy as np, pandas as pd, os, json

os.makedirs("artifacts", exist_ok=True)

# reprint final metrics
final_metrics = evaluate_topk_fast(train, test, k=10, alpha=BEST_ALPHA if 'BEST_ALPHA' in globals() else 0.6, sample_users=50)
print("Final Top-K (small sample):", final_metrics)

# save a JSON with settings + metrics
summary = {
    "BEST_ALPHA": float(BEST_ALPHA) if 'BEST_ALPHA' in globals() else 0.6,
    "metrics_sample_users": 50,
    "topk": 10,
    "final_metrics": final_metrics
}
with open("artifacts/summary.json", "w") as f:
    json.dump(summary, f, indent=2)

# export a small recommendations sample (<= 300 rows)
rng = np.random.default_rng(42)
users_pool = test["u"].unique()
sample_n = int(min(30, len(users_pool)))   # 30 users max
users_pick = rng.choice(users_pool, size=sample_n, replace=False)

rows = []
for u in users_pick:
    for i, title, s in recommend(int(u), k=10, mode="hybrid"):
        rows.append({"u": int(u), "i": int(i), "title": title, "score": round(float(s), 6)})
recs_df = pd.DataFrame(rows)
recs_df.to_csv("artifacts/recommendations_sample.csv", index=False)
print("Saved artifacts/recommendations_sample.csv with", len(recs_df), "rows")

# save an items lookup
items.loc[:, ["i","title","title_key"]].to_csv("artifacts/items_lookup.csv", index=False)
print("Saved artifacts/items_lookup.csv")

# quick qualitative check helpers
def similar_titles(query: str, k: int = 5):
    m = items[items["title"].str.contains(query, case=False, na=False)]
    if m.empty:
        print("No titles matching:", query); return []
    i0 = int(m.iloc[0]["i"])
    idx_ct, sim_ct = get_ct_neighbors(i0)   # cached + fast
    k = min(k, len(idx_ct))
    out = []
    for j, s in zip(idx_ct[:k], sim_ct[:k]):
        out.append((int(j), items.loc[items["i"]==j, "title"].values[0], float(s)))
    return out


# demo: show 5 recs for one random user (fast)
u_demo = int(rng.choice(interactions["u"].unique()))
print("\nHybrid recommendations for user", u_demo)
for i, title, score in recommend(u_demo, k=5, mode="hybrid"):
    print(f"  - {title} (i={i}) score={score:.3f}")


Final Top-K (small sample): {'precision@k': 0.018000000000000002, 'recall@k': 0.18, 'hit_rate': 0.18, 'coverage': 0.007292529138271448}
Saved artifacts/recommendations_sample.csv with 300 rows
Saved artifacts/items_lookup.csv

Hybrid recommendations for user 115202
  - the art of jewish cooking (i=30081) score=0.750
  - the mensch chef: or why delicious jewish food isn't an oxymoron (i=39312) score=0.662
  - the complete american-jewish cookbook: in accordance with the jewish dietary laws (i=20867) score=0.598
  - art of jewish cooking (i=50520) score=0.533
  - joan nathan's jewish holiday cookbook (i=53311) score=0.471


In [None]:
def add_f1(metrics_dict):
    m = dict(metrics_dict)
    p, r = float(m.get("precision@k", 0.0)), float(m.get("recall@k", 0.0))
    m["F1@k"] = 0.0 if (p + r) == 0 else 2 * p * r / (p + r)
    return m

# re-run small-sample eval and print with F1
m_small = evaluate_topk_fast(train, test, k=10, alpha=0.6, sample_users=50)
print("Final Top-K (small sample):", add_f1(m_small))

Final Top-K (small sample): {'precision@k': 0.016, 'recall@k': 0.16, 'hit_rate': 0.16, 'coverage': 0.006337175976052481, 'F1@k': 0.029090909090909094}


In [None]:
# human-readable recommendations for ONE user
# requires: recommend_for_user_fast, items (has 'i','title'), interactions, uid_map

# inverse map (index -> original user_id)
inv_uid_map = {v:k for k,v in uid_map.items()}

def recommend_df_for_user(u_idx: int, k: int = 10, alpha: float = 0.6):
    preds = recommend_for_user_fast(u_idx, alpha=alpha, final_k=k) or []
    rows = []
    for rank, (i, score) in enumerate(preds, start=1):
        title = items.loc[items["i"]==i, "title"].values[0] if (items["i"]==i).any() else f"item_{i}"
        rows.append({
            "u_idx": int(u_idx),
            "user_id": inv_uid_map.get(int(u_idx), None),
            "i": int(i),
            "title": title,
            "score": float(score),
            "rank": rank
        })
    import pandas as pd
    return pd.DataFrame(rows, columns=["u_idx","user_id","i","title","score","rank"])

# pick a random test user and preview top-10 recs
u_demo = int(test["u"].sample(1, random_state=7).iloc[0])
df_user_recs = recommend_df_for_user(u_demo, k=10, alpha=0.6)
df_user_recs.head(10)

Unnamed: 0,u_idx,user_id,i,title,score,rank
0,46856,A2BT108J266Y8G,51704,"brave new world,: a novel",0.987923,1
1,46856,A2BT108J266Y8G,39432,pilgrimage and exile: mother marlanne of molokai,0.534552,2
2,46856,A2BT108J266Y8G,19480,jacob's ladder:: wisdom for the heart's ascent...,0.534552,3
3,46856,A2BT108J266Y8G,25403,benedict xvi: way of the cross,0.534552,4
4,46856,A2BT108J266Y8G,2205,"understanding ""our father"": biblical reflectio...",0.534552,5
5,46856,A2BT108J266Y8G,10467,in the eyes of anahita: an adventure in search...,0.534552,6
6,46856,A2BT108J266Y8G,39968,the north star,0.533704,7
7,46856,A2BT108J266Y8G,46968,pocket handbook of christian apologetics,0.530967,8
8,46856,A2BT108J266Y8G,40014,tales of adam,0.530165,9
9,46856,A2BT108J266Y8G,54041,the ransomed heart: a collection of devotional...,0.529683,10


In [None]:
# batch export: recommendations for a SAMPLE of test users
import numpy as np, pandas as pd, os
os.makedirs("artifacts", exist_ok=True)

def export_recommendations(sample_users: int = 200, k: int = 10, alpha: float = 0.6,
                           filename: str = "artifacts/recommendations_final.csv"):
    rng = np.random.default_rng(42)
    users_pool = np.array(test["u"].unique())
    if len(users_pool) > sample_users:
        users_pick = rng.choice(users_pool, size=sample_users, replace=False)
    else:
        users_pick = users_pool

    all_rows = []
    for u in users_pick:
        preds = recommend_for_user_fast(int(u), alpha=alpha, final_k=k) or []
        for rank, (i, score) in enumerate(preds, start=1):
            title = items.loc[items["i"]==i, "title"].values[0] if (items["i"]==i).any() else f"item_{i}"
            all_rows.append({
                "u_idx": int(u),
                "user_id": inv_uid_map.get(int(u), None),
                "i": int(i),
                "title": title,
                "score": float(score),
                "rank": rank
            })

    recs_df = pd.DataFrame(all_rows, columns=["u_idx","user_id","i","title","score","rank"])
    recs_df.to_csv(filename, index=False)
    print(f"Saved {len(recs_df)} rows to {filename}")
    return recs_df

# run export
recs_final = export_recommendations(sample_users=200, k=10, alpha=0.6)
recs_final.head(20)


Saved 1960 rows to artifacts/recommendations_final.csv


Unnamed: 0,u_idx,user_id,i,title,score,rank
0,62432,A1SCBY1NHJC4BD,20345,the book of mormon another testament of jesus ...,0.892193,1
1,62432,A1SCBY1NHJC4BD,54127,answering mormons questions,0.4,2
2,62432,A1SCBY1NHJC4BD,39091,the lives and travels of mormon & moroni,0.391014,3
3,62432,A1SCBY1NHJC4BD,59539,mormon claims answered,0.378457,4
4,62432,A1SCBY1NHJC4BD,20789,the counterfeit gospel of mormonism: the great...,0.370176,5
5,62432,A1SCBY1NHJC4BD,18765,rich dad's guide to becoming rich...without cu...,0.364045,6
6,62432,A1SCBY1NHJC4BD,60167,one-minute answers to anti-mormon questions,0.360886,7
7,62432,A1SCBY1NHJC4BD,27784,"the mormon murders: a true story of greed, for...",0.354382,8
8,62432,A1SCBY1NHJC4BD,2923,mormonism for dummies (for dummies (religion &...,0.352702,9
9,62432,A1SCBY1NHJC4BD,11581,studies of the book of mormon,0.350969,10
