# Recommendation System — Train / Inference Evaluation

This notebook tests all models in the new separated train-once / infer-many framework:

1. Load data and sample a random cutoff day
2. **Training phase** — fit all models on pre-cutoff data and save artifacts to `models/`
3. **Inference phase** — reload models from disk (simulates a fresh server / daily job)
4. **Evaluate** — compare all models offline using post-cutoff interactions as ground truth
5. **Results** — summary table and metric chart

In [3]:
import os
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Resolve project root regardless of kernel working directory
if os.path.basename(os.getcwd()) == 'notebooks':
    PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
else:
    PROJECT_ROOT = os.getcwd()

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# ── Model classes ──────────────────────────────────────────────────────────
from functions.ials_implicit_package import IALSImplicitRecommender
from functions.interaction_matrix import InteractionMatrixBuilder
from functions.item_item_knn import ItemItemRecommenderKnn
from functions.model_store import list_saved_models, load_model, save_model
from functions.rs_baseline_models import (
    CooccurrenceRecommender,
    PopularityRecommender,
    TrendingRecommender,
)
from functions import rs_evaluation as rsev

DATA_PATH = os.path.join(PROJECT_ROOT, 'data', 'Home_and_Kitchen_filtered.csv')
MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')

print('Project root :', PROJECT_ROOT)
print('Data path    :', DATA_PATH)
print('Models dir   :', MODELS_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Project root : /Users/vachemacbook/Desktop/RecSystem/RecSystem
Data path    : /Users/vachemacbook/Desktop/RecSystem/RecSystem/data/Home_and_Kitchen_filtered.csv
Models dir   : /Users/vachemacbook/Desktop/RecSystem/RecSystem/models


## 1. Load Data & Sample Cutoff Day

In [4]:
df = pd.read_csv(DATA_PATH)
df = df.sort_values('unixReviewTime').reset_index(drop=True)
df['review_dt'] = pd.to_datetime(df['unixReviewTime'], unit='s')

print(f'Loaded {len(df):,} interactions')
print(f'Time span : {df["review_dt"].min().date()}  →  {df["review_dt"].max().date()}')
print(f'Unique users : {df["reviewerID"].nunique():,}')
print(f'Unique items : {df["asin"].nunique():,}')

  df = pd.read_csv(DATA_PATH)


Loaded 6,898,955 interactions
Time span : 2000-05-02  →  2018-10-04
Unique users : 777,242
Unique items : 189,172


In [5]:
# ── Sample a random cutoff day in the middle of the timeline ───────────────
# We work in whole days so the boundary is clean and easy to reproduce.

def sample_cutoff_day(df, random_state=42, lower_q=0.3, upper_q=0.7):
    """Pick a random day between the given quantiles of the unix timestamp distribution."""
    rng = np.random.default_rng(random_state)
    ts = df['unixReviewTime'].to_numpy()
    low, high = np.quantile(ts, lower_q), np.quantile(ts, upper_q)
    days = np.unique((ts[(ts >= low) & (ts <= high)] // 86_400) * 86_400)
    cutoff_unix = int(rng.choice(days))
    return cutoff_unix, pd.to_datetime(cutoff_unix, unit='s')


CUTOFF_UNIX, CUTOFF_TS = sample_cutoff_day(df)

df_train = df[df['unixReviewTime'] < CUTOFF_UNIX].copy()
df_test  = df[df['unixReviewTime'] > CUTOFF_UNIX].copy()

train_users = set(df_train['reviewerID'].unique())
test_users  = set(df_test['reviewerID'].unique())
ALL_EVAL_USERS = sorted(train_users & test_users)

# ── Subsample eval users for fast iteration ────────────────────────────────
MAX_EVAL_USERS = 2_000
rng_eval = np.random.default_rng(0)
EVAL_USERS = (
    list(rng_eval.choice(ALL_EVAL_USERS, size=MAX_EVAL_USERS, replace=False))
    if len(ALL_EVAL_USERS) > MAX_EVAL_USERS
    else ALL_EVAL_USERS
)

print(f'Cutoff day       : {CUTOFF_TS.date()}  (unix={CUTOFF_UNIX})')
print(f'Train interactions: {len(df_train):,}')
print(f'Test  interactions: {len(df_test):,}')
print(f'Eval users (full) : {len(ALL_EVAL_USERS):,}')
print(f'Eval users (sample): {len(EVAL_USERS):,}')

Cutoff day       : 2015-05-28  (unix=1432771200)
Train interactions: 2,251,903
Test  interactions: 4,643,855
Eval users (full) : 423,925
Eval users (sample): 2,000


## 2. Training Phase

Fit every model on pre-cutoff data, then persist to `models/`.  
In production this entire section runs once per day as a batch job.

In [6]:
# ── Hyperparameters (lightweight defaults for quick testing) ───────────────
KNN_K          = 30
KNN_MIN_USERS  = 10
IALS_FACTORS   = 32
IALS_EPOCHS    = 10
IALS_ALPHA     = 40
MATRIX_MIN_USERS = 10

In [7]:
# ── 1. Popularity ──────────────────────────────────────────────────────────
t = time.time()
pop_model = PopularityRecommender().fit(df_train, CUTOFF_UNIX)
save_model(pop_model, 'popularity', models_dir=MODELS_DIR)
print(f'Popularity trained and saved  ({time.time()-t:.1f}s)')

Saved 'popularity' → /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/popularity.joblib
Popularity trained and saved  (2.5s)


In [8]:
# ── 2. Trending ────────────────────────────────────────────────────────────
t = time.time()
trend_model = TrendingRecommender(n_days=7).fit(df_train, CUTOFF_UNIX)
save_model(trend_model, 'trending', models_dir=MODELS_DIR)
print(f'Trending trained and saved  ({time.time()-t:.1f}s)')

Saved 'trending' → /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/trending.joblib
Trending trained and saved  (3.3s)


In [9]:
# ── 3. Co-occurrence ───────────────────────────────────────────────────────
t = time.time()
cooc_model = CooccurrenceRecommender().fit(df_train, CUTOFF_UNIX)
save_model(cooc_model, 'cooccurrence', models_dir=MODELS_DIR)
print(f'Co-occurrence trained and saved  ({time.time()-t:.1f}s)')

Saved 'cooccurrence' → /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/cooccurrence.joblib
Co-occurrence trained and saved  (8.1s)


In [10]:
# ── 4. Item-Item kNN ───────────────────────────────────────────────────────
t = time.time()
knn_model = ItemItemRecommenderKnn(k=KNN_K, min_users=KNN_MIN_USERS, shrinkage=10).fit(
    df_train, cutoff_time=CUTOFF_UNIX
)
save_model(knn_model, 'item_knn', models_dir=MODELS_DIR)
print(f'Item-Item kNN trained and saved  ({time.time()-t:.1f}s)')

Saved 'item_knn' → /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/item_knn.joblib
Item-Item kNN trained and saved  (13.7s)


In [11]:
# ── 5. IALS ────────────────────────────────────────────────────────────────
t = time.time()
builder = InteractionMatrixBuilder(min_users=MATRIX_MIN_USERS)
train_matrix = builder.build(df_train)
print(f'Interaction matrix: {train_matrix.shape[0]:,} users × {train_matrix.shape[1]:,} items')

ials_model = IALSImplicitRecommender(
    factors=IALS_FACTORS, regularization=0.01, alpha=IALS_ALPHA,
    epochs=IALS_EPOCHS, use_gpu=False
).fit(train_matrix, builder.item_map, builder.items, builder.user_map, builder.users)
save_model(ials_model, 'ials', models_dir=MODELS_DIR)
print(f'IALS trained and saved  ({time.time()-t:.1f}s)')

Interaction matrix: 462,223 users × 41,001 items


  check_blas_config()
100%|██████████| 10/10 [01:06<00:00,  6.61s/it]


Saved 'ials' → /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/ials.joblib
IALS trained and saved  (78.8s)


In [12]:
print('Saved artifacts in models/:')
for name in list_saved_models(MODELS_DIR):
    print(f'  {name}.joblib')

Saved artifacts in models/:
  cooccurrence.joblib
  ials.joblib
  item_knn.joblib
  popularity.joblib
  trending.joblib


## 3. Inference Phase

Load each model from disk — this is what a production server does once at startup.  
From this point on, no raw data is needed; every `recommend()` call is a fast in-memory lookup.

In [13]:
# Load all five models from their joblib artifacts
pop_inf   = load_model(PopularityRecommender,   'popularity',   models_dir=MODELS_DIR)
trend_inf = load_model(TrendingRecommender,     'trending',     models_dir=MODELS_DIR)
cooc_inf  = load_model(CooccurrenceRecommender, 'cooccurrence', models_dir=MODELS_DIR)
knn_inf   = load_model(ItemItemRecommenderKnn,  'item_knn',     models_dir=MODELS_DIR)
ials_inf  = load_model(IALSImplicitRecommender, 'ials',         models_dir=MODELS_DIR)

print('All models loaded. Ready to serve.')

Loaded 'popularity' from /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/popularity.joblib
Loaded 'trending' from /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/trending.joblib
Loaded 'cooccurrence' from /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/cooccurrence.joblib
Loaded 'item_knn' from /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/item_knn.joblib
Loaded 'ials' from /Users/vachemacbook/Desktop/RecSystem/RecSystem/models/ials.joblib
All models loaded. Ready to serve.


In [16]:
# Quick smoke-test: generate recommendations for one known user
demo_user = EVAL_USERS[0]
demo_hist = df_train[df_train['reviewerID'] == demo_user]['asin'].tolist()

print(f'Demo user : {demo_user}')
print(f'History   : {len(demo_hist)} items')
print()
print('Popularity   :', pop_inf.recommend(demo_hist, n=5))
print('Trending     :', trend_inf.recommend(demo_hist, n=5))
print('Co-occurrence:', cooc_inf.recommend(demo_hist, n=5))
print('Item-kNN     :', knn_inf.recommend(demo_hist, top_n=5))
print('IALS         :', ials_inf.recommend(demo_user, demo_hist, top_n=5))

Demo user : A1WMZ7LN1AH4XW
History   : 6 items

Popularity   : ['B0015TMHSI', 'B000ZK5UT6', 'B000YGEVMI', 'B0017XHSAE', 'B00902X68W']
Trending     : ['B00GRAEZNK', 'B00SV0EF4S', 'B00PRDN64M', 'B00R3Z49G6', 'B00T1Q2KRU']
Co-occurrence: ['B000WHE5PC', 'B0014CX87U', 'B00WMK5X08', 'B000RO0TUU', 'B001LGWMHG']
Item-kNN     : ['B003K251HW', 'B00JTSEENS', 'B0090E4HD8', 'B00CZ2OJJO', 'B005OCZT1Y']
IALS         : ['B003A2IDMC', 'B0014X7ARI', 'B00007E7RY', 'B000ZK5UT6', 'B000H0SDD4']


## 4. Offline Evaluation

**Protocol:** train on pre-cutoff interactions → recommend → check whether recommended items
appear in each user's post-cutoff interactions.  
Metrics computed per user then averaged: Precision@K, Recall@K, Hit@K, NDCG@K.

In [17]:
# Pre-compute user histories and future interactions (done once, not per model)
user_history_map = (
    df_train.groupby('reviewerID')['asin']
    .apply(list)
    .to_dict()
)
user_future_map = (
    df_test.groupby('reviewerID')['asin']
    .apply(lambda s: list(set(s)))
    .to_dict()
)

print(f'Users with history: {len(user_history_map):,}')
print(f'Users with future interactions: {len(user_future_map):,}')

Users with history: 484,945
Users with future interactions: 716,201


In [18]:
def evaluate_model(model_name, recommend_fn, eval_users, user_history_map, user_future_map, k=10):
    """Evaluate a model over a set of users, returning averaged metrics.

    recommend_fn signature: (user_id: str, history: list, k: int) -> list[str]
    """
    precs, recs, hits, ndcgs = [], [], [], []

    for uid in eval_users:
        hist   = user_history_map.get(uid, [])
        future = user_future_map.get(uid, [])
        if not future:
            continue

        recs_u = recommend_fn(uid, hist, k)

        precs.append(rsev.calculate_precision_at_k(recs_u, future, k))
        recs.append(rsev.calculate_recall_at_k(recs_u, future, k))
        hits.append(rsev.calculate_hit_at_k(recs_u, future, k))
        ndcgs.append(rsev.calculate_ndcg_at_k(recs_u, future, k))

    avg = lambda xs: float(np.mean(xs)) if xs else 0.0
    return {
        'model':       model_name,
        'k':           k,
        'precision@k': avg(precs),
        'recall@k':    avg(recs),
        'hit@k':       avg(hits),
        'ndcg@k':      avg(ndcgs),
        'n_users':     len(precs),
    }

In [19]:
# Wrap each loaded model into the unified (user_id, history, k) -> recs interface.
# Models that are purely item-based just ignore user_id.

MODELS_TO_EVAL = {
    'popularity':    lambda uid, hist, k: pop_inf.recommend(hist, n=k),
    'trending':      lambda uid, hist, k: trend_inf.recommend(hist, n=k),
    'cooccurrence':  lambda uid, hist, k: cooc_inf.recommend(hist, n=k),
    'item_knn':      lambda uid, hist, k: knn_inf.recommend(hist, top_n=k),
    'ials':          lambda uid, hist, k: ials_inf.recommend(uid, hist, top_n=k),
}

In [20]:
K = 10
all_results = []

for model_name, rec_fn in MODELS_TO_EVAL.items():
    t = time.time()
    result = evaluate_model(
        model_name, rec_fn, EVAL_USERS,
        user_history_map, user_future_map, k=K
    )
    elapsed = time.time() - t
    all_results.append(result)
    print(
        f"{model_name:<15}  "
        f"hit@{K}={result['hit@k']:.4f}  "
        f"ndcg@{K}={result['ndcg@k']:.4f}  "
        f"prec@{K}={result['precision@k']:.4f}  "
        f"recall@{K}={result['recall@k']:.4f}  "
        f"({elapsed:.1f}s, n={result['n_users']:,})"
    )

popularity       hit@10=0.0140  ndcg@10=0.0027  prec@10=0.0014  recall@10=0.0031  (20.7s, n=2,000)
trending         hit@10=0.0020  ndcg@10=0.0004  prec@10=0.0002  recall@10=0.0004  (16.6s, n=2,000)
cooccurrence     hit@10=0.0195  ndcg@10=0.0034  prec@10=0.0022  recall@10=0.0038  (119.1s, n=2,000)
item_knn         hit@10=0.0115  ndcg@10=0.0020  prec@10=0.0012  recall@10=0.0027  (13.1s, n=2,000)
ials             hit@10=0.0160  ndcg@10=0.0025  prec@10=0.0019  recall@10=0.0021  (11.4s, n=2,000)


## 5. Results Summary

In [21]:
results_df = pd.DataFrame(all_results).set_index('model')
display_cols = ['hit@k', 'ndcg@k', 'precision@k', 'recall@k', 'n_users']

summary = results_df[display_cols].copy()
for col in ['hit@k', 'ndcg@k', 'precision@k', 'recall@k']:
    best = summary[col].max()
    summary[col] = summary[col].apply(
        lambda v: f'{v:.4f} ◀' if v == best else f'{v:.4f}'
    )
summary['n_users'] = results_df['n_users'].apply(lambda v: f'{v:,}')

print(summary.to_string())

ImportError: Missing optional dependency 'Jinja2'. DataFrame.style requires jinja2. Use pip or conda to install Jinja2.

In [None]:
# Grouped bar chart — all four metrics side by side per model
metrics   = ['hit@k', 'ndcg@k', 'precision@k', 'recall@k']
models    = results_df.index.tolist()
n_models  = len(models)
n_metrics = len(metrics)

x = np.arange(n_models)
width = 0.18
offsets = np.linspace(-(n_metrics - 1) / 2, (n_metrics - 1) / 2, n_metrics) * width

fig, ax = plt.subplots(figsize=(11, 5))
for i, (metric, offset) in enumerate(zip(metrics, offsets)):
    values = results_df[metric].values
    bars = ax.bar(x + offset, values, width, label=metric)
    for bar, val in zip(bars, values):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.001,
            f'{val:.3f}', ha='center', va='bottom', fontsize=7
        )

ax.set_xticks(x)
ax.set_xticklabels(models, rotation=15, ha='right')
ax.set_ylabel(f'Score  (K={K})')
ax.set_title(f'Offline Evaluation — All Models @ K={K}  (cutoff: {CUTOFF_TS.date()})')
ax.legend(loc='upper right')
ax.set_ylim(0, results_df[metrics].max().max() * 1.25)
plt.tight_layout()
plt.show()

In [None]:
# Hit@K across multiple K values — useful for understanding depth of signal
K_VALUES = [5, 10, 20, 50]

hit_at_k = {}
for model_name, rec_fn in MODELS_TO_EVAL.items():
    hit_at_k[model_name] = []
    for k in K_VALUES:
        res = evaluate_model(
            model_name, rec_fn, EVAL_USERS,
            user_history_map, user_future_map, k=k
        )
        hit_at_k[model_name].append(res['hit@k'])

fig, ax = plt.subplots(figsize=(8, 5))
for model_name, values in hit_at_k.items():
    ax.plot(K_VALUES, values, marker='o', label=model_name)

ax.set_xlabel('K')
ax.set_ylabel('Hit@K')
ax.set_title(f'Hit@K vs K  (cutoff: {CUTOFF_TS.date()})')
ax.legend()
ax.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()