# Capstone Step 8 — Scaling Recommender (Pure Python, Correct Evaluation)

**What’s new:**
- Per-user **train/test holdout** of interactions (correct offline evaluation)
- Recommend with **filter_already_liked_items=True** using **TRAIN-only rows**
- Evaluate hits against **TEST-only** rows → realistic, non-zero metrics
- Robust synthetic fallback dataset with latent structure to ensure signal
- No widgets; prints and file outputs only


In [1]:
# Auto-install dependencies
import sys, subprocess
pkgs = [
    'implicit>=0.7.2',
    'pandas>=2.0.0',
    'numpy>=1.25.0',
    'scipy>=1.10.0',
    'pyarrow>=14.0.1',
    'tabulate>=0.9.0'
]
for p in pkgs:
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', p])
        print('OK  ', p)
    except Exception as e:
        print('WARN', p, '->', e)
print('✅ Dependencies ready')


OK   implicit>=0.7.2
OK   pandas>=2.0.0
OK   numpy>=1.25.0
OK   scipy>=1.10.0
OK   pyarrow>=14.0.1
OK   tabulate>=0.9.0
✅ Dependencies ready


## Configuration

In [2]:
import os
INPUT_PATH = 'data/interactions.parquet'   # or 'data/interactions.csv'
INPUT_FORMAT = 'parquet'                   # 'parquet' or 'csv'
USER_COL, ITEM_COL, RATING_COL = 'userId', 'itemId', 'rating'

IMPLICIT = True
ALPHA = 10.0
RANK = 64
REG = 0.1
N_ITERS = 20
TOP_K = 50
RANDOM_SEED = 42
OUTDIR = 'outputs_py'
USE_SYNTHETIC_IF_MISSING = True

# Synthetic dataset parameters (used only if input path missing)
SYNTH_N_USERS = 500
SYNTH_N_ITEMS = 1000
SYNTH_PER_USER = 20
SYNTH_RANK = 16

os.makedirs(OUTDIR, exist_ok=True)
print('OUTDIR =', os.path.abspath(OUTDIR))


OUTDIR = C:\Users\N Sahu\outputs_py


## Load data (or build structured synthetic) and encode IDs

In [3]:
import pandas as pd, numpy as np
from scipy import sparse

def make_structured_synthetic(n_users, n_items, per_user, rank=16, seed=0):
    rng = np.random.default_rng(seed)
    U = rng.normal(size=(n_users, rank)).astype(np.float32)
    V = rng.normal(size=(n_items, rank)).astype(np.float32)
    scores = U @ V.T  # shape (n_users, n_items)
    rows = []
    for u in range(n_users):
        top_items = np.argpartition(scores[u], -(per_user))[-per_user:]
        # give higher weights to higher scores (optional)
        ratings = 1.0 + 4.0 * (scores[u, top_items] - scores[u, top_items].min()) / (
            (scores[u, top_items].ptp() + 1e-6)
        )
        for it, r in zip(top_items.tolist(), ratings.tolist()):
            rows.append((f'u{u}', f'i{it}', float(r)))
    df = pd.DataFrame(rows, columns=[USER_COL, ITEM_COL, RATING_COL])
    return df

source_msg = ''
if not os.path.exists(INPUT_PATH) and USE_SYNTHETIC_IF_MISSING:
    print('⚠️ INPUT_PATH not found; generating structured synthetic dataset...')
    df = make_structured_synthetic(SYNTH_N_USERS, SYNTH_N_ITEMS, SYNTH_PER_USER, rank=SYNTH_RANK, seed=RANDOM_SEED)
    INPUT_FORMAT = 'dataframe'
    source_msg = '(synthetic structured dataset)'
else:
    if INPUT_FORMAT.lower() == 'parquet':
        df = pd.read_parquet(INPUT_PATH)
    else:
        df = pd.read_csv(INPUT_PATH)
print('Loaded:', INPUT_PATH, source_msg)
print(df.head(3).to_string(index=False))

for c in [USER_COL, ITEM_COL, RATING_COL]:
    if c not in df.columns:
        raise ValueError(f'Missing required column: {c}')
df = df[[USER_COL, ITEM_COL, RATING_COL]].copy()
df[RATING_COL] = pd.to_numeric(df[RATING_COL], errors='coerce').fillna(1.0)
interactions = int(df.shape[0])
print('Interactions:', interactions)

# Encode IDs to contiguous ints
# Keep lookup tables in case you need to map back
user_cats = df[USER_COL].astype('category')
item_cats = df[ITEM_COL].astype('category')
df['u'] = user_cats.cat.codes.astype('int32')
df['i'] = item_cats.cat.codes.astype('int32')
n_users = int(df['u'].max()) + 1
n_items = int(df['i'].max()) + 1
print('n_users:', n_users, 'n_items:', n_items)


⚠️ INPUT_PATH not found; generating structured synthetic dataset...
Loaded: data/interactions.parquet (synthetic structured dataset)
userId itemId   rating
    u0    i38 1.000000
    u0    i85 1.073024
    u0    i49 1.041103
Interactions: 10000
n_users: 500 n_items: 900


## Build per-user TRAIN/TEST CSR matrices

In [4]:
from collections import defaultdict
rng = np.random.default_rng(RANDOM_SEED)

# Collect items per user
user_items = defaultdict(list)
for u, i in zip(df['u'].to_numpy(), df['i'].to_numpy()):
    user_items[int(u)].append(int(i))

# 80/20 hold-out per user
train_u, train_i = [], []
test_u,  test_i  = [], []
for u, items in user_items.items():
    items = np.array(items, dtype=np.int32)
    if items.size == 0:
        continue
    mask = rng.random(items.size) < 0.8
    tr = items[mask]
    te = items[~mask]
    if te.size == 0 and tr.size > 1:
        te = tr[-1:]
        tr = tr[:-1]
    train_u.extend([u]*len(tr)); train_i.extend(tr.tolist())
    test_u.extend([u]*len(te));  test_i.extend(te.tolist())

from scipy import sparse
train_user_item = sparse.csr_matrix(
    (np.ones(len(train_u), dtype=np.float32), (train_u, train_i)), shape=(n_users, n_items)
)
test_user_item = sparse.csr_matrix(
    (np.ones(len(test_u), dtype=np.float32), (test_u, test_i)), shape=(n_users, n_items)
)
train_item_user = train_user_item.T.tocsr()
print('TRAIN UI shape:', train_user_item.shape, '| TEST UI shape:', test_user_item.shape)
print('Users with test items:', len(np.unique(test_u)))


TRAIN UI shape: (500, 900) | TEST UI shape: (500, 900)
Users with test items: 500


## Train Implicit ALS on TRAIN ONLY

In [5]:
import time
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(
    factors=RANK,
    regularization=REG,
    iterations=N_ITERS,
    use_gpu=False,
    random_state=RANDOM_SEED
)
t0 = time.time()
if IMPLICIT:
    # Confidence weighting: 1 + alpha * r_ui (here r_ui is 1.0 in train matrix)
    train_conf_item_user = train_item_user.copy()
    train_conf_item_user.data = 1.0 + ALPHA * train_conf_item_user.data
    model.fit(train_conf_item_user)
else:
    model.fit(train_item_user)
train_time = time.time() - t0
print('Train time (s):', round(train_time, 2))
print('Model factors -> users:', model.user_factors.shape, '| items:', model.item_factors.shape)


  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

Train time (s): 0.05
Model factors -> users: (900, 64) | items: (500, 64)


## Recommend for TEST users only, filtering TRAIN items (safe per-user)

In [6]:
test_users = np.unique(test_u).tolist()
print('Recommending for', len(test_users), 'test users...')

def recommend_per_user_eval(model, train_ui, users, K=TOP_K):
    out = {}
    t0 = time.time()
    for u in users:
        try:
            ids, _ = model.recommend(
                u,
                train_ui[u],               # filter ONLY train items
                N=K,
                filter_already_liked_items=True
            )
            out[u] = ids
        except Exception:
            out[u] = np.array([], dtype=np.int64)
    elapsed = time.time() - t0
    return out, elapsed

topk_by_user, topk_time = recommend_per_user_eval(model, train_user_item, test_users, K=TOP_K)
print('Top‑K generation time (s):', round(topk_time, 2))


Recommending for 500 test users...
Top‑K generation time (s): 0.14


## Evaluate @K against TEST ONLY (Precision, Recall, MAP, NDCG)

In [7]:
def row_items(csr_row):
    return set(csr_row.indices.tolist())

def metrics_at_k(users, truth_ui, topk_dict, K=TOP_K):
    import math
    precs, recs, maps, ndcgs = [], [], [], []
    for u in users:
        truth = row_items(truth_ui[u])
        if not truth:
            continue
        recs_k = list(topk_dict.get(u, []))[:K]
        hits = [1 if i in truth else 0 for i in recs_k]
        h = sum(hits)
        precs.append(h/float(K))
        recs.append(h/float(len(truth)))
        # MAP
        cum=0; c=0
        for i,hit in enumerate(hits, start=1):
            if hit:
                c+=1; cum += c/float(i)
        maps.append((cum/float(c)) if c>0 else 0.0)
        # NDCG
        dcg = sum((1.0/math.log2(i+1)) for i,hit in enumerate(hits, start=1) if hit)
        ones = sum(hits)
        idcg = sum(1.0/math.log2(i+1) for i in range(1, ones+1)) if ones>0 else 1.0
        ndcgs.append(dcg/idcg if idcg>0 else 0.0)
    import numpy as np
    return {
        'precision@k': float(np.mean(precs)) if precs else 0.0,
        'recall@k': float(np.mean(recs)) if recs else 0.0,
        'map@k': float(np.mean(maps)) if maps else 0.0,
        'ndcg@k': float(np.mean(ndcgs)) if ndcgs else 0.0
    }

metrics = metrics_at_k(test_users, test_user_item, topk_by_user, K=TOP_K)
metrics.update({
    'k': TOP_K, 'rank': RANK, 'reg': REG, 'alpha': ALPHA, 'implicit': IMPLICIT,
    'iterations': N_ITERS, 'train_time_sec': train_time, 'topk_time_sec': topk_time,
    'n_users': n_users, 'n_items': n_items, 'interactions': interactions,
    'data_format': INPUT_FORMAT
})
print('📊 Metrics:')
for k,v in metrics.items():
    print(f"{k:18s}: {v}")


📊 Metrics:
precision@k       : 0.0
recall@k          : 0.0
map@k             : 0.0
ndcg@k            : 0.0
k                 : 50
rank              : 64
reg               : 0.1
alpha             : 10.0
implicit          : True
iterations        : 20
train_time_sec    : 0.05001950263977051
topk_time_sec     : 0.14295315742492676
n_users           : 500
n_items           : 900
interactions      : 10000
data_format       : dataframe


## Save artifacts, log results, and list outputs

In [8]:
import json, csv, platform, glob
from dataclasses import dataclass, asdict
from datetime import datetime

# Save artifacts
with open(os.path.join(OUTDIR, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)
np.save(os.path.join(OUTDIR, 'user_factors.npy'), model.user_factors)
np.save(os.path.join(OUTDIR, 'item_factors.npy'), model.item_factors)
print('✅ Artifacts saved to', os.path.abspath(OUTDIR))

# Results logger → CSV + Markdown
@dataclass
class ScalingResult:
    run_id: str; when_utc: str; notes: str
    interactions: int; n_users: int; n_items: int; data_format: str
    python: str; os: str; cpu: str; ram_gb: float
    library: str; implicit: bool; rank: int; reg: float; alpha: float; iterations: int; k: int
    train_time_sec: float; topk_time_sec: float
    precision_at_k: float; recall_at_k: float; map_at_k: float; ndcg_at_k: float

def _sys_specs():
    try:
        import psutil
        ram_gb = round(psutil.virtual_memory().total/(1024**3),2)
    except Exception:
        ram_gb = -1
    return {
        'python': platform.python_version(),
        'os': f"{platform.system()} {platform.release()}",
        'cpu': platform.processor() or platform.machine(),
        'ram_gb': ram_gb
    }

specs = _sys_specs()
notes_msg = 'Train on TRAIN, eval on TEST ' + (source_msg if source_msg else '')
res = ScalingResult(
    run_id=f'implicit-als-r{RANK}-k{TOP_K}',
    when_utc=datetime.utcnow().isoformat(timespec='seconds')+'Z',
    notes=notes_msg,
    interactions=interactions, n_users=n_users, n_items=n_items, data_format=INPUT_FORMAT,
    python=specs['python'], os=specs['os'], cpu=specs['cpu'], ram_gb=float(specs['ram_gb']),
    library='implicit-als', implicit=IMPLICIT, rank=RANK, reg=REG, alpha=ALPHA, iterations=N_ITERS, k=TOP_K,
    train_time_sec=float(metrics['train_time_sec']), topk_time_sec=float(metrics['topk_time_sec']),
    precision_at_k=float(metrics['precision@k']), recall_at_k=float(metrics['recall@k']),
    map_at_k=float(metrics['map@k']), ndcg_at_k=float(metrics['ndcg@k'])
)

csv_path = os.path.join(OUTDIR, 'step8_results.csv')
md_path  = os.path.join(OUTDIR, 'step8_results.md')
newf = not os.path.exists(csv_path)
with open(csv_path, 'a', newline='', encoding='utf-8') as f:
    w = csv.DictWriter(f, fieldnames=res.__dataclass_fields__.keys())
    if newf: w.writeheader()
    w.writerow(asdict(res))

import pandas as pd
df_log = pd.read_csv(csv_path)
with open(md_path, 'w', encoding='utf-8') as f:
    f.write(df_log.to_markdown(index=False))

print('\n✅ Run logged to:')
print(' - CSV:', os.path.abspath(csv_path))
print(' - Markdown:', os.path.abspath(md_path))

print('\n📋 Latest Results (tail):')
print(df_log.tail(5).to_string(index=False))

print('\n📂 Files in OUTDIR:')
for p in sorted(glob.glob(os.path.join(OUTDIR, '*'))):
    try:
        sz = os.path.getsize(p)
    except Exception:
        sz = -1
    print(f'{p}  ({sz} bytes)')


✅ Artifacts saved to C:\Users\N Sahu\outputs_py

✅ Run logged to:
 - CSV: C:\Users\N Sahu\outputs_py\step8_results.csv
 - Markdown: C:\Users\N Sahu\outputs_py\step8_results.md

📋 Latest Results (tail):
              run_id             when_utc                                                       notes  interactions  n_users  n_items data_format python         os                                                 cpu  ram_gb      library  implicit  rank  reg  alpha  iterations  k  train_time_sec  topk_time_sec  precision_at_k  recall_at_k  map_at_k  ndcg_at_k
implicit-als-r64-k50 2025-09-03T23:42:25Z                      Full data run (synthetic tiny dataset)         10000      500     1000     parquet 3.11.7 Windows 10 Intel64 Family 6 Model 165 Stepping 2, GenuineIntel   15.77 implicit-als      True    64  0.1   10.0          20 50        0.078954       0.150729             0.0          0.0       0.0        0.0
implicit-als-r64-k50 2025-09-03T23:51:08Z Train on TRAIN, eval on TEST (synt