# Repeat Dataset Evaluation Analysis (Stage 9)

This notebook analyzes the repeat order evaluation results from `stage9_repeat_detailed.json`.
The pipeline predicts which vendor a user will re-order from using a two-round LLM approach:
- **Round 1**: Predict top-3 cuisines (using order history + LightGCN CF scores)
- **Round 2**: Rank candidate vendors within those cuisines

**Key Questions:**
1. How well does the pipeline predict repeat orders? (Hit@K, NDCG, MRR)
2. How often is the ground truth vendor in the candidate list?
3. How accurate is Round 1 cuisine prediction?
4. Which cuisines are hardest/easiest to predict?
5. Do temporal patterns (time of day, day of week) affect performance?
6. What characterizes failure cases?

## 1. Setup & Data Loading

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from IPython.display import display, Markdown

# Load the data
data_path = '/home/zhenkai/personal/Projects/AgenticRecommender/outputs/202601310621/stage9_repeat_detailed.json'
with open(data_path) as f:
    data = json.load(f)

print(f"Total records: {len(data)}")
print(f"Fields per record: {list(data[0].keys())}")

## 2. Data Overview

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Derived columns
df['gt_in_candidates'] = df.apply(lambda r: r['ground_truth'] in r['candidate_vendors'], axis=1)
df['gt_cuisine_in_round1'] = df.apply(lambda r: r['ground_truth_cuisine'] in r['round1_predicted_cuisines'], axis=1)
df['history_len'] = df['order_history_tuples'].apply(len)
df['history_cuisines'] = df['order_history_tuples'].apply(lambda tuples: [t[1] for t in tuples])
df['history_unique_cuisines'] = df['history_cuisines'].apply(lambda c: len(set(c)))
df['lightgcn_gt_cuisine_rank'] = df.apply(
    lambda r: next((i+1 for i, (c, _) in enumerate(r['lightgcn_top_cuisines']) if c == r['ground_truth_cuisine']), 0),
    axis=1
)

print("Columns:", list(df.columns))
print(f"\nBasic Statistics (ground_truth_rank, 0 = not found):")
print(df['ground_truth_rank'].describe())
print(f"\nCandidate count stats:")
print(df['candidate_count'].describe())

In [None]:
# ── Case Lookup System ────────────────────────────────────────
# Two hashmaps for instant lookup by sample_idx or customer_id.
#
#   lookup(38)                     -> pretty-print case #38
#   case_by_idx[38]                -> raw dict for case #38
#   lookup_customer('abc...')      -> pretty-print all cases for a customer
#   cases_by_customer['abc...']    -> raw list of dicts for a customer

case_by_idx = {r['sample_idx']: r for r in data}

_by_cust = defaultdict(list)
for r in data:
    _by_cust[r['customer_id']].append(r)
cases_by_customer = dict(_by_cust)

def lookup(sample_idx):
    """Pretty-print a single test case by sample_idx. Returns the raw dict."""
    c = case_by_idx.get(sample_idx)
    if c is None:
        print(f"No case with sample_idx={sample_idx}")
        return None
    print(f"Sample {c['sample_idx']}  |  Customer: {c['customer_id']}")
    print(f"  Ground truth:       {c['ground_truth']}  (cuisine: {c['ground_truth_cuisine']})")
    print(f"  Target time:        day={c['target_day_of_week']}  hour={c['target_hour']}")
    print(f"  GT in candidates:   {c['ground_truth'] in c['candidate_vendors']}")
    print(f"  Ground truth rank:  {c['ground_truth_rank']}  {'(NOT FOUND)' if c['ground_truth_rank'] == 0 else ''}")
    print(f"  Candidate count:    {c['candidate_count']}")
    print(f"\n  Order history ({len(c['order_history_tuples'])} orders):")
    for vendor, cuisine, time_str in c['order_history_tuples']:
        tag = "  << GT vendor" if vendor == c['ground_truth_vendor_id'] else ""
        print(f"    {vendor}||{cuisine} ({time_str}){tag}")
    print(f"\n  Round 1 predicted cuisines: {c['round1_predicted_cuisines']}")
    print(f"  GT cuisine in R1:          {c['ground_truth_cuisine'] in c['round1_predicted_cuisines']}")
    print(f"\n  LightGCN top cuisines:")
    for cuisine, score in c['lightgcn_top_cuisines'][:5]:
        tag = "  << GT cuisine" if cuisine == c['ground_truth_cuisine'] else ""
        print(f"    {cuisine}: {score:.3f}{tag}")
    print(f"\n  Candidates: {c['candidate_vendors'][:10]}{'...' if len(c['candidate_vendors']) > 10 else ''}")
    print(f"  Final ranking: {c['final_ranking'][:10]}{'...' if len(c['final_ranking']) > 10 else ''}")
    print(f"\n  Timing: R1={c['round1_llm_ms']:.0f}ms  R2={c['round2_llm_ms']:.0f}ms  Total={c['time_ms']:.0f}ms")
    return c

def lookup_customer(customer_id):
    """Pretty-print all test cases for a given customer_id. Returns list of raw dicts."""
    cases = cases_by_customer.get(customer_id, [])
    if not cases:
        print(f"No cases for customer_id={customer_id}")
        return []
    print(f"{len(cases)} case(s) for customer {customer_id}:")
    for c in cases:
        gt_in = c['ground_truth'] in c['candidate_vendors']
        print(f"  idx={c['sample_idx']}  GT={c['ground_truth']}  "
              f"rank={c['ground_truth_rank']}  GT_in_cands={gt_in}")
    return cases

print(f"Lookup indices ready:")
print(f"  case_by_idx       -> {len(case_by_idx)} cases (e.g., lookup(38))")
print(f"  cases_by_customer -> {len(cases_by_customer)} unique customers (e.g., lookup_customer('...'))")
print(f"\nSample customer IDs: {list(cases_by_customer.keys())[:5]}")

In [None]:
# Aggregate metrics
found = df[df['ground_truth_rank'] > 0]
total = len(df)

print("=" * 60)
print("AGGREGATE METRICS")
print("=" * 60)

for k in [1, 3, 5]:
    hit = (found['ground_truth_rank'] <= k).sum() / total
    print(f"  Hit@{k}:  {hit*100:.1f}%  ({(found['ground_truth_rank'] <= k).sum()}/{total})")

# NDCG
ndcg_vals = []
for _, r in df.iterrows():
    rank = r['ground_truth_rank']
    ndcg_vals.append(1.0 / np.log2(rank + 1) if rank > 0 else 0.0)
df['ndcg'] = ndcg_vals

# MRR
mrr_vals = [1.0 / r if r > 0 else 0.0 for r in df['ground_truth_rank']]
df['mrr'] = mrr_vals

print(f"\n  MRR:    {df['mrr'].mean():.4f}")
print(f"  NDCG@1: {df[df['ground_truth_rank'] <= 1]['ndcg'].sum() / total:.4f}")
print(f"  NDCG@3: {np.mean([1/np.log2(r+1) if 0 < r <= 3 else 0 for r in df['ground_truth_rank']]):.4f}")
print(f"  NDCG@5: {np.mean([1/np.log2(r+1) if 0 < r <= 5 else 0 for r in df['ground_truth_rank']]):.4f}")

print(f"\n  GT found rate:      {len(found)}/{total} ({len(found)/total*100:.1f}%)")
print(f"  Avg rank (found):   {found['ground_truth_rank'].mean():.2f}")
print(f"  Avg candidates:     {df['candidate_count'].mean():.1f}")
print(f"  Avg time (ms):      {df['time_ms'].mean():.0f}")

In [None]:
# Rank distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Full distribution including rank 0 (not found)
rank_counts = df['ground_truth_rank'].value_counts().sort_index()
colors = ['#e74c3c' if r == 0 else '#2ecc71' if r == 1 else '#3498db' for r in rank_counts.index]
axes[0].bar(rank_counts.index, rank_counts.values, color=colors, edgecolor='black', alpha=0.8)
axes[0].set_xlabel('Ground Truth Rank (0 = not found)')
axes[0].set_ylabel('Count')
axes[0].set_title(f'Rank Distribution (all {len(df)} samples)')
axes[0].axvline(x=0.5, color='red', linestyle='--', alpha=0.5)

# Right: Only found cases (rank > 0)
found_ranks = found['ground_truth_rank'].value_counts().sort_index()
colors2 = ['#2ecc71' if r == 1 else '#3498db' for r in found_ranks.index]
axes[1].bar(found_ranks.index, found_ranks.values, color=colors2, edgecolor='black', alpha=0.8)
axes[1].set_xlabel('Ground Truth Rank')
axes[1].set_ylabel('Count')
axes[1].set_title(f'Rank Distribution (found cases only, n={len(found)})\nMean rank: {found["ground_truth_rank"].mean():.2f}')

plt.tight_layout()
plt.show()

# Hit rates
not_found = (df['ground_truth_rank'] == 0).sum()
print(f"Rank breakdown:")
print(f"  Not found (rank 0): {not_found} ({not_found/len(df)*100:.1f}%)")
print(f"  Rank 1 (Hit@1):     {(df['ground_truth_rank'] == 1).sum()} ({(df['ground_truth_rank'] == 1).mean()*100:.1f}%)")
print(f"  Rank 2-3:           {((df['ground_truth_rank'] >= 2) & (df['ground_truth_rank'] <= 3)).sum()}")
print(f"  Rank 4-5:           {((df['ground_truth_rank'] >= 4) & (df['ground_truth_rank'] <= 5)).sum()}")
print(f"  Rank 6+:            {(df['ground_truth_rank'] > 5).sum()}")

In [None]:
# Ground truth in candidates analysis
gt_in = df[df['gt_in_candidates']]
gt_not_in = df[~df['gt_in_candidates']]

print(f"Ground Truth in Candidate Vendors:")
print(f"  In candidates:     {len(gt_in)} ({len(gt_in)/len(df)*100:.1f}%)  -> gt_in")
print(f"  NOT in candidates: {len(gt_not_in)} ({len(gt_not_in)/len(df)*100:.1f}%)  -> gt_not_in")

# Performance comparison
print(f"\nPerformance when GT IS in candidates:")
for k in [1, 3, 5]:
    hit = (gt_in['ground_truth_rank'] <= k).sum() / len(gt_in)
    print(f"  Hit@{k}: {hit*100:.1f}%")

print(f"\nPerformance when GT is NOT in candidates:")
for k in [1, 3, 5]:
    hit = (gt_not_in['ground_truth_rank'] <= k).sum() / len(gt_not_in)
    print(f"  Hit@{k}: {hit*100:.1f}%")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].pie([len(gt_in), len(gt_not_in)],
       labels=['In candidates', 'Not in candidates'],
       autopct='%1.1f%%', colors=['#2ecc71', '#e74c3c'])
axes[0].set_title('Ground Truth Vendor in Candidates?')

# Hit@1 comparison
hit1_in = (gt_in['ground_truth_rank'] == 1).mean() * 100
hit1_not = (gt_not_in['ground_truth_rank'] == 1).mean() * 100
bars = axes[1].bar(['GT in candidates', 'GT not in candidates'], [hit1_in, hit1_not],
                   color=['#2ecc71', '#e74c3c'])
axes[1].set_ylabel('Hit@1 (%)')
axes[1].set_title('Hit@1 by GT Presence in Candidates')
for bar, val in zip(bars, [hit1_in, hit1_not]):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', ha='center')

plt.tight_layout()
plt.show()

## 3. Round 1 Cuisine Prediction Accuracy

Round 1 predicts the top-3 cuisines. If the ground truth cuisine isn't predicted, the correct vendor can't appear in candidates.

In [None]:
# Round 1 cuisine prediction accuracy
r1_correct = df[df['gt_cuisine_in_round1']]
r1_wrong = df[~df['gt_cuisine_in_round1']]

print(f"Round 1 Cuisine Prediction:")
print(f"  GT cuisine in top-3: {len(r1_correct)} ({len(r1_correct)/len(df)*100:.1f}%)")
print(f"  GT cuisine missed:   {len(r1_wrong)} ({len(r1_wrong)/len(df)*100:.1f}%)")

# When cuisine is missed, can GT still be found?
r1_wrong_but_found = r1_wrong[r1_wrong['ground_truth_rank'] > 0]
print(f"\n  When cuisine missed, GT vendor still found: {len(r1_wrong_but_found)}/{len(r1_wrong)}")

# Performance split
print(f"\nPerformance when GT cuisine IS predicted:")
for k in [1, 3, 5]:
    hit = (r1_correct['ground_truth_rank'].between(1, k)).sum() / len(df)
    print(f"  Hit@{k}: {hit*100:.1f}% (of total)")

print(f"\nPerformance when GT cuisine is NOT predicted:")
for k in [1, 3, 5]:
    hit = (r1_wrong['ground_truth_rank'].between(1, k)).sum() / len(df)
    print(f"  Hit@{k}: {hit*100:.1f}% (of total)")

# LightGCN cuisine ranking vs Round 1 prediction
print(f"\nLightGCN ranking of GT cuisine:")
print(f"  GT cuisine in LightGCN top-3: {(df['lightgcn_gt_cuisine_rank'].between(1,3)).sum()}")
print(f"  GT cuisine in LightGCN top-5: {(df['lightgcn_gt_cuisine_rank'].between(1,5)).sum()}")
print(f"  GT cuisine not in top-10:     {(df['lightgcn_gt_cuisine_rank'] == 0).sum()}")

In [None]:
# Confusion: what cuisines does Round 1 predict when it misses the GT cuisine?
print("When Round 1 misses the GT cuisine, what was predicted vs actual:")
print("=" * 60)

missed_cuisine_counter = Counter()
for _, row in r1_wrong.iterrows():
    missed_cuisine_counter[row['ground_truth_cuisine']] += 1

print(f"\nMost frequently missed GT cuisines:")
for cuisine, count in missed_cuisine_counter.most_common(10):
    total_for_cuisine = (df['ground_truth_cuisine'] == cuisine).sum()
    print(f"  {cuisine}: missed {count}/{total_for_cuisine} ({count/total_for_cuisine*100:.0f}%)")

# Show a few examples
print(f"\nExamples of missed cuisine predictions (showing 3):")
for _, row in r1_wrong.head(3).iterrows():
    history_cuisines = [t[1] for t in row['order_history_tuples']]
    print(f"\n  Sample {row['sample_idx']}:")
    print(f"    GT cuisine:  {row['ground_truth_cuisine']}")
    print(f"    Predicted:   {row['round1_predicted_cuisines']}")
    print(f"    History:     {history_cuisines}")
    lgcn_rank = row['lightgcn_gt_cuisine_rank']
    print(f"    LightGCN rank of GT cuisine: {lgcn_rank if lgcn_rank > 0 else 'not in top 10'}")

## 4. Per-Cuisine Performance Breakdown

In [None]:
# Per-cuisine performance
cuisine_stats = []
for cuisine in df['ground_truth_cuisine'].unique():
    subset = df[df['ground_truth_cuisine'] == cuisine]
    n = len(subset)
    if n < 3:
        continue
    hit1 = (subset['ground_truth_rank'] == 1).mean()
    hit3 = (subset['ground_truth_rank'].between(1, 3)).mean()
    hit5 = (subset['ground_truth_rank'].between(1, 5)).mean()
    found_rate = (subset['ground_truth_rank'] > 0).mean()
    cuisine_in_r1 = subset['gt_cuisine_in_round1'].mean()
    avg_rank_found = subset[subset['ground_truth_rank'] > 0]['ground_truth_rank'].mean() if (subset['ground_truth_rank'] > 0).any() else float('nan')
    cuisine_stats.append({
        'cuisine': cuisine, 'n': n,
        'hit@1': hit1, 'hit@3': hit3, 'hit@5': hit5,
        'found_rate': found_rate, 'cuisine_in_r1': cuisine_in_r1,
        'avg_rank_found': avg_rank_found
    })

cuisine_df = pd.DataFrame(cuisine_stats).sort_values('hit@1', ascending=False)

print("Per-Cuisine Performance (sorted by Hit@1, min 3 samples):")
print("=" * 100)
print(f"{'Cuisine':<15} {'N':>4}  {'Hit@1':>6}  {'Hit@3':>6}  {'Hit@5':>6}  {'Found%':>7}  {'R1 Cuisine%':>11}  {'AvgRank':>7}")
print("-" * 100)
for _, row in cuisine_df.iterrows():
    print(f"{row['cuisine']:<15} {row['n']:>4}  {row['hit@1']*100:>5.1f}%  {row['hit@3']*100:>5.1f}%  {row['hit@5']*100:>5.1f}%  {row['found_rate']*100:>6.1f}%  {row['cuisine_in_r1']*100:>10.1f}%  {row['avg_rank_found']:>7.2f}")

In [ ]:
# Visualize top cuisines by Hit@1
top_cuisines = cuisine_df[cuisine_df['n'] >= 5].head(15)

fig, ax = plt.subplots(figsize=(12, 5))
x = range(len(top_cuisines))
width = 0.3

ax.bar([i - width for i in x], top_cuisines['hit@1'] * 100, width, label='Hit@1', color='#2ecc71')
ax.bar([i for i in x], top_cuisines['hit@3'] * 100, width, label='Hit@3', color='#3498db')
ax.bar([i + width for i in x], top_cuisines['hit@5'] * 100, width, label='Hit@5', color='#9b59b6')

ax.set_xlabel('Cuisine')
ax.set_ylabel('Hit Rate (%)')
ax.set_title('Hit@K by Cuisine (min 5 samples)')
ax.set_xticks(x)
ax.set_xticklabels([f"{r['cuisine']}\n(n={r['n']})" for _, r in top_cuisines.iterrows()], rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 105)

plt.tight_layout()
plt.show()

## 5. Temporal Patterns

Does performance vary by time of day or day of week?

In [None]:
# Temporal analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# By hour
hour_stats = df.groupby('target_hour').agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean()),
    hit3=('ground_truth_rank', lambda x: (x.between(1, 3)).mean()),
    found=('ground_truth_rank', lambda x: (x > 0).mean())
).reset_index()

ax = axes[0]
ax.bar(hour_stats['target_hour'], hour_stats['hit1'] * 100, color='#2ecc71', alpha=0.7, label='Hit@1')
ax.plot(hour_stats['target_hour'], hour_stats['hit3'] * 100, 'o-', color='#3498db', label='Hit@3')
ax.set_xlabel('Target Hour')
ax.set_ylabel('Hit Rate (%)')
ax.set_title('Performance by Hour of Day')
ax.legend()
# Add sample counts
for _, row in hour_stats.iterrows():
    ax.text(row['target_hour'], row['hit1'] * 100 + 2, f"n={int(row['n'])}", ha='center', fontsize=7)

# By day of week
day_names = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
day_stats = df.groupby('target_day_of_week').agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean()),
    hit3=('ground_truth_rank', lambda x: (x.between(1, 3)).mean()),
    found=('ground_truth_rank', lambda x: (x > 0).mean())
).reset_index()

ax = axes[1]
ax.bar(day_stats['target_day_of_week'], day_stats['hit1'] * 100, color='#2ecc71', alpha=0.7, label='Hit@1')
ax.plot(day_stats['target_day_of_week'], day_stats['hit3'] * 100, 'o-', color='#3498db', label='Hit@3')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Hit Rate (%)')
ax.set_title('Performance by Day of Week')
ax.set_xticks(list(day_names.keys()))
ax.set_xticklabels([day_names[d] for d in sorted(day_names.keys())])
ax.legend()
for _, row in day_stats.iterrows():
    ax.text(row['target_day_of_week'], row['hit1'] * 100 + 2, f"n={int(row['n'])}", ha='center', fontsize=8)

plt.tight_layout()
plt.show()

## 6. Order History Analysis

How does the user's order history (length, diversity, repeat frequency) affect prediction accuracy?

In [None]:
# Order history analysis
# How many times does the GT vendor appear in order history?
df['gt_vendor_in_history'] = df.apply(
    lambda r: sum(1 for t in r['order_history_tuples'] if t[0] == r['ground_truth_vendor_id']),
    axis=1
)
# How many times does the GT cuisine appear in history?
df['gt_cuisine_in_history'] = df.apply(
    lambda r: sum(1 for t in r['order_history_tuples'] if t[1] == r['ground_truth_cuisine']),
    axis=1
)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# By GT vendor repeat count in history
vendor_repeat_stats = df.groupby('gt_vendor_in_history').agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean())
).reset_index()

ax = axes[0]
ax.bar(vendor_repeat_stats['gt_vendor_in_history'], vendor_repeat_stats['hit1'] * 100,
       color='#9b59b6', edgecolor='black', alpha=0.8)
ax.set_xlabel('# Times GT Vendor in History')
ax.set_ylabel('Hit@1 (%)')
ax.set_title('Hit@1 by GT Vendor Repeat Count')
for _, row in vendor_repeat_stats.iterrows():
    ax.text(row['gt_vendor_in_history'], row['hit1'] * 100 + 1, f"n={int(row['n'])}", ha='center', fontsize=8)

# By unique cuisine count in history
cuisine_div_stats = df.groupby('history_unique_cuisines').agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean())
).reset_index()

ax = axes[1]
ax.bar(cuisine_div_stats['history_unique_cuisines'], cuisine_div_stats['hit1'] * 100,
       color='#e67e22', edgecolor='black', alpha=0.8)
ax.set_xlabel('# Unique Cuisines in History')
ax.set_ylabel('Hit@1 (%)')
ax.set_title('Hit@1 by History Cuisine Diversity')
for _, row in cuisine_div_stats.iterrows():
    ax.text(row['history_unique_cuisines'], row['hit1'] * 100 + 1, f"n={int(row['n'])}", ha='center', fontsize=8)

# By candidate count
bins = [0, 3, 5, 10, 15, 21]
df['cand_bin'] = pd.cut(df['candidate_count'], bins=bins, labels=['1-3', '4-5', '6-10', '11-15', '16-20'])
cand_stats = df.groupby('cand_bin', observed=True).agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean()),
    found=('ground_truth_rank', lambda x: (x > 0).mean())
).reset_index()

ax = axes[2]
x = range(len(cand_stats))
ax.bar(x, cand_stats['hit1'] * 100, color='#1abc9c', edgecolor='black', alpha=0.8)
ax.set_xlabel('Candidate Count')
ax.set_ylabel('Hit@1 (%)')
ax.set_title('Hit@1 by Candidate Count')
ax.set_xticks(x)
ax.set_xticklabels(cand_stats['cand_bin'])
for i, row in cand_stats.iterrows():
    ax.text(list(x)[list(cand_stats.index).index(i)], row['hit1'] * 100 + 1, f"n={int(row['n'])}", ha='center', fontsize=8)

plt.tight_layout()
plt.show()

print(f"GT vendor repeat count in history:")
print(df['gt_vendor_in_history'].value_counts().sort_index())
print(f"\nCorrelation between vendor repeat count and Hit@1:")
print(f"  Mean rank when vendor seen 1x: {df[df['gt_vendor_in_history']==1]['ground_truth_rank'].mean():.2f}")
print(f"  Mean rank when vendor seen 2x: {df[df['gt_vendor_in_history']==2]['ground_truth_rank'].mean():.2f}")
print(f"  Mean rank when vendor seen 3+: {df[df['gt_vendor_in_history']>=3]['ground_truth_rank'].mean():.2f}")

## 7. Failure Analysis

Deep dive into cases where the pipeline fails (rank 0 or high rank).

In [None]:
# Failure analysis: rank 0 (not found) cases
not_found_df = df[df['ground_truth_rank'] == 0].copy()

print(f"NOT FOUND cases (rank 0): {len(not_found_df)} ({len(not_found_df)/len(df)*100:.1f}%)")
print("=" * 60)

# Why was GT not found?
# 1. GT cuisine not predicted by Round 1
cuisine_missed = not_found_df[~not_found_df['gt_cuisine_in_round1']]
# 2. GT cuisine predicted but vendor not in candidates
cuisine_hit_vendor_miss = not_found_df[not_found_df['gt_cuisine_in_round1'] & ~not_found_df['gt_in_candidates']]
# 3. GT in candidates but ranked 0 (shouldn't happen normally)
in_cands_but_0 = not_found_df[not_found_df['gt_in_candidates']]

print(f"\nBreakdown of NOT FOUND cases:")
print(f"  GT cuisine NOT predicted by R1:           {len(cuisine_missed)} ({len(cuisine_missed)/len(not_found_df)*100:.1f}%)")
print(f"  GT cuisine predicted, vendor NOT in cands: {len(cuisine_hit_vendor_miss)} ({len(cuisine_hit_vendor_miss)/len(not_found_df)*100:.1f}%)")
print(f"  GT in candidates but still rank 0:         {len(in_cands_but_0)} ({len(in_cands_but_0)/len(not_found_df)*100:.1f}%)")

# Cuisine distribution of not-found cases
print(f"\nCuisines most frequently not found:")
nf_cuisines = Counter(not_found_df['ground_truth_cuisine'])
for cuisine, count in nf_cuisines.most_common(10):
    total_for_cuisine = (df['ground_truth_cuisine'] == cuisine).sum()
    print(f"  {cuisine}: {count}/{total_for_cuisine} not found ({count/total_for_cuisine*100:.0f}%)")

In [ ]:
# Detailed examples of failure cases
print("FAILURE EXAMPLES:")
print("=" * 80)

# Example 1: Cuisine missed by Round 1
if len(cuisine_missed) > 0:
    row = cuisine_missed.iloc[0]
    print(f"\n--- Type 1: GT Cuisine Not Predicted ---")
    print(f"Sample {row['sample_idx']}  |  Customer: {row['customer_id']}")
    print(f"  GT: {row['ground_truth']}  (cuisine: {row['ground_truth_cuisine']})")
    print(f"  R1 predicted: {row['round1_predicted_cuisines']}")
    history_cuisines = [t[1] for t in row['order_history_tuples']]
    print(f"  History cuisines: {history_cuisines}")
    print(f"  LightGCN top-5: {[(c, f'{s:.2f}') for c, s in row['lightgcn_top_cuisines'][:5]]}")

# Example 2: Cuisine hit but vendor not in candidates
if len(cuisine_hit_vendor_miss) > 0:
    row = cuisine_hit_vendor_miss.iloc[0]
    print(f"\n--- Type 2: GT Cuisine Predicted, But Vendor Not Found ---")
    print(f"Sample {row['sample_idx']}  |  Customer: {row['customer_id']}")
    print(f"  GT: {row['ground_truth']}  (cuisine: {row['ground_truth_cuisine']})")
    print(f"  R1 predicted: {row['round1_predicted_cuisines']}")
    same_cuisine_cands = [c for c in row['candidate_vendors'] if row['ground_truth_cuisine'] in c]
    print(f"  Candidates with same cuisine: {same_cuisine_cands[:5]}")
    print(f"  Total candidates: {row['candidate_count']}")

In [None]:
# High rank (found but ranked poorly) analysis
high_rank = df[(df['ground_truth_rank'] > 5) & (df['ground_truth_rank'] > 0)]
print(f"Cases ranked > 5 (found but poorly ranked): {len(high_rank)}")
print("=" * 60)

if len(high_rank) > 0:
    print(f"\n  Mean rank: {high_rank['ground_truth_rank'].mean():.1f}")
    print(f"  Mean candidate count: {high_rank['candidate_count'].mean():.1f}")
    print(f"  GT cuisine in R1: {high_rank['gt_cuisine_in_round1'].mean()*100:.1f}%")
    
    # Show examples
    print(f"\nExamples (showing 3):")
    for _, row in high_rank.head(3).iterrows():
        print(f"\n  Sample {row['sample_idx']}: rank={row['ground_truth_rank']}, "
              f"candidates={row['candidate_count']}")
        print(f"    GT: {row['ground_truth']}")
        print(f"    Top-3 in final ranking: {row['final_ranking'][:3]}")
        print(f"    GT vendor in history {row['gt_vendor_in_history']}x")

## 8. Similar Users Analysis

How do similar user signals affect prediction quality?

In [None]:
# Similar users analysis
df['num_similar_users'] = df['similar_users'].apply(len)
df['max_similarity'] = df['similar_users'].apply(
    lambda users: max(u['similarity'] for u in users) if users else 0
)
df['mean_similarity'] = df['similar_users'].apply(
    lambda users: np.mean([u['similarity'] for u in users]) if users else 0
)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Similarity score distribution
ax = axes[0]
ax.hist(df['max_similarity'], bins=30, color='#3498db', edgecolor='black', alpha=0.7)
ax.set_xlabel('Max Similarity Score')
ax.set_ylabel('Count')
ax.set_title('Distribution of Max Similarity Score')
ax.axvline(x=df['max_similarity'].median(), color='red', linestyle='--', label=f'Median={df["max_similarity"].median():.4f}')
ax.legend()

# Hit@1 by similarity quartile
df['sim_quartile'] = pd.qcut(df['max_similarity'], q=4, labels=['Q1 (low)', 'Q2', 'Q3', 'Q4 (high)'])
sim_stats = df.groupby('sim_quartile', observed=True).agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean()),
    found=('ground_truth_rank', lambda x: (x > 0).mean())
).reset_index()

ax = axes[1]
bars = ax.bar(range(len(sim_stats)), sim_stats['hit1'] * 100, color='#9b59b6', edgecolor='black', alpha=0.8)
ax.set_xlabel('Max Similarity Quartile')
ax.set_ylabel('Hit@1 (%)')
ax.set_title('Hit@1 by User Similarity Quartile')
ax.set_xticks(range(len(sim_stats)))
ax.set_xticklabels(sim_stats['sim_quartile'])
for bar, (_, row) in zip(bars, sim_stats.iterrows()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f"n={int(row['n'])}", ha='center', fontsize=8)

plt.tight_layout()
plt.show()

print(f"Similarity stats:")
print(f"  Mean max similarity: {df['max_similarity'].mean():.6f}")
print(f"  Similar users per case: {df['num_similar_users'].mean():.1f} (all have {df['num_similar_users'].unique()})")

## 9. LightGCN Cuisine Signal Quality

How well does LightGCN rank the ground truth cuisine? Does a higher LightGCN ranking for the cuisine translate to better final performance?

In [None]:
# LightGCN cuisine ranking quality
lgcn_rank_stats = df.groupby('lightgcn_gt_cuisine_rank').agg(
    n=('ground_truth_rank', 'size'),
    hit1=('ground_truth_rank', lambda x: (x == 1).mean()),
    hit3=('ground_truth_rank', lambda x: (x.between(1, 3)).mean()),
    found=('ground_truth_rank', lambda x: (x > 0).mean()),
    cuisine_in_r1=('gt_cuisine_in_round1', 'mean')
).reset_index()

print("Performance by LightGCN Cuisine Rank (of GT cuisine):")
print("=" * 90)
print(f"{'LightGCN Rank':>13}  {'N':>4}  {'Hit@1':>6}  {'Hit@3':>6}  {'Found%':>7}  {'R1 Predicts Cuisine%':>20}")
print("-" * 90)
for _, row in lgcn_rank_stats.iterrows():
    rank_label = 'Not in top 10' if row['lightgcn_gt_cuisine_rank'] == 0 else int(row['lightgcn_gt_cuisine_rank'])
    print(f"{str(rank_label):>13}  {int(row['n']):>4}  {row['hit1']*100:>5.1f}%  {row['hit3']*100:>5.1f}%  {row['found']*100:>6.1f}%  {row['cuisine_in_r1']*100:>19.1f}%")

# Visualize
fig, ax = plt.subplots(figsize=(10, 5))
valid_ranks = lgcn_rank_stats[lgcn_rank_stats['lightgcn_gt_cuisine_rank'] > 0]
ax.bar(valid_ranks['lightgcn_gt_cuisine_rank'], valid_ranks['hit1'] * 100,
       color='#2ecc71', edgecolor='black', alpha=0.8, label='Hit@1')
ax.plot(valid_ranks['lightgcn_gt_cuisine_rank'], valid_ranks['found'] * 100,
        'o-', color='#e74c3c', label='Found Rate')
ax.set_xlabel('LightGCN Rank of GT Cuisine')
ax.set_ylabel('Rate (%)')
ax.set_title('Final Performance vs LightGCN Cuisine Ranking')
ax.legend()
for _, row in valid_ranks.iterrows():
    ax.text(row['lightgcn_gt_cuisine_rank'], row['hit1'] * 100 + 2,
            f"n={int(row['n'])}", ha='center', fontsize=7)
plt.tight_layout()
plt.show()

## 10. Round 2 Vendor Ranking Quality

When GT is in candidates, how well does Round 2 rank it?

In [None]:
# Round 2 ranking quality (only when GT is in candidates)
gt_in_found = gt_in.copy()

print(f"When GT vendor IS in candidates ({len(gt_in_found)} cases):")
print("=" * 60)

for k in [1, 3, 5]:
    hit = (gt_in_found['ground_truth_rank'].between(1, k)).sum() / len(gt_in_found)
    print(f"  Hit@{k}: {hit*100:.1f}%")

print(f"  Mean rank: {gt_in_found[gt_in_found['ground_truth_rank'] > 0]['ground_truth_rank'].mean():.2f}")

# How often does R2 put GT vendor at rank 1 when it's in candidates?
r2_rank1_given_cand = (gt_in_found['ground_truth_rank'] == 1).sum()
print(f"\n  GT ranked #1 when in candidates: {r2_rank1_given_cand}/{len(gt_in_found)} ({r2_rank1_given_cand/len(gt_in_found)*100:.1f}%)")

# Rank distribution for cases where GT is in candidates
fig, ax = plt.subplots(figsize=(8, 4))
in_cand_ranks = gt_in_found['ground_truth_rank'].value_counts().sort_index()
colors = ['#e74c3c' if r == 0 else '#2ecc71' if r == 1 else '#3498db' for r in in_cand_ranks.index]
ax.bar(in_cand_ranks.index, in_cand_ranks.values, color=colors, edgecolor='black', alpha=0.8)
ax.set_xlabel('Ground Truth Rank')
ax.set_ylabel('Count')
ax.set_title(f'Rank Distribution When GT is in Candidates (n={len(gt_in_found)})')
plt.tight_layout()
plt.show()

# Show examples where GT is in candidates but poorly ranked
poorly_ranked = gt_in_found[gt_in_found['ground_truth_rank'] > 3]
if len(poorly_ranked) > 0:
    print(f"\nExamples where GT is in candidates but ranked > 3 ({len(poorly_ranked)} cases):")
    for _, row in poorly_ranked.head(3).iterrows():
        print(f"\n  Sample {row['sample_idx']}: rank={row['ground_truth_rank']}, candidates={row['candidate_count']}")
        print(f"    GT: {row['ground_truth']}")
        print(f"    Top-3 ranked: {row['final_ranking'][:3]}")

## 11. Pipeline Bottleneck Analysis

Where does the pipeline lose the most? Break down the failure into pipeline stages.

In [None]:
# Pipeline bottleneck funnel
total = len(df)

# Stage 1: LightGCN cuisine ranking - is GT cuisine in top 10?
lgcn_top10 = (df['lightgcn_gt_cuisine_rank'] > 0).sum()
lgcn_top3 = (df['lightgcn_gt_cuisine_rank'].between(1, 3)).sum()
lgcn_top1 = (df['lightgcn_gt_cuisine_rank'] == 1).sum()

# Stage 2: Round 1 predicts GT cuisine
r1_correct = df['gt_cuisine_in_round1'].sum()

# Stage 3: GT vendor in candidates
gt_cand = df['gt_in_candidates'].sum()

# Stage 4: GT vendor found in final ranking
gt_found = (df['ground_truth_rank'] > 0).sum()

# Stage 5: GT vendor at rank 1
gt_rank1 = (df['ground_truth_rank'] == 1).sum()

stages = [
    ('Total samples', total),
    ('LightGCN: GT cuisine in top-10', lgcn_top10),
    ('LightGCN: GT cuisine in top-3', lgcn_top3),
    ('Round 1: GT cuisine predicted', r1_correct),
    ('GT vendor in candidates', gt_cand),
    ('GT vendor found in ranking', gt_found),
    ('GT vendor at rank 1', gt_rank1),
]

print("PIPELINE FUNNEL:")
print("=" * 70)
for stage, count in stages:
    lost = total - count
    print(f"  {stage:<40} {count:>4}/{total}  ({count/total*100:>5.1f}%)  lost: {lost}")

# Visualize funnel
fig, ax = plt.subplots(figsize=(10, 6))
labels = [s[0] for s in stages]
values = [s[1] for s in stages]
colors_funnel = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(stages)))

bars = ax.barh(range(len(stages)-1, -1, -1), values, color=colors_funnel)
ax.set_yticks(range(len(stages)-1, -1, -1))
ax.set_yticklabels(labels)
ax.set_xlabel('Number of Cases')
ax.set_title('Pipeline Funnel: Where Cases Are Lost')

for bar, val in zip(bars, values):
    ax.text(bar.get_width() + 5, bar.get_y() + bar.get_height()/2,
            f'{val} ({val/total*100:.1f}%)', va='center')

plt.tight_layout()
plt.show()

## 12. Summary & Key Findings

In [None]:
# Summary
total = len(df)
found_count = (df['ground_truth_rank'] > 0).sum()
hit1 = (df['ground_truth_rank'] == 1).sum()
hit3 = (df['ground_truth_rank'].between(1, 3)).sum()
hit5 = (df['ground_truth_rank'].between(1, 5)).sum()

summary = f"""
# Stage 9 Repeat Evaluation Summary

## Dataset
- Total samples: {total}
- Unique customers: {df['customer_id'].nunique()}
- Ground truth cuisines: {df['ground_truth_cuisine'].nunique()}

## Overall Performance
| Metric | Value |
|--------|-------|
| Hit@1 | {hit1/total*100:.1f}% ({hit1}/{total}) |
| Hit@3 | {hit3/total*100:.1f}% ({hit3}/{total}) |
| Hit@5 | {hit5/total*100:.1f}% ({hit5}/{total}) |
| MRR | {df['mrr'].mean():.4f} |
| GT Found Rate | {found_count/total*100:.1f}% ({found_count}/{total}) |
| Avg Rank (when found) | {found['ground_truth_rank'].mean():.2f} |

## Pipeline Breakdown
| Stage | Rate |
|-------|------|
| LightGCN: GT cuisine in top-3 | {(df['lightgcn_gt_cuisine_rank'].between(1,3)).mean()*100:.1f}% |
| Round 1: GT cuisine predicted | {df['gt_cuisine_in_round1'].mean()*100:.1f}% |
| GT vendor in candidates | {df['gt_in_candidates'].mean()*100:.1f}% |
| GT vendor found in ranking | {(df['ground_truth_rank'] > 0).mean()*100:.1f}% |

## Key Findings
1. **Candidate generation is the bottleneck**: {(~df['gt_in_candidates']).sum()} cases ({(~df['gt_in_candidates']).mean()*100:.1f}%) fail because the GT vendor isn't in the candidate list
2. **Round 1 cuisine prediction is strong**: {df['gt_cuisine_in_round1'].mean()*100:.1f}% accuracy on predicting the GT cuisine
3. **When GT is in candidates, ranking works well**: Hit@1 = {(gt_in['ground_truth_rank']==1).mean()*100:.1f}% for cases with GT in candidates
4. **Top cuisines**: pizza ({(df['ground_truth_cuisine']=='pizza').sum()}), burgare ({(df['ground_truth_cuisine']=='burgare').sum()}), asiatiskt ({(df['ground_truth_cuisine']=='asiatiskt').sum()}) are most common GT cuisines
"""

display(Markdown(summary))

In [ ]:
# Final visualization: overall metrics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Hit@K curve
ax = axes[0]
ks = range(1, 21)
hit_all = [(df['ground_truth_rank'].between(1, k)).mean() * 100 for k in ks]
hit_in_cand = [(gt_in['ground_truth_rank'].between(1, k)).mean() * 100 for k in ks]

ax.plot(ks, hit_all, 'o-', label='All samples', linewidth=2, markersize=6)
ax.plot(ks, hit_in_cand, 's-', label='GT in candidates only', linewidth=2, markersize=6)
ax.set_xlabel('K')
ax.set_ylabel('Hit@K (%)')
ax.set_title('Hit@K Curves')
ax.legend()
ax.set_xticks([1, 3, 5, 10, 15, 20])
ax.grid(True, alpha=0.3)

# Right: Failure breakdown pie
ax = axes[1]
r1_miss = (~df['gt_cuisine_in_round1']).sum()
r1_hit_cand_miss = (df['gt_cuisine_in_round1'] & ~df['gt_in_candidates']).sum()
cand_hit_rank_miss = (df['gt_in_candidates'] & (df['ground_truth_rank'] > 1)).sum()
success = (df['ground_truth_rank'] == 1).sum()

slices = [success, cand_hit_rank_miss, r1_hit_cand_miss, r1_miss]
labels = [
    f'Hit@1 ({success})',
    f'In cands, rank>1 ({cand_hit_rank_miss})',
    f'Cuisine OK, vendor miss ({r1_hit_cand_miss})',
    f'Cuisine missed by R1 ({r1_miss})'
]
colors = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c']

ax.pie(slices, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
ax.set_title('Outcome Breakdown')

plt.tight_layout()
plt.show()

print("=" * 60)
print("Analysis complete. Use lookup(idx) to inspect individual cases.")
print("=" * 60)