# Repeat Evaluation â€” Prompt & Response Analysis

Analyze the Gemini 500-sample repeat evaluation results:
- Prompt lengths (Round 1 & Round 2)
- Response lengths (Round 1 & Round 2)
- LLM timing per round
- Candidate counts
- Similar user record counts
- Order history sizes

In [None]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

# Load the Gemini 500-sample detailed results
DETAILED_PATH = Path("../outputs/202601310621/stage9_repeat_detailed.json")

with open(DETAILED_PATH) as f:
    data = json.load(f)

print(f"Total samples: {len(data)}")
print(f"Keys: {list(data[0].keys())}")

In [None]:
# Extract metrics into a DataFrame
rows = []
for d in data:
    r1_prompt = d.get('round1_prompt', '')
    r1_resp = d.get('round1_raw_response', '')
    r2_prompt = d.get('round2_prompt', '')
    r2_resp = d.get('round2_raw_response', '')
    
    # Count similar user records
    sim_users = d.get('similar_users', [])
    total_sim_records = sum(u.get('record_count', 0) for u in sim_users)
    
    rows.append({
        'sample_idx': d.get('sample_idx'),
        'customer_id': d.get('customer_id'),
        'history_size': len(d.get('order_history_tuples', [])),
        'r1_prompt_chars': len(r1_prompt),
        'r1_prompt_words': len(r1_prompt.split()),
        'r1_resp_chars': len(r1_resp),
        'r1_resp_words': len(r1_resp.split()),
        'r1_llm_ms': d.get('round1_llm_ms', 0),
        'r2_prompt_chars': len(r2_prompt),
        'r2_prompt_words': len(r2_prompt.split()),
        'r2_resp_chars': len(r2_resp),
        'r2_resp_words': len(r2_resp.split()),
        'r2_llm_ms': d.get('round2_llm_ms', 0),
        'candidate_count': d.get('candidate_count', 0),
        'similar_users_count': len(sim_users),
        'similar_user_records': total_sim_records,
        'ground_truth_rank': d.get('ground_truth_rank', 0),
        'total_time_ms': d.get('time_ms', 0),
    })

df = pd.DataFrame(rows)
print(f"DataFrame shape: {df.shape}")
df.head()

## Summary Statistics

In [None]:
# Key columns to summarize
cols = [
    'history_size',
    'r1_prompt_chars', 'r1_prompt_words', 'r1_resp_chars', 'r1_resp_words', 'r1_llm_ms',
    'r2_prompt_chars', 'r2_prompt_words', 'r2_resp_chars', 'r2_resp_words', 'r2_llm_ms',
    'candidate_count', 'similar_users_count', 'similar_user_records',
    'total_time_ms',
]

summary = df[cols].describe(percentiles=[.25, .5, .75, .90, .95, .99]).T
summary = summary[['count', 'mean', 'std', 'min', '25%', '50%', '75%', '90%', '95%', '99%', 'max']]
summary = summary.round(1)
print(summary.to_string())

## Prompt Length Distributions

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Round 1 prompt chars
axes[0, 0].hist(df['r1_prompt_chars'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Round 1 Prompt (chars)')
axes[0, 0].axvline(df['r1_prompt_chars'].mean(), color='red', linestyle='--', label=f"mean={df['r1_prompt_chars'].mean():.0f}")
axes[0, 0].legend()

# Round 1 response chars
axes[0, 1].hist(df['r1_resp_chars'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Round 1 Response (chars)')
axes[0, 1].axvline(df['r1_resp_chars'].mean(), color='red', linestyle='--', label=f"mean={df['r1_resp_chars'].mean():.0f}")
axes[0, 1].legend()

# Round 1 LLM time
axes[0, 2].hist(df['r1_llm_ms'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0, 2].set_title('Round 1 LLM Time (ms)')
axes[0, 2].axvline(df['r1_llm_ms'].mean(), color='red', linestyle='--', label=f"mean={df['r1_llm_ms'].mean():.0f}ms")
axes[0, 2].legend()

# Round 2 prompt chars
axes[1, 0].hist(df['r2_prompt_chars'], bins=30, edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Round 2 Prompt (chars)')
axes[1, 0].axvline(df['r2_prompt_chars'].mean(), color='red', linestyle='--', label=f"mean={df['r2_prompt_chars'].mean():.0f}")
axes[1, 0].legend()

# Round 2 response chars
axes[1, 1].hist(df['r2_resp_chars'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1, 1].set_title('Round 2 Response (chars)')
axes[1, 1].axvline(df['r2_resp_chars'].mean(), color='red', linestyle='--', label=f"mean={df['r2_resp_chars'].mean():.0f}")
axes[1, 1].legend()

# Round 2 LLM time
axes[1, 2].hist(df['r2_llm_ms'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 2].set_title('Round 2 LLM Time (ms)')
axes[1, 2].axvline(df['r2_llm_ms'].mean(), color='red', linestyle='--', label=f"mean={df['r2_llm_ms'].mean():.0f}ms")
axes[1, 2].legend()

plt.tight_layout()
plt.savefig('../resultExploration/repeat_prompt_response_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## Candidate & Similar User Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Candidate count distribution
axes[0].hist(df['candidate_count'], bins=20, edgecolor='black', alpha=0.7)
axes[0].set_title('Candidate Vendors per Sample')
axes[0].set_xlabel('Count')
axes[0].axvline(df['candidate_count'].mean(), color='red', linestyle='--', label=f"mean={df['candidate_count'].mean():.1f}")
axes[0].legend()

# Similar user records
axes[1].hist(df['similar_user_records'], bins=20, edgecolor='black', alpha=0.7, color='purple')
axes[1].set_title('Similar User Records per Sample')
axes[1].set_xlabel('Count')
axes[1].axvline(df['similar_user_records'].mean(), color='red', linestyle='--', label=f"mean={df['similar_user_records'].mean():.1f}")
axes[1].legend()

# Order history size
axes[2].hist(df['history_size'], bins=30, edgecolor='black', alpha=0.7, color='teal')
axes[2].set_title('Order History Size')
axes[2].set_xlabel('Count')
axes[2].axvline(df['history_size'].mean(), color='red', linestyle='--', label=f"mean={df['history_size'].mean():.1f}")
axes[2].legend()

plt.tight_layout()
plt.savefig('../resultExploration/repeat_candidate_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## Prompt Length vs. LLM Response Time

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Round 1: prompt length vs time
axes[0].scatter(df['r1_prompt_chars'], df['r1_llm_ms'], alpha=0.3, s=10)
axes[0].set_xlabel('Round 1 Prompt Length (chars)')
axes[0].set_ylabel('Round 1 LLM Time (ms)')
axes[0].set_title('R1: Prompt Length vs Response Time')

# Round 2: prompt length vs time
axes[1].scatter(df['r2_prompt_chars'], df['r2_llm_ms'], alpha=0.3, s=10, color='orange')
axes[1].set_xlabel('Round 2 Prompt Length (chars)')
axes[1].set_ylabel('Round 2 LLM Time (ms)')
axes[1].set_title('R2: Prompt Length vs Response Time')

plt.tight_layout()
plt.savefig('../resultExploration/repeat_prompt_vs_time.png', dpi=150, bbox_inches='tight')
plt.show()

## Ground Truth Rank Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Rank distribution (exclude 0 = not found)
found = df[df['ground_truth_rank'] > 0]
not_found = df[df['ground_truth_rank'] == 0]

axes[0].hist(found['ground_truth_rank'], bins=range(1, found['ground_truth_rank'].max()+2),
             edgecolor='black', alpha=0.7)
axes[0].set_title(f'GT Rank Distribution (found: {len(found)}/{len(df)})')
axes[0].set_xlabel('Rank')
axes[0].set_ylabel('Count')

# Hit@K cumulative
k_values = list(range(1, 21))
hit_rates = []
for k in k_values:
    hit = (found['ground_truth_rank'] <= k).sum() / len(df)
    hit_rates.append(hit)

axes[1].plot(k_values, hit_rates, 'o-', markersize=4)
axes[1].set_title('Cumulative Hit Rate')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Hit@K')
axes[1].set_ylim(0, 1)
axes[1].grid(True, alpha=0.3)

# Annotate key points
for k in [1, 3, 5]:
    axes[1].annotate(f'Hit@{k}={hit_rates[k-1]:.3f}',
                     xy=(k, hit_rates[k-1]),
                     xytext=(k+1, hit_rates[k-1]-0.05),
                     fontsize=9)

plt.tight_layout()
plt.savefig('../resultExploration/repeat_rank_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## Response Content Analysis

Check for thinking tokens / excessively long responses.

In [None]:
# Check for thinking tokens in responses
r1_has_think = sum(1 for d in data if '<think>' in d.get('round1_raw_response', ''))
r2_has_think = sum(1 for d in data if '<think>' in d.get('round2_raw_response', ''))
print(f"Round 1 responses with <think> tags: {r1_has_think}/{len(data)}")
print(f"Round 2 responses with <think> tags: {r2_has_think}/{len(data)}")

# Find longest responses
print("\n--- Top 5 Longest Round 1 Responses (chars) ---")
top_r1 = sorted(data, key=lambda d: len(d.get('round1_raw_response', '')), reverse=True)[:5]
for d in top_r1:
    r = d.get('round1_raw_response', '')
    print(f"  Sample {d['sample_idx']}: {len(r)} chars, {len(r.split())} words")

print("\n--- Top 5 Longest Round 2 Responses (chars) ---")
top_r2 = sorted(data, key=lambda d: len(d.get('round2_raw_response', '')), reverse=True)[:5]
for d in top_r2:
    r = d.get('round2_raw_response', '')
    print(f"  Sample {d['sample_idx']}: {len(r)} chars, {len(r.split())} words")

# Estimate token counts (rough: chars / 4)
print("\n--- Estimated Token Counts (chars/4 approximation) ---")
r1_tokens = [len(d.get('round1_prompt', '')) / 4 + len(d.get('round1_raw_response', '')) / 4 for d in data]
r2_tokens = [len(d.get('round2_prompt', '')) / 4 + len(d.get('round2_raw_response', '')) / 4 for d in data]
total_tokens = [r1 + r2 for r1, r2 in zip(r1_tokens, r2_tokens)]

print(f"R1 avg tokens: {np.mean(r1_tokens):.0f} (input+output)")
print(f"R2 avg tokens: {np.mean(r2_tokens):.0f} (input+output)")
print(f"Total avg tokens/sample: {np.mean(total_tokens):.0f}")
print(f"Total tokens (all samples): {np.sum(total_tokens):.0f}")

## Timing Breakdown

In [None]:
# Timing breakdown
print("=" * 60)
print("TIMING BREAKDOWN")
print("=" * 60)
print(f"Round 1 LLM time:  mean={df['r1_llm_ms'].mean():.0f}ms, median={df['r1_llm_ms'].median():.0f}ms, p95={df['r1_llm_ms'].quantile(0.95):.0f}ms")
print(f"Round 2 LLM time:  mean={df['r2_llm_ms'].mean():.0f}ms, median={df['r2_llm_ms'].median():.0f}ms, p95={df['r2_llm_ms'].quantile(0.95):.0f}ms")
print(f"Total time:        mean={df['total_time_ms'].mean():.0f}ms, median={df['total_time_ms'].median():.0f}ms, p95={df['total_time_ms'].quantile(0.95):.0f}ms")

# Non-LLM overhead
df['overhead_ms'] = df['total_time_ms'] - df['r1_llm_ms'] - df['r2_llm_ms']
print(f"Overhead (non-LLM): mean={df['overhead_ms'].mean():.0f}ms, median={df['overhead_ms'].median():.0f}ms")
print(f"LLM % of total:    {((df['r1_llm_ms'] + df['r2_llm_ms']) / df['total_time_ms']).mean()*100:.1f}%")

## Impact of History Size on Performance

In [None]:
# Bin history sizes and compute hit@1 per bin
df['history_bin'] = pd.cut(df['history_size'], bins=[0, 5, 10, 20, 50, 100, 500], labels=['1-5', '6-10', '11-20', '21-50', '51-100', '100+'])

bin_stats = df.groupby('history_bin', observed=True).agg(
    count=('sample_idx', 'count'),
    hit1=('ground_truth_rank', lambda x: (x == 1).sum() / len(x)),
    hit3=('ground_truth_rank', lambda x: ((x > 0) & (x <= 3)).sum() / len(x)),
    hit5=('ground_truth_rank', lambda x: ((x > 0) & (x <= 5)).sum() / len(x)),
    avg_r1_prompt=('r1_prompt_chars', 'mean'),
    avg_r2_prompt=('r2_prompt_chars', 'mean'),
    avg_time=('total_time_ms', 'mean'),
).round(3)

print("Performance by History Size:")
print(bin_stats.to_string())