In [None]:
import setup_path  # adds project root to sys.path

In [2]:
! pip install pandas

Collecting pandas
  Downloading pandas-3.0.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (79 kB)
Downloading pandas-3.0.1-cp312-cp312-macosx_10_13_x86_64.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m20.9 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hInstalling collected packages: pandas
Successfully installed pandas-3.0.1


In [1]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

RESULTS_DIR = Path('../Analysis/results')
MODELS = ['gpt-4o', 'claude-3.5-sonnet', 'gemma-3-27b-it', 'qwen2.5-vl-72b-instruct']

ModuleNotFoundError: No module named 'pandas'

## Load per-model CSVs
Run `compare_generated_to_truth.py --all-models` first to populate `Analysis/results/`.

In [None]:
dfs = {}
for model in MODELS:
    csv_path = RESULTS_DIR / f'compare_{model}.csv'
    if csv_path.exists():
        dfs[model] = pd.read_csv(csv_path)
    else:
        print(f'Missing: {csv_path}')

print(f'Loaded {len(dfs)} model(s):', list(dfs.keys()))

## Leaderboard
Metrics (lower is better for vol_rel / hausdorff / xi_l2; higher p_value = more similar).

In [None]:
SHAPE_METRICS = ['vol_rel', 'hausdorff', 'xi_l2', 'p_value']

rows = []
for model, df in dfs.items():
    ok   = df[df['status'] == 'ok']
    n    = len(df)
    code_pass = (df['code_success'] == True).sum() / n if n > 0 else float('nan')
    shape_pass = len(ok) / n if n > 0 else float('nan')
    row = {'model': model, 'n_total': n,
           'code_pass_rate': round(code_pass, 3),
           'shape_eval_rate': round(shape_pass, 3)}
    for m in SHAPE_METRICS:
        vals = ok[m].dropna()
        row[f'{m}_mean']   = round(vals.mean(), 4)   if len(vals) else float('nan')
        row[f'{m}_median'] = round(vals.median(), 4) if len(vals) else float('nan')
    rows.append(row)

leaderboard = pd.DataFrame(rows).set_index('model')
leaderboard

## Metric distributions per model

In [None]:
metric_labels = {
    'vol_rel':   'Volume rel. diff  (↓ better)',
    'hausdorff': 'Hausdorff dist    (↓ better)',
    'xi_l2':     'ξ(r) L2           (↓ better)',
    'p_value':   'p-value (Fisher)  (↑ better)',
}

fig, axes = plt.subplots(1, 4, figsize=(18, 5))
model_names = list(dfs.keys())
short_names = [m.split('-')[0] + '-' + m.split('-')[1] if '-' in m else m for m in model_names]

for ax, metric in zip(axes, SHAPE_METRICS):
    data = [dfs[m].query("status == 'ok'")[metric].dropna().values for m in model_names]
    bp = ax.boxplot(data, patch_artist=True, medianprops=dict(color='black', linewidth=2))
    colors = plt.cm.tab10(np.linspace(0, 0.6, len(model_names)))
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    ax.set_title(metric_labels[metric], fontsize=10)
    ax.set_xticks(range(1, len(model_names) + 1))
    ax.set_xticklabels(short_names, rotation=20, ha='right', fontsize=8)
    ax.grid(axis='y', alpha=0.3)

plt.suptitle('Shape Metric Distributions by Model', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

## Code pass rate vs shape quality

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
shape_metrics_scatter = ['hausdorff', 'xi_l2', 'p_value']
colors = plt.cm.tab10(np.linspace(0, 0.6, len(model_names)))

for ax, metric in zip(axes, shape_metrics_scatter):
    for color, model in zip(colors, model_names):
        df = dfs[model]
        code_pass = (df['code_success'] == True).sum() / len(df)
        ok_vals   = df.query("status == 'ok'")[metric].dropna()
        if len(ok_vals) == 0:
            continue
        shape_score = ok_vals.median()
        ax.scatter(code_pass, shape_score, color=color, s=120, zorder=3,
                   label=model.split('-')[0])
        ax.annotate(model.split('-')[0], (code_pass, shape_score),
                    textcoords='offset points', xytext=(6, 4), fontsize=8)
    ax.set_xlabel('Code pass rate', fontsize=10)
    ax.set_ylabel(f'Median {metric}', fontsize=10)
    ax.set_title(f'Pass rate vs {metric}', fontsize=10)
    ax.grid(alpha=0.3)

plt.suptitle('Code Executability vs Shape Quality', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

## Error breakdown per model

In [None]:
from metrics import code_error_bins
import os

GEN_ROOT = Path('../generated')

fig, axes = plt.subplots(1, len(dfs), figsize=(5 * len(dfs), 4))
if len(dfs) == 1:
    axes = [axes]

for ax, model in zip(axes, model_names):
    gen_dir = GEN_ROOT / model
    if not gen_dir.exists():
        ax.set_title(f'{model}\n(dir missing)')
        continue
    bins = code_error_bins(str(gen_dir))
    labels = list(bins.keys())
    fracs  = [bins[k]['fraction'] for k in labels]
    bar_colors = ['#2ecc71' if k == 'passed' else '#e74c3c' for k in labels]
    bars = ax.bar(labels, fracs, color=bar_colors, edgecolor='white', linewidth=0.5)
    for bar, frac in zip(bars, fracs):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                f'{frac:.0%}', ha='center', va='bottom', fontsize=9)
    ax.set_ylim(0, 1.12)
    ax.set_title(model, fontsize=9)
    ax.set_ylabel('Fraction of files')
    ax.set_xticklabels(labels, rotation=25, ha='right', fontsize=8)
    ax.grid(axis='y', alpha=0.3)

plt.suptitle('Code Error Breakdown by Model', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

## Per-sample inspection
Pick a model and look at the worst / best shapes by hausdorff distance.

In [None]:
MODEL = 'gpt-4o'   # change as needed

df = dfs[MODEL].query("status == 'ok'").copy()
df = df.sort_values('hausdorff')

print(f'=== {MODEL}: best 5 by Hausdorff ===')
print(df[['sample_id', 'vol_rel', 'hausdorff', 'xi_l2', 'p_value']].head(5).to_string(index=False))

print(f'\n=== {MODEL}: worst 5 by Hausdorff ===')
print(df[['sample_id', 'vol_rel', 'hausdorff', 'xi_l2', 'p_value']].tail(5).to_string(index=False))