# Captain Analysis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import glob
import re

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from battleship.run_captain_benchmarks import rebuild_captain_summary_from_results
from battleship.utils import resolve_project_path
from battleship.agents import EIGCalculator, CodeQuestion, Question
from battleship.game import Board

from analysis import (
    CAPTAIN_TYPE_LABELS,
    MODEL_DISPLAY_NAMES,
    human_round_summaries,
    build_competitor_column,
    compute_pairwise_win_rates,
    plot_grouped_winrate_heatmap,
)

In [None]:
%config InlineBackend.figure_format = 'retina'

# set seaborn color palette
sns.set_palette("tab10")

# set seaborn style
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
HUMAN_EXPERIMENT_NAME = "battleship-final-data"
PATH_DATA = os.path.join("data", HUMAN_EXPERIMENT_NAME)
PATH_EXPORT = os.path.join(PATH_DATA, "export")

CAPTAIN_EXPERIMENT_PATH = (
    "experiments/collaborative/captain_benchmarks/"
)

## Data loading

### Human data

In [None]:
human_df = human_round_summaries(
    experiment_path=PATH_DATA,
)
human_df = pd.DataFrame(human_df)

human_df = human_df.assign(llm="Human")
human_df

### Model data

In [None]:
model_round_data_unresolved_paths = [
    ("gpt-4o", "run_2025_08_25_16_28_19"),
    ("gpt-5", "run_2025_08_25_22_02_29"),
    ("llama-4-scout", "run_2025_08_26_17_56_46"),
    ("Baseline", "run_2025_08_26_17_23_23"),
]

model_round_data_paths = [
    (name, resolve_project_path(os.path.join(CAPTAIN_EXPERIMENT_PATH, path)))
    for name, path in model_round_data_unresolved_paths
]
for name, path in model_round_data_paths:
    if not os.path.exists(path):
        print(f"The path {path} does not exist.")

dfs = []
for name, path in model_round_data_paths:
    df = pd.DataFrame(rebuild_captain_summary_from_results(path))
    if df.empty:
        continue
    df["llm"] = name
    df["run_dir"] = path  # retain run directory for downstream file access
    dfs.append(df)

model_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
model_df

In [None]:
# Append summary_df to round_df
df = pd.concat([human_df, model_df], ignore_index=True)

primary_columns = ["captain_type_display", "llm_display", "board_id", "seed"]

# Create categorical column for captain_type_display
df["captain_type_display"] = pd.Categorical(
    df["captain_type"].map(CAPTAIN_TYPE_LABELS),
    categories=list(dict.fromkeys(CAPTAIN_TYPE_LABELS.values())),
    ordered=True,
)

# Create categorical column for llm_display
df["llm_display"] = pd.Categorical(
    df["llm"],
    categories=["Human", "Baseline"] + [x for x in MODEL_DISPLAY_NAMES.values() if x in df["llm"].unique()],
    ordered=True,
)

# Move primary columns to the front
df = df[primary_columns + [col for col in df.columns if col not in primary_columns]]

# Sort the DataFrame by primary columns
df = df.sort_values(by=primary_columns, ascending=True).reset_index(drop=True)

df

## Precision/Recall Stats

In [None]:
print("\nBreakdown by captain_type_display:")
for captain_type in df['captain_type_display'].cat.categories:
    llms = df[df['captain_type_display'] == captain_type]['llm'].unique()
    print(f"{captain_type}: {llms}")


# Colorblind-friendly palette (Okabe–Ito)
llm_palette = {
    "Human": "#009E73",  # green
    "Baseline": "#0072B2",  # blue
    "llama-4-scout": "#CC79A7",  # purple
    "gpt-4o": "#E69F00",  # orange (similar to gpt-5)
    "gpt-5": "#D55E00",  # vermillion
}

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

sns.boxplot(
    data=df,
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    ax=ax,
)
sns.despine()

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

ax.legend(loc="upper left", bbox_to_anchor=(1, 1), title="")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

# Prepare ordered captain categories that actually appear in the data
captain_categories = [
    c for c in df["captain_type_display"].cat.categories
    if c in df["captain_type_display"].values
]

# Determine max number of llm groups present for any captain (for consistent box widths)
llm_counts = df.groupby("captain_type_display")["llm"].nunique()
max_llms = int(llm_counts.max()) if len(llm_counts) > 0 else 1

# Base positions for each captain on the x axis
x_positions = np.arange(len(captain_categories))

# Box width: leave some padding between captain groups
group_width = 0.5  # total width occupied by boxes for one captain
box_width = group_width / max_llms

# Ensure grid lines are drawn below plot elements and only horizontal gridlines are shown
ax.set_axisbelow(True)
ax.xaxis.grid(False)
ax.yaxis.grid(True)

# Map captain -> present llms to ensure we only plot existing combinations
# Use the llm_display categorical ordering so order is consistent across plots
llm_order = list(df["llm_display"].cat.categories) if "llm_display" in df.columns else sorted(df["llm"].unique())

for i, captain in enumerate(captain_categories):
    present_llms_unsorted = df[df["captain_type_display"] == captain]["llm"].unique()
    # Preserve the display order
    present_llms = [llm for llm in llm_order if llm in present_llms_unsorted]

    m = len(present_llms)
    if m == 0:
        continue

    # Offsets to center m boxes around the captain x position
    offsets = (np.arange(m) - (m - 1) / 2.0) * box_width

    for j, llm in enumerate(present_llms):
        subset = df[(df["captain_type_display"] == captain) & (df["llm"] == llm)]["f1_score"].dropna()
        if subset.empty:
            continue

        pos = x_positions[i] + offsets[j]
        color = llm_palette.get(llm, "#808080")

        # Use matplotlib's boxplot to place each box at the computed numeric position
        bp = ax.boxplot(subset.values,
                        positions=[pos],
                        widths=box_width * 0.9,
                        patch_artist=True,
                        manage_ticks=False)

        # Style the box elements
        for element in ["boxes", "whiskers", "caps", "medians"]:
            plt.setp(bp[element], color=color)
        for patch in bp["boxes"]:
            patch.set(facecolor=color, alpha=0.6)

        # Make fliers (outliers) less visually distinctive: smaller, lower-alpha, and same color as box
        if "fliers" in bp:
            for f in bp["fliers"]:
                f.set(marker='o', markersize=3, markerfacecolor=color, markeredgecolor=color, alpha=0.35, markeredgewidth=0)

# Create legend handles for llm types present in the full DataFrame, in llm_display order
from matplotlib.patches import Patch
all_present_llms = [llm for llm in llm_order if llm in df["llm"].unique()]
legend_handles = [Patch(facecolor=llm_palette[k], label=k, alpha=0.6) for k in all_present_llms]

ax.legend(handles=legend_handles, loc="upper left", bbox_to_anchor=(1, 1), title="")

# Final formatting
ax.set_xticks(x_positions)
ax.set_xticklabels(captain_categories, rotation=90)
ax.set_xlabel("Captain Type")
ax.set_ylabel("Firing Accuracy (F1)")
ax.set_xlim(-0.5, len(captain_categories) - 0.5)

sns.despine()
# plt.tight_layout()
# plt.show()

plt.savefig(os.path.join(PATH_EXPORT, "captain_f1_boxplot.pdf"), dpi=300, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

sns.stripplot(
    data=df,
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    alpha=0.7,
    ax=ax,
)
sns.despine()

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

ax.legend(loc="upper left", bbox_to_anchor=(1, 1), title="")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.swarmplot(
    data=df,
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    # alpha=0.7,
    ax=ax,
)
sns.despine()

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

ax.legend(loc="upper left", bbox_to_anchor=(1, 1), title="")

plt.show()


In [None]:
sns.displot(
    data=df,
    kind="ecdf",
    x="f1_score",
    hue="captain_type_display",
)

In [None]:
df["move_count"] = df["hits"] + df["misses"]

sns.barplot(data=df, x="captain_type", y="move_count", hue="captain_type")
plt.xticks(rotation=45)

## Win rates

In [None]:
# Build competitor column (LLM first | Captain)
df = build_competitor_column(df, llm_col="llm_display", captain_col="captain_type_display", out_col="competitor")

# Compute win rates for F1 (higher better)
win_results_f1_comp = compute_pairwise_win_rates(
    df, metric="f1_score", higher_is_better=True, competitor_col="competitor", board_col="board_id"
)
print("Mean board win rate matrix (F1, competitor-level):")
display(win_results_f1_comp["mean_board_win_rate_matrix"])
# print("Weighted all-pairs win rate matrix (F1, competitor-level):")
# display(win_results_f1_comp["weighted_win_rate_matrix"])

# # Compute win rates for move count (lower better) if available
# if "move_count" in df.columns:
#     win_results_moves_comp = compute_pairwise_win_rates(
#         df, metric="move_count", higher_is_better=False, competitor_col="competitor", board_col="board_id"
#     )
#     print("Mean board win rate matrix (Move Count, competitor-level):")
#     display(win_results_moves_comp["mean_board_win_rate_matrix"])
#     print("Weighted all-pairs win rate matrix (Move Count, competitor-level):")
#     display(win_results_moves_comp["weighted_win_rate_matrix"])
# else:
#     print("Column 'move_count' not found; skip move-count win rates.")

# Aggregate summary for F1
# f1_comp_summary = win_results_f1_comp["aggregate"][[
#     "competitor_a", "competitor_b", "mean_board_win_rate", "weighted_all_pairs_win_rate", "boards_considered", "board_wins", "board_comparisons"
# ]].sort_values(["competitor_a", "competitor_b"]).reset_index(drop=True)
# print("Pairwise aggregate win rates (F1, competitor-level):")
# display(f1_comp_summary)

In [None]:
# Grouped heatmap using refactored helper
base_matrix = win_results_f1_comp["weighted_win_rate_matrix"].copy().astype(float)

fig, ax = plot_grouped_winrate_heatmap(
    base_matrix,
    llm_palette=llm_palette,
    cmap="cividis",
    annotate=True,
    captain_tick_fontsize=6,
    row_alpha=1.0,
    col_alpha=1.0,
    show_group_separators=True,
    separator_width=4,
    shade_rows=True,
    shade_cols=True,
    group_label_rotation=90,
    group_label_fontsize=10,
    output_path=os.path.join(PATH_EXPORT, "f1_winrate_heatmap.pdf"),
    title=None,
)

## EIG Stats

In [None]:
# --- Theoretical Maximum EIG (shared) -------------------------------------------
# Defines a reusable constant THEORETICAL_MAX_EIG based on epsilon.
# Other cells (ECDF delta, running max curves) will reference this instead of recomputing.
import numpy as np
from battleship.agents import binary_entropy

EIG_EPSILON = 0.1  # noise / error probability parameter
THEORETICAL_MAX_EIG = binary_entropy(p=0.5) - binary_entropy(EIG_EPSILON)
print(f"Set THEORETICAL_MAX_EIG={THEORETICAL_MAX_EIG:.4f} for epsilon={EIG_EPSILON}")

In [None]:
# --- Model EIG Extraction (All Runs) -----------------------------------------
# Build EIG table directly from model_df metadata (no filename round parsing).
from pathlib import Path
import json
import pandas as pd

# Guard: ensure model_df exists
if 'model_df' not in globals() or model_df.empty:
    model_eig_df = pd.DataFrame()
else:
    # We expect columns: llm, run_dir, round_id, captain_type
    required = {"llm", "run_dir", "round_id", "captain_type"}
    missing = required - set(model_df.columns)
    if missing:
        print(f"Missing columns in model_df: {missing}; cannot extract EIG.")
        model_eig_df = pd.DataFrame()
    else:
        records = []
        # Iterate unique (llm, run_dir, round_id, captain_type)
        for (llm, run_dir, round_id, captain_type) in (
            model_df[["llm", "run_dir", "round_id", "captain_type"]]
            .drop_duplicates()
            .itertuples(index=False, name=None)
        ):
            captain_json = Path(run_dir) / "rounds" / f"round_{round_id}" / "captain" / "captain.json"
            if not captain_json.exists():
                continue
            try:
                with captain_json.open() as f:
                    entries = json.load(f)
            except Exception:
                continue
            for q_idx, entry in enumerate(entries):
                eig_value = entry.get("eig")
                q_block = entry.get("question")
                if eig_value is None or q_block is None:
                    continue
                # Nested question text extraction
                if isinstance(q_block, dict):
                    inner_q = q_block.get("question")
                    q_text = inner_q.get("text") if isinstance(inner_q, dict) else None
                else:
                    q_text = None
                q_text = q_text or "No question text"

                raw_candidates = entry.get("eig_questions") or []
                processed = None
                if raw_candidates:
                    processed = [
                        (
                            c.get("question", {})
                            .get("question", {})
                            .get("text"),
                            c.get("eig"),
                            None,
                        )
                        for c in raw_candidates
                    ]
                    vals = [c[1] for c in processed if c[1] is not None]
                    if vals:
                        mx = max(vals)
                        processed = [(qt, ev, ev == mx) for (qt, ev, _) in processed]

                records.append(
                    {
                        "llm": llm,
                        "captain_type": captain_type,
                        "run_dir": run_dir,
                        "round_id": round_id,
                        "question_idx": q_idx,
                        "question": q_text,
                        "eig": eig_value,
                        "eig_candidates": processed,
                    }
                )
        model_eig_df = pd.DataFrame(records)
        if not model_eig_df.empty:
            model_eig_df = model_eig_df.sort_values(["llm", "round_id", "question_idx"]).reset_index(drop=True)

# Add captain_type_display similar to main df
if not model_eig_df.empty and 'CAPTAIN_TYPE_LABELS' in globals():
    model_eig_df['captain_type_display'] = model_eig_df['captain_type'].map(CAPTAIN_TYPE_LABELS)
    # Preserve order used elsewhere
    cat_order = [x for x in dict.fromkeys(CAPTAIN_TYPE_LABELS.values()) if x in model_eig_df['captain_type_display'].unique()]
    model_eig_df['captain_type_display'] = pd.Categorical(model_eig_df['captain_type_display'], categories=cat_order, ordered=True)

# Derive llm_display here (moved earlier as requested)
if not model_eig_df.empty and 'llm_display' not in model_eig_df.columns:
    if 'df' in globals() and 'llm_display' in df.columns:
        llm_display_map = (
            df.dropna(subset=['llm_display'])
              .drop_duplicates('llm')
              [['llm','llm_display']]
              .set_index('llm')['llm_display']
              .to_dict()
        )
        model_eig_df['llm_display'] = model_eig_df['llm'].map(llm_display_map).fillna(model_eig_df['llm'])
    else:
        model_eig_df['llm_display'] = model_eig_df['llm']

# Align categorical ordering with main df if available
if not model_eig_df.empty and 'df' in globals() and 'llm_display' in df.columns and hasattr(df['llm_display'], 'cat'):
    model_eig_df['llm_display'] = pd.Categorical(
        model_eig_df['llm_display'],
        categories=list(df['llm_display'].cat.categories),
        ordered=True,
    )

model_eig_df

In [None]:
# --- Per-Captain EIG Distribution --------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

if 'model_eig_df' not in globals() or model_eig_df.empty:
    print("model_eig_df is empty; run the EIG extraction cell first.")
else:
    x_col = 'captain_type_display' if 'captain_type_display' in model_eig_df.columns else 'captain_type'

    plt.figure(figsize=(8, 4.5))
    ax = sns.boxplot(
        data=model_eig_df,
        x=x_col,
        y='eig',
        hue='llm',
        palette=llm_palette,
        showfliers=False,
    )

    ax.set_xlabel('Captain Type')
    ax.set_ylabel('EIG')
    ax.set_title('Per-Captain EIG Distribution (Models)')
    plt.xticks(rotation=45, ha='right')
    ax.legend(title='LLM', bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
model_eig_df.columns

In [None]:
ax = sns.displot(
    data=model_eig_df,
    kind="ecdf",
    col="captain_type_display",
    x="eig",
    hue="llm",
    palette=llm_palette,
    complementary=True,
)

In [None]:
# --- ECDF Delta (LLM vs EIG captain types) -----------------------------------
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Configuration
_ecdf_categories = ("LLM", "EIG")  # (baseline, enhanced)
complementary = True  # match earlier visualization style (survival curves)
alpha_fill = 0.18
linewidth = 2.0

if 'model_eig_df' not in globals() or model_eig_df.empty:
    print("model_eig_df empty; run extraction first.")
else:
    # Ensure required captain types exist
    present_cats = set(model_eig_df['captain_type_display'].dropna().unique())
    missing = [c for c in _ecdf_categories if c not in present_cats]
    if missing:
        print(f"Missing categories for delta plot: {missing}")
    else:
        fig, ax = plt.subplots(figsize=(7.5, 5))

        legend_elements = []
        area_rows = []

        for llm in [c for c in llm_palette.keys() if c in model_eig_df['llm'].unique()]:
            sub = model_eig_df[model_eig_df['llm'] == llm]
            # Need both categories for this llm
            if not set(_ecdf_categories).issubset(set(sub['captain_type_display'].unique())):
                continue
            data_a = sub[sub['captain_type_display'] == _ecdf_categories[0]]['eig'].dropna().values
            data_b = sub[sub['captain_type_display'] == _ecdf_categories[1]]['eig'].dropna().values
            if len(data_a) == 0 or len(data_b) == 0:
                continue

            # Build common grid
            grid = np.unique(np.concatenate([data_a, data_b]))
            n_a = len(data_a)
            n_b = len(data_b)
            # ECDF values using searchsorted (right) for F(x) = P(X <= x)
            y_a = np.searchsorted(np.sort(data_a), grid, side='right') / n_a
            y_b = np.searchsorted(np.sort(data_b), grid, side='right') / n_b
            if complementary:
                y_a = 1 - y_a
                y_b = 1 - y_b

            color = llm_palette.get(llm, '#444444')
            # Plot baseline (LLM) dashed, enhanced (EIG) solid
            ax.plot(grid, y_a, linestyle='--', color=color, linewidth=linewidth, alpha=0.9)
            ax.plot(grid, y_b, linestyle='-', color=color, linewidth=linewidth, alpha=0.9)

            # Shade region between curves
            ax.fill_between(grid, y_a, y_b, color=color, alpha=alpha_fill, linewidth=0)

            # Approximate absolute area difference (integral of |delta|) for reference
            area_diff = np.trapz(np.abs(y_b - y_a), grid)
            area_rows.append({'llm': llm, 'area_abs_diff': area_diff})

        # Use shared THEORETICAL_MAX_EIG
        if 'THEORETICAL_MAX_EIG' in globals():
            ax.axvline(THEORETICAL_MAX_EIG, color='k', linestyle=':', linewidth=1.5)
            ymin, ymax = ax.get_ylim()
            ax.text(
                THEORETICAL_MAX_EIG * 1.01,
                ymax * 0.97,
                f"Theoretical Max EIG (ε={EIG_EPSILON})≈{THEORETICAL_MAX_EIG:.3f}",
                rotation=90,
                va='top',
                ha='left',
                fontsize=9,
                color='k'
            )

        ax.set_xlim(left=0)
        ax.set_xlabel('EIG')
        ax.set_ylabel('Proportion (CDF)' if not complementary else 'Proportion (1 - CDF)')
        ax.set_title(r"$\Delta$EIG w/r/t Base LLM")
        ax.grid(alpha=0.3, linestyle="-")

        # Simplified legend: only distinguish line styles (color meaning shown elsewhere)
        from matplotlib.lines import Line2D
        style_handles = [
            Line2D([0,1],[0,1], color='k', linestyle='--', linewidth=linewidth, label=_ecdf_categories[0]),
            Line2D([0,1],[0,1], color='k', linestyle='-', linewidth=linewidth, label=_ecdf_categories[1]),
        ]
        ax.legend(handles=style_handles, loc='lower left', frameon=True, title='Captain Type')

        sns.despine()
        plt.tight_layout()

        plt.savefig(
            os.path.join(PATH_EXPORT, "eig_ecdf_delta.pdf"),
            bbox_inches='tight',
            dpi=300,
        )

        plt.show()

        if area_rows:
            area_df = pd.DataFrame(area_rows).sort_values('area_abs_diff', ascending=False)
            display(area_df.reset_index(drop=True))

### Growth of max EIG with number of candidate questions
We fix the captain type to `EIG` and, for every model/round row in `model_eig_df`, compute the running maximum EIG obtainable when only the first k candidate questions are considered (k = 1..N). We then average these curves across rows for each LLM type and plot one line per LLM.

In [None]:
# Running max EIG vs number of candidate questions per LLM
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Configurable captain type display to analyze
CAPTAIN_TYPE_DISPLAY = "EIG"  # change this to any value present in model_eig_df['captain_type_display']

required_cols = {"llm_display", "captain_type_display", "eig_candidates"}
missing = required_cols - set(model_eig_df.columns)
assert not missing, f"model_eig_df missing required columns: {missing}"  # Fail fast

subset = model_eig_df[model_eig_df["captain_type_display"] == CAPTAIN_TYPE_DISPLAY].copy()
if subset.empty:
    print(f"No rows with captain_type_display == '{CAPTAIN_TYPE_DISPLAY}'.")
else:
    preview = None
    for v in subset["eig_candidates"]:
        if isinstance(v, list) and v:
            preview = v[:3]
            break
    print("Preview eig_candidates (first 3 of first non-empty row):", preview)

    def extract_eigs(candidates):
        if not isinstance(candidates, list):
            return []
        vals = []
        for item in candidates:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                val = item[1]
                if val is not None:
                    try:
                        vals.append(float(val))
                    except Exception:
                        pass
            elif isinstance(item, dict):
                for k in ["eig", "EIG", "value"]:
                    if k in item and item[k] is not None:
                        try:
                            vals.append(float(item[k]))
                        except Exception:
                            pass
                        break
        return vals

    records = []
    for _, row in subset.iterrows():
        eigs = extract_eigs(row["eig_candidates"])
        if not eigs:
            continue
        running_max = np.maximum.accumulate(eigs)
        for k, val in enumerate(running_max, start=1):
            records.append({
                "llm_display": row["llm_display"],
                "k": k,
                "running_max_eig": val,
            })

    curve_df = pd.DataFrame(records)
    if curve_df.empty:
        print("No candidate EIG values extracted.")
    else:
        agg = (
            curve_df.groupby(["llm_display", "k"])
            ["running_max_eig"].agg(["mean", "count", "std"]).reset_index()
        )
        agg["se"] = agg["std"] / np.sqrt(agg["count"]).replace(0, np.nan)

        fig, ax = plt.subplots(figsize=(7, 4.5))
        palette = llm_palette if 'llm_palette' in globals() else None
        for llm_name, g in agg.groupby("llm_display"):
            g_sorted = g.sort_values("k")
            color = palette.get(llm_name, None) if palette else None
            ax.plot(g_sorted["k"], g_sorted["mean"], label=llm_name, color=color)
            if g_sorted["se"].notna().any():
                ax.fill_between(
                    g_sorted["k"],
                    g_sorted["mean"] - g_sorted["se"],
                    g_sorted["mean"] + g_sorted["se"],
                    alpha=0.18,
                    color=color,
                )
        # Add theoretical max EIG horizontal line if available
        if 'THEORETICAL_MAX_EIG' in globals():
            ax.axhline(THEORETICAL_MAX_EIG, color='k', linestyle=':', linewidth=1.25, label=f"Theoretical max (ε={EIG_EPSILON})")
            xmin, xmax = ax.get_xlim()
            x_center = 0.5 * (xmin + xmax)
            ax.text(
                x_center,
                THEORETICAL_MAX_EIG * 0.98,
                f"Theoretical Max EIG (ε={EIG_EPSILON})≈{THEORETICAL_MAX_EIG:.3f}",
                va="top",
                ha="center",
                fontsize=9,
                color="k",
                bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.6, edgecolor="none"),
            )

        ax.set_xlabel("Number of candidate questions considered (k)")
        ax.set_ylabel("Max EIG")
        # ax.set_title(f"Growth of max EIG with candidate set size (captain = {CAPTAIN_TYPE_DISPLAY})")
        # ax.legend(title="LLM")
        ax.grid(alpha=0.3, linestyle="-")

        sns.despine()
        plt.tight_layout()

        plt.savefig(
            os.path.join(PATH_EXPORT, "eig_max_vs_k.pdf"),
            bbox_inches='tight',
            dpi=300
        )

        plt.show()
        display(agg.head())

### Human question EIG

In [None]:
# --- Human Question EIG Calculation ------------------------------------------
# Computes EIG for human questions and caches to human_eig_df.csv (≈1–2 min first run).
from pathlib import Path
import os, json
import pandas as pd

INPUT_JSON_PATH = resolve_project_path(
    "experiments/collaborative/spotter_benchmarks/o4-mini_CodeSpotterModel_True.json"
)
CACHE_PATH = Path("human_eig_df.csv")


def load_human_interactions(json_path: Path) -> list[dict]:
    with json_path.open() as f:
        return json.load(f)


def build_human_df(entries: list[dict]) -> pd.DataFrame:
    rows = []
    for e in entries:
        if not ("question" in e and "occTiles" in e):
            continue
        answer = e.get("answer", "").lower()
        # Normalize boolean text answers.
        if answer == "true":
            answer = "yes"
        elif answer == "false":
            answer = "no"
        true_answer = e.get("true_answer")
        rows.append(
            {
                "question": e.get("question"),
                "program": e.get("program"),
                "board_state": e.get("occTiles"),
                "answer": answer,
                "true_answer": true_answer,
                "correct": answer == true_answer,
            }
        )
    return pd.DataFrame(rows)


if CACHE_PATH.exists():
    human_eig_df = pd.read_csv(CACHE_PATH)
else:
    raw_entries = load_human_interactions(Path(INPUT_JSON_PATH))
    human_eig_df = build_human_df(raw_entries)
    # Keep only correctly answered questions.
    human_eig_df = human_eig_df[human_eig_df["correct"]]

    eig_calculator = EIGCalculator(samples=1000, timeout=15, epsilon=0)
    human_eig_df["calculated_eig"] = None

    for idx, row in human_eig_df.iterrows():
        code_question = CodeQuestion(
            question=Question(row["question"]),
            fn_text=row["program"],
            translation_prompt="",
            completion={},
        )
        board = Board.from_occ_tiles(row["board_state"])  # reconstruct board
        human_eig_df.at[idx, "calculated_eig"] = eig_calculator(code_question, board)

    human_eig_df.to_csv(CACHE_PATH, index=False)

human_eig_df

In [None]:
# --- Compare EIG Distributions (Model vs Human) ------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plot_data = pd.DataFrame(
    {
        "EIG": pd.concat([model_eig_df["eig"], human_eig_df["calculated_eig"]], ignore_index=True),
        "Source": ["model"] * len(model_eig_df) + ["human"] * len(human_eig_df),
    }
)

ax = sns.boxplot(data=plot_data, x="Source", y="EIG", palette="Set2")
ax.set(
    title="EIG Distribution (Model vs Human)",
    xlabel="Source",
    ylabel="EIG",
)
ax.grid(axis="y", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()

avg_model = model_eig_df["eig"].mean()
avg_human = pd.to_numeric(human_eig_df["calculated_eig"], errors="coerce").mean()
print(f"Average model EIG: {avg_model:.4f}")
print(f"Average human EIG: {avg_human:.4f}")