# Captain Analysis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import glob
import re

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from battleship.run_captain_benchmarks import rebuild_captain_summary_from_results
from battleship.utils import resolve_project_path
from battleship.agents import EIGCalculator, CodeQuestion, Question
from battleship.game import Board

from analysis import CAPTAIN_TYPE_LABELS, MODEL_DISPLAY_NAMES
from analysis import human_round_summaries

In [None]:
%config InlineBackend.figure_format = 'retina'

# set seaborn color palette
sns.set_palette("tab10")

# set seaborn style
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
HUMAN_EXPERIMENT_NAME = "battleship-final-data"
PATH_DATA = os.path.join("data", HUMAN_EXPERIMENT_NAME)
PATH_EXPORT = os.path.join(PATH_DATA, "export")

CAPTAIN_EXPERIMENT_PATH = (
    "experiments/collaborative/captain_benchmarks/"
)

## Data loading

### Human data

In [None]:
human_df = human_round_summaries(
    experiment_path=PATH_DATA,
)
human_df = pd.DataFrame(human_df)

human_df = human_df.assign(llm="Human")
human_df

### Model data

In [None]:
model_round_data_unresolved_paths = [
    ("gpt-4o", "run_2025_08_25_16_28_19"),
    ("gpt-5", "run_2025_08_25_22_02_29"),
    ("llama-4-scout", "run_2025_08_26_17_56_46"),
    ("Baseline", "run_2025_08_26_17_23_23"),
]

model_round_data_paths = [
    (name, resolve_project_path(os.path.join(CAPTAIN_EXPERIMENT_PATH, path)))
    for name, path in model_round_data_unresolved_paths
]
for name, path in model_round_data_paths:
    if not os.path.exists(path):
        print(f"The path {path} does not exist.")

dfs = []
for name, path in model_round_data_paths:
    df = pd.DataFrame(rebuild_captain_summary_from_results(path))
    df["llm"] = name
    dfs.append(df)

model_df = pd.concat(dfs, ignore_index=True)
model_df

In [None]:
# Append summary_df to round_df
df = pd.concat([human_df, model_df], ignore_index=True)

primary_columns = ["captain_type_display", "llm_display", "board_id", "seed"]

# Create categorical column for captain_type_display
df["captain_type_display"] = pd.Categorical(
    df["captain_type"].map(CAPTAIN_TYPE_LABELS),
    categories=list(dict.fromkeys(CAPTAIN_TYPE_LABELS.values())),
    ordered=True,
)

# Create categorical column for llm_display
df["llm_display"] = pd.Categorical(
    df["llm"],
    categories=["Human", "Baseline"] + [x for x in MODEL_DISPLAY_NAMES.values() if x in df["llm"].unique()],
    ordered=True,
)

# Move primary columns to the front
df = df[primary_columns + [col for col in df.columns if col not in primary_columns]]

# Sort the DataFrame by primary columns
df = df.sort_values(by=primary_columns, ascending=True).reset_index(drop=True)

df

## Precision/Recall Stats

In [None]:
print("\nBreakdown by captain_type_display:")
for captain_type in df['captain_type_display'].cat.categories:
    llms = df[df['captain_type_display'] == captain_type]['llm'].unique()
    print(f"{captain_type}: {llms}")


# Colorblind-friendly palette (Okabe–Ito)
llm_palette = {
    "Human": "#009E73",  # green
    "Baseline": "#0072B2",  # blue
    "llama-4-scout": "#CC79A7",  # purple
    "gpt-4o": "#E69F00",  # orange (similar to gpt-5)
    "gpt-5": "#D55E00",  # vermillion
}

In [None]:
g = sns.catplot(data=df, kind="box", x="captain_type_display", y="f1_score", hue="llm", palette=llm_palette)

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

g._legend.set_title("")

In [None]:
g = sns.catplot(
    data=df,
    kind="strip",
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    alpha=0.7,
)

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

g._legend.set_title("")


In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

sns.stripplot(
    data=df,
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    alpha=0.7,
    ax=ax,
)
sns.despine()

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

ax.legend(loc="upper left", bbox_to_anchor=(1, 1), title="")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.swarmplot(
    data=df,
    x="captain_type_display",
    y="f1_score",
    hue="llm",
    palette=llm_palette,
    # alpha=0.7,
    ax=ax,
)
sns.despine()

plt.xlabel("Captain Type")
plt.ylabel("Firing Accuracy (F1)")

plt.xticks(rotation=90)

ax.legend(loc="upper left", bbox_to_anchor=(1, 1), title="")

plt.show()


In [None]:
plt.figure(figsize=(10, 6))

for captain_type, group in df.groupby('captain_type'):
    f1_scores = np.sort(group['f1_score'].dropna())
    cdf = np.arange(1, len(f1_scores) + 1) / len(f1_scores)
    plt.step(f1_scores, cdf, where='post', label=captain_type)

plt.xlabel('F1 Score')
plt.ylabel('Cumulative Probability')
plt.title('CDF of F1 Score by Captain Type')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
df["move_count"] = df["hits"] + df["misses"]

sns.barplot(data=df, x="captain_type", y="move_count", hue="captain_type")
plt.xticks(rotation=45)

## EIG Stats

In [None]:
# Path to run directory
# base_path = resolve_project_path("experiments/collaborative/captain_benchmarks/run_combined/run_4o_mapeig_cot_captain")
base_path = resolve_project_path(
    "experiments/collaborative/captain_benchmarks/run_combined/run_4o_llmdecision_captain"
)

# Find all captain.json files in subdirectories
captain_files = glob.glob(os.path.join(base_path, '**/captain/captain.json'), recursive=True)

# Dictionary to store eig values by file
eig_values_by_file = {}
# Initialize list to store data for DataFrame
eig_data_list = []

# Extract eig values from each file
for file_path in captain_files:
    # Get relative path for naming
    rel_path = os.path.relpath(file_path, base_path)

    # Extract round_id from path
    # Use regex to extract the part after 'round_' in the relative path
    match = re.search(r'round_([a-zA-Z0-9]+)', rel_path)
    round_id = match.group(1) if match else None

    with open(file_path, 'r') as f:
        data = json.load(f)

    # Extract eig values, skipping None/null values
    for idx, datum in enumerate(data):
        if 'eig' in datum and datum['eig'] is not None and 'question' in datum and datum['question'] is not None:
            question_text = datum['question']['question']['text'] if datum['question']['question'] and 'text' in datum['question']['question'] else "No question text"
            eig_value = datum['eig']

            eig_questions = datum.get("eig_questions", [])


            if eig_questions is not None:
                if len(eig_questions) != 0:
                    eig_questions = [(q['question']['question']['text'],q['eig'], None) for q in eig_questions]
                    max_eig = max([eq[1] for eq in eig_questions if eq[1] is not None])
                    eig_questions = [(q[0], q[1], q[1] == max_eig) for q in eig_questions]

            # Add to data list
            eig_data_list.append({
                'round_id': round_id,
                'question_idx': idx,
                'question': question_text,
                'eig': eig_value,
                'eig_questions': eig_questions,
            })

# Create DataFrame from the list
model_eig_df = pd.DataFrame(eig_data_list)

model_eig_df

In [None]:
model_eig_df["eig"].hist()

In [None]:
# /////////////////////////////////////////////////
# This cell calculates EIG for human questions (and saves it to notebooks/human_eig_df.csv)
# Caution: This will take 1-2 mins to run if human_eig_df.csv doesn't exist in the notebooks directory
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# JSON file to pull the code translations of human questions from
input_json_path = resolve_project_path("experiments/collaborative/spotter_benchmarks/o4-mini_CodeSpotterModel_True.json")

def extract_questions_and_boards_to_dataframe(json_path):
    """
    Extracts all questions asked and the board state at the time they were asked from a JSON file
    and returns the data as a pandas DataFrame.

    Args:
        json_path (str): Path to the input JSON file.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted questions and board states.
    """
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"The file {json_path} does not exist.")

    with open(json_path, 'r') as f:
        data = json.load(f)

    extracted_data = []

    for entry in data:
        if "question" in entry and "occTiles" in entry:
            question = entry["question"]
            program = entry["program"]
            board_state = entry["occTiles"]
            answer = entry["answer"]
            true_answer = entry["true_answer"]

            if answer.lower() == "true":
                answer = "yes"
            if answer.lower() == "false":
                answer = "no"

            extracted_data.append({
                "question": question,
                "program": program,
                "board_state": board_state,
                "answer": answer,
                "true_answer": true_answer,
                "correct": answer == true_answer
            })

    return pd.DataFrame(extracted_data)


if os.path.exists('human_eig_df.csv'):
    human_eig_df = pd.read_csv('human_eig_df.csv')
else:
    human_eig_df = extract_questions_and_boards_to_dataframe(input_json_path)
    human_eig_df = human_eig_df[human_eig_df['correct'] == True]

    eig_calculator = EIGCalculator(samples=1000, timeout=15, epsilon=0)

    # Add a new column to store EIG values
    human_eig_df["calculated_eig"] = None

    for idx, row in human_eig_df.iterrows():
            # Create a CodeQuestion instance
            code_question = CodeQuestion(
                question=Question(row["question"]),
                fn_text=row["program"],
                translation_prompt="",
                completion={}
            )

            # Convert board_state to a Board instance
            board = Board.from_occ_tiles(row["board_state"])

            # Calculate EIG
            eig_value = eig_calculator(code_question, board)
            human_eig_df.at[idx, "calculated_eig"] = eig_value

    human_eig_df.to_csv('human_eig_df.csv', index=False)

human_eig_df

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Prepare the data for plotting
plot_data = pd.DataFrame({
    'EIG Values': pd.concat([model_eig_df["eig"], human_eig_df["calculated_eig"]], ignore_index=True),
    'Source': ['model_eig_df'] * len(model_eig_df) + ['human_eig_df'] * len(human_eig_df)
})

# Create a boxplot instead of a scatter plot
sns.boxplot(data=plot_data, x='Source', y='EIG Values', palette='Set2')

# Add labels and title
plt.title('Categorical Scatter Plot of EIG Values')
plt.xlabel('Source')
plt.ylabel('EIG Values')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Calculate and print the average EIG values for both distributions
avg_model_eig = model_eig_df["eig"].mean()
avg_human_eig = human_eig_df["calculated_eig"].astype(float).mean()

print(f"Average EIG for eig_df: {avg_model_eig:.4f}")
print(f"Average EIG for output_df: {avg_human_eig:.4f}")