# Spotter Benchmark Analysis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

from experiments.collaborative.analysis import (
    load_dataset,
    get_gold_answer_dataset,
    MODEL_DISPLAY_NAMES,
    get_spotter_type_short,
)
from battleship.run_spotter_benchmarks import rebuild_summary_from_results

In [None]:
%config InlineBackend.figure_format = 'retina'

# set seaborn color palette
sns.set_palette("Set2")

# set seaborn style
sns.set_style("white")
sns.set_context("talk")

In [None]:
EXPERIMENT_NAME = "battleship-final-data"
PATH_DATA = os.path.join("data", EXPERIMENT_NAME)
PATH_EXPORT = os.path.join(PATH_DATA, "export")

df_gold = load_dataset(experiment_path=PATH_DATA, use_gold=True)

In [None]:
# RESULTS_PATH = os.path.join(
#     "spotter_benchmarks", "run_2025_07_09_16_55_45", "summary.json"
# )

# df = pd.read_json(RESULTS_PATH)
# df

results = rebuild_summary_from_results(os.path.join("spotter_benchmarks", "run_2025_07_11_18_32_51"))
df = pd.DataFrame(results)

In [None]:
# Add display names and categorizations for analysis
def add_display_fields(df):
    """Add display names and categorizations to the dataframe."""
    # Add spotter type categorization
    df["spotter_type_short"] = df.apply(
        lambda row: get_spotter_type_short(row["spotter_type"], row["use_cot"]), axis=1
    )
    df["spotter_type_short"] = pd.Categorical(df["spotter_type_short"], categories=["Base", "+ CoT", "+ Code", "+ CoT + Code"], ordered=True)

    # Add model display name
    df["llm_display_name"] = df["llm"].map(lambda x: MODEL_DISPLAY_NAMES.get(x, x))

    # Sort by order in MODEL_DISPLAY_NAMES using categorical
    df["llm_display_name"] = pd.Categorical(df["llm_display_name"], categories=MODEL_DISPLAY_NAMES.values(), ordered=True)
    df = df.sort_values(by=["llm_display_name", "spotter_type_short"])

    return df

# Process the dataframe
df = add_display_fields(df)
df


In [None]:
df.llm_display_name.unique()

In [None]:
# Check distribution of raw answer text
df["answer_text"].value_counts(dropna=False).plot(kind="bar")

In [None]:
gold_labels, human_labels = get_gold_answer_dataset(df_gold)
print(len(gold_labels), len(human_labels))

In [None]:
print(classification_report(y_true=gold_labels, y_pred=human_labels))

human_accuracy_baseline = classification_report(y_true=gold_labels, y_pred=human_labels, output_dict=True)["accuracy"]
print(f"Human accuracy baseline: {human_accuracy_baseline:.2%}")

In [None]:
with sns.plotting_context(context="talk"), sns.axes_style("whitegrid"):

    plt.figure(figsize=(6, 8))
    sns.barplot(
        data=df,
        x="is_correct",
        y="llm_display_name",
        hue="spotter_type_short",
        errorbar=("ci", 95),
        err_kws={
            "color": "gray",
            "linewidth": 1,
        },
        capsize=0.2,
    )

    plt.axvline(
        human_accuracy_baseline,
        color="#4b4f73",
        linestyle="--",
        linewidth=2.0,
        label="Human Performance",
    )

    plt.ylabel("")
    plt.xlabel("Gold Answer Accuracy")

    plt.xlim(0.0, 1.0)

    plt.yticks(fontsize=12)

    plt.legend(title="Spotter Models", bbox_to_anchor=(1.05, 1), loc='upper left')

    # plt.savefig(
    #     os.path.join(PATH_EXPORT, "spotter_accuracy.pdf"),
    #     bbox_inches="tight",
    #     dpi=300,
    # )