In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
from io import StringIO
import pandas as pd

from battleship.agents import RandomCaptain, MAPCaptain, ProbabilisticCaptain, CodeSpotterModel, LLMDecisionCaptain, EIGAutoCaptain
from battleship.board import Board
from battleship.game import BattleshipGame
from battleship.agents import CacheMode

In [None]:
stage_df = pd.read_csv("/home/ubuntu/repo_battleship/temp/gold_annotations_partial.csv")
round_df = pd.read_csv("/home/ubuntu/repo_battleship/battleship/experiments/collaborative/battleship-final-data/round.csv")
goldAnnotations=["answer", "ambiguous", "contextual", "unanswerable"]

board_ids = round_df[["id", "board_id"]]

filtered_stage_df = stage_df[
    [
        "roundID",
        "index",
        "questionID",
        "messageText",
        "messageType",
        "occTiles",
        "goldAnswer",
    ]
]
df = filtered_stage_df.merge(
    board_ids, left_on="roundID", right_on="id", how="inner"
)

In [None]:
def game_completed(hits, misses, occTiles, board_id):
    def mask(board_array):
        return (board_array != -1) & (board_array != 0)
    if hits + misses > 40:
        return False
    else:
        return np.all(mask(occTiles) == mask(Board.convert_to_numeric(Board.from_trial_id(board_id).to_symbolic_array())))

question_counts_df = (
    df[df["messageType"] == "question"]
    .groupby("roundID")
    .size()
    .reset_index(name="question_number")
)

df = df.merge(question_counts_df, on="roundID", how="inner")
result = df.loc[df.groupby('roundID')['index'].idxmax()][['roundID','occTiles', 'board_id','question_number']]
result = result[result["occTiles"] != str(np.full((8, 8), -1).tolist()).replace(" ", "")] #ugly!
data = []
for roundID, occTiles, board_id in zip(result["roundID"], result["occTiles"], result["board_id"]):
    occTiles = np.array(eval(occTiles))
    misses = np.sum(occTiles == 0)
    hits = np.sum((occTiles != -1) & (occTiles != 0))
    data.append({
        "captainType": "human",
        "boardId": board_id,
        "hits": hits,
        "misses": misses,
        "gameCompleted": game_completed(hits, misses, occTiles, board_id),
        "questionsAsked": result[result["roundID"] == roundID]["question_number"].values[0]
    })
human_results_df = pd.DataFrame(data)
human_results_df

In [None]:
human_good = human_results_df[human_results_df["gameCompleted"]]
human_good

In [None]:
from matplotlib import pyplot as plt

moves = human_good["hits"] + human_good["misses"]
plt.figure(figsize=(8, 6))
plt.hist(moves, edgecolor='black')
plt.xlabel("Number of Moves")
plt.ylabel("Frequency")
plt.title("Histogram of Moves in Human Results")
plt.show()

In [None]:

board_ids = ["B"+str(i).zfill(2) for i in range(1, 19)]

eig_spotter = CodeSpotterModel(
                    board_id="B01", 
                    board_experiment="collaborative",
                    model_string="openai/gpt-4o",
                    temperature=None,
                    use_cot=True,
                )

captains = {"RandomCaptain": RandomCaptain(seed=42), 
            "MAPCaptain":MAPCaptain(seed=42, n_samples=100), 
            "EIGAutoCaptain":EIGAutoCaptain(seed=42,
                                                samples=1000,
                                                model_string="openai/gpt-4o",
                                                spotter=eig_spotter,
                                                use_cot=False,
                                                cache_mode=CacheMode.WRITE_ONLY
                                            )}
seeds = range(1,7+1)

seeds = [1]
board_ids = ["B01", "B02"]

data = []
for cap_name, captain in captains.items():
    print("Starting with captain", cap_name)
    for idx, seed in enumerate(seeds):
        for board_id in board_ids:
            if cap_name == "EIGAutoCaptain":
                eig_spotter.board_id = board_id
            captain.seed = seed
            
            board = Board.from_trial_id(board_id)
            game = BattleshipGame(
                board_target=board,
                max_questions=1,
                max_moves=1,
                captain=captain,
                spotter=CodeSpotterModel(
                                        board_id,
                                        "collaborative",
                                        cache_mode=CacheMode.WRITE_ONLY,
                                        model_string="openai/gpt-4o",
                                        temperature=None,
                                        use_cot=True
                                        ),
            )
            game.play()
            hits = game.hits
            misses = game.misses
            data.append({
                "captainType": cap_name,
                "boardId": board_id,
                "hits": hits,
                "misses": misses,
                "gameCompleted": game.is_won(),
                "questionsAsked": game.question_count
            })

agent_results_df = pd.DataFrame(data)


In [None]:
game

In [None]:
results_df = pd.concat([human_results_df, agent_results_df])
results_df["precision"] = results_df["hits"] / (results_df["hits"] + results_df["misses"])
results_df

In [None]:
results_df = pd.read_csv("/home/ubuntu/repo_battleship/temp/total_results_04-04.csv")
results_df["moves"] = results_df["hits"] + results_df["misses"]
results_df["moves"] = list(map(lambda x: min(40,x), results_df["moves"].tolist()))
results_df

In [None]:
# Compute average precision and fraction of games completed per captain type
# Split human results into three groups based on questionsAsked
human_mask = results_df["captainType"] == "human"
results_df_humans = results_df[human_mask].copy()

conditions = [
    results_df_humans["questionsAsked"] < 5,
    (results_df_humans["questionsAsked"] >= 5) & (results_df_humans["questionsAsked"] <= 10),
    results_df_humans["questionsAsked"] > 10
]
choices = ["below 5", "between 5 and 10", "above 10"]
results_df_humans["question_group"] = np.select(conditions, choices, default="unknown")

# Compute metrics for agent teams as before
avg_precision_agents = results_df.groupby("captainType")["precision"].mean()
frac_completed_agents = results_df.groupby("captainType")["gameCompleted"].mean()

# Compute metrics for human groups split by questions asked
avg_precision_humans = results_df_humans.groupby("question_group")["precision"].mean()
frac_completed_humans = results_df_humans.groupby("question_group")["gameCompleted"].mean()

# Combine the results for display
avg_precision = pd.concat([avg_precision_agents, avg_precision_humans])
frac_completed = pd.concat([frac_completed_agents, frac_completed_humans])

avg_precision, frac_completed

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
plt.figure(figsize=(10, 6))
#sns.violinplot(data=results_df, x='captainType', y='precision')
sns.histplot(data=results_df, x='precision', hue='captainType', stat="proportion", element="bars", multiple="stack", bins=30)
plt.title('Distribution of Precision by Captain Type')
plt.xlabel('Precision')
plt.ylabel('Proportion')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
completed_df = results_df[results_df["moves"] < 40]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df
plt.figure(figsize=(10, 6))
#sns.violinplot(data=results_df, x='captainType', y='precision')
sns.histplot(data=completed_df, x='moves', hue='captainType', stat="proportion", element="bars", multiple="stack", bins=30)
plt.title('Distribution of Precision by Captain Type')
plt.xlabel('Number of Moves')
plt.ylabel('Proportion')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create two side-by-side bar plots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

avg_precision.plot(kind="bar", ax=axes[0])
axes[0].set_ylabel("Average Precision")
axes[0].set_title("Average Precision by Captain Type")

frac_completed.plot(kind="bar", ax=axes[1])
axes[1].set_ylabel("Fraction of Completed Games")
axes[1].set_title("Fraction of Games Completed by Captain Type")

plt.tight_layout()
plt.show()