In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "battleship")))

from agents import CodeSpotterModel
from agents import DirectSpotterModel
from run_spotter_benchmarks import load_data, benchmark_on_rounds

df, rounds_questions_dict = load_data(
        stages_path="/home/ubuntu/repo_battleship/temp/gold_annotations_partial.csv",
        rounds_path="/home/ubuntu/repo_battleship/battleship/experiments/collaborative/battleship-final-data/round.csv",
        goldAnnotations=["answer", "ambiguous", "contextual", "unanswerable"],
    )

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# We'll store the accuracy results in a dictionary:
# Key: language model string
# Value: list of accuracies [DirectSpotter, CodeSpotterModel]
results = {}
QUESTIONS = 5
ROUNDS = 3

spotter_models = [DirectSpotterModel, CodeSpotterModel]
language_models = ["openai/gpt-4o", "anthropic/claude-3.5-sonnet", "meta-llama/llama-3.3-70b-instruct"]

for llm in language_models:
        results[llm] = []
        for spotter in spotter_models:
                print(f"Benchmarking {spotter.__name__} with language model {llm}")
                accuracy, failed = benchmark_on_rounds(
                        df=df,
                        rounds_question_ids=rounds_questions_dict,
                        model=spotter,
                        model_string=llm,
                        max_rounds=ROUNDS,
                        max_questions=QUESTIONS,
                        use_cache=False,
                )
                results[llm].append(accuracy)
                print(f"Accuracy: {accuracy}")

In [None]:
# Prepare data for the plot
labels = language_models
x = np.arange(len(labels))  # positions for language models on x-axis
width = 0.35  # width of each bar

fig, ax = plt.subplots(figsize=(8, 6))

# For each language model, the first element is for DirectSpotter and the second for CodeSpotterModel.
direct = [results[l][0] for l in labels]
code = [results[l][1] for l in labels]

rects1 = ax.bar(x - width/2, direct, width, label='DirectSpotter')
rects2 = ax.bar(x + width/2, code, width, label='CodeSpotterModel')

ax.set_ylabel('Accuracy')
ax.set_title('Accuracy by Language Model and Spotter Model')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45)
ax.legend()

# Annotate bars with the accuracy values.
def autolabel(rects):
        for rect in rects:
                height = rect.get_height()
                ax.annotate(f'{height:.2f}',
                                        xy=(rect.get_x() + rect.get_width() / 2, height),
                                        xytext=(0, 3),  # vertical offset in points
                                        textcoords="offset points",
                                        ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.show()