# Boardgame QA Debate Experiments

This notebook runs experiments using the debate and judge modules on the BoardgameQA dataset.


In [None]:
import logging

# Set up logging configuration
# Configure root logger
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.DEBUG
)

logger = logging.getLogger(__name__)

In [None]:
import json
from pathlib import Path
from typing import Tuple
from collections import Counter, defaultdict
import random
import re

from debate.debate import DebateTwoPlayers
from debate.judge import JudgeManager
from debate.types import DebateScenario
from debate.baseline import BaselineManager

# Configure model list from most to least powerful
MODELS = ["gemini-1.5-pro", "gemini-1.5-flash", "gemini-1.5-flash-8b"]

# Dataset difficulty levels
LEVELS = [
    "ZeroConflict",
    "LowConflict",
    "Main",  # Medium
    "HighConflict",
]

In [None]:
def load_boardgame_qa(base_path: str = "BoardgameQA") -> dict:
    """Load BoardgameQA dataset from json files."""
    ds = {}
    for level in LEVELS:
        path = Path(base_path) / f"BoardgameQA-{level}-depth2/test.json"
        with open(path) as f:
            ds[level] = json.load(f)
    return ds


def sample_data(ds: dict, sample_size: float = 0.1) -> dict:
    """Sample data while maintaining label distribution."""
    sampled_data = {}
    for level, data in ds.items():
        label_to_examples = defaultdict(list)

        for example in data:
            label_to_examples[example["label"]].append(example)

        sample = []
        for label, examples in label_to_examples.items():
            sample_size_per_label = int(len(examples) * sample_size)
            sample.extend(random.sample(examples, sample_size_per_label))

        sampled_data[level] = sample
    return sampled_data


def convert_to_scenarios(examples: list) -> list[DebateScenario]:
    """Convert BoardgameQA examples to DebateScenario objects."""

    def split_question(example: str) -> Tuple[str, str]:
        """Split the example text into the game and the question."""
        example_text = example
        question = re.split(r"\.\s+", example_text)[-1]
        game = example_text.replace(question, "").strip()
        return game, question

    scenarios = []
    for ex in examples:
        situation, question = split_question(ex["example"])
        scenario = DebateScenario(
            situation=situation,
            question=question,
            answer_options=["proved", "disproved", "unknown"],
            label=ex["label"],
        )
        scenarios.append(scenario)
    return scenarios

In [None]:
# Load and sample data
ds = load_boardgame_qa()
sampled_ds = sample_data(ds, sample_size=0.102)  # ~100 examples per level

# Display sample sizes and label distributions
for level, examples in sampled_ds.items():
    print(f"{level}: {len(examples)} examples")
    label_counts = Counter(ex["label"] for ex in examples)
    print(f"  {dict(label_counts)}")

## Run Experiments

We'll test different model configurations:


In [None]:
def run_experiment(
    scenario: DebateScenario,
    debater_models: list[str],
    judge_models: list[str],
    experiment_name: str,
):
    """Run a complete experiment with debates, judgments and baseline."""
    Path("results").mkdir(exist_ok=True)

    # Run baseline first
    baseline = BaselineManager(scenarios=[scenario], models=judge_models)
    baseline.run()
    baseline.save_results(f"results/{experiment_name}_baseline.json")

    # Run debates
    debate = DebateTwoPlayers(
        scenario=scenario,
        debater_models=debater_models,
        word_limit=300,
        max_debate_rounds=3,
    )
    debate.run()
    debate.run(swap=True)
    debate.run(all_wrong=True)
    debate.save_results(f"results/{experiment_name}_debates.json")

    # Run judgments
    judge = JudgeManager(records=debate.get_records(), judge_models=judge_models)
    judge.run()
    judge.save_results(f"results/{experiment_name}_judgments.json")

In [None]:
# Test the experiment with a single scenario
scenario = convert_to_scenarios(sampled_ds["Main"])[0]
debater_models = [MODELS[-1]] * 2
judge_models = [MODELS[-1]]
run_experiment(scenario, debater_models, judge_models, "test2")


In [None]:
scenario.__dict__

In [None]:
print(sflkasjdf)

In [None]:
# Configuration 1: Same capability debaters and judge
for level in LEVELS:
    scenarios = convert_to_scenarios(sampled_ds[level])
    debater_models = [MODELS[0]] * 2  # Same model for both debaters
    judge_model = [MODELS[0]]  # Same model for judge

    exp_name = f"{level}_same_capability"
    print(f"Running {exp_name}...")
    run_experiment(scenarios, debater_models, judge_model, exp_name)

In [None]:
# Configuration 2: Stronger judge
for level in LEVELS:
    scenarios = convert_to_scenarios(sampled_ds[level])
    debater_models = [MODELS[1]] * 2  # Weaker debaters
    judge_model = [MODELS[0]]  # Stronger judge

    exp_name = f"{level}_stronger_judge"
    print(f"Running {exp_name}...")
    run_experiment(scenarios, debater_models, judge_model, exp_name)

In [None]:
# Configuration 3: Stronger debaters
for level in LEVELS:
    scenarios = convert_to_scenarios(sampled_ds[level])
    debater_models = [MODELS[0]] * 2  # Stronger debaters
    judge_model = [MODELS[1]]  # Weaker judge

    exp_name = f"{level}_stronger_debaters"
    print(f"Running {exp_name}...")
    run_experiment(scenarios, debater_models, judge_model, exp_name)

## Analyze Results

Load and analyze the experiment results:


In [None]:
def load_results(pattern: str = "results/*_judgments.json") -> dict:
    """Load all experiment results matching pattern."""
    results = {}
    for path in Path(".").glob(pattern):
        with open(path) as f:
            results[path.stem] = json.load(f)
    return results


def analyze_results(results: dict):
    """Analyze judgment results across experiments."""
    analysis = defaultdict(dict)

    for exp_name, exp_results in results.items():
        for model, judgments in exp_results.items():
            # Calculate accuracy, agreement rates, etc.
            correct = sum(1 for j in judgments if j["judgment"] == j["ground_truth"])
            total = len(judgments)
            accuracy = correct / total if total > 0 else 0

            analysis[exp_name][model] = {
                "accuracy": accuracy,
                "total_judgments": total,
                "correct_judgments": correct,
            }

    return analysis


# Load and analyze results
results = load_results()
analysis = analyze_results(results)

# Display analysis
import pandas as pd

df = pd.DataFrame.from_dict(analysis, orient="index")
display(df)