# Experiment Design
* model selection
    1. openai/gpt-4o-mini
    2. google/gemini-2.5-flash-preview-05-20
    3. deepseek/deepseek-chat-v3-0324:free
    4. deepseek/deepseek-r1-0528:free

* temperature : 0.3
* token limit: no
* dataset: general_wordnet_dataset
* max_turns:5

* failure defination
    * guesser cannot get the target word in the max turns, we define this as failure
    * hinter or guesser do not output in what we defined way, we could ask for reprompt for 3 times. if it still donot output properly, we define this as failure.

* what we should collect?
   example: {
  "run_id": "gpt4o_haiku_idx042",
  "hinter_model": "gpt-4o",
  "guesser_model": "claude-haiku",
  "temperature": 0.3,
  "domain": "general",
  "target_word": "apple",
  "taboo_words": ["fruit", "iphone", "mac", "red", "tech"],
  "success": true,
  "turn_count": 3,
  "total_tokens": 147,
  "turns": [
    {
      "turn_id": 1,
      "hinter_prompt": "...",
      "hinter_output": "...",
      "hint_tokens": 18,
      "hint_violate": false,
      "hint_format_ok": true,
      
      "guesser_prompt": "...",
      "guesser_output": "banana",
      "guess_tokens": 5,
      "correct": false,
      "rank_delta": 10
    },
    {
      "turn_id": 2,
      ...
    },
    ...
  ]
}
* Game set
    * init prompt:
        hinter:
            system: |
                You are **Hinter** in a word-guessing Taboo Game.
                – Your goal: help the Guesser identify the *target word*.
                – Hard rules:
                  1. **Do NOT** use or partially spell any taboo words.
                  2. Produce **exactly one line** that starts with ‘Clue: ’.
                  3. Keep the clue ≤{hint_len} tokens.
                – After each turn you will get the Guesser’s guess and must output the next clue.
                – If you break a rule, the game is immediately lost.
            user: |
                ### Game context
                Target word: **{target_word}**
                Taboo list (must not appear in your clue): {taboo_words}
                
                Previously given clues:
                {history_clues}
                
                Provide your next clue now.

        guesser:
            system: |
                You are **Guesser** in a word-guessing Taboo Game.
                – Read the latest clue and output **one single word** as your guess.
                – Output must start with ‘Guess: ’ followed by your word.
                – Respond with exactly one line; no explanations.
            user: |
                ### Game context
                Latest clue from Hinter:
                {latest_clue}
                
                Your previous guesses (for context): {history_guesses}
                
                Provide your next guess now.


In [None]:
# Import required libraries
import json
import random
import time
import requests
from typing import Dict, List, Any, Optional
import logging
from datetime import datetime
import os

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [None]:
# 1. Load Dataset
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """
    Load the general wordnet dataset from JSON file.
    
    Args:
        dataset_path: Path to the dataset JSON file
        
    Returns:
        List of dataset entries with target word, taboo words, and metadata
    """
    try:
        with open(dataset_path, 'r', encoding='utf-8') as f:
            dataset = json.load(f)
        logger.info(f"Successfully loaded dataset with {len(dataset)} entries from {dataset_path}")
        return dataset
    except Exception as e:
        logger.error(f"Failed to load dataset from {dataset_path}: {e}")
        raise

# Load the general wordnet dataset as specified in experiment design
DATASET_PATH = "data/wordnet_dataset.json"
dataset = load_dataset(DATASET_PATH)

# Display dataset info
print(f"Dataset loaded: {len(dataset)} entries")
print(f"First example:")
print(f"Target: {dataset[0]['target']}")
print(f"Taboo words: {dataset[0]['taboo']}")
print(f"Category: {dataset[0]['category']}")
print(f"Definition: {dataset[0]['definition']}")


In [None]:
# 2. Setup OpenRouter API
def load_api_keys(keys_path: str = "api_keys.json") -> Dict[str, str]:
    """
    Load API keys from JSON file.
    
    Args:
        keys_path: Path to the API keys JSON file
        
    Returns:
        Dictionary containing API keys
    """
    try:
        with open(keys_path, 'r', encoding='utf-8') as f:
            keys = json.load(f)
        logger.info("API keys loaded successfully")
        return keys
    except Exception as e:
        logger.error(f"Failed to load API keys from {keys_path}: {e}")
        raise

class OpenRouterClient:
    """
    Client for interacting with OpenRouter API to access different LLM models.
    """
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def call_model(self, 
                   model: str, 
                   messages: List[Dict[str, str]], 
                   temperature: float = 0.3) -> Dict[str, Any]:
        """
        Make API call to specified model via OpenRouter.
        
        Args:
            model: Model identifier (e.g., 'openai/gpt-4o-mini')
            messages: List of message dictionaries with 'role' and 'content'
            temperature: Sampling temperature (0.0 to 1.0)
            
        Returns:
            API response dictionary
        """
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
        }
        # No token limit as specified in experiment design
        
        try:
            response = requests.post(
                self.base_url,
                headers=self.headers,
                json=payload,
                timeout=30
            )
            response.raise_for_status()
            result = response.json()
            
            logger.debug(f"API call successful for model {model}")
            return result
            
        except requests.exceptions.RequestException as e:
            logger.error(f"API call failed for model {model}: {e}")
            raise
        except Exception as e:
            logger.error(f"Unexpected error in API call: {e}")
            raise

# Initialize OpenRouter client
api_keys = load_api_keys()
openrouter_client = OpenRouterClient(api_keys["OPENROUTER_API_KEY"])

# Define models from experiment design - CORRECTED
MODELS = [
    "openai/gpt-4o-mini",
    "google/gemini-2.5-flash-preview-05-20",  # Exact name from experiment design
    "deepseek/deepseek-chat-v3-0324:free",  # Exact name from experiment design
    "deepseek/deepseek-r1-0528:free",  # Exact name from experiment design
]

print("OpenRouter client initialized successfully")
print(f"Available models for experiment: {MODELS}")


In [None]:
# 3. Experiment Configuration
# Experiment parameters from design document
EXPERIMENT_CONFIG = {
    "temperature": 0.3,
    "max_turns": 5,
    "hint_len": 20,  # Maximum tokens for hints
    "domain": "general",
    "max_reprompt_attempts": 3,
}

# Prompt templates from experiment design
HINTER_SYSTEM_PROMPT = """You are **Hinter** in a word-guessing Taboo Game.
– Your goal: help the Guesser identify the *target word*.
– Hard rules:
  1. **Do NOT** use or partially spell any taboo words.
  2. Produce **exactly one line** that starts with 'Clue: '.
  3. Keep the clue ≤{hint_len} tokens.
– After each turn you will get the Guesser's guess and must output the next clue.
– If you break a rule, the game is immediately lost."""

HINTER_USER_TEMPLATE = """### Game context
Target word: **{target_word}**
Taboo list (must not appear in your clue): {taboo_words}

Previously given clues:
{history_clues}

Provide your next clue now."""

GUESSER_SYSTEM_PROMPT = """You are **Guesser** in a word-guessing Taboo Game.
– Read the latest clue and output **one single word** as your guess.
– Output must start with 'Guess: ' followed by your word.
– Respond with exactly one line; no explanations."""

GUESSER_USER_TEMPLATE = """### Game context
Latest clue from Hinter:
{latest_clue}

Your previous guesses (for context): {history_guesses}

Provide your next guess now."""

print("Experiment configuration loaded:")
print(f"Temperature: {EXPERIMENT_CONFIG['temperature']}")
print(f"Max turns: {EXPERIMENT_CONFIG['max_turns']}")
print(f"Max hint length: {EXPERIMENT_CONFIG['hint_len']} tokens")
print(f"Domain: {EXPERIMENT_CONFIG['domain']}")
print(f"Max reprompt attempts: {EXPERIMENT_CONFIG['max_reprompt_attempts']}")


In [None]:
# 4. Game Logic and Validation Functions
import re

def validate_hint_format(hint_output: str) -> tuple[bool, str]:
    """
    Validate if hinter output follows the required format.
    
    Args:
        hint_output: Raw output from hinter model
        
    Returns:
        Tuple of (is_valid, cleaned_hint)
    """
    lines = hint_output.strip().split('\n')
    if not lines:
        return False, ""
    
    first_line = lines[0].strip()
    if not first_line.startswith("Clue: "):
        return False, ""
    
    hint = first_line[6:].strip()  # Remove "Clue: " prefix
    return True, hint

def validate_guess_format(guess_output: str) -> tuple[bool, str]:
    """
    Validate if guesser output follows the required format.
    
    Args:
        guess_output: Raw output from guesser model
        
    Returns:
        Tuple of (is_valid, cleaned_guess)
    """
    lines = guess_output.strip().split('\n')
    if not lines:
        return False, ""
    
    first_line = lines[0].strip()
    if not first_line.startswith("Guess: "):
        return False, ""
    
    guess = first_line[7:].strip()  # Remove "Guess: " prefix
    # Ensure it's a single word
    if len(guess.split()) != 1:
        return False, ""
    
    return True, guess

def check_taboo_violation(hint: str, taboo_words: List[str]) -> bool:
    """
    Check if hint contains any taboo words or partial spellings.
    
    Args:
        hint: The hint text to check
        taboo_words: List of forbidden words
        
    Returns:
        True if violation found, False otherwise
    """
    hint_lower = hint.lower()
    
    for taboo in taboo_words:
        taboo_lower = taboo.lower()
        # Check exact word match
        if re.search(r'\b' + re.escape(taboo_lower) + r'\b', hint_lower):
            return True
        # Check partial spelling (at least 3 characters)
        if len(taboo_lower) >= 3 and taboo_lower in hint_lower:
            return True
    
    return False

def count_tokens_rough(text: str) -> int:
    """
    Rough token count estimation (word-based).
    
    Args:
        text: Text to count tokens for
        
    Returns:
        Estimated token count
    """
    return len(text.split())

print("Game validation functions loaded successfully")


In [None]:
# 5. Main Game Engine
class TabooGame:
    """
    Main game engine for running single taboo game instance.
    """
    
    def __init__(self, 
                 openrouter_client: OpenRouterClient,
                 hinter_model: str,
                 guesser_model: str,
                 config: Dict[str, Any]):
        self.client = openrouter_client
        self.hinter_model = hinter_model
        self.guesser_model = guesser_model
        self.config = config
        
        # Game state
        self.turns = []
        self.history_clues = []
        self.history_guesses = []
        self.success = False
        self.total_tokens = 0
        
    def play_single_game(self, game_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Play a single taboo game with given target word and taboo list.
        
        Args:
            game_data: Dictionary containing 'target', 'taboo', etc.
            
        Returns:
            Complete game result dictionary
        """
        target_word = game_data['target']
        taboo_words = game_data['taboo']
        
        logger.info(f"Starting game - Target: {target_word}, Hinter: {self.hinter_model}, Guesser: {self.guesser_model}")
        
        # Initialize game state
        self.turns = []
        self.history_clues = []
        self.history_guesses = []
        self.success = False
        self.total_tokens = 0
        
        # Generate unique run ID
        timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\
        hinter_short = self.hinter_model.split('/')[-1][:8]\
        guesser_short = self.guesser_model.split('/')[-1][:8]\
        run_id = f\"{hinter_short}_{guesser_short}_{timestamp}\"\
        
        # Play the game turn by turn
        for turn in range(1, self.config['max_turns'] + 1):
            turn_result = self._play_turn(turn, target_word, taboo_words)
            self.turns.append(turn_result)
            
            # Check if game ended
            if turn_result.get('correct', False):
                self.success = True
                logger.info(f\"Game won in turn {turn}!\")\
                break
            elif not turn_result.get('hint_format_ok', True) or not turn_result.get('guesser_format_ok', True):\
                logger.info(f\"Game failed due to format issues in turn {turn}\")\
                break\
            elif turn_result.get('hint_violate', False):\
                logger.info(f\"Game failed due to taboo violation in turn {turn}\")\
                break\
        \
        # Compile final result\
        result = {\
            \"run_id\": run_id,\
            \"hinter_model\": self.hinter_model,\
            \"guesser_model\": self.guesser_model,\
            \"temperature\": self.config['temperature'],\
            \"domain\": self.config['domain'],\
            \"target_word\": target_word,\
            \"taboo_words\": taboo_words,\
            \"success\": self.success,\
            \"turn_count\": len(self.turns),\
            \"total_tokens\": self.total_tokens,\
            \"turns\": self.turns\
        }\
        \
        return result\
    \
    def _play_turn(self, turn_id: int, target_word: str, taboo_words: List[str]) -> Dict[str, Any]:\
        \"\"\"\
        Play a single turn of the game.\
        \
        Args:\
            turn_id: Current turn number (1-indexed)\
            target_word: The target word to guess\
            taboo_words: List of forbidden words\
            \
        Returns:\
            Turn result dictionary\
        \"\"\"\
        logger.info(f\"Playing turn {turn_id}\")\
        \
        # Generate hint\
        hint_result = self._get_hint(target_word, taboo_words)\
        \
        # If hint generation failed, end turn\
        if not hint_result['hint_format_ok']:\
            return {\
                \"turn_id\": turn_id,\
                **hint_result,\
                \"guesser_prompt\": \"\",\
                \"guesser_output\": \"\",\
                \"guess_tokens\": 0,\
                \"guesser_format_ok\": False,\
                \"correct\": False\
            }\
        \
        # Add hint to history\
        self.history_clues.append(hint_result['hint'])\
        \
        # Generate guess\
        guess_result = self._get_guess(hint_result['hint'])\
        \
        # Add guess to history\
        if guess_result['guesser_format_ok']:\
            self.history_guesses.append(guess_result['guess'])\
            \
        # Check if guess is correct\
        correct = (guess_result.get('guess', '').lower() == target_word.lower())\
        \
        # Combine results\
        turn_result = {\
            \"turn_id\": turn_id,\
            **hint_result,\
            **guess_result,\
            \"correct\": correct\
        }\
        \
        self.total_tokens += hint_result.get('hint_tokens', 0) + guess_result.get('guess_tokens', 0)\
        \
        return turn_result\
\
print(\"TabooGame class loaded successfully\")


In [None]:
# 5. Main Game Engine
class TabooGame:
    """
    Main game engine for running single taboo game instance.
    """
    
    def __init__(self, 
                 openrouter_client: OpenRouterClient,
                 hinter_model: str,
                 guesser_model: str,
                 config: Dict[str, Any]):
        self.client = openrouter_client
        self.hinter_model = hinter_model
        self.guesser_model = guesser_model
        self.config = config
        
        # Game state
        self.turns = []
        self.history_clues = []
        self.history_guesses = []
        self.success = False
        self.total_tokens = 0
        
    def play_single_game(self, game_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Play a single taboo game with given target word and taboo list.
        
        Args:
            game_data: Dictionary containing 'target', 'taboo', etc.
            
        Returns:
            Complete game result dictionary
        """
        target_word = game_data['target']
        taboo_words = game_data['taboo']
        
        logger.info(f"Starting game - Target: {target_word}, Hinter: {self.hinter_model}, Guesser: {self.guesser_model}")
        
        # Initialize game state
        self.turns = []
        self.history_clues = []
        self.history_guesses = []
        self.success = False
        self.total_tokens = 0
        
        # Generate unique run ID
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        hinter_short = self.hinter_model.split('/')[-1][:8]
        guesser_short = self.guesser_model.split('/')[-1][:8]
        run_id = f"{hinter_short}_{guesser_short}_{timestamp}"
        
        # Play the game turn by turn
        for turn in range(1, self.config['max_turns'] + 1):
            turn_result = self._play_turn(turn, target_word, taboo_words)
            self.turns.append(turn_result)
            
            # Check if game ended
            if turn_result.get('correct', False):
                self.success = True
                logger.info(f"Game won in turn {turn}!")
                break
            elif not turn_result.get('hint_format_ok', True) or not turn_result.get('guesser_format_ok', True):
                logger.info(f"Game failed due to format issues in turn {turn}")
                break
            elif turn_result.get('hint_violate', False):
                logger.info(f"Game failed due to taboo violation in turn {turn}")
                break
        
        # Compile final result
        result = {
            "run_id": run_id,
            "hinter_model": self.hinter_model,
            "guesser_model": self.guesser_model,
            "temperature": self.config['temperature'],
            "domain": self.config['domain'],
            "target_word": target_word,
            "taboo_words": taboo_words,
            "success": self.success,
            "turn_count": len(self.turns),
            "total_tokens": self.total_tokens,
            "turns": self.turns
        }
        
        return result

print("TabooGame class loaded successfully")


In [None]:
# 6. Game Methods Implementation
def _play_turn(self, turn_id: int, target_word: str, taboo_words: List[str]) -> Dict[str, Any]:
    """
    Play a single turn of the game.
    
    Args:
        turn_id: Current turn number (1-indexed)
        target_word: The target word to guess
        taboo_words: List of forbidden words
        
    Returns:
        Turn result dictionary
    """
    logger.info(f"Playing turn {turn_id}")
    
    # Generate hint
    hint_result = self._get_hint(target_word, taboo_words)
    
    # If hint generation failed, end turn
    if not hint_result['hint_format_ok']:
        return {
            "turn_id": turn_id,
            **hint_result,
            "guesser_prompt": "",
            "guesser_output": "",
            "guess_tokens": 0,
            "guesser_format_ok": False,
            "correct": False
        }
    
    # Add hint to history
    self.history_clues.append(hint_result['hint'])
    
    # Generate guess
    guess_result = self._get_guess(hint_result['hint'])
    
    # Add guess to history
    if guess_result['guesser_format_ok']:
        self.history_guesses.append(guess_result['guess'])
        
    # Check if guess is correct
    correct = (guess_result.get('guess', '').lower() == target_word.lower())
    
    # Combine results
    turn_result = {
        "turn_id": turn_id,
        **hint_result,
        **guess_result,
        "correct": correct
    }
    
    self.total_tokens += hint_result.get('hint_tokens', 0) + guess_result.get('guess_tokens', 0)
    
    return turn_result

def _get_hint(self, target_word: str, taboo_words: List[str]) -> Dict[str, Any]:
    """
    Generate hint from hinter model with validation and retry logic.
    
    Args:
        target_word: The target word to hint for
        taboo_words: List of forbidden words
        
    Returns:
        Dictionary with hint result and metadata
    """
    # Prepare history string
    history_clues_str = "\\n".join([f"Turn {i+1}: {clue}" for i, clue in enumerate(self.history_clues)]) if self.history_clues else "None"
    
    # Build prompt
    system_prompt = HINTER_SYSTEM_PROMPT.format(hint_len=self.config['hint_len'])
    user_prompt = HINTER_USER_TEMPLATE.format(
        target_word=target_word,
        taboo_words=taboo_words,
        history_clues=history_clues_str
    )
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    # Try multiple times ONLY if format is wrong (as per experiment design)
    for attempt in range(self.config['max_reprompt_attempts']):
        try:
            # Call model (no max_tokens as per experiment design)
            response = self.client.call_model(
                model=self.hinter_model,
                messages=messages,
                temperature=self.config['temperature']
            )
            
            hint_output = response['choices'][0]['message']['content']
            hint_tokens = response['usage']['completion_tokens']
            
            # Validate format
            format_ok, hint = validate_hint_format(hint_output)
            if not format_ok:
                logger.warning(f"Hint format invalid on attempt {attempt+1}: {hint_output}")
                continue  # Retry for format issues
            
            # Check token count (this is also a format issue)
            if count_tokens_rough(hint) > self.config['hint_len']:
                logger.warning(f"Hint too long on attempt {attempt+1}: {len(hint.split())} tokens")
                continue  # Retry for length issues
            
            # Check taboo violation (DO NOT retry for this, just record it)
            hint_violate = check_taboo_violation(hint, taboo_words)
            
            # Return result (even if taboo violation - no retry for violations)
            return {
                "hinter_prompt": f"System: {system_prompt}\\n\\nUser: {user_prompt}",
                "hinter_output": hint_output,
                "hint": hint,
                "hint_tokens": hint_tokens,
                "hint_violate": hint_violate,
                "hint_format_ok": True
            }
            
        except Exception as e:
            logger.error(f"Error in hint generation attempt {attempt+1}: {e}")
            
    # All attempts failed
    return {
        "hinter_prompt": f"System: {system_prompt}\\n\\nUser: {user_prompt}",
        "hinter_output": "",
        "hint": "",
        "hint_tokens": 0,
        "hint_violate": False,
        "hint_format_ok": False
    }

def _get_guess(self, latest_hint: str) -> Dict[str, Any]:
    """
    Generate guess from guesser model with validation and retry logic.
    
    Args:
        latest_hint: The most recent hint from hinter
        
    Returns:
        Dictionary with guess result and metadata
    """
    # Prepare history string
    history_guesses_str = ", ".join(self.history_guesses) if self.history_guesses else "None"
    
    # Build prompt
    user_prompt = GUESSER_USER_TEMPLATE.format(
        latest_clue=latest_hint,
        history_guesses=history_guesses_str
    )
    
    messages = [
        {"role": "system", "content": GUESSER_SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt}
    ]
    
    # Try multiple times if format is wrong
    for attempt in range(self.config['max_reprompt_attempts']):
        try:
            # Call model (no max_tokens as per experiment design)
            response = self.client.call_model(
                model=self.guesser_model,
                messages=messages,
                temperature=self.config['temperature']
            )
            
            guess_output = response['choices'][0]['message']['content']
            guess_tokens = response['usage']['completion_tokens']
            
            # Validate format
            format_ok, guess = validate_guess_format(guess_output)
            if format_ok:
                return {
                    "guesser_prompt": f"System: {GUESSER_SYSTEM_PROMPT}\\n\\nUser: {user_prompt}",
                    "guesser_output": guess_output,
                    "guess": guess,
                    "guess_tokens": guess_tokens,
                    "guesser_format_ok": True
                }
            else:
                logger.warning(f"Guess format invalid on attempt {attempt+1}: {guess_output}")
                
        except Exception as e:
            logger.error(f"Error in guess generation attempt {attempt+1}: {e}")
            
    # All attempts failed
    return {
        "guesser_prompt": f"System: {GUESSER_SYSTEM_PROMPT}\\n\\nUser: {user_prompt}",
        "guesser_output": "",
        "guess": "",
        "guess_tokens": 0,
        "guesser_format_ok": False
    }

# Add methods to TabooGame class
TabooGame._play_turn = _play_turn
TabooGame._get_hint = _get_hint
TabooGame._get_guess = _get_guess

print("Game methods implementation completed")


In [None]:
# 7. Experiment Runner
class ExperimentRunner:
    """
    Main experiment runner to conduct taboo game experiments across different model pairs.
    """
    
    def __init__(self, 
                 openrouter_client: OpenRouterClient,
                 dataset: List[Dict[str, Any]],
                 models: List[str],
                 config: Dict[str, Any]):
        self.client = openrouter_client
        self.dataset = dataset
        self.models = models
        self.config = config
        self.results = []
        
    def run_experiment(self, 
                      num_games_per_pair: int = 5,
                      sample_dataset: bool = True,
                      save_results: bool = True) -> List[Dict[str, Any]]:
        """
        Run complete experiment with all model combinations.
        
        Args:
            num_games_per_pair: Number of games to run for each hinter-guesser pair
            sample_dataset: Whether to randomly sample from dataset
            save_results: Whether to save results to file
            
        Returns:
            List of all game results
        """
        logger.info("Starting taboo game experiment")
        logger.info(f"Models: {self.models}")
        logger.info(f"Games per pair: {num_games_per_pair}")
        logger.info(f"Dataset size: {len(self.dataset)}")
        
        self.results = []
        total_combinations = len(self.models) * len(self.models) * num_games_per_pair
        game_count = 0
        
        # Test all hinter-guesser combinations
        for hinter_model in self.models:
            for guesser_model in self.models:
                logger.info(f"Testing pair: Hinter={hinter_model}, Guesser={guesser_model}")
                
                # Sample games for this pair
                if sample_dataset:
                    game_samples = random.sample(self.dataset, min(num_games_per_pair, len(self.dataset)))
                else:
                    game_samples = self.dataset[:num_games_per_pair]
                
                # Run games for this pair
                for i, game_data in enumerate(game_samples):
                    game_count += 1
                    logger.info(f"Running game {game_count}/{total_combinations} - Target: {game_data['target']}")
                    
                    # Create game instance
                    game = TabooGame(
                        openrouter_client=self.client,
                        hinter_model=hinter_model,
                        guesser_model=guesser_model,
                        config=self.config
                    )
                    
                    try:
                        # Play the game
                        result = game.play_single_game(game_data)
                        self.results.append(result)
                        
                        # Log result
                        status = "SUCCESS" if result['success'] else "FAILURE"
                        logger.info(f"Game {game_count} completed: {status} in {result['turn_count']} turns")
                        
                    except Exception as e:
                        logger.error(f"Game {game_count} failed with error: {e}")
                        # Create failure record
                        failure_result = {
                            "run_id": f"failed_{game_count}",
                            "hinter_model": hinter_model,
                            "guesser_model": guesser_model,
                            "target_word": game_data['target'],
                            "taboo_words": game_data['taboo'],
                            "success": False,
                            "error": str(e),
                            "turn_count": 0,
                            "total_tokens": 0,
                            "turns": []
                        }
                        self.results.append(failure_result)
                    
                    # Small delay to avoid rate limits
                    time.sleep(1)
        
        # Save results if requested
        if save_results:
            self._save_results()
        
        logger.info(f"Experiment completed! Total games: {len(self.results)}")
        return self.results
    
    def _save_results(self):
        """Save experiment results to JSON file."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"taboo_experiment_results_{timestamp}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Results saved to {filename}")
    
    def get_summary_stats(self) -> Dict[str, Any]:
        """
        Generate summary statistics from experiment results.
        
        Returns:
            Dictionary with summary statistics
        """
        if not self.results:
            return {"error": "No results available"}
        
        total_games = len(self.results)
        successful_games = sum(1 for r in self.results if r.get('success', False))
        success_rate = successful_games / total_games if total_games > 0 else 0
        
        # Per-model statistics
        model_stats = {}
        for model in self.models:
            hinter_games = [r for r in self.results if r.get('hinter_model') == model]
            guesser_games = [r for r in self.results if r.get('guesser_model') == model]
            
            model_stats[model] = {
                "as_hinter": {
                    "total_games": len(hinter_games),
                    "success_rate": sum(1 for g in hinter_games if g.get('success', False)) / len(hinter_games) if hinter_games else 0,
                    "avg_turns": sum(g.get('turn_count', 0) for g in hinter_games) / len(hinter_games) if hinter_games else 0
                },
                "as_guesser": {
                    "total_games": len(guesser_games),
                    "success_rate": sum(1 for g in guesser_games if g.get('success', False)) / len(guesser_games) if guesser_games else 0,
                    "avg_turns": sum(g.get('turn_count', 0) for g in guesser_games) / len(guesser_games) if guesser_games else 0
                }
            }
        
        return {
            "total_games": total_games,
            "successful_games": successful_games,
            "overall_success_rate": success_rate,
            "avg_turns_when_successful": sum(r['turn_count'] for r in self.results if r.get('success', False)) / successful_games if successful_games > 0 else 0,
            "total_tokens": sum(r.get('total_tokens', 0) for r in self.results),
            "model_statistics": model_stats
        }

print("ExperimentRunner class loaded successfully")


In [None]:
# 8. Example Usage and Testing
# Create experiment runner
experiment_runner = ExperimentRunner(
    openrouter_client=openrouter_client,
    dataset=dataset,
    models=MODELS,
    config=EXPERIMENT_CONFIG
)

print("Experiment setup completed!")
print("\\nReady to run experiments with:")
print(f"- {len(MODELS)} models: {MODELS}")
print(f"- {len(dataset)} games in dataset")
print(f"- Temperature: {EXPERIMENT_CONFIG['temperature']}")
print(f"- Max turns per game: {EXPERIMENT_CONFIG['max_turns']}")
print(f"- Max hint length: {EXPERIMENT_CONFIG['hint_len']} tokens")

# Example: Run a small test experiment (uncomment to execute)
print("\\n" + "="*50)
print("EXAMPLE: To run a small test experiment, execute:")
print("# results = experiment_runner.run_experiment(num_games_per_pair=2)")
print("# stats = experiment_runner.get_summary_stats()")
print("# print(json.dumps(stats, indent=2))")
print("="*50)


In [None]:
# 9. Quick Test - Single Game Demo
def run_single_game_demo():
    """
    Run a single game demonstration to test the system.
    """
    print("Running single game demonstration...")
    
    # Pick a sample game
    sample_game = random.choice(dataset)
    print(f"\\nSample game selected:")
    print(f"Target word: {sample_game['target']}")
    print(f"Taboo words: {sample_game['taboo']}")
    print(f"Definition: {sample_game['definition']}")
    
    # Create game instance with first two models
    demo_game = TabooGame(
        openrouter_client=openrouter_client,
        hinter_model=MODELS[0],  # First model as hinter
        guesser_model=MODELS[1], # Second model as guesser  
        config=EXPERIMENT_CONFIG
    )
    
    print(f"\\nGame setup:")
    print(f"Hinter: {MODELS[0]}")
    print(f"Guesser: {MODELS[1]}")
    print(f"Max turns: {EXPERIMENT_CONFIG['max_turns']}")
    
    try:
        # Play the game
        result = demo_game.play_single_game(sample_game)
        
        print(f"\\n" + "="*60)
        print("GAME RESULT:")
        print(f"Success: {result['success']}")
        print(f"Turns played: {result['turn_count']}")
        print(f"Total tokens: {result['total_tokens']}")
        
        # Show turn-by-turn details
        print(f"\\nTurn-by-turn breakdown:")
        for turn in result['turns']:
            print(f"  Turn {turn['turn_id']}:")
            if turn.get('hint_format_ok', False):
                print(f"    Hint: {turn.get('hint', 'N/A')}")
                print(f"    Taboo violation: {turn.get('hint_violate', False)}")
            else:
                print(f"    Hint: [FORMAT ERROR]")
            
            if turn.get('guesser_format_ok', False):
                print(f"    Guess: {turn.get('guess', 'N/A')}")
                print(f"    Correct: {turn.get('correct', False)}")
            else:
                print(f"    Guess: [FORMAT ERROR]")
            print()
        
        print("="*60)
        
    except Exception as e:
        print(f"Demo failed with error: {e}")
        import traceback
        traceback.print_exc()

# Uncomment the line below to run the demo
print("To run a single game demo, execute: run_single_game_demo()")
