In [4]:
!pip install SciencePlots
!pip3 install SciencePlots

# 用法示例
import matplotlib.pyplot as plt
plt.style.use(['science', 'no-latex'])
import numpy as np

plt.style.use(['science', 'grid'])

x = np.linspace(0, 10, 100)
y1 = np.sin(x)
y2 = np.cos(x)

plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.xlabel('x')
plt.ylabel('Value')
plt.legend()
plt.title('A beautiful scientific plot')
plt.tight_layout()
plt.savefig('plot.pdf')
plt.show()




OSError: 'science' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [1]:
# 导入必要的库
import json
import pandas as pd
import random
import time
import requests
import os
from typing import Dict, List, Any
from datetime import datetime

# 加载数据集
def load_dataset(dataset_path: str = "data/dataset.json") -> List[Dict]:
    """加载Taboo数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# 加载数据集
print("📚 正在加载数据集...")
dataset = load_dataset()
print(f"✅ 数据集加载完成，共{len(dataset)}条记录")

# 显示数据集样本
print("\n📋 数据集样本:")
sample = random.choice(dataset)
print(f"   目标词: {sample['target']}")
print(f"   类别: {sample.get('category', 'unknown')}")
print(f"   禁用词: {sample['taboo']}")
print(f"   词义数: {len(sample.get('senses', []))}")


📚 正在加载数据集...
✅ 数据集加载完成，共300条记录

📋 数据集样本:
   目标词: recovery
   类别: chemistry
   禁用词: ['forest', 'return', 'advance', 'rapid', 'state']
   词义数: 3


In [2]:
# 数据集统计信息
print("📊 数据集基本统计:")
print("=" * 40)

# 类别分布统计
categories = {}
taboo_counts = []
sense_counts = []

for item in dataset:
    # 统计类别
    category = item.get('category', 'unknown')
    categories[category] = categories.get(category, 0) + 1
    
    # 统计禁用词数量
    taboo_counts.append(len(item.get('taboo', [])))
    
    # 统计词义数量
    sense_counts.append(len(item.get('senses', [])))

print(f"\n🏷️ 类别分布 (Top 5):")
sorted_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)
for i, (category, count) in enumerate(sorted_categories[:5], 1):
    percentage = count / len(dataset) * 100
    print(f"   {i}. {category}: {count} 条 ({percentage:.1f}%)")

print(f"\n🚫 禁用词统计:")
print(f"   平均数量: {sum(taboo_counts) / len(taboo_counts):.1f}")
print(f"   范围: {min(taboo_counts)} - {max(taboo_counts)}")

print(f"\n💭 词义统计:")
print(f"   平均数量: {sum(sense_counts) / len(sense_counts):.1f}")
print(f"   范围: {min(sense_counts)} - {max(sense_counts)}")

print(f"\n✅ 数据集统计完成，质量良好，可用于实验")

# 设置随机种子用于实验
random.seed(240)
print("\n🎲 随机种子已设置为 240，确保实验可复现")


📊 数据集基本统计:

🏷️ 类别分布 (Top 5):
   1. general: 100 条 (33.3%)
   2. chemistry: 50 条 (16.7%)
   3. cs: 50 条 (16.7%)
   4. finance: 50 条 (16.7%)
   5. philosophy: 50 条 (16.7%)

🚫 禁用词统计:
   平均数量: 5.0
   范围: 5 - 5

💭 词义统计:
   平均数量: 3.1
   范围: 1 - 23

✅ 数据集统计完成，质量良好，可用于实验

🎲 随机种子已设置为 240，确保实验可复现


In [3]:
# 设置API客户端
def load_api_keys(keys_path: str = "api_keys.json") -> Dict[str, str]:
    """加载API密钥"""
    with open(keys_path, 'r', encoding='utf-8') as f:
        return json.load(f)

class OpenRouterClient:
    """OpenRouter API客户端"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def call_model(self, model: str, messages: List[Dict[str, str]], temperature: float = 0.3) -> str:
        """调用模型API"""
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": 2000
        }
        response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        content = result['choices'][0]['message']['content'].strip()
        
        # 防止乱码：只保留ASCII可打印字符
        import re
        content = re.sub(r'[^\x20-\x7E]', '', content)
        return content

# 初始化API客户端
try:
    api_keys = load_api_keys()
    client = OpenRouterClient(api_keys["OPENROUTER_API_KEY"])
    print("✅ API客户端初始化成功")
except Exception as e:
    print(f"❌ API客户端初始化失败: {e}")
    client = None

# 定义测试模型
TEST_MODELS = [
    "openai/gpt-4o",
    "google/gemini-2.5-pro", 
    "deepseek/deepseek-chat-v3-0324",
    "anthropic/claude-sonnet-4"
]

print(f"🤖 实验模型: {len(TEST_MODELS)} 个")
for i, model in enumerate(TEST_MODELS, 1):
    print(f"   {i}. {model}")


✅ API客户端初始化成功
🤖 实验模型: 4 个
   1. openai/gpt-4o
   2. google/gemini-2.5-pro
   3. deepseek/deepseek-chat-v3-0324
   4. anthropic/claude-sonnet-4


In [4]:
import json
import random
import time
import requests
import pandas as pd
from typing import Dict, List, Any
from datetime import datetime
import os

# 加载数据集
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """加载Taboo游戏数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    return dataset

# 加载预生成的数据集
DATASET_PATH = "data/dataset.json"
dataset = load_dataset(DATASET_PATH)
print(f"✅ 数据集加载成功: {len(dataset)} 条记录")
print(f"📁 数据集路径: {DATASET_PATH}")

# 显示第一个样本
if dataset:
    sample = dataset[0]
    print(f"\n📋 数据样本:")
    print(f"   目标词: {sample['target']}")
    print(f"   禁用词: {sample['taboo']}")
    print(f"   类别: {sample.get('category', 'N/A')}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')[:100]}...")


✅ 数据集加载成功: 300 条记录
📁 数据集路径: data/dataset.json

📋 数据样本:
   目标词: crotonbug
   禁用词: ['common', 'croton', 'europe', 'german', 'states']
   类别: general
   定义: small light-brown cockroach brought to United States from Europe; a common household pest...


In [5]:
# 通用实验方法 - 核心函数

def safe_text_cleanup(text: str, max_length: int = 200) -> str:
    """安全清理文本，防止乱码和超长内容"""
    if not text:
        return ""
    import re
    cleaned = re.sub(r'[^\x20-\x7E\n\r\t]', '', str(text))
    if len(cleaned) > max_length:
        cleaned = cleaned[:max_length] + "..."
    return cleaned

def robust_api_call(client, model: str, base_prompt: str, expected_prefix: str, max_retries: int = 3):
    """健壮的API调用，包含重试机制和格式验证"""
    failed_outputs = []
    
    for attempt in range(1, max_retries + 1):
        try:
            if attempt == 1:
                prompt = base_prompt
            else:
                prev_output = failed_outputs[-1] if failed_outputs else "Unknown"
                format_reminder = f"""

⚠️ FORMAT ERROR DETECTED ⚠️
Your previous response was: "{prev_output}"

REQUIRED FORMAT:
- You MUST start with exactly '{expected_prefix}' (including square brackets)
- Do NOT add any text before {expected_prefix}

Try again with the exact format:"""
                prompt = base_prompt + format_reminder
            
            response = client.call_model(model, [{"role": "user", "content": prompt}])
            
            if response.strip().upper().startswith(expected_prefix.upper()):
                return {
                    'success': True,
                    'response': response,
                    'attempts': attempt,
                    'error': None,
                    'failed_outputs': failed_outputs
                }
            else:
                safe_response = safe_text_cleanup(response, max_length=150)
                failed_outputs.append(safe_response)
                
                if attempt == max_retries:
                    all_failed = " | ".join(failed_outputs)
                    return {
                        'success': False,
                        'response': f"FORMAT_ERROR_EXCEEDED: {safe_response}",
                        'attempts': attempt,
                        'error': f"Failed after {max_retries} attempts. Expected '{expected_prefix}'. All failed outputs: {all_failed}",
                        'failed_outputs': failed_outputs
                    }
                time.sleep(0.5)
                
        except Exception as e:
            safe_error = safe_text_cleanup(str(e), max_length=150)
            error_msg = f"API error (attempt {attempt}/{max_retries}): {safe_error}"
            
            if attempt == max_retries:
                return {
                    'success': False,
                    'response': None,
                    'attempts': attempt,
                    'error': error_msg,
                    'failed_outputs': failed_outputs
                }
            time.sleep(1.0)
    
    return {
        'success': False,
        'response': None,
        'attempts': max_retries,
        'error': "Max retries exceeded",
        'failed_outputs': failed_outputs
    }

def extract_guess_word(response: str) -> str:
    """从响应中提取猜测词"""
    if response.startswith("FORMAT_ERROR_EXCEEDED"):
        return "FORMAT_ERROR"
    
    if '[GUESS]' in response.upper():
        import re
        match = re.search(r'\[GUESS\]\s*(.+)', response, re.IGNORECASE)
        if match:
            guess_part = match.group(1).strip()
            guess = guess_part.split()[0] if guess_part.split() else ""
            return guess.strip('.,!?;:"\'()[]{}')
    
    if 'Guess:' in response:
        guess_part = response.split('Guess:')[1].strip()
        guess = guess_part.split()[0] if guess_part.split() else ""
        return guess.strip('.,!?;:"\'()[]{}')
    
    return "INVALID_FORMAT"

def extract_clue_text(response: str) -> str:
    """从响应中提取线索文本"""
    if response.startswith("FORMAT_ERROR_EXCEEDED"):
        return "FORMAT_ERROR"
    
    if '[CLUE]' in response.upper():
        import re
        match = re.search(r'\[CLUE\]\s*(.+)', response, re.IGNORECASE | re.DOTALL)
        if match:
            return match.group(1).strip()
    
    if 'Clue:' in response:
        return response.split('Clue:')[1].strip()
    
    return "INVALID_FORMAT"

def check_taboo_violation(hint, taboo_words):
    """检查hinter的线索是否违反了禁用词规则"""
    import re
    hint_lower = hint.lower()
    
    for taboo in taboo_words:
        taboo_lower = taboo.lower()
        # 检查完整单词匹配
        if re.search(r'\b' + re.escape(taboo_lower) + r'\b', hint_lower):
            return True
        # 检查部分拼写（至少3个字符）
        if len(taboo_lower) >= 3 and taboo_lower in hint_lower:
            return True
    
    return False

print("✅ 核心工具函数已定义")


✅ 核心工具函数已定义


In [6]:
# 核心游戏逻辑和通用实验运行器

def enhanced_play_taboo_game(client, hinter_model, guesser_model, target_word, taboo_words, max_turns=5):
    """增强版Taboo游戏，包含taboo violation检查"""
    
    conversation_history = []
    taboo_list = ', '.join(taboo_words)
    
    # 系统提示
    hinter_system = f"""You are playing a Taboo word game as the Hinter.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[CLUE]' (including square brackets)
- Do NOT add any text before [CLUE]
- Example: [CLUE] This is a large mammal with a trunk

GAME RULES:
1. NEVER use any of these taboo words: {taboo_list}
2. Give helpful clues to help the guesser find the target word
3. You can see the conversation history and build on previous clues
4. Be creative but follow the format exactly
5. Use BRITISH ENGLISH spelling (e.g., colour not color, centre not center)

Target word: {target_word}
Taboo words (ABSOLUTELY FORBIDDEN): {taboo_list}"""

    guesser_system = """You are playing a Taboo word game as the Guesser.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[GUESS]' (including square brackets)
- Do NOT add any text before [GUESS]
- Example: [GUESS] elephant

GAME RULES:
1. Make your best guess based on all the clues you've received
2. You can see the conversation history
3. Give only ONE word as your guess after [GUESS]
4. Use BRITISH ENGLISH spelling (e.g., colour not color, centre not center)"""

    # 记录统计信息
    total_hinter_attempts = 0
    total_guesser_attempts = 0
    format_errors = []
    hinter_failed_outputs = []
    guesser_failed_outputs = []

    for turn in range(1, max_turns + 1):
        # 构建Hinter提示
        if turn == 1:
            hinter_prompt = f"{hinter_system}\n\nProvide your first clue:"
        else:
            history_text = "\n".join([f"Turn {i}: {msg}" for i, msg in enumerate(conversation_history, 1)])
            hinter_prompt = f"{hinter_system}\n\nConversation so far:\n{history_text}\n\nThe guesser hasn't found the word yet. Provide your next clue:"
        
        # Hinter给出线索（带重试）
        hinter_result = robust_api_call(client, hinter_model, hinter_prompt, "[CLUE]", max_retries=3)
        total_hinter_attempts += hinter_result['attempts']
        
        if hinter_result.get('failed_outputs'):
            hinter_failed_outputs.extend(hinter_result['failed_outputs'])
        
        if not hinter_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(hinter_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"Turn {turn} Hinter: {hinter_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"HINTER_{error_type}",
                'error': f"{error_type}: {hinter_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        # 提取线索并检查taboo violation
        hint_text = extract_clue_text(hinter_result['response'])
        
        # 🚨 关键：检查是否违反taboo words规则
        taboo_violated = check_taboo_violation(hint_text, taboo_words)
        if taboo_violated:
            # 违规立即失败！
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': 'TABOO_VIOLATION: Hinter违反规则',
                'error': f'TABOO_VIOLATION: Hinter在第{turn}轮违反规则，说了禁用词: {hint_text}',
                'failure_reason': 'TABOO_VIOLATION',
                'taboo_violation_turn': turn,
                'taboo_violation_hint': hint_text,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        conversation_history.append(f"Hinter: {hinter_result['response']}")
        
        # 构建Guesser提示
        history_text = "\n".join([f"Turn {i}: {msg}" for i, msg in enumerate(conversation_history, 1)])
        guesser_prompt = f"{guesser_system}\n\nConversation so far:\n{history_text}\n\nWhat is your guess?"
        
        # Guesser进行猜测（带重试）
        guesser_result = robust_api_call(client, guesser_model, guesser_prompt, "[GUESS]", max_retries=3)
        total_guesser_attempts += guesser_result['attempts']
        
        if guesser_result.get('failed_outputs'):
            guesser_failed_outputs.extend(guesser_result['failed_outputs'])
        
        if not guesser_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(guesser_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"Turn {turn} Guesser: {guesser_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"GUESSER_{error_type}",
                'error': f"{error_type}: {guesser_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        conversation_history.append(f"Guesser: {guesser_result['response']}")
        guess = extract_guess_word(guesser_result['response'])
        
        # 检查是否成功
        if guess.lower() == target_word.lower():
            return {
                'success': True,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': guess,
                'failure_reason': None,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        # 如果不是最后一轮，添加反馈
        if turn < max_turns:
            conversation_history.append(f"System: '{guess}' is not correct. Try again!")
    
    # 达到最大轮数仍未成功
    return {
        'success': False,
        'turns': max_turns,
        'conversation': conversation_history,
        'final_guess': guess if 'guess' in locals() else 'N/A',
        'failure_reason': 'MAX_TURNS_EXCEEDED',
        'total_hinter_attempts': total_hinter_attempts,
        'total_guesser_attempts': total_guesser_attempts,
        'format_errors': format_errors,
        'hinter_failed_outputs': hinter_failed_outputs,
        'guesser_failed_outputs': guesser_failed_outputs,
        'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
        'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
    }

print("✅ 增强版游戏函数已定义（包含严格的taboo violation检查）")


✅ 增强版游戏函数已定义（包含严格的taboo violation检查）


In [7]:
# 统一的Taboo实验运行器
def run_taboo_experiment(client, models, dataset, config):
    """统一的Taboo实验运行器，支持测试和全量模式"""
    
    # 配置参数
    experiment_type = config.get('experiment_type', 'test')
    experiment_mode = config.get('experiment_mode', 'simple')  # 'simple' 或 'grouped_by_hinter'
    max_turns = config.get('max_turns', 5)
    output_dir = config.get('output_dir', 'results')
    fixed_word = config.get('fixed_word', None)
    
    # 创建输出目录
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if experiment_mode == 'grouped_by_hinter':
        return run_grouped_experiment(client, models, dataset, config, timestamp)
    else:
        return run_simple_experiment(client, models, dataset, config, timestamp)

def run_simple_experiment(client, models, dataset, config, timestamp):
    """简单模式：测试实验，使用固定词汇"""
    experiment_type = config.get('experiment_type', 'test')
    output_dir = config.get('output_dir', 'results')
    fixed_word = config.get('fixed_word', None)
    max_turns = config.get('max_turns', 5)
    
    # 输出设置
    output_path = f"{output_dir}/test_results_{timestamp}.csv"
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"🚀 开始执行{experiment_type}实验...")
    print(f"📁 输出路径: {output_path}")
    
    # 使用固定词汇
    if not fixed_word:
        fixed_word = random.choice(dataset)
    
    target_word = fixed_word['target']
    taboo_words = fixed_word['taboo']
    print(f"🎯 测试词: {target_word}")
    print(f"🚫 禁用词: {taboo_words}")
    
    total_games = len(models) ** 2  # 每个模型对组合1场游戏
    print(f"📊 总游戏数: {total_games}")
    
    all_results = []
    game_counter = 0
    
    # 运行所有模型组合
    for hinter_model in models:
        for guesser_model in models:
            game_counter += 1
            pair_name = f"{hinter_model.split('/')[-1]}→{guesser_model.split('/')[-1]}"
            
            print(f"🔄 游戏 {game_counter}/{total_games} ({game_counter/total_games*100:.1f}%): {pair_name}")
            
            start_time = time.time()
            
            # 执行游戏
            game_result = enhanced_play_taboo_game(client, hinter_model, guesser_model, 
                                                 target_word, taboo_words, max_turns)
            
            duration = round(time.time() - start_time, 2)
            
            # 记录结果
            result = {
                'game_id': game_counter,
                'hinter_model': hinter_model,
                'guesser_model': guesser_model,
                'target_word': target_word,
                'category': fixed_word.get('category', 'unknown'),
                'taboo_words': '|'.join(taboo_words),
                'success': game_result['success'],
                'turns_used': game_result['turns'],
                'final_guess': game_result['final_guess'],
                'failure_reason': game_result.get('failure_reason', None),
                'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                'all_hints': ' | '.join(game_result['all_hints']),
                'all_guesses': ' | '.join(game_result['all_guesses']),
                'conversation': ' | '.join(game_result['conversation']),
                'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                'format_errors': ' | '.join(game_result.get('format_errors', [])),
                'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                'duration_seconds': duration,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            
            if 'error' in game_result:
                result['error'] = game_result['error']
            
            all_results.append(result)
            
            # 显示结果
            status = "✅ 成功" if game_result['success'] else "❌ 失败"
            failure_info = ""
            if not game_result['success'] and game_result.get('failure_reason'):
                failure_reason = game_result['failure_reason']
                if failure_reason == 'TABOO_VIOLATION':
                    failure_info = " (违反禁用词规则)"
                elif failure_reason == 'FORMAT_FAILURE':
                    failure_info = " (格式错误超3次)"
                elif failure_reason == 'API_FAILURE':
                    failure_info = " (API调用失败)"
                elif failure_reason == 'MAX_TURNS_EXCEEDED':
                    failure_info = " (达到最大轮数)"
            
            print(f"   {status}{failure_info} | {game_result['turns']}轮 | 最终猜测: {game_result['final_guess']}")
            time.sleep(0.5)
    
    return save_and_analyze_results(all_results, output_path, experiment_type)

print("✅ 统一实验运行器已定义")


✅ 统一实验运行器已定义


In [12]:
# 支持函数 - 全量实验和结果分析
def run_grouped_experiment(client, models, dataset, config, timestamp):
    """分组模式：全量实验，按hinter模型分组，遍历所有词汇，每50个游戏保存一个批次文件"""
    experiment_type = config.get('experiment_type', 'formal')
    output_dir = config.get('output_dir', 'results')
    max_turns = config.get('max_turns', 5)
    batch_size = config.get('batch_size', 50)  # 每批次保存的游戏数
    
    main_exp_dir = f"{output_dir}/taboo_experiment_zengliang28_20250717_130855"
    os.makedirs(main_exp_dir, exist_ok=True)
    print(f"📁 主实验目录: {main_exp_dir}")
    
    # 全量实验配置：每个模型对遍历所有300个词
    print(f"📊 数据集词汇数: {len(dataset)}")
    print(f"🤖 模型组合数: {len(models)}×{len(models)} = {len(models)**2}")
    print(f"🎮 总游戏数: {len(dataset) * len(models)**2:,}")
    print(f"💾 批次大小: 每{batch_size}个游戏保存一个文件")
    
    all_experiment_results = []
    batch_files = []  # 记录所有批次文件路径
    
    # 按hinter模型分组执行
    for i, hinter_model in enumerate(models, 1):
        hinter_name = hinter_model.split('/')[-1]
        print(f"\\n🎯 第{i}/{len(models)}组: Hinter = {hinter_name}")
        
        # 为每个hinter模型创建子目录
        hinter_dir = f"{main_exp_dir}/{hinter_name}_as_hinter"
        os.makedirs(hinter_dir, exist_ok=True)
        
        # 运行当前hinter模型与所有guesser模型的组合
        hinter_results = []
        current_batch = []
        total_games_for_hinter = len(models) * len(dataset)
        game_counter = 0
        batch_counter = 0
        
        for guesser_model in models:
            guesser_name = guesser_model.split('/')[-1]
            pair_name = f"{hinter_name}→{guesser_name}"
            
            print(f"   🔄 运行组合: {pair_name}")
            
            # 遍历所有词汇
            for word_idx, word_data in enumerate(dataset):
                game_counter += 1
                
                target_word = word_data['target']
                taboo_words = word_data['taboo']
                
                start_time = time.time()
                
                # 执行游戏
                game_result = enhanced_play_taboo_game(client, hinter_model, guesser_model, 
                                                     target_word, taboo_words, max_turns)
                
                duration = round(time.time() - start_time, 2)
                
                # 记录结果
                result = {
                    'game_id': f"{hinter_name}_{game_counter}",
                    'word_index': word_idx,
                    'hinter_model': hinter_model,
                    'guesser_model': guesser_model,
                    'target_word': target_word,
                    'category': word_data.get('category', 'unknown'),
                    'taboo_words': '|'.join(taboo_words),
                    'success': game_result['success'],
                    'turns_used': game_result['turns'],
                    'final_guess': game_result['final_guess'],
                    'failure_reason': game_result.get('failure_reason', None),
                    'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                    'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                    'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                    'all_hints': ' | '.join(game_result['all_hints']),
                    'all_guesses': ' | '.join(game_result['all_guesses']),
                    'conversation': ' | '.join(game_result['conversation']),
                    'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                    'format_errors': ' | '.join(game_result.get('format_errors', [])),
                    'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                    'duration_seconds': duration,
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                if 'error' in game_result:
                    result['error'] = game_result['error']
                
                hinter_results.append(result)
                current_batch.append(result)
                all_experiment_results.append(result)
                
                # 每batch_size个游戏保存一个批次文件
                if len(current_batch) >= batch_size:
                    batch_counter += 1
                    batch_file_path = f"{hinter_dir}/batch_{batch_counter:03d}.csv"
                    batch_df = pd.DataFrame(current_batch)
                    batch_df.to_csv(batch_file_path, index=False, encoding='utf-8')
                    batch_files.append(batch_file_path)
                    
                    # 进度显示
                    progress = (game_counter / total_games_for_hinter) * 100
                    success_in_batch = sum(r['success'] for r in current_batch)
                    batch_success_rate = success_in_batch / len(current_batch) * 100
                    
                    print(f"      💾 批次{batch_counter:03d}: {len(current_batch)}场游戏已保存")
                    print(f"      📈 进度: {game_counter}/{total_games_for_hinter} ({progress:.1f}%)")
                    print(f"      📊 批次成功率: {batch_success_rate:.1f}%")
                    
                    # 清空当前批次
                    current_batch = []
                
                time.sleep(0.3)  # API调用间隔
        
        # 保存剩余的游戏（如果有）
        if current_batch:
            batch_counter += 1
            batch_file_path = f"{hinter_dir}/batch_{batch_counter:03d}.csv"
            batch_df = pd.DataFrame(current_batch)
            batch_df.to_csv(batch_file_path, index=False, encoding='utf-8')
            batch_files.append(batch_file_path)
            
            success_in_batch = sum(r['success'] for r in current_batch)
            batch_success_rate = success_in_batch / len(current_batch) * 100
            print(f"      💾 最后批次{batch_counter:03d}: {len(current_batch)}场游戏已保存")
            print(f"      📊 批次成功率: {batch_success_rate:.1f}%")
        
        # 保存当前hinter模型的汇总结果
        hinter_df = pd.DataFrame(hinter_results)
        hinter_csv_path = f"{hinter_dir}/{hinter_name}_summary.csv"
        hinter_df.to_csv(hinter_csv_path, index=False, encoding='utf-8')
        
        # 统计当前hinter模型的结果
        success_count = sum(r['success'] for r in hinter_results)
        success_rate = success_count / len(hinter_results) * 100
        
        print(f"   ✅ {hinter_name}组完成: {len(hinter_results)}场游戏, 成功率: {success_rate:.1f}%")
        print(f"   💾 汇总结果已保存: {hinter_csv_path}")
        print(f"   📁 批次文件数: {batch_counter}个")
        
        # 失败原因统计
        print_failure_summary(hinter_df)
    
    # 保存全量实验的最终汇总结果
    final_csv_path = f"{main_exp_dir}/complete_experiment_results.csv"
    print(f"\n🔄 开始生成最终汇总文件...")
    print(f"📊 总批次文件数: {len(batch_files)}")
    
    return save_and_analyze_grouped_results(all_experiment_results, final_csv_path, main_exp_dir, models, batch_files)

def save_and_analyze_results(all_results, output_path, experiment_type):
    """保存并分析实验结果"""
    if all_results:
        df = pd.DataFrame(all_results)
        df.to_csv(output_path, index=False, encoding='utf-8')
        
        # 统计分析
        total_success = sum(r['success'] for r in all_results)
        success_rate = total_success / len(all_results) * 100
        
        print(f"\\n✅ {experiment_type}实验完成！")
        print(f"📁 结果文件: {output_path}")
        print(f"📊 总游戏数: {len(all_results):,}")
        print(f"📈 成功率: {success_rate:.1f}%")
        
        print_failure_summary(df)
        return df
    else:
        print("❌ 没有成功的实验记录")
        return None

def save_and_analyze_grouped_results(all_experiment_results, final_csv_path, main_exp_dir, models, batch_files=None):
    """保存并分析分组实验结果"""
    if all_experiment_results:
        final_df = pd.DataFrame(all_experiment_results)
        final_df.to_csv(final_csv_path, index=False, encoding='utf-8')
        
        # 全量实验统计
        total_success = sum(r['success'] for r in all_experiment_results)
        total_games = len(all_experiment_results)
        overall_success_rate = total_success / total_games * 100
        
        print(f"\\n🎉 全量实验完成！")
        print(f"📁 最终汇总文件: {final_csv_path}")
        print(f"📊 总游戏数: {total_games:,}场")
        print(f"📈 整体成功率: {overall_success_rate:.1f}%")
        
        if batch_files:
            print(f"📦 批次文件数: {len(batch_files)}个")
            print(f"💾 平均每批次: {total_games / len(batch_files):.1f}场游戏")
        
        # 按hinter模型的成功率统计
        print(f"\\n📊 各Hinter模型成功率:")
        for model in models:
            model_name = model.split('/')[-1]
            model_games = final_df[final_df['hinter_model'] == model]
            model_success = sum(model_games['success'])
            model_rate = model_success / len(model_games) * 100 if len(model_games) > 0 else 0
            print(f"   {model_name}: {model_success}/{len(model_games)} ({model_rate:.1f}%)")
        
        print_failure_summary(final_df, prefix="整体")
        print(f"\\n💾 所有数据已保存至目录: {main_exp_dir}")
        
        # 批次文件总结
        if batch_files:
            print(f"\\n📂 批次文件详情:")
            for batch_file in batch_files:
                file_name = os.path.basename(batch_file)
                print(f"   📄 {file_name}")
        
        return final_df
    else:
        print("❌ 全量实验失败，没有成功的游戏记录")
        return None

def print_failure_summary(df, prefix=""):
    """打印失败原因统计"""
    failed_games = df[df['success'] == False]
    if len(failed_games) > 0:
        title = f"{prefix}失败原因统计:" if prefix else "失败原因统计:"
        print(f"\\n📉 {title}")
        failure_counts = failed_games['failure_reason'].value_counts()
        for reason, count in failure_counts.items():
            percentage = count / len(failed_games) * 100
            if reason == 'TABOO_VIOLATION':
                print(f"   🚫 违反禁用词规则: {count} 场 ({percentage:.1f}%)")
            elif reason == 'FORMAT_FAILURE':
                print(f"   🔤 格式错误超限: {count} 场 ({percentage:.1f}%)")
            elif reason == 'API_FAILURE':
                print(f"   🌐 API调用失败: {count} 场 ({percentage:.1f}%)")
            elif reason == 'MAX_TURNS_EXCEEDED':
                print(f"   ⏱️ 轮数耗尽: {count} 场 ({percentage:.1f}%)")

print("✅ 支持函数已定义")


✅ 支持函数已定义


In [52]:
# 执行测试实验
print("🧪 开始执行测试实验...")

# 选择一个测试词
test_word_data = random.choice(dataset)

config = {
    'experiment_type': 'test',
    'experiment_mode': 'simple',  # 简单模式
    'max_turns': 5,
    'output_dir': 'results',
    'fixed_word': test_word_data
}

# 运行测试实验
test_results = run_taboo_experiment(client, TEST_MODELS, dataset, config)

if test_results is not None:
    print(f"✅ 测试实验完成，共{len(test_results)}场游戏")
    
    # 显示测试结果统计
    print("\n📊 测试结果统计:")
    for model in TEST_MODELS:
        model_name = model.split('/')[-1]
        model_as_hinter = test_results[test_results['hinter_model'] == model]
        model_as_guesser = test_results[test_results['guesser_model'] == model]
        
        hinter_success = sum(model_as_hinter['success']) if len(model_as_hinter) > 0 else 0
        guesser_success = sum(model_as_guesser['success']) if len(model_as_guesser) > 0 else 0
        
        print(f"   {model_name}: Hinter {hinter_success}/4, Guesser {guesser_success}/4")
else:
    print("❌ 测试实验失败")


🧪 开始执行测试实验...
🚀 开始执行test实验...
📁 输出路径: results/test_results_20250711_232922.csv
🎯 测试词: colouration
🚫 禁用词: ['colour', 'timber', 'timbre', 'capture', 'color']
📊 总游戏数: 16
🔄 游戏 1/16 (6.2%): gpt-4o→gpt-4o
   ✅ 成功 | 4轮 | 最终猜测: colouration
🔄 游戏 2/16 (12.5%): gpt-4o→gemini-2.5-pro
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 3/16 (18.8%): gpt-4o→deepseek-chat-v3-0324
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 4/16 (25.0%): gpt-4o→claude-sonnet-4
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 5/16 (31.2%): gemini-2.5-pro→gpt-4o
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 6/16 (37.5%): gemini-2.5-pro→gemini-2.5-pro
   ✅ 成功 | 1轮 | 最终猜测: colouration
🔄 游戏 7/16 (43.8%): gemini-2.5-pro→deepseek-chat-v3-0324
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 8/16 (50.0%): gemini-2.5-pro→claude-sonnet-4
   ✅ 成功 | 1轮 | 最终猜测: colouration
🔄 游戏 9/16 (56.2%): deepseek-chat-v3-0324→gpt-4o
   ✅ 成功 | 4轮 | 最终猜测: colouration
🔄 游戏 10/16 (62.5%): deepseek-chat-v3-0324→gemini-2.5-pro
   ✅ 成功 | 2轮 | 最终猜测: colouration
🔄 游戏 11/16 (68.8%): deepseek-chat-v3-03

In [None]:
# 执行全量实验（按hinter模型分组，批次保存）
print("🚀 开始执行全量实验...")
print("💡 新功能：批次保存机制")
print("   • 每300个游戏自动保存一个批次文件")
print("   • 即使实验中断，已完成的批次数据也会保留")
print("   • 方便监控实验进度和调试问题")

config = {
    'experiment_type': 'formal',
    'experiment_mode': 'grouped_by_hinter',  # 分组模式
    'max_turns': 5,
    'output_dir': 'results',
    'batch_size': 300  # 每300个游戏保存一个批次文件
}

# 运行全量实验
formal_results = run_taboo_experiment(client, TEST_MODELS, dataset, config)

if formal_results is not None:
    print(f"\\n🎉 全量实验完成！共{len(formal_results):,}场游戏")
    print("\\n💡 关键改进：")
    print("   ✅ 遍历所有300个词汇，而非随机选择")
    print("   ✅ 按hinter模型分组执行和保存")
    print("   ✅ 统一的实验架构，测试和全量共享代码")
    print("   ✅ 严格的taboo words违规检查")
    print("   ✅ 批次保存机制，每300个游戏保存一个文件")
    print("   ✅ 防止实验中断数据丢失，支持断点续传")
else:
    print("❌ 全量实验失败")


In [13]:
# 执行Quick80 WordNet数据集实验
print("🧪 开始执行Quick80 WordNet数据集实验...")

# 加载Quick80数据集
QUICK80_DATASET_PATH = "quick80_from_wordnet_only.json"

def load_quick80_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """加载Quick80 WordNet数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    return dataset

# 加载Quick80数据集
quick80_dataset = load_quick80_dataset(QUICK80_DATASET_PATH)
print(f"✅ Quick80数据集加载成功: {len(quick80_dataset)} 条记录")
print(f"📁 数据集路径: {QUICK80_DATASET_PATH}")

# 显示数据集样本
if quick80_dataset:
    sample = quick80_dataset[0]
    print(f"\n📋 Quick80数据样本:")
    print(f"   目标词: {sample['target']}")
    print(f"   词性: {sample['part_of_speech']}")
    print(f"   禁用词: {sample['taboo']}")
    print(f"   类别: {sample.get('category', 'N/A')}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')}")

# 统计Quick80数据集信息
print(f"\n📊 Quick80数据集统计:")
categories = {}
pos_counts = {}
taboo_counts = []

for item in quick80_dataset:
    # 统计类别
    category = item.get('category', 'unknown')
    categories[category] = categories.get(category, 0) + 1
    
    # 统计词性
    pos = item.get('part_of_speech', 'unknown')
    pos_counts[pos] = pos_counts.get(pos, 0) + 1
    
    # 统计禁用词数量
    taboo_counts.append(len(item.get('taboo', [])))

print(f"🏷️ 类别分布:")
for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
    percentage = count / len(quick80_dataset) * 100
    print(f"   {category}: {count} 条 ({percentage:.1f}%)")

print(f"\n📝 词性分布:")
for pos, count in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = count / len(quick80_dataset) * 100
    print(f"   {pos}: {count} 条 ({percentage:.1f}%)")

print(f"\n🚫 禁用词统计:")
print(f"   平均数量: {sum(taboo_counts) / len(taboo_counts):.1f}")
print(f"   范围: {min(taboo_counts)} - {max(taboo_counts)}")

print(f"\n🎲 设置随机种子为 42，确保实验可复现")
random.seed(42)


🧪 开始执行Quick80 WordNet数据集实验...
✅ Quick80数据集加载成功: 28 条记录
📁 数据集路径: quick80_from_wordnet_only.json

📋 Quick80数据样本:
   目标词: obtrusively
   词性: adv
   禁用词: ['manner', 'obtrusive', 'unobtrusively', 'noticeably', 'intrusively']
   类别: general
   定义: in an obtrusive manner

📊 Quick80数据集统计:
🏷️ 类别分布:
   general: 28 条 (100.0%)

📝 词性分布:
   adv: 20 条 (71.4%)
   verb: 8 条 (28.6%)

🚫 禁用词统计:
   平均数量: 5.0
   范围: 5 - 5

🎲 设置随机种子为 42，确保实验可复现


In [14]:
# Quick80全量实验预处理
print("🚀 准备执行Quick80数据集全量实验...")
print("🔄 实验模式: 全量遍历所有词汇")

print(f"\n📈 实验规模分析:")
print(f"   • 词汇总数: {len(quick80_dataset)} 个")
print(f"   • 词性分布: {dict(sorted(pos_counts.items(), key=lambda x: x[1], reverse=True))}")
print(f"   • 模型数量: {len(TEST_MODELS)} 个")
print(f"   • 每个词汇游戏数: {len(TEST_MODELS)**2} 场")
print(f"   • 总游戏数: {len(quick80_dataset) * len(TEST_MODELS)**2:,} 场")

# 检查API客户端是否可用
if client is None:
    print("❌ API客户端未初始化，无法执行实验")
else:
    print(f"\n✅ API客户端已就绪")
    print(f"🤖 参与实验的模型:")
    for i, model in enumerate(TEST_MODELS, 1):
        print(f"   {i}. {model}")
    
    print(f"\n🎮 实验执行计划:")
    print(f"   • 按Hinter模型分组执行")
    print(f"   • 每个Hinter模型: {len(TEST_MODELS)} × {len(quick80_dataset)} = {len(TEST_MODELS) * len(quick80_dataset)} 场游戏")
    print(f"   • 批次保存: 每50场游戏保存一个文件")
    print(f"   • 预计批次数: ~{(len(quick80_dataset) * len(TEST_MODELS)**2) // 50 + 1} 个文件")
    
    # 显示一些样本词汇
    print(f"\n📋 样本词汇预览:")
    sample_words = random.sample(quick80_dataset, min(3, len(quick80_dataset)))
    for i, word in enumerate(sample_words, 1):
        print(f"   {i}. {word['target']} ({word['part_of_speech']}) - 禁用词: {word['taboo'][:3]}...")


🚀 准备执行Quick80数据集全量实验...
🔄 实验模式: 全量遍历所有词汇

📈 实验规模分析:
   • 词汇总数: 28 个
   • 词性分布: {'adv': 20, 'verb': 8}
   • 模型数量: 4 个
   • 每个词汇游戏数: 16 场
   • 总游戏数: 448 场

✅ API客户端已就绪
🤖 参与实验的模型:
   1. openai/gpt-4o
   2. google/gemini-2.5-pro
   3. deepseek/deepseek-chat-v3-0324
   4. anthropic/claude-sonnet-4

🎮 实验执行计划:
   • 按Hinter模型分组执行
   • 每个Hinter模型: 4 × 28 = 112 场游戏
   • 批次保存: 每50场游戏保存一个文件
   • 预计批次数: ~9 个文件

📋 样本词汇预览:
   1. dispensed (verb) - 禁用词: ['bestow', 'parcel', 'allot']...
   2. past (adv) - 禁用词: ['given', 'point', 'pass']...
   3. obtrusively (adv) - 禁用词: ['manner', 'obtrusive', 'unobtrusively']...


In [15]:
# 执行Quick80全量实验
if client is not None:
    print("🚀 开始执行Quick80全量实验...")
    print("💡 实验规模:")
    print(f"   • 数据集规模: {len(quick80_dataset)} 个词汇")
    print(f"   • 模型组合: {len(TEST_MODELS)}×{len(TEST_MODELS)} = {len(TEST_MODELS)**2}")
    print(f"   • 总游戏数: {len(quick80_dataset) * len(TEST_MODELS)**2:,} 场")
    print(f"   • 预计时间: ~{len(quick80_dataset) * len(TEST_MODELS)**2 * 0.5 / 60:.1f} 分钟")
    
    # 配置Quick80全量实验参数
    quick80_full_config = {
        'experiment_type': 'quick80_full',
        'experiment_mode': 'grouped_by_hinter',  # 使用分组模式进行全量实验
        'max_turns': 5,
        'output_dir': 'results',
        'batch_size': 50  # 每50个游戏保存一个批次文件
    }
    
    print(f"\n📋 全量实验配置:")
    print(f"   实验类型: {quick80_full_config['experiment_type']}")
    print(f"   实验模式: {quick80_full_config['experiment_mode']}")
    print(f"   最大轮数: {quick80_full_config['max_turns']}")
    print(f"   批次大小: {quick80_full_config['batch_size']} 游戏/批次")
    print(f"   输出目录: {quick80_full_config['output_dir']}")
    
    # 运行Quick80全量实验，复用现有的实验框架
    quick80_results = run_taboo_experiment(client, TEST_MODELS, quick80_dataset, quick80_full_config)
    
    if quick80_results is not None:
        print(f"\n🎉 Quick80全量实验完成！")
        print(f"📊 实验规模总结: {len(quick80_results):,} 场游戏")
        
        # 总体成功率统计
        total_success = sum(quick80_results['success'])
        success_rate = total_success / len(quick80_results) * 100
        print(f"📈 总体成功率: {total_success:,}/{len(quick80_results):,} ({success_rate:.1f}%)")
        
        # 按Hinter模型统计成功率
        print(f"\n🎭 各Hinter模型表现:")
        for model in TEST_MODELS:
            model_name = model.split('/')[-1]
            model_games = quick80_results[quick80_results['hinter_model'] == model]
            model_success = sum(model_games['success'])
            model_rate = (model_success / len(model_games) * 100) if len(model_games) > 0 else 0
            print(f"   {model_name}: {model_success:,}/{len(model_games):,} ({model_rate:.1f}%)")
        
        # 按Guesser模型统计成功率
        print(f"\n🔍 各Guesser模型表现:")
        for model in TEST_MODELS:
            model_name = model.split('/')[-1]
            model_games = quick80_results[quick80_results['guesser_model'] == model]
            model_success = sum(model_games['success'])
            model_rate = (model_success / len(model_games) * 100) if len(model_games) > 0 else 0
            print(f"   {model_name}: {model_success:,}/{len(model_games):,} ({model_rate:.1f}%)")
        
        # 按词性分析成功率
        if 'part_of_speech' in quick80_results.columns or any('part_of_speech' in word for word in quick80_dataset):
            print(f"\n📝 按词性分析成功率:")
            # 需要从原始数据集匹配词性
            word_pos_map = {word['target']: word['part_of_speech'] for word in quick80_dataset}
            quick80_results['word_pos'] = quick80_results['target_word'].map(word_pos_map)
            
            for pos in quick80_results['word_pos'].unique():
                pos_games = quick80_results[quick80_results['word_pos'] == pos]
                pos_success = sum(pos_games['success'])
                pos_rate = (pos_success / len(pos_games) * 100) if len(pos_games) > 0 else 0
                print(f"   {pos}: {pos_success:,}/{len(pos_games):,} ({pos_rate:.1f}%)")
        
        # 失败原因分析
        failed_games = quick80_results[quick80_results['success'] == False]
        if len(failed_games) > 0:
            print(f"\n❌ 失败原因分析 ({len(failed_games):,} 场失败):")
            failure_reasons = failed_games['failure_reason'].value_counts()
            for reason, count in failure_reasons.items():
                percentage = count / len(failed_games) * 100
                if reason == 'TABOO_VIOLATION':
                    print(f"   🚫 违反禁用词规则: {count:,} 场 ({percentage:.1f}%)")
                elif reason == 'FORMAT_FAILURE':
                    print(f"   🔤 格式错误超限: {count:,} 场 ({percentage:.1f}%)")
                elif reason == 'API_FAILURE':
                    print(f"   🌐 API调用失败: {count:,} 场 ({percentage:.1f}%)")
                elif reason == 'MAX_TURNS_EXCEEDED':
                    print(f"   ⏱️ 轮数耗尽: {count:,} 场 ({percentage:.1f}%)")
                else:
                    print(f"   ❓ {reason}: {count:,} 场 ({percentage:.1f}%)")
        else:
            print(f"\n🎉 所有游戏都成功了！没有失败案例。")
        
        # 轮数效率分析
        successful_games = quick80_results[quick80_results['success'] == True]
        if len(successful_games) > 0:
            print(f"\n🔄 游戏轮数效率分析:")
            avg_turns = successful_games['turns_used'].mean()
            print(f"   平均轮数: {avg_turns:.2f} 轮")
            print(f"   轮数分布:")
            turn_counts = successful_games['turns_used'].value_counts().sort_index()
            for turns, count in turn_counts.items():
                percentage = count / len(successful_games) * 100
                print(f"     {turns}轮: {count:,} 场 ({percentage:.1f}%)")
        
        # 保存信息
        print(f"\n💾 实验数据保存信息:")
        print(f"   📁 主目录: results/taboo_experiment_[timestamp]/")
        print(f"   📄 按Hinter模型分组的批次文件")
        print(f"   📋 完整汇总文件: complete_experiment_results.csv")
        print(f"   🔍 建议查看各Hinter模型的子目录获取详细结果")
        
    else:
        print("❌ Quick80全量实验失败")
else:
    print("❌ 无法执行Quick80全量实验：API客户端未初始化")


🚀 开始执行Quick80全量实验...
💡 实验规模:
   • 数据集规模: 28 个词汇
   • 模型组合: 4×4 = 16
   • 总游戏数: 448 场
   • 预计时间: ~3.7 分钟

📋 全量实验配置:
   实验类型: quick80_full
   实验模式: grouped_by_hinter
   最大轮数: 5
   批次大小: 50 游戏/批次
   输出目录: results
📁 主实验目录: results/taboo_experiment_zengliang28_20250717_130855
📊 数据集词汇数: 28
🤖 模型组合数: 4×4 = 16
🎮 总游戏数: 448
💾 批次大小: 每50个游戏保存一个文件
\n🎯 第1/4组: Hinter = gpt-4o
   🔄 运行组合: gpt-4o→gpt-4o
   🔄 运行组合: gpt-4o→gemini-2.5-pro
      💾 批次001: 50场游戏已保存
      📈 进度: 50/112 (44.6%)
      📊 批次成功率: 52.0%
   🔄 运行组合: gpt-4o→deepseek-chat-v3-0324
   🔄 运行组合: gpt-4o→claude-sonnet-4
      💾 批次002: 50场游戏已保存
      📈 进度: 100/112 (89.3%)
      📊 批次成功率: 60.0%
      💾 最后批次003: 12场游戏已保存
      📊 批次成功率: 83.3%
   ✅ gpt-4o组完成: 112场游戏, 成功率: 58.9%
   💾 汇总结果已保存: results/taboo_experiment_zengliang28_20250717_130855/gpt-4o_as_hinter/gpt-4o_summary.csv
   📁 批次文件数: 3个
\n📉 失败原因统计:
   ⏱️ 轮数耗尽: 31 场 (67.4%)
   🔤 格式错误超限: 9 场 (19.6%)
   🚫 违反禁用词规则: 6 场 (13.0%)
\n🎯 第2/4组: Hinter = gemini-2.5-pro
   🔄 运行组合: gemini-2.5-pro→gpt-4o
   🔄