In [2]:
# 导入必要的库
import json
import pandas as pd
import random
import time
import requests
import os
from typing import Dict, List, Any
from datetime import datetime

# 加载数据集
def load_dataset(dataset_path: str = "quick80_dataset.json") -> List[Dict]:
    """加载Taboo数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# 加载数据集
print("📚 正在加载数据集...")
dataset = load_dataset()
print(f"✅ 数据集加载完成，共{len(dataset)}条记录")

# 显示数据集样本
print("\n📋 数据集样本:")
sample = random.choice(dataset)
print(f"   目标词: {sample['target']}")
print(f"   类别: {sample.get('category', 'unknown')}")
print(f"   禁用词: {sample['taboo']}")
print(f"   词义数: {len(sample.get('senses', []))}")


📚 正在加载数据集...
✅ 数据集加载完成，共80条记录

📋 数据集样本:
   目标词: substance
   类别: philosophy
   禁用词: ['center', 'centre', 'kernel', 'marrow', 'matter']
   词义数: 7


In [3]:
# 数据集统计信息
print("📊 数据集基本统计:")
print("=" * 40)

# 类别分布统计
categories = {}
taboo_counts = []
sense_counts = []

for item in dataset:
    # 统计类别
    category = item.get('category', 'unknown')
    categories[category] = categories.get(category, 0) + 1
    
    # 统计禁用词数量
    taboo_counts.append(len(item.get('taboo', [])))
    
    # 统计词义数量
    sense_counts.append(len(item.get('senses', [])))

print(f"\n🏷️ 类别分布 (Top 5):")
sorted_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)
for i, (category, count) in enumerate(sorted_categories[:5], 1):
    percentage = count / len(dataset) * 100
    print(f"   {i}. {category}: {count} 条 ({percentage:.1f}%)")

print(f"\n🚫 禁用词统计:")
print(f"   平均数量: {sum(taboo_counts) / len(taboo_counts):.1f}")
print(f"   范围: {min(taboo_counts)} - {max(taboo_counts)}")

print(f"\n💭 词义统计:")
print(f"   平均数量: {sum(sense_counts) / len(sense_counts):.1f}")
print(f"   范围: {min(sense_counts)} - {max(sense_counts)}")

print(f"\n✅ 数据集统计完成，质量良好，可用于实验")

# 设置随机种子用于实验
random.seed(240)
print("\n🎲 随机种子已设置为 240，确保实验可复现")


📊 数据集基本统计:

🏷️ 类别分布 (Top 5):
   1. general: 55 条 (68.8%)
   2. philosophy: 8 条 (10.0%)
   3. finance: 7 条 (8.8%)
   4. chemistry: 5 条 (6.2%)
   5. cs: 5 条 (6.2%)

🚫 禁用词统计:
   平均数量: 5.0
   范围: 5 - 5

💭 词义统计:
   平均数量: 1.9
   范围: 1 - 23

✅ 数据集统计完成，质量良好，可用于实验

🎲 随机种子已设置为 240，确保实验可复现


In [4]:
# 设置API客户端
def load_api_keys(keys_path: str = "api_keys.json") -> Dict[str, str]:
    """加载API密钥"""
    with open(keys_path, 'r', encoding='utf-8') as f:
        return json.load(f)

class OpenRouterClient:
    """OpenRouter API客户端"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def call_model(self, model: str, messages: List[Dict[str, str]], temperature: float = 0.3) -> str:
        """调用模型API"""
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": 2000
        }
        response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        content = result['choices'][0]['message']['content'].strip()
        
        # 防止乱码：只保留ASCII可打印字符
        import re
        content = re.sub(r'[^\x20-\x7E]', '', content)
        return content

# 初始化API客户端
try:
    api_keys = load_api_keys()
    client = OpenRouterClient(api_keys["OPENROUTER_API_KEY"])
    print("✅ API客户端初始化成功")
except Exception as e:
    print(f"❌ API客户端初始化失败: {e}")
    client = None

# 定义测试模型 - 包含Kimi模型
TEST_MODELS = [
    "openai/gpt-4o",
    "google/gemini-2.5-flash", 
    "deepseek/deepseek-chat-v3-0324",
    "anthropic/claude-sonnet-4"
]

# Kimi实验专用模型列表 (5个模型)
KIMI_MODELS = [
    "moonshotai/kimi-k2",  # Kimi模型
    "openai/gpt-4o",
    "google/gemini-2.5-flash", 
    "deepseek/deepseek-chat-v3-0324",
    "anthropic/claude-sonnet-4"
]

print(f"🤖 原始测试模型: {len(TEST_MODELS)} 个")
for i, model in enumerate(TEST_MODELS, 1):
    print(f"   {i}. {model}")

print(f"\n🌙 Kimi实验模型: {len(KIMI_MODELS)} 个")
for i, model in enumerate(KIMI_MODELS, 1):
    print(f"   {i}. {model}")
    
print(f"\n📊 Kimi实验配置:")
print(f"   • 总模型数: {len(KIMI_MODELS)}")
print(f"   • 模型对组合: {len(KIMI_MODELS)}×{len(KIMI_MODELS)} = {len(KIMI_MODELS)**2}")
print(f"   • 包含自对自组合: {len(KIMI_MODELS)}个")
print(f"   • 5+5-1对实验: 我们将运行所有25对组合 ✅")


✅ API客户端初始化成功
🤖 原始测试模型: 4 个
   1. openai/gpt-4o
   2. google/gemini-2.5-flash
   3. deepseek/deepseek-chat-v3-0324
   4. anthropic/claude-sonnet-4

🌙 Kimi实验模型: 5 个
   1. moonshotai/kimi-k2
   2. openai/gpt-4o
   3. google/gemini-2.5-flash
   4. deepseek/deepseek-chat-v3-0324
   5. anthropic/claude-sonnet-4

📊 Kimi实验配置:
   • 总模型数: 5
   • 模型对组合: 5×5 = 25
   • 包含自对自组合: 5个
   • 5+5-1对实验: 我们将运行所有25对组合 ✅


In [5]:
import json
import random
import time
import requests
import pandas as pd
from typing import Dict, List, Any
from datetime import datetime
import os

# 加载数据集
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """加载Taboo游戏数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    return dataset

# 加载预生成的数据集
DATASET_PATH = "quick80_dataset.json"
dataset = load_dataset(DATASET_PATH)
print(f"✅ 数据集加载成功: {len(dataset)} 条记录")
print(f"📁 数据集路径: {DATASET_PATH}")

# 显示第一个样本
if dataset:
    sample = dataset[0]
    print(f"\n📋 数据样本:")
    print(f"   目标词: {sample['target']}")
    print(f"   禁用词: {sample['taboo']}")
    print(f"   类别: {sample.get('category', 'N/A')}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')[:100]}...")


✅ 数据集加载成功: 80 条记录
📁 数据集路径: quick80_dataset.json

📋 数据样本:
   目标词: behaviorism
   禁用词: ['approach', 'behavior', 'emphasizes', 'measurable', 'observable']
   类别: philosophy
   定义: an approach to psychology that emphasizes observable measurable behavior...


In [6]:
import json
import random
import time
import requests
import pandas as pd
from typing import Dict, List, Any
from datetime import datetime
import os

# 加载数据集
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """加载Taboo游戏数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    return dataset

# 加载预生成的数据集
DATASET_PATH = "data/dataset.json"
dataset = load_dataset(DATASET_PATH)
print(f"✅ 数据集加载成功: {len(dataset)} 条记录")
print(f"📁 数据集路径: {DATASET_PATH}")

# 显示第一个样本
if dataset:
    sample = dataset[0]
    print(f"\n📋 数据样本:")
    print(f"   目标词: {sample['target']}")
    print(f"   禁用词: {sample['taboo']}")
    print(f"   类别: {sample.get('category', 'N/A')}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')[:100]}...")


✅ 数据集加载成功: 300 条记录
📁 数据集路径: data/dataset.json

📋 数据样本:
   目标词: crotonbug
   禁用词: ['common', 'croton', 'europe', 'german', 'states']
   类别: general
   定义: small light-brown cockroach brought to United States from Europe; a common household pest...


In [7]:
# 核心游戏逻辑和通用实验运行器

def enhanced_play_taboo_game(client, hinter_model, guesser_model, target_word, taboo_words, max_turns=5):
    """增强版Taboo游戏，包含taboo violation检查"""
    
    conversation_history = []
    taboo_list = ', '.join(taboo_words)
    
    # 系统提示
    hinter_system = f"""You are playing a Taboo word game as the Hinter.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[CLUE]' (including square brackets)
- Do NOT add any text before [CLUE]
- Example: [CLUE] This is a large mammal with a trunk

GAME RULES:
1. NEVER use any of these taboo words: {taboo_list}
2. Give helpful clues to help the guesser find the target word
3. You can see the conversation history and build on previous clues
4. Be creative but follow the format exactly
5. Use BRITISH ENGLISH spelling (e.g., colour not color, centre not center)

Target word: {target_word}
Taboo words (ABSOLUTELY FORBIDDEN): {taboo_list}"""

    guesser_system = """You are playing a Taboo word game as the Guesser.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[GUESS]' (including square brackets)
- Do NOT add any text before [GUESS]
- Example: [GUESS] elephant

GAME RULES:
1. Make your best guess based on all the clues you've received
2. You can see the conversation history
3. Give only ONE word as your guess after [GUESS]
4. Use BRITISH ENGLISH spelling (e.g., colour not color, centre not center)"""

    # 记录统计信息
    total_hinter_attempts = 0
    total_guesser_attempts = 0
    format_errors = []
    hinter_failed_outputs = []
    guesser_failed_outputs = []

    for turn in range(1, max_turns + 1):
        # 构建Hinter提示
        if turn == 1:
            hinter_prompt = f"{hinter_system}\n\nProvide your first clue:"
        else:
            history_text = "\n".join([f"Turn {i}: {msg}" for i, msg in enumerate(conversation_history, 1)])
            hinter_prompt = f"{hinter_system}\n\nConversation so far:\n{history_text}\n\nThe guesser hasn't found the word yet. Provide your next clue:"
        
        # Hinter给出线索（带重试）
        hinter_result = robust_api_call(client, hinter_model, hinter_prompt, "[CLUE]", max_retries=3)
        total_hinter_attempts += hinter_result['attempts']
        
        if hinter_result.get('failed_outputs'):
            hinter_failed_outputs.extend(hinter_result['failed_outputs'])
        
        if not hinter_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(hinter_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"Turn {turn} Hinter: {hinter_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"HINTER_{error_type}",
                'error': f"{error_type}: {hinter_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        # 提取线索并检查taboo violation
        hint_text = extract_clue_text(hinter_result['response'])
        
        # 🚨 关键：检查是否违反taboo words规则
        taboo_violated = check_taboo_violation(hint_text, taboo_words)
        if taboo_violated:
            # 违规立即失败！
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': 'TABOO_VIOLATION: Hinter违反规则',
                'error': f'TABOO_VIOLATION: Hinter在第{turn}轮违反规则，说了禁用词: {hint_text}',
                'failure_reason': 'TABOO_VIOLATION',
                'taboo_violation_turn': turn,
                'taboo_violation_hint': hint_text,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        conversation_history.append(f"Hinter: {hinter_result['response']}")
        
        # 构建Guesser提示
        history_text = "\n".join([f"Turn {i}: {msg}" for i, msg in enumerate(conversation_history, 1)])
        guesser_prompt = f"{guesser_system}\n\nConversation so far:\n{history_text}\n\nWhat is your guess?"
        
        # Guesser进行猜测（带重试）
        guesser_result = robust_api_call(client, guesser_model, guesser_prompt, "[GUESS]", max_retries=3)
        total_guesser_attempts += guesser_result['attempts']
        
        if guesser_result.get('failed_outputs'):
            guesser_failed_outputs.extend(guesser_result['failed_outputs'])
        
        if not guesser_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(guesser_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"Turn {turn} Guesser: {guesser_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"GUESSER_{error_type}",
                'error': f"{error_type}: {guesser_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        conversation_history.append(f"Guesser: {guesser_result['response']}")
        guess = extract_guess_word(guesser_result['response'])
        
        # 检查是否成功
        if guess.lower() == target_word.lower():
            return {
                'success': True,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': guess,
                'failure_reason': None,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
            }
        
        # 如果不是最后一轮，添加反馈
        if turn < max_turns:
            conversation_history.append(f"System: '{guess}' is not correct. Try again!")
    
    # 达到最大轮数仍未成功
    return {
        'success': False,
        'turns': max_turns,
        'conversation': conversation_history,
        'final_guess': guess if 'guess' in locals() else 'N/A',
        'failure_reason': 'MAX_TURNS_EXCEEDED',
        'total_hinter_attempts': total_hinter_attempts,
        'total_guesser_attempts': total_guesser_attempts,
        'format_errors': format_errors,
        'hinter_failed_outputs': hinter_failed_outputs,
        'guesser_failed_outputs': guesser_failed_outputs,
        'all_hints': [msg for msg in conversation_history if msg.startswith('Hinter:')],
        'all_guesses': [msg for msg in conversation_history if msg.startswith('Guesser:')]
    }

print("✅ 增强版游戏函数已定义（包含严格的taboo violation检查）")


✅ 增强版游戏函数已定义（包含严格的taboo violation检查）


In [8]:
# 统一的Taboo实验运行器
def run_taboo_experiment(client, models, dataset, config):
    """统一的Taboo实验运行器，支持测试和全量模式"""
    
    # 配置参数
    experiment_type = config.get('experiment_type', 'test')
    experiment_mode = config.get('experiment_mode', 'simple')  # 'simple' 或 'grouped_by_hinter'
    max_turns = config.get('max_turns', 5)
    output_dir = config.get('output_dir', 'results')
    fixed_word = config.get('fixed_word', None)
    
    # 创建输出目录
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if experiment_mode == 'grouped_by_hinter':
        return run_grouped_experiment(client, models, dataset, config, timestamp)
    else:
        return run_simple_experiment(client, models, dataset, config, timestamp)

def run_simple_experiment(client, models, dataset, config, timestamp):
    """简单模式：测试实验，使用固定词汇"""
    experiment_type = config.get('experiment_type', 'test')
    output_dir = config.get('output_dir', 'results')
    fixed_word = config.get('fixed_word', None)
    max_turns = config.get('max_turns', 5)
    
    # 输出设置
    output_path = f"{output_dir}/test_results_{timestamp}.csv"
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"🚀 开始执行{experiment_type}实验...")
    print(f"📁 输出路径: {output_path}")
    
    # 使用固定词汇
    if not fixed_word:
        fixed_word = random.choice(dataset)
    
    target_word = fixed_word['target']
    taboo_words = fixed_word['taboo']
    print(f"🎯 测试词: {target_word}")
    print(f"🚫 禁用词: {taboo_words}")
    
    total_games = len(models) ** 2  # 每个模型对组合1场游戏
    print(f"📊 总游戏数: {total_games}")
    
    all_results = []
    game_counter = 0
    
    # 运行所有模型组合
    for hinter_model in models:
        for guesser_model in models:
            game_counter += 1
            pair_name = f"{hinter_model.split('/')[-1]}→{guesser_model.split('/')[-1]}"
            
            print(f"🔄 游戏 {game_counter}/{total_games} ({game_counter/total_games*100:.1f}%): {pair_name}")
            
            start_time = time.time()
            
            # 执行游戏
            game_result = enhanced_play_taboo_game(client, hinter_model, guesser_model, 
                                                 target_word, taboo_words, max_turns)
            
            duration = round(time.time() - start_time, 2)
            
            # 记录结果
            result = {
                'game_id': game_counter,
                'hinter_model': hinter_model,
                'guesser_model': guesser_model,
                'target_word': target_word,
                'category': fixed_word.get('category', 'unknown'),
                'taboo_words': '|'.join(taboo_words),
                'success': game_result['success'],
                'turns_used': game_result['turns'],
                'final_guess': game_result['final_guess'],
                'failure_reason': game_result.get('failure_reason', None),
                'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                'all_hints': ' | '.join(game_result['all_hints']),
                'all_guesses': ' | '.join(game_result['all_guesses']),
                'conversation': ' | '.join(game_result['conversation']),
                'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                'format_errors': ' | '.join(game_result.get('format_errors', [])),
                'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                'duration_seconds': duration,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            
            if 'error' in game_result:
                result['error'] = game_result['error']
            
            all_results.append(result)
            
            # 显示结果
            status = "✅ 成功" if game_result['success'] else "❌ 失败"
            failure_info = ""
            if not game_result['success'] and game_result.get('failure_reason'):
                failure_reason = game_result['failure_reason']
                if failure_reason == 'TABOO_VIOLATION':
                    failure_info = " (违反禁用词规则)"
                elif failure_reason == 'FORMAT_FAILURE':
                    failure_info = " (格式错误超3次)"
                elif failure_reason == 'API_FAILURE':
                    failure_info = " (API调用失败)"
                elif failure_reason == 'MAX_TURNS_EXCEEDED':
                    failure_info = " (达到最大轮数)"
            
            print(f"   {status}{failure_info} | {game_result['turns']}轮 | 最终猜测: {game_result['final_guess']}")
            time.sleep(0.5)
    
    return save_and_analyze_results(all_results, output_path, experiment_type)

print("✅ 统一实验运行器已定义")


✅ 统一实验运行器已定义


In [9]:
# 支持函数 - 全量实验和结果分析
def run_grouped_experiment(client, models, dataset, config, timestamp):
    """分组模式：全量实验，按hinter模型分组，遍历所有词汇，每50个游戏保存一个批次文件"""
    experiment_type = config.get('experiment_type', 'formal')
    output_dir = config.get('output_dir', 'results')
    max_turns = config.get('max_turns', 5)
    batch_size = config.get('batch_size', 50)  # 每批次保存的游戏数
    
    main_exp_dir = f"{output_dir}/taboo_experiment_{timestamp}"
    os.makedirs(main_exp_dir, exist_ok=True)
    print(f"📁 主实验目录: {main_exp_dir}")
    
    # 全量实验配置：每个模型对遍历所有300个词
    print(f"📊 数据集词汇数: {len(dataset)}")
    print(f"🤖 模型组合数: {len(models)}×{len(models)} = {len(models)**2}")
    print(f"🎮 总游戏数: {len(dataset) * len(models)**2:,}")
    print(f"💾 批次大小: 每{batch_size}个游戏保存一个文件")
    
    all_experiment_results = []
    batch_files = []  # 记录所有批次文件路径
    
    # 按hinter模型分组执行
    for i, hinter_model in enumerate(models, 1):
        hinter_name = hinter_model.split('/')[-1]
        print(f"\\n🎯 第{i}/{len(models)}组: Hinter = {hinter_name}")
        
        # 为每个hinter模型创建子目录
        hinter_dir = f"{main_exp_dir}/{hinter_name}_as_hinter"
        os.makedirs(hinter_dir, exist_ok=True)
        
        # 运行当前hinter模型与所有guesser模型的组合
        hinter_results = []
        current_batch = []
        total_games_for_hinter = len(models) * len(dataset)
        game_counter = 0
        batch_counter = 0
        
        for guesser_model in models:
            guesser_name = guesser_model.split('/')[-1]
            pair_name = f"{hinter_name}→{guesser_name}"
            
            print(f"   🔄 运行组合: {pair_name}")
            
            # 遍历所有词汇
            for word_idx, word_data in enumerate(dataset):
                game_counter += 1
                
                target_word = word_data['target']
                taboo_words = word_data['taboo']
                
                start_time = time.time()
                
                # 执行游戏
                game_result = enhanced_play_taboo_game(client, hinter_model, guesser_model, 
                                                     target_word, taboo_words, max_turns)
                
                duration = round(time.time() - start_time, 2)
                
                # 记录结果
                result = {
                    'game_id': f"{hinter_name}_{game_counter}",
                    'word_index': word_idx,
                    'hinter_model': hinter_model,
                    'guesser_model': guesser_model,
                    'target_word': target_word,
                    'category': word_data.get('category', 'unknown'),
                    'taboo_words': '|'.join(taboo_words),
                    'success': game_result['success'],
                    'turns_used': game_result['turns'],
                    'final_guess': game_result['final_guess'],
                    'failure_reason': game_result.get('failure_reason', None),
                    'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                    'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                    'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                    'all_hints': ' | '.join(game_result['all_hints']),
                    'all_guesses': ' | '.join(game_result['all_guesses']),
                    'conversation': ' | '.join(game_result['conversation']),
                    'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                    'format_errors': ' | '.join(game_result.get('format_errors', [])),
                    'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                    'duration_seconds': duration,
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                if 'error' in game_result:
                    result['error'] = game_result['error']
                
                hinter_results.append(result)
                current_batch.append(result)
                all_experiment_results.append(result)
                
                # 每batch_size个游戏保存一个批次文件
                if len(current_batch) >= batch_size:
                    batch_counter += 1
                    batch_file_path = f"{hinter_dir}/batch_{batch_counter:03d}.csv"
                    batch_df = pd.DataFrame(current_batch)
                    batch_df.to_csv(batch_file_path, index=False, encoding='utf-8')
                    batch_files.append(batch_file_path)
                    
                    # 进度显示
                    progress = (game_counter / total_games_for_hinter) * 100
                    success_in_batch = sum(r['success'] for r in current_batch)
                    batch_success_rate = success_in_batch / len(current_batch) * 100
                    
                    print(f"      💾 批次{batch_counter:03d}: {len(current_batch)}场游戏已保存")
                    print(f"      📈 进度: {game_counter}/{total_games_for_hinter} ({progress:.1f}%)")
                    print(f"      📊 批次成功率: {batch_success_rate:.1f}%")
                    
                    # 清空当前批次
                    current_batch = []
                
                time.sleep(0.3)  # API调用间隔
        
        # 保存剩余的游戏（如果有）
        if current_batch:
            batch_counter += 1
            batch_file_path = f"{hinter_dir}/batch_{batch_counter:03d}.csv"
            batch_df = pd.DataFrame(current_batch)
            batch_df.to_csv(batch_file_path, index=False, encoding='utf-8')
            batch_files.append(batch_file_path)
            
            success_in_batch = sum(r['success'] for r in current_batch)
            batch_success_rate = success_in_batch / len(current_batch) * 100
            print(f"      💾 最后批次{batch_counter:03d}: {len(current_batch)}场游戏已保存")
            print(f"      📊 批次成功率: {batch_success_rate:.1f}%")
        
        # 保存当前hinter模型的汇总结果
        hinter_df = pd.DataFrame(hinter_results)
        hinter_csv_path = f"{hinter_dir}/{hinter_name}_summary.csv"
        hinter_df.to_csv(hinter_csv_path, index=False, encoding='utf-8')
        
        # 统计当前hinter模型的结果
        success_count = sum(r['success'] for r in hinter_results)
        success_rate = success_count / len(hinter_results) * 100
        
        print(f"   ✅ {hinter_name}组完成: {len(hinter_results)}场游戏, 成功率: {success_rate:.1f}%")
        print(f"   💾 汇总结果已保存: {hinter_csv_path}")
        print(f"   📁 批次文件数: {batch_counter}个")
        
        # 失败原因统计
        print_failure_summary(hinter_df)
    
    # 保存全量实验的最终汇总结果
    final_csv_path = f"{main_exp_dir}/complete_experiment_results.csv"
    print(f"\n🔄 开始生成最终汇总文件...")
    print(f"📊 总批次文件数: {len(batch_files)}")
    
    return save_and_analyze_grouped_results(all_experiment_results, final_csv_path, main_exp_dir, models, batch_files)

def save_and_analyze_results(all_results, output_path, experiment_type):
    """保存并分析实验结果"""
    if all_results:
        df = pd.DataFrame(all_results)
        df.to_csv(output_path, index=False, encoding='utf-8')
        
        # 统计分析
        total_success = sum(r['success'] for r in all_results)
        success_rate = total_success / len(all_results) * 100
        
        print(f"\\n✅ {experiment_type}实验完成！")
        print(f"📁 结果文件: {output_path}")
        print(f"📊 总游戏数: {len(all_results):,}")
        print(f"📈 成功率: {success_rate:.1f}%")
        
        print_failure_summary(df)
        return df
    else:
        print("❌ 没有成功的实验记录")
        return None

def save_and_analyze_grouped_results(all_experiment_results, final_csv_path, main_exp_dir, models, batch_files=None):
    """保存并分析分组实验结果"""
    if all_experiment_results:
        final_df = pd.DataFrame(all_experiment_results)
        final_df.to_csv(final_csv_path, index=False, encoding='utf-8')
        
        # 全量实验统计
        total_success = sum(r['success'] for r in all_experiment_results)
        total_games = len(all_experiment_results)
        overall_success_rate = total_success / total_games * 100
        
        print(f"\\n🎉 全量实验完成！")
        print(f"📁 最终汇总文件: {final_csv_path}")
        print(f"📊 总游戏数: {total_games:,}场")
        print(f"📈 整体成功率: {overall_success_rate:.1f}%")
        
        if batch_files:
            print(f"📦 批次文件数: {len(batch_files)}个")
            print(f"💾 平均每批次: {total_games / len(batch_files):.1f}场游戏")
        
        # 按hinter模型的成功率统计
        print(f"\\n📊 各Hinter模型成功率:")
        for model in models:
            model_name = model.split('/')[-1]
            model_games = final_df[final_df['hinter_model'] == model]
            model_success = sum(model_games['success'])
            model_rate = model_success / len(model_games) * 100 if len(model_games) > 0 else 0
            print(f"   {model_name}: {model_success}/{len(model_games)} ({model_rate:.1f}%)")
        
        print_failure_summary(final_df, prefix="整体")
        print(f"\\n💾 所有数据已保存至目录: {main_exp_dir}")
        
        # 批次文件总结
        if batch_files:
            print(f"\\n📂 批次文件详情:")
            for batch_file in batch_files:
                file_name = os.path.basename(batch_file)
                print(f"   📄 {file_name}")
        
        return final_df
    else:
        print("❌ 全量实验失败，没有成功的游戏记录")
        return None

def print_failure_summary(df, prefix=""):
    """打印失败原因统计"""
    failed_games = df[df['success'] == False]
    if len(failed_games) > 0:
        title = f"{prefix}失败原因统计:" if prefix else "失败原因统计:"
        print(f"\\n📉 {title}")
        failure_counts = failed_games['failure_reason'].value_counts()
        for reason, count in failure_counts.items():
            percentage = count / len(failed_games) * 100
            if reason == 'TABOO_VIOLATION':
                print(f"   🚫 违反禁用词规则: {count} 场 ({percentage:.1f}%)")
            elif reason == 'FORMAT_FAILURE':
                print(f"   🔤 格式错误超限: {count} 场 ({percentage:.1f}%)")
            elif reason == 'API_FAILURE':
                print(f"   🌐 API调用失败: {count} 场 ({percentage:.1f}%)")
            elif reason == 'MAX_TURNS_EXCEEDED':
                print(f"   ⏱️ 轮数耗尽: {count} 场 ({percentage:.1f}%)")

print("✅ 支持函数已定义")


✅ 支持函数已定义


In [10]:
# 加载80个样本的数据集
def load_quick80_dataset():
    """加载80个样本的快速测试数据集"""
    quick80_path = "quick80_dataset.json"
    try:
        with open(quick80_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"❌ 找不到文件: {quick80_path}")
        return None

print("📚 正在加载80个样本数据集...")
quick80_dataset = load_quick80_dataset()

if quick80_dataset:
    print(f"✅ Quick80数据集加载成功: {len(quick80_dataset)} 条记录")
    
    # 显示数据集样本
    print(f"\n📋 Quick80数据集样本:")
    sample = quick80_dataset[0]
    print(f"   目标词: {sample['target']}")
    print(f"   类别: {sample.get('category', 'unknown')}")
    print(f"   禁用词: {sample['taboo']}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')[:100]}...")
        
    # 统计类别分布
    categories = {}
    for item in quick80_dataset:
        category = item.get('category', 'unknown')
        categories[category] = categories.get(category, 0) + 1
    
    print(f"\n🏷️ Quick80类别分布:")
    for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
        percentage = count / len(quick80_dataset) * 100
        print(f"   {category}: {count} 条 ({percentage:.1f}%)")
else:
    print("❌ Quick80数据集加载失败，使用完整数据集的前80个样本作为备选")
    if 'dataset' in locals() and len(dataset) >= 80:
        quick80_dataset = dataset[:80]
        print(f"✅ 使用完整数据集前80个样本: {len(quick80_dataset)} 条记录")
    else:
        print("❌ 完整数据集也不可用，无法进行实验")
        quick80_dataset = None


📚 正在加载80个样本数据集...
✅ Quick80数据集加载成功: 80 条记录

📋 Quick80数据集样本:
   目标词: behaviorism
   类别: philosophy
   禁用词: ['approach', 'behavior', 'emphasizes', 'measurable', 'observable']
   定义: an approach to psychology that emphasizes observable measurable behavior...

🏷️ Quick80类别分布:
   general: 55 条 (68.8%)
   philosophy: 8 条 (10.0%)
   finance: 7 条 (8.8%)
   chemistry: 5 条 (6.2%)
   cs: 5 条 (6.2%)


In [11]:
# 🌙 Kimi实验 - 简洁版本（替代重复的多个cell）
print("🌙 Kimi实验配置")
print("=" * 50)

# 检查先决条件
if not quick80_dataset:
    print("❌ Quick80数据集未加载")
elif not client:
    print("❌ API客户端未初始化")
else:
    # 创建9对模型组合
    kimi_model = "moonshotai/kimi-k2"
    other_models = [m for m in KIMI_MODELS if m != kimi_model]
    
    model_pairs = []
    # Kimi作为hinter (5对)
    for guesser in KIMI_MODELS:
        model_pairs.append((kimi_model, guesser))
    # 其他模型作为hinter，Kimi作为guesser (4对)
    for hinter in other_models:
        model_pairs.append((hinter, kimi_model))
    
    print(f"📊 实验配置:")
    print(f"   • 数据集: {len(quick80_dataset)} 个词汇 ✅")
    print(f"   • 模型对: {len(model_pairs)} 对组合 ✅")
    print(f"   • 总游戏: {len(quick80_dataset) * len(model_pairs)} 场")
    
    print(f"\\n📋 9对组合详情:")
    for i, (h, g) in enumerate(model_pairs, 1):
        h_name, g_name = h.split('/')[-1], g.split('/')[-1]
        role = ""
        if h == kimi_model and g == kimi_model:
            role = " (Kimi自对自)"
        elif h == kimi_model:
            role = " (Kimi作Hinter)"
        elif g == kimi_model:
            role = " (Kimi作Guesser)"
        print(f"   {i:2d}. {h_name} → {g_name}{role}")
    
    print(f"\\n✅ Kimi实验配置完成！")
    print(f"🧹 建议: 删除Cell 15-18的重复内容，只保留这个版本。")
    print(f"📝 下一步: 实现实际的实验执行代码。")


🌙 Kimi实验配置
📊 实验配置:
   • 数据集: 80 个词汇 ✅
   • 模型对: 9 对组合 ✅
   • 总游戏: 720 场
\n📋 9对组合详情:
    1. kimi-k2 → kimi-k2 (Kimi自对自)
    2. kimi-k2 → gpt-4o (Kimi作Hinter)
    3. kimi-k2 → gemini-2.5-flash (Kimi作Hinter)
    4. kimi-k2 → deepseek-chat-v3-0324 (Kimi作Hinter)
    5. kimi-k2 → claude-sonnet-4 (Kimi作Hinter)
    6. gpt-4o → kimi-k2 (Kimi作Guesser)
    7. gemini-2.5-flash → kimi-k2 (Kimi作Guesser)
    8. deepseek-chat-v3-0324 → kimi-k2 (Kimi作Guesser)
    9. claude-sonnet-4 → kimi-k2 (Kimi作Guesser)
\n✅ Kimi实验配置完成！
🧹 建议: 删除Cell 15-18的重复内容，只保留这个版本。
📝 下一步: 实现实际的实验执行代码。


In [12]:
# 添加缺失的辅助函数
import re

def robust_api_call(client, model, prompt, expected_format, max_retries=3):
    """带重试的API调用函数"""
    attempts = 0
    failed_outputs = []
    
    for attempt in range(max_retries):
        attempts += 1
        try:
            messages = [{"role": "user", "content": prompt}]
            response = client.call_model(model, messages)
            
            # 检查格式
            if expected_format in response:
                return {
                    'success': True,
                    'response': response,
                    'attempts': attempts,
                    'failed_outputs': failed_outputs
                }
            else:
                failed_outputs.append(response)
                if attempt < max_retries - 1:
                    time.sleep(1)  # 重试前等待
                    continue
                    
        except Exception as e:
            failed_outputs.append(f"API Error: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)  # API错误后等待更长时间
                continue
    
    # 所有重试都失败
    return {
        'success': False,
        'response': f"FORMAT_ERROR_EXCEEDED_{max_retries}",
        'error': f"Format check failed after {max_retries} attempts",
        'attempts': attempts,
        'failed_outputs': failed_outputs
    }

def extract_clue_text(hinter_response):
    """从hinter响应中提取线索文本"""
    if '[CLUE]' in hinter_response:
        return hinter_response.split('[CLUE]', 1)[1].strip()
    return hinter_response.strip()

def extract_guess_word(guesser_response):
    """从guesser响应中提取猜测词汇"""
    if '[GUESS]' in guesser_response:
        guess_part = guesser_response.split('[GUESS]', 1)[1].strip()
        # 提取第一个单词
        words = guess_part.split()
        return words[0] if words else guess_part
    return guesser_response.strip().split()[0] if guesser_response.strip() else ""

def check_taboo_violation(hint_text, taboo_words):
    """检查线索是否违反taboo words规则"""
    hint_lower = hint_text.lower()
    for taboo_word in taboo_words:
        taboo_lower = taboo_word.lower()
        # 检查完整单词匹配
        if re.search(r'\b' + re.escape(taboo_lower) + r'\b', hint_lower):
            return True
    return False

print("✅ 辅助函数已定义")


✅ 辅助函数已定义


In [None]:
# 🌙 Kimi实验执行 - 实际运行代码
print("🌙 开始执行Kimi实验")
print("=" * 50)

# 检查先决条件
if not quick80_dataset:
    print("❌ Quick80数据集未加载，实验无法进行")
elif not client:
    print("❌ API客户端未初始化，实验无法进行")
else:
    # 创建9对模型组合
    kimi_model = "moonshotai/kimi-k2"
    other_models = [m for m in KIMI_MODELS if m != kimi_model]
    
    model_pairs = []
    # Kimi作为hinter (5对)
    for guesser in KIMI_MODELS:
        model_pairs.append((kimi_model, guesser))
    # 其他模型作为hinter，Kimi作为guesser (4对)
    for hinter in other_models:
        model_pairs.append((hinter, kimi_model))
    
    print(f"📊 实验配置确认:")
    print(f"   • 数据集: {len(quick80_dataset)} 个词汇")
    print(f"   • 模型对: {len(model_pairs)} 对组合")
    print(f"   • 总游戏: {len(quick80_dataset) * len(model_pairs)} 场")
    print(f"   • 预计时长: {len(quick80_dataset) * len(model_pairs) * 0.5 / 60:.1f} 分钟")
    
    # 创建结果输出目录
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = f"results/kimi_experiment_{timestamp}"
    os.makedirs(results_dir, exist_ok=True)
    
    print(f"📁 结果目录: {results_dir}")
    
    # 初始化结果收集
    all_results = []
    game_counter = 0
    total_games = len(quick80_dataset) * len(model_pairs)
    
    print(f"\\n🚀 开始执行实验...")
    start_time = time.time()
    
    # 执行实验
    for pair_idx, (hinter_model, guesser_model) in enumerate(model_pairs, 1):
        hinter_name = hinter_model.split('/')[-1]
        guesser_name = guesser_model.split('/')[-1]
        pair_name = f"{hinter_name}→{guesser_name}"
        
        print(f"\\n📋 第{pair_idx}/{len(model_pairs)}组: {pair_name}")
        
        # 对这个模型组合运行所有80个词汇
        pair_results = []
        
        for word_idx, word_data in enumerate(quick80_dataset):
            game_counter += 1
            target_word = word_data['target']
            taboo_words = word_data['taboo']
            
            # 进度显示
            if word_idx % 10 == 0 or word_idx < 5:
                progress = (game_counter / total_games) * 100
                print(f"   🎯 词汇 {word_idx+1}/80: {target_word} ({progress:.1f}%)")
            
            # 执行单场游戏
            game_start = time.time()
            try:
                game_result = enhanced_play_taboo_game(
                    client, hinter_model, guesser_model, 
                    target_word, taboo_words, max_turns=5
                )
                
                duration = round(time.time() - game_start, 2)
                
                # 记录结果
                result = {
                    'game_id': f"kimi_{game_counter:04d}",
                    'pair_index': pair_idx,
                    'word_index': word_idx + 1,
                    'hinter_model': hinter_model,
                    'guesser_model': guesser_model,
                    'hinter_name': hinter_name,
                    'guesser_name': guesser_name,
                    'pair_name': pair_name,
                    'target_word': target_word,
                    'category': word_data.get('category', 'unknown'),
                    'taboo_words': '|'.join(taboo_words),
                    'success': game_result['success'],
                    'turns_used': game_result['turns'],
                    'final_guess': game_result['final_guess'],
                    'failure_reason': game_result.get('failure_reason', None),
                    'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                    'conversation_turns': len(game_result['conversation']),
                    'all_hints': ' | '.join(game_result.get('all_hints', [])),
                    'all_guesses': ' | '.join(game_result.get('all_guesses', [])),
                    'conversation': ' | '.join(game_result['conversation']),
                    'duration_seconds': duration,
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                
                # 添加错误信息（如果有）
                if 'error' in game_result:
                    result['error'] = game_result['error']
                
                pair_results.append(result)
                all_results.append(result)
                
                # 简短的结果显示
                if word_idx < 5 or word_idx % 20 == 19:
                    status = "✅" if game_result['success'] else "❌"
                    failure_info = ""
                    if not game_result['success'] and game_result.get('failure_reason'):
                        reason_map = {
                            'TABOO_VIOLATION': '违规',
                            'FORMAT_FAILURE': '格式',
                            'API_FAILURE': 'API',
                            'MAX_TURNS_EXCEEDED': '超时'
                        }
                        failure_info = f" ({reason_map.get(game_result['failure_reason'], game_result['failure_reason'])})"
                    
                    print(f"      {status} {target_word}: {game_result['turns']}轮{failure_info}")
                
            except Exception as e:
                print(f"      ❌ 游戏执行错误: {target_word} - {str(e)}")
                # 记录错误结果
                error_result = {
                    'game_id': f"kimi_{game_counter:04d}",
                    'pair_index': pair_idx,
                    'word_index': word_idx + 1,
                    'hinter_model': hinter_model,
                    'guesser_model': guesser_model,
                    'target_word': target_word,
                    'success': False,
                    'error': str(e),
                    'failure_reason': 'EXECUTION_ERROR',
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                pair_results.append(error_result)
                all_results.append(error_result)
            
            # API调用间隔
            time.sleep(0.3)
        
        # 保存当前模型对的结果
        pair_success = sum(1 for r in pair_results if r['success'])
        pair_success_rate = pair_success / len(pair_results) * 100
        
        print(f"   📊 {pair_name}组完成: {pair_success}/{len(pair_results)} ({pair_success_rate:.1f}%)")
        
        # 保存单个模型对的结果文件
        pair_file = f"{results_dir}/{hinter_name}_vs_{guesser_name}_{timestamp}.csv"
        pair_df = pd.DataFrame(pair_results)
        pair_df.to_csv(pair_file, index=False, encoding='utf-8')
        print(f"   💾 结果已保存: {os.path.basename(pair_file)}")
    
    # 保存汇总结果
    print(f"\\n📊 实验汇总统计:")
    total_duration = time.time() - start_time
    total_success = sum(1 for r in all_results if r['success'])
    overall_success_rate = total_success / len(all_results) * 100
    
    print(f"   ⏱️  总耗时: {total_duration/60:.1f} 分钟")
    print(f"   🎮 总游戏数: {len(all_results):,} 场")
    print(f"   ✅ 总成功数: {total_success:,} 场")
    print(f"   📈 总成功率: {overall_success_rate:.1f}%")
    
    # 保存完整汇总结果
    summary_file = f"{results_dir}/kimi_experiment_summary_{timestamp}.csv"
    summary_df = pd.DataFrame(all_results)
    summary_df.to_csv(summary_file, index=False, encoding='utf-8')
    
    print(f"\\n💾 完整结果已保存:")
    print(f"   📁 目录: {results_dir}")
    print(f"   📄 汇总文件: {os.path.basename(summary_file)}")
    print(f"   📄 单独文件: {len(model_pairs)} 个模型对文件")
    
    # 快速分析
    print(f"\\n📈 按模型角色成功率:")
    kimi_as_hinter = summary_df[summary_df['hinter_name'] == 'kimi-k2']
    kimi_as_guesser = summary_df[summary_df['guesser_name'] == 'kimi-k2']
    
    if len(kimi_as_hinter) > 0:
        kimi_hinter_success = sum(kimi_as_hinter['success']) / len(kimi_as_hinter) * 100
        print(f"   🌙 Kimi作Hinter: {kimi_hinter_success:.1f}%")
    
    if len(kimi_as_guesser) > 0:
        kimi_guesser_success = sum(kimi_as_guesser['success']) / len(kimi_as_guesser) * 100
        print(f"   🌙 Kimi作Guesser: {kimi_guesser_success:.1f}%")
    
    print(f"\\n🎉 Kimi实验执行完成！")


🌙 开始执行Kimi实验
📊 实验配置确认:
   • 数据集: 80 个词汇
   • 模型对: 9 对组合
   • 总游戏: 720 场
   • 预计时长: 6.0 分钟
📁 结果目录: results/kimi_experiment_20250717_125711
\n🚀 开始执行实验...
\n📋 第1/9组: kimi-k2→kimi-k2
   🎯 词汇 1/80: behaviorism (0.1%)
      ❌ behaviorism: 5轮 (超时)
   🎯 词汇 2/80: carcharhinus (0.3%)
      ✅ carcharhinus: 3轮
   🎯 词汇 3/80: aphrodisiac (0.4%)
      ✅ aphrodisiac: 1轮
   🎯 词汇 4/80: futures (0.6%)
      ✅ futures: 1轮
   🎯 词汇 5/80: stylus (0.7%)
      ✅ stylus: 1轮
   🎯 词汇 11/80: debenture (1.5%)
      ✅ parentage: 3轮
   🎯 词汇 21/80: biological (2.9%)
   🎯 词汇 31/80: quaggy (4.3%)
      ✅ stomatal: 2轮
   🎯 词汇 41/80: dishearten (5.7%)
   🎯 词汇 51/80: backtracking (7.1%)
      ❌ wickedly: 5轮 (违规)
   🎯 词汇 61/80: tonight (8.5%)
   🎯 词汇 71/80: centennially (9.9%)
      ✅ brooks: 2轮
   📊 kimi-k2→kimi-k2组完成: 69/80 (86.2%)
   💾 结果已保存: kimi-k2_vs_kimi-k2_20250717_125711.csv
\n📋 第2/9组: kimi-k2→gpt-4o
   🎯 词汇 1/80: behaviorism (11.2%)
      ❌ behaviorism: 5轮 (违规)
   🎯 词汇 2/80: carcharhinus (11.4%)
      ✅ carcharhin

In [None]:
# 🔧 恢复并完成中断的Kimi实验
print("🔧 检查并完成中断的Kimi实验")
print("=" * 50)

# 检查已完成的文件
results_dir = "results/kimi_experiment_20250717_125711"
if os.path.exists(results_dir):
    existing_files = [f for f in os.listdir(results_dir) if f.endswith('.csv')]
    print(f"📁 找到实验目录: {results_dir}")
    print(f"📄 已有文件数: {len(existing_files)}")
    
    # 定义所有应该存在的模型对
    kimi_model = "moonshotai/kimi-k2"
    all_models = [
        "moonshotai/kimi-k2",
        "openai/gpt-4o", 
        "google/gemini-2.5-flash",
        "deepseek/deepseek-chat-v3-0324",
        "anthropic/claude-sonnet-4"
    ]
    
    expected_pairs = []
    # Kimi作为hinter (5对)
    for guesser in all_models:
        guesser_name = guesser.split('/')[-1]
        expected_pairs.append(f"kimi-k2_vs_{guesser_name}_20250717_125711.csv")
    
    # 其他模型作为hinter，Kimi作为guesser (4对)
    for hinter in all_models:
        if hinter != kimi_model:
            hinter_name = hinter.split('/')[-1]
            expected_pairs.append(f"{hinter_name}_vs_kimi-k2_20250717_125711.csv")
    
    print(f"\\n📋 应有文件清单 ({len(expected_pairs)}个):")
    missing_files = []
    
    for i, expected_file in enumerate(expected_pairs, 1):
        if expected_file in existing_files:
            print(f"   ✅ {i:2d}. {expected_file}")
        else:
            print(f"   ❌ {i:2d}. {expected_file} (缺失)")
            missing_files.append(expected_file)
    
    print(f"\\n📊 状态总结:")
    print(f"   • 已完成: {len(existing_files)}/{len(expected_pairs)} 个文件")
    print(f"   • 缺失: {len(missing_files)} 个文件")
    
    if missing_files:
        print(f"\\n🚧 需要补充完成的文件:")
        for missing in missing_files:
            print(f"   📝 {missing}")
            
        # 如果只缺少一个文件，立即执行
        if len(missing_files) == 1:
            missing_file = missing_files[0]
            if "claude-sonnet-4_vs_kimi-k2" in missing_file:
                print(f"\\n🚀 开始执行缺失的实验: claude-sonnet-4 → kimi-k2")
                
                hinter_model = "anthropic/claude-sonnet-4"
                guesser_model = "moonshotai/kimi-k2"
                hinter_name = "claude-sonnet-4"
                guesser_name = "kimi-k2"
                
                # 加载数据集
                if 'quick80_dataset' not in locals() or not quick80_dataset:
                    quick80_dataset = load_quick80_dataset()
                
                if quick80_dataset and client:
                    print(f"   📚 数据集: {len(quick80_dataset)} 个词汇")
                    print(f"   🤖 模型对: {hinter_name} → {guesser_name}")
                    
                    # 执行实验
                    pair_results = []
                    game_counter = 640  # 前8组已完成640场游戏
                    
                    start_time = time.time()
                    print(f"\\n⏱️  开始时间: {datetime.now().strftime('%H:%M:%S')}")
                    
                    for word_idx, word_data in enumerate(quick80_dataset):
                        game_counter += 1
                        target_word = word_data['target']
                        taboo_words = word_data['taboo']
                        
                        # 进度显示
                        if word_idx % 10 == 0 or word_idx < 5:
                            print(f"   🎯 词汇 {word_idx+1}/80: {target_word}")
                        
                        # 执行游戏
                        game_start = time.time()
                        try:
                            game_result = enhanced_play_taboo_game(
                                client, hinter_model, guesser_model,
                                target_word, taboo_words, max_turns=5
                            )
                            
                            duration = round(time.time() - game_start, 2)
                            
                            # 记录结果
                            result = {
                                'game_id': f"kimi_{game_counter:04d}",
                                'pair_index': 9,
                                'word_index': word_idx + 1,
                                'hinter_model': hinter_model,
                                'guesser_model': guesser_model,
                                'hinter_name': hinter_name,
                                'guesser_name': guesser_name,
                                'pair_name': f"{hinter_name}→{guesser_name}",
                                'target_word': target_word,
                                'category': word_data.get('category', 'unknown'),
                                'taboo_words': '|'.join(taboo_words),
                                'success': game_result['success'],
                                'turns_used': game_result['turns'],
                                'final_guess': game_result['final_guess'],
                                'failure_reason': game_result.get('failure_reason', None),
                                'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                                'conversation_turns': len(game_result['conversation']),
                                'all_hints': ' | '.join(game_result.get('all_hints', [])),
                                'all_guesses': ' | '.join(game_result.get('all_guesses', [])),
                                'conversation': ' | '.join(game_result['conversation']),
                                'duration_seconds': duration,
                                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                            }
                            
                            if 'error' in game_result:
                                result['error'] = game_result['error']
                            
                            pair_results.append(result)
                            
                            # 简短结果显示
                            if word_idx < 5 or word_idx % 20 == 19:
                                status = "✅" if game_result['success'] else "❌"
                                failure_info = ""
                                if not game_result['success'] and game_result.get('failure_reason'):
                                    reason_map = {
                                        'TABOO_VIOLATION': '违规',
                                        'FORMAT_FAILURE': '格式',
                                        'API_FAILURE': 'API',
                                        'MAX_TURNS_EXCEEDED': '超时'
                                    }
                                    failure_info = f" ({reason_map.get(game_result['failure_reason'], game_result['failure_reason'])})"
                                
                                print(f"      {status} {target_word}: {game_result['turns']}轮{failure_info}")
                        
                        except Exception as e:
                            print(f"      ❌ 游戏执行错误: {target_word} - {str(e)}")
                            # 记录错误结果
                            error_result = {
                                'game_id': f"kimi_{game_counter:04d}",
                                'pair_index': 9,
                                'word_index': word_idx + 1,
                                'hinter_model': hinter_model,
                                'guesser_model': guesser_model,
                                'target_word': target_word,
                                'success': False,
                                'error': str(e),
                                'failure_reason': 'EXECUTION_ERROR',
                                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                            }
                            pair_results.append(error_result)
                        
                        time.sleep(0.3)  # API调用间隔
                    
                    # 计算结果
                    duration = time.time() - start_time
                    pair_success = sum(1 for r in pair_results if r['success'])
                    pair_success_rate = pair_success / len(pair_results) * 100
                    
                    print(f"\\n📊 {hinter_name}→{guesser_name} 完成:")
                    print(f"   ⏱️  耗时: {duration/60:.1f} 分钟")
                    print(f"   🎮 游戏数: {len(pair_results)}")
                    print(f"   ✅ 成功数: {pair_success}")
                    print(f"   📈 成功率: {pair_success_rate:.1f}%")
                    
                    # 保存结果文件
                    pair_file = f"{results_dir}/{hinter_name}_vs_{guesser_name}_20250717_125711.csv"
                    pair_df = pd.DataFrame(pair_results)
                    pair_df.to_csv(pair_file, index=False, encoding='utf-8')
                    print(f"   💾 结果已保存: {os.path.basename(pair_file)}")
                    
                    print(f"\\n✅ 最后一组实验完成！")
                else:
                    print("❌ 数据集或API客户端不可用")
    else:
        print(f"\\n🎉 所有9组实验已完成！")
    
else:
    print("❌ 未找到实验结果目录，可能需要重新开始实验")


In [None]:
# 📊 生成Kimi实验最终汇总分析
print("📊 生成Kimi实验最终汇总分析")
print("=" * 50)

results_dir = "results/kimi_experiment_20250717_125711"

if os.path.exists(results_dir):
    # 收集所有结果文件
    csv_files = [f for f in os.listdir(results_dir) if f.endswith('.csv') and f != 'kimi_experiment_summary_20250717_125711.csv']
    
    if len(csv_files) >= 9:  # 确保有完整的9个文件
        print(f"📁 找到 {len(csv_files)} 个结果文件")
        
        # 合并所有结果
        all_results = []
        
        for csv_file in csv_files:
            file_path = os.path.join(results_dir, csv_file)
            try:
                df = pd.read_csv(file_path, encoding='utf-8')
                all_results.extend(df.to_dict('records'))
                print(f"   ✅ {csv_file}: {len(df)} 条记录")
            except Exception as e:
                print(f"   ❌ {csv_file}: 读取失败 - {e}")
        
        if all_results:
            print(f"\\n📊 汇总统计:")
            total_games = len(all_results)
            total_success = sum(1 for r in all_results if r['success'])
            overall_success_rate = total_success / total_games * 100
            
            print(f"   🎮 总游戏数: {total_games:,} 场")
            print(f"   ✅ 总成功数: {total_success:,} 场") 
            print(f"   📈 总成功率: {overall_success_rate:.1f}%")
            
            # 保存汇总文件
            summary_df = pd.DataFrame(all_results)
            summary_file = f"{results_dir}/kimi_experiment_summary_20250717_125711.csv"
            summary_df.to_csv(summary_file, index=False, encoding='utf-8')
            print(f"\\n💾 汇总文件已保存: {os.path.basename(summary_file)}")
            
            # 详细分析
            print(f"\\n🎭 按角色分析:")
            kimi_as_hinter = summary_df[summary_df['hinter_name'] == 'kimi-k2']
            kimi_as_guesser = summary_df[summary_df['guesser_name'] == 'kimi-k2']
            
            if len(kimi_as_hinter) > 0:
                hinter_success = sum(kimi_as_hinter['success'])
                hinter_rate = hinter_success / len(kimi_as_hinter) * 100
                print(f"   🌙 Kimi作Hinter: {hinter_success}/{len(kimi_as_hinter)} ({hinter_rate:.1f}%)")
            
            if len(kimi_as_guesser) > 0:
                guesser_success = sum(kimi_as_guesser['success'])
                guesser_rate = guesser_success / len(kimi_as_guesser) * 100
                print(f"   🌙 Kimi作Guesser: {guesser_success}/{len(kimi_as_guesser)} ({guesser_rate:.1f}%)")
            
            # 按模型对分析
            print(f"\\n👥 各模型对成功率:")
            pair_stats = summary_df.groupby('pair_name').agg({
                'success': ['count', 'sum'],
                'turns_used': 'mean'
            }).round(1)
            
            for pair_name in sorted(pair_stats.index):
                count = int(pair_stats.loc[pair_name, ('success', 'count')])
                success = int(pair_stats.loc[pair_name, ('success', 'sum')])
                rate = success / count * 100
                avg_turns = pair_stats.loc[pair_name, ('turns_used', 'mean')]
                
                # 标记Kimi的角色
                if 'kimi-k2→' in pair_name:
                    role = "(🌙H)"
                elif '→kimi-k2' in pair_name:
                    role = "(🌙G)"
                else:
                    role = ""
                
                print(f"   {pair_name:<35} {role:<5}: {success:2d}/{count} ({rate:5.1f}%) 平均{avg_turns:.1f}轮")
            
            # 失败原因分析
            failed_df = summary_df[summary_df['success'] == False]
            if len(failed_df) > 0:
                print(f"\\n📉 失败原因分析 ({len(failed_df)}场失败):")
                failure_counts = failed_df['failure_reason'].value_counts()
                
                for reason, count in failure_counts.items():
                    percentage = count / len(failed_df) * 100
                    reason_map = {
                        'TABOO_VIOLATION': '🚫 违反禁用词',
                        'FORMAT_FAILURE': '🔤 格式错误', 
                        'API_FAILURE': '🌐 API失败',
                        'MAX_TURNS_EXCEEDED': '⏱️ 轮数耗尽',
                        'EXECUTION_ERROR': '💥 执行错误'
                    }
                    reason_name = reason_map.get(reason, reason)
                    print(f"   {reason_name}: {count} 场 ({percentage:.1f}%)")
            
            # 按词汇类别分析
            if 'category' in summary_df.columns:
                print(f"\\n🏷️ 按词汇类别成功率:")
                category_stats = summary_df.groupby('category').agg({
                    'success': ['count', 'sum']
                }).round(1)
                
                for category in sorted(category_stats.index):
                    count = int(category_stats.loc[category, ('success', 'count')])
                    success = int(category_stats.loc[category, ('success', 'sum')])
                    rate = success / count * 100
                    print(f"   {category:<12}: {success:3d}/{count:3d} ({rate:5.1f}%)")
            
            print(f"\\n🎉 Kimi实验完整分析完成！")
            print(f"📁 所有结果保存在: {results_dir}")
            print(f"📄 汇总文件: kimi_experiment_summary_20250717_125711.csv")
            
        else:
            print("❌ 无法读取结果数据")
    else:
        print(f"❌ 结果文件不完整，期望9个，实际{len(csv_files)}个")
        
else:
    print("❌ 实验结果目录不存在")


In [1]:
# 🚀 运行最后一组实验: claude-sonnet-4 → kimi-k2
print("🚀 运行最后一组实验: claude-sonnet-4 → kimi-k2")
print("=" * 50)

# 检查是否需要运行
results_dir = "results/kimi_experiment_20250717_125711"
missing_file = "claude-sonnet-4_vs_kimi-k2_20250717_125711.csv"
missing_path = f"{results_dir}/{missing_file}"

if os.path.exists(missing_path):
    print(f"✅ 文件已存在: {missing_file}")
    print("无需重复运行实验")
else:
    print(f"❌ 缺失文件: {missing_file}")
    print("开始执行最后一组实验...")
    
    # 实验配置
    hinter_model = "anthropic/claude-sonnet-4"
    guesser_model = "moonshotai/kimi-k2"
    hinter_name = "claude-sonnet-4"
    guesser_name = "kimi-k2"
    
    print(f"🤖 模型对: {hinter_name} → {guesser_name}")
    print(f"📚 数据集: {len(quick80_dataset)} 个词汇")
    print(f"📁 输出目录: {results_dir}")
    
    # 执行实验
    pair_results = []
    game_counter = 640  # 前8组已完成640场游戏
    
    start_time = time.time()
    print(f"\\n⏱️  开始时间: {datetime.now().strftime('%H:%M:%S')}")
    
    for word_idx, word_data in enumerate(quick80_dataset):
        game_counter += 1
        target_word = word_data['target']
        taboo_words = word_data['taboo']
        
        # 进度显示
        if word_idx % 10 == 0 or word_idx < 5:
            progress = (word_idx + 1) / len(quick80_dataset) * 100
            print(f"   🎯 词汇 {word_idx+1:2d}/80: {target_word:<15} ({progress:5.1f}%)")
        
        # 执行游戏
        game_start = time.time()
        try:
            game_result = enhanced_play_taboo_game(
                client, hinter_model, guesser_model,
                target_word, taboo_words, max_turns=5
            )
            
            duration = round(time.time() - game_start, 2)
            
            # 记录结果
            result = {
                'game_id': f"kimi_{game_counter:04d}",
                'pair_index': 9,
                'word_index': word_idx + 1,
                'hinter_model': hinter_model,
                'guesser_model': guesser_model,
                'hinter_name': hinter_name,
                'guesser_name': guesser_name,
                'pair_name': f"{hinter_name}→{guesser_name}",
                'target_word': target_word,
                'category': word_data.get('category', 'unknown'),
                'taboo_words': '|'.join(taboo_words),
                'success': game_result['success'],
                'turns_used': game_result['turns'],
                'final_guess': game_result['final_guess'],
                'failure_reason': game_result.get('failure_reason', None),
                'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                'conversation_turns': len(game_result['conversation']),
                'all_hints': ' | '.join(game_result.get('all_hints', [])),
                'all_guesses': ' | '.join(game_result.get('all_guesses', [])),
                'conversation': ' | '.join(game_result['conversation']),
                'duration_seconds': duration,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            
            if 'error' in game_result:
                result['error'] = game_result['error']
            
            pair_results.append(result)
            
            # 结果显示
            if word_idx < 5 or word_idx % 20 == 19 or word_idx == len(quick80_dataset) - 1:
                status = "✅" if game_result['success'] else "❌"
                failure_info = ""
                if not game_result['success'] and game_result.get('failure_reason'):
                    reason_map = {
                        'TABOO_VIOLATION': '违规',
                        'FORMAT_FAILURE': '格式',
                        'API_FAILURE': 'API',
                        'MAX_TURNS_EXCEEDED': '超时'
                    }
                    failure_info = f" ({reason_map.get(game_result['failure_reason'], game_result['failure_reason'])})"
                
                print(f"      {status} {target_word:<15}: {game_result['turns']}轮{failure_info}")
        
        except Exception as e:
            print(f"      ❌ 游戏执行错误: {target_word} - {str(e)}")
            # 记录错误结果
            error_result = {
                'game_id': f"kimi_{game_counter:04d}",
                'pair_index': 9,
                'word_index': word_idx + 1,
                'hinter_model': hinter_model,
                'guesser_model': guesser_model,
                'hinter_name': hinter_name,
                'guesser_name': guesser_name,
                'pair_name': f"{hinter_name}→{guesser_name}",
                'target_word': target_word,
                'category': word_data.get('category', 'unknown'),
                'success': False,
                'turns_used': 0,
                'final_guess': 'EXECUTION_ERROR',
                'error': str(e),
                'failure_reason': 'EXECUTION_ERROR',
                'has_taboo_violation': False,
                'conversation_turns': 0,
                'all_hints': '',
                'all_guesses': '',
                'conversation': '',
                'duration_seconds': 0,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            pair_results.append(error_result)
        
        time.sleep(0.3)  # API调用间隔
    
    # 计算和显示结果
    total_duration = time.time() - start_time
    pair_success = sum(1 for r in pair_results if r['success'])
    pair_success_rate = pair_success / len(pair_results) * 100
    
    print(f"\\n📊 {hinter_name}→{guesser_name} 实验完成:")
    print(f"   ⏱️  总耗时: {total_duration/60:.1f} 分钟")
    print(f"   🎮 游戏数: {len(pair_results)}")
    print(f"   ✅ 成功数: {pair_success}")
    print(f"   📈 成功率: {pair_success_rate:.1f}%")
    
    # 失败分析
    failed_results = [r for r in pair_results if not r['success']]
    if failed_results:
        print(f"   📉 失败数: {len(failed_results)}")
        failure_reasons = {}
        for r in failed_results:
            reason = r.get('failure_reason', 'UNKNOWN')
            failure_reasons[reason] = failure_reasons.get(reason, 0) + 1
        
        for reason, count in failure_reasons.items():
            print(f"      {reason}: {count}场")
    
    # 保存结果文件
    try:
        os.makedirs(results_dir, exist_ok=True)
        pair_df = pd.DataFrame(pair_results)
        pair_df.to_csv(missing_path, index=False, encoding='utf-8')
        print(f"\\n💾 结果已保存: {missing_file}")
        print(f"   📄 文件路径: {missing_path}")
        print(f"   📊 记录数: {len(pair_results)}")
        
        print(f"\\n🎉 最后一组实验完成！现在所有9组实验都已就绪。")
        
    except Exception as e:
        print(f"\\n❌ 保存失败: {str(e)}")


🚀 运行最后一组实验: claude-sonnet-4 → kimi-k2


NameError: name 'os' is not defined

In [None]:
# 📈 结果分析（实验完成后运行）
print("📈 Kimi实验结果分析")
print("=" * 40)

# 分析最近的实验结果
def analyze_kimi_results():
    """分析Kimi实验结果"""
    
    # 查找最新的结果目录
    results_base = "results"
    if not os.path.exists(results_base):
        print("❌ 未找到results目录")
        return
    
    # 查找kimi实验目录
    kimi_dirs = [d for d in os.listdir(results_base) if d.startswith("kimi_experiment_")]
    if not kimi_dirs:
        print("❌ 未找到Kimi实验结果")
        return
    
    # 使用最新的实验目录
    latest_dir = max(kimi_dirs)
    results_dir = os.path.join(results_base, latest_dir)
    
    print(f"📁 分析目录: {latest_dir}")
    
    # 查找汇总文件
    summary_files = [f for f in os.listdir(results_dir) if f.startswith("kimi_experiment_summary_")]
    if not summary_files:
        print("❌ 未找到汇总文件")
        return
    
    summary_file = os.path.join(results_dir, summary_files[0])
    
    try:
        # 加载结果数据
        df = pd.read_csv(summary_file, encoding='utf-8')
        print(f"✅ 加载数据: {len(df)} 条记录")
        
        # 基本统计
        total_games = len(df)
        total_success = sum(df['success'])
        overall_success_rate = total_success / total_games * 100
        
        print(f"\\n📊 基本统计:")
        print(f"   🎮 总游戏数: {total_games:,} 场")
        print(f"   ✅ 成功数: {total_success:,} 场")
        print(f"   📈 总成功率: {overall_success_rate:.1f}%")
        
        # 按角色分析
        print(f"\\n🎭 按角色分析:")
        kimi_as_hinter = df[df['hinter_name'] == 'kimi-k2']
        kimi_as_guesser = df[df['guesser_name'] == 'kimi-k2']
        
        if len(kimi_as_hinter) > 0:
            hinter_success = sum(kimi_as_hinter['success'])
            hinter_rate = hinter_success / len(kimi_as_hinter) * 100
            print(f"   🌙 Kimi作Hinter: {hinter_success}/{len(kimi_as_hinter)} ({hinter_rate:.1f}%)")
        
        if len(kimi_as_guesser) > 0:
            guesser_success = sum(kimi_as_guesser['success'])
            guesser_rate = guesser_success / len(kimi_as_guesser) * 100
            print(f"   🌙 Kimi作Guesser: {guesser_success}/{len(kimi_as_guesser)} ({guesser_rate:.1f}%)")
        
        # 按模型对分析
        print(f"\\n👥 按模型对分析:")
        pair_stats = df.groupby('pair_name').agg({
            'success': ['count', 'sum'],
            'turns_used': 'mean'
        }).round(2)
        
        for pair_name in pair_stats.index:
            count = pair_stats.loc[pair_name, ('success', 'count')]
            success = pair_stats.loc[pair_name, ('success', 'sum')]
            rate = success / count * 100
            avg_turns = pair_stats.loc[pair_name, ('turns_used', 'mean')]
            print(f"   {pair_name}: {success}/{count} ({rate:.1f}%) 平均{avg_turns:.1f}轮")
        
        # 失败原因分析
        failed_df = df[df['success'] == False]
        if len(failed_df) > 0:
            print(f"\\n📉 失败原因分析:")
            failure_counts = failed_df['failure_reason'].value_counts()
            for reason, count in failure_counts.items():
                percentage = count / len(failed_df) * 100
                reason_map = {
                    'TABOO_VIOLATION': '🚫 违反禁用词',
                    'FORMAT_FAILURE': '🔤 格式错误',
                    'API_FAILURE': '🌐 API失败',
                    'MAX_TURNS_EXCEEDED': '⏱️ 轮数耗尽',
                    'EXECUTION_ERROR': '💥 执行错误'
                }
                reason_name = reason_map.get(reason, reason)
                print(f"   {reason_name}: {count} 场 ({percentage:.1f}%)")
        
        # 按类别分析
        if 'category' in df.columns:
            print(f"\\n🏷️ 按词汇类别分析:")
            category_stats = df.groupby('category').agg({
                'success': ['count', 'sum']
            })
            
            for category in category_stats.index:
                count = category_stats.loc[category, ('success', 'count')]
                success = category_stats.loc[category, ('success', 'sum')]
                rate = success / count * 100
                print(f"   {category}: {success}/{count} ({rate:.1f}%)")
        
        print(f"\\n💾 详细数据位置: {summary_file}")
        return df
        
    except Exception as e:
        print(f"❌ 分析错误: {str(e)}")
        return None

# 执行分析
print("🔍 开始分析最新的Kimi实验结果...")
result_df = analyze_kimi_results()
