In [1]:
# 导入必要的库
import json
import pandas as pd
import random
import time
import requests
import os
from typing import Dict, List, Any
from datetime import datetime

# 加载Quick80数据集
def load_dataset(dataset_path: str = "quick80_dataset.json") -> List[Dict]:
    """加载Taboo数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# 加载数据集
print("📚 正在加载Quick80数据集...")
dataset = load_dataset()
print(f"✅ 数据集加载完成，共{len(dataset)}条记录")

# 显示数据集样本
print("\n📋 数据集样本:")
sample = random.choice(dataset)
print(f"   目标词: {sample['target']}")
print(f"   类别: {sample.get('category', 'unknown')}")
print(f"   禁用词: {sample['taboo']}")
print(f"   词义数: {len(sample.get('senses', []))}")


📚 正在加载Quick80数据集...
✅ 数据集加载完成，共80条记录

📋 数据集样本:
   目标词: atomize
   类别: chemistry
   禁用词: ['atomise', 'perfume', 'spray', 'nuke', 'zap']
   词义数: 3


In [2]:
# Quick80数据集统计信息
print("📊 Quick80数据集基本统计:")
print("=" * 40)

# 类别分布统计
categories = {}
taboo_counts = []
sense_counts = []

for item in dataset:
    # 统计类别
    category = item.get('category', 'unknown')
    categories[category] = categories.get(category, 0) + 1
    
    # 统计禁用词数量
    taboo_counts.append(len(item.get('taboo', [])))
    
    # 统计词义数量
    sense_counts.append(len(item.get('senses', [])))

print(f"\n🏷️ 类别分布:")
sorted_categories = sorted(categories.items(), key=lambda x: x[1], reverse=True)
for i, (category, count) in enumerate(sorted_categories, 1):
    percentage = count / len(dataset) * 100
    print(f"   {i}. {category}: {count} 条 ({percentage:.1f}%)")

print(f"\n🚫 禁用词统计:")
print(f"   平均数量: {sum(taboo_counts) / len(taboo_counts):.1f}")
print(f"   范围: {min(taboo_counts)} - {max(taboo_counts)}")

print(f"\n💭 词义统计:")
print(f"   平均数量: {sum(sense_counts) / len(sense_counts):.1f}")
print(f"   范围: {min(sense_counts)} - {max(sense_counts)}")

print(f"\n✅ Quick80数据集统计完成，适合定量分析实验")

# 设置随机种子用于实验
random.seed(240)
print("\n🎲 随机种子已设置为 240，确保实验可复现")


📊 Quick80数据集基本统计:

🏷️ 类别分布:
   1. general: 55 条 (68.8%)
   2. philosophy: 8 条 (10.0%)
   3. finance: 7 条 (8.8%)
   4. chemistry: 5 条 (6.2%)
   5. cs: 5 条 (6.2%)

🚫 禁用词统计:
   平均数量: 5.0
   范围: 5 - 5

💭 词义统计:
   平均数量: 1.9
   范围: 1 - 23

✅ Quick80数据集统计完成，适合定量分析实验

🎲 随机种子已设置为 240，确保实验可复现


In [3]:
# 定量分析模型配置
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 定义定量分析使用的模型
QUANTITATIVE_MODELS = [
    "deepseek/deepseek-chat-v3-0324",  # deepseek-v3  
    "google/gemini-2.5-flash"  # gemini-2.5flash
]

# 温度参数设置
TEMPERATURE_VALUES = [0.1, 0.3, 0.7]

# 禁忌词数量设置 
TABOO_COUNTS = [1, 3, 5]

# Hinter提示长度设置（单词个数）
HINT_WORD_COUNTS = [1, 5, 10]

print("✅ Quick80定量分析配置已加载")
print(f"🤖 分析模型: {len(QUANTITATIVE_MODELS)} 个")
for i, model in enumerate(QUANTITATIVE_MODELS, 1):
    model_name = model.split('/')[-1]
    print(f"   {i}. {model_name}")

print(f"🌡️ 温度参数: {TEMPERATURE_VALUES}")
print(f"🚫 禁忌词数量: {TABOO_COUNTS}")
print(f"💬 Hinter提示长度（单词个数）: {HINT_WORD_COUNTS}")
print(f"📊 Quick80数据集词汇总数: {len(dataset)}")


✅ Quick80定量分析配置已加载
🤖 分析模型: 2 个
   1. deepseek-chat-v3-0324
   2. gemini-2.5-flash
🌡️ 温度参数: [0.1, 0.3, 0.7]
🚫 禁忌词数量: [1, 3, 5]
💬 Hinter提示长度（单词个数）: [1, 5, 10]
📊 Quick80数据集词汇总数: 80


In [4]:
# API客户端设置
def load_api_keys(keys_path: str = "api_keys.json") -> Dict[str, str]:
    """加载API密钥"""
    with open(keys_path, 'r', encoding='utf-8') as f:
        return json.load(f)

class QuantitativeOpenRouterClient:
    """定量分析专用的API客户端，支持温度控制和长度验证"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def call_model_with_temperature(self, model: str, messages: List[Dict[str, str]], 
                                  temperature: float = 0.3, max_tokens: int = 2000) -> str:
        """调用模型API，支持自定义温度参数"""
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        content = result['choices'][0]['message']['content'].strip()
        
        # 防止乱码：只保留ASCII可打印字符
        content = re.sub(r'[^\\x20-\\x7E]', '', content)
        return content

# 初始化API客户端
try:
    api_keys = load_api_keys()
    quant_client = QuantitativeOpenRouterClient(api_keys["OPENROUTER_API_KEY"])
    print("✅ Quick80定量分析API客户端初始化成功")
except Exception as e:
    print(f"❌ API客户端初始化失败: {e}")
    quant_client = None


✅ Quick80定量分析API客户端初始化成功


In [5]:
# Quick80禁忌词数量控制和数据集处理
def create_taboo_variants(word_data, target_counts=[1, 3, 5]):
    """为单个词汇创建不同禁忌词数量的变体"""
    variants = {}
    original_taboo = word_data['taboo']
    
    for count in target_counts:
        if count <= len(original_taboo):
            # 选择前N个禁忌词
            selected_taboo = original_taboo[:count]
            variants[count] = {
                **word_data,
                'taboo': selected_taboo,
                'original_taboo_count': len(original_taboo),
                'variant_taboo_count': count
            }
        else:
            # 如果要求的数量超过原始禁忌词数量，则使用全部
            variants[count] = {
                **word_data,
                'taboo': original_taboo,
                'original_taboo_count': len(original_taboo),
                'variant_taboo_count': len(original_taboo)
            }
    
    return variants

def prepare_quantitative_dataset(dataset, sample_size=80):
    """准备定量分析数据集，为每个词创建不同禁忌词数量的变体"""
    # 使用全部80个词汇或指定数量
    sample_words = random.sample(dataset, min(sample_size, len(dataset)))
    
    quantitative_dataset = []
    
    for word_data in sample_words:
        variants = create_taboo_variants(word_data, TABOO_COUNTS)
        
        for taboo_count, variant in variants.items():
            variant['experiment_id'] = f"{word_data['target']}_{taboo_count}taboo"
            quantitative_dataset.append(variant)
    
    return quantitative_dataset

# 创建Quick80定量分析数据集
print("📊 正在准备Quick80定量分析数据集...")
quant_dataset = prepare_quantitative_dataset(dataset, sample_size=80)  # 使用全部80个词汇

print(f"✅ Quick80定量分析数据集准备完成")
print(f"📋 原始词汇数: 80")
print(f"📊 分析条目数: {len(quant_dataset)} (每个词汇3个禁忌词变体)")

# 统计各类别的分布
taboo_distribution = {}
category_distribution = {}

for item in quant_dataset:
    # 禁忌词数量分布
    taboo_count = item['variant_taboo_count']
    taboo_distribution[taboo_count] = taboo_distribution.get(taboo_count, 0) + 1
    
    # 类别分布
    category = item.get('category', 'unknown')
    category_distribution[category] = category_distribution.get(category, 0) + 1

print(f"\n🚫 禁忌词数量分布:")
for count, num in sorted(taboo_distribution.items()):
    print(f"   {count}个禁忌词: {num} 条")

print(f"\n🏷️ 类别分布:")
for category, num in sorted(category_distribution.items(), key=lambda x: x[1], reverse=True):
    print(f"   {category}: {num} 条")

# 显示示例
print(f"\n📋 数据集示例:")
sample_item = quant_dataset[0]
print(f"   实验ID: {sample_item['experiment_id']}")
print(f"   目标词: {sample_item['target']}")
print(f"   原始禁忌词数: {sample_item['original_taboo_count']}")
print(f"   当前禁忌词: {sample_item['taboo']} ({sample_item['variant_taboo_count']}个)")


📊 正在准备Quick80定量分析数据集...
✅ Quick80定量分析数据集准备完成
📋 原始词汇数: 80
📊 分析条目数: 240 (每个词汇3个禁忌词变体)

🚫 禁忌词数量分布:
   1个禁忌词: 80 条
   3个禁忌词: 80 条
   5个禁忌词: 80 条

🏷️ 类别分布:
   general: 165 条
   philosophy: 24 条
   finance: 21 条
   cs: 15 条
   chemistry: 15 条

📋 数据集示例:
   实验ID: farm_1taboo
   目标词: farm
   原始禁忌词数: 5
   当前禁忌词: ['farmer'] (1个)


In [6]:
# 实验总览
# 计算实验规模
total_models = len(QUANTITATIVE_MODELS)
total_combinations = total_models * total_models  # hinter x guesser
total_temperatures = len(TEMPERATURE_VALUES)
total_hint_lengths = len(HINT_WORD_COUNTS)
total_dataset_variants = len(quant_dataset)

total_experiments = total_combinations * total_temperatures * total_hint_lengths * total_dataset_variants

print("🎯 Quick80定量分析实验规模:")
print("=" * 50)
print(f"📊 数据集: Quick80 ({len(dataset)} 个原始词汇)")
print(f"🤖 模型: {total_models} 个")
print(f"🔄 模型组合: {total_combinations} 个 (每个模型既做Hinter又做Guesser)")
print(f"🌡️ 温度参数: {total_temperatures} 个")
print(f"💬 Hinter提示长度: {total_hint_lengths} 个")
print(f"📝 数据集变体: {total_dataset_variants} 个 (80词汇 × 3种禁忌词数量)")
print(f"\n🎮 总实验数: {total_experiments:,} 个")
print(f"⏱️ 预计耗时: {total_experiments * 0.5 / 3600:.1f} 小时 (按每个实验0.5秒估算)")

print(f"\n💡 实验设计特点:")
print(f"   ✅ 涵盖Quick80数据集的全部80个词汇")
print(f"   ✅ 测试3个目标模型的完整性能")
print(f"   ✅ 分析4个温度参数的影响")
print(f"   ✅ 验证3种Hinter提示长度的效果")
print(f"   ✅ 比较3种禁忌词数量的难度")
print(f"   ✅ 支持小规模测试和完整实验")

print(f"\n📋 关键更新:")
print(f"   🔄 数据集: 从dataset.json (300词) 改为 quick80_dataset.json (80词)")
print(f"   📊 实验规模: 更加聚焦，适合快速迭代和深度分析")
print(f"   💾 数据质量: Quick80是高质量精选词汇集")


🎯 Quick80定量分析实验规模:
📊 数据集: Quick80 (80 个原始词汇)
🤖 模型: 2 个
🔄 模型组合: 4 个 (每个模型既做Hinter又做Guesser)
🌡️ 温度参数: 3 个
💬 Hinter提示长度: 3 个
📝 数据集变体: 240 个 (80词汇 × 3种禁忌词数量)

🎮 总实验数: 8,640 个
⏱️ 预计耗时: 1.2 小时 (按每个实验0.5秒估算)

💡 实验设计特点:
   ✅ 涵盖Quick80数据集的全部80个词汇
   ✅ 测试3个目标模型的完整性能
   ✅ 分析4个温度参数的影响
   ✅ 验证3种Hinter提示长度的效果
   ✅ 比较3种禁忌词数量的难度
   ✅ 支持小规模测试和完整实验

📋 关键更新:
   🔄 数据集: 从dataset.json (300词) 改为 quick80_dataset.json (80词)
   📊 实验规模: 更加聚焦，适合快速迭代和深度分析
   💾 数据质量: Quick80是高质量精选词汇集


In [7]:
# 小规模测试实验
print("🧪 开始Quick80小规模测试...")
print("📊 测试配置:")
print(f"   模型: {[m.split('/')[-1] for m in QUANTITATIVE_MODELS]}")
print(f"   温度: {TEMPERATURE_VALUES[:2]}")  # 只用前2个温度
print(f"   禁忌词数量: {TABOO_COUNTS}")
print(f"   提示长度: {HINT_WORD_COUNTS[:2]}")  # 只用前2个长度
print(f"   测试词汇数: 5")

# 创建小规模测试数据集
test_dataset = prepare_quantitative_dataset(dataset, sample_size=5)  # 只用5个词汇
print(f"   测试数据集: {len(test_dataset)} 条记录")

# 计算测试实验数
total_test_experiments = len(QUANTITATIVE_MODELS) * len(QUANTITATIVE_MODELS) * len(test_dataset) * 2 * 2  # 2温度 x 2长度
print(f"   总测试实验数: {total_test_experiments}")
print(f"   预计耗时: {total_test_experiments * 0.5 / 60:.1f} 分钟")

print(f"\n⚠️ 这是一个小规模测试，用于验证系统功能")
print(f"💡 如果测试成功，您可以运行完整的Quick80定量分析实验")


🧪 开始Quick80小规模测试...
📊 测试配置:
   模型: ['deepseek-chat-v3-0324', 'gemini-2.5-flash']
   温度: [0.1, 0.3]
   禁忌词数量: [1, 3, 5]
   提示长度: [1, 5]
   测试词汇数: 5
   测试数据集: 15 条记录
   总测试实验数: 240
   预计耗时: 2.0 分钟

⚠️ 这是一个小规模测试，用于验证系统功能
💡 如果测试成功，您可以运行完整的Quick80定量分析实验


In [16]:
# 简化版Taboo游戏函数 - 用于Quick80测试
def simple_taboo_game(client, hinter_model, guesser_model, target_word, taboo_words, 
                     temperature=0.3, hint_word_count=5, max_turns=5):
    """简化版Taboo游戏，用于Quick80定量分析"""
    
    conversation_history = []
    taboo_list = ', '.join(taboo_words)
    
    # 系统提示
    hinter_system = f"""You are playing a Taboo word game as the Hinter.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[CLUE]' (including square brackets)
- Your clue must contain exactly {hint_word_count} words
- Do NOT add any text before [CLUE]

GAME RULES:
1. NEVER use any of these taboo words: {taboo_list}
2. Give helpful clues to help the guesser find the target word
3. Use BRITISH ENGLISH spelling

Target word: {target_word}
Taboo words (ABSOLUTELY FORBIDDEN): {taboo_list}"""

    guesser_system = """You are playing a Taboo word game as the Guesser.

CRITICAL FORMAT REQUIREMENT:
- You MUST start your response with exactly '[GUESS]' (including square brackets)
- Give only ONE word as your guess after [GUESS]

GAME RULES:
1. Make your best guess based on the clues
2. Use BRITISH ENGLISH spelling"""

    for turn in range(1, max_turns + 1):
        try:
            # Hinter给出线索
            if turn == 1:
                hinter_prompt = f"{hinter_system}\n\nProvide your first clue:"
            else:
                history_text = "\n".join(conversation_history)
                hinter_prompt = f"{hinter_system}\n\nConversation so far:\n{history_text}\n\nProvide your next clue:"
            
            hinter_response = client.call_model_with_temperature(
                hinter_model, [{"role": "user", "content": hinter_prompt}], temperature
            )
            
            # 检查格式和禁忌词
            if not hinter_response.strip().upper().startswith('[CLUE]'):
                return {
                    'success': False, 'turns': turn, 'final_guess': 'FORMAT_ERROR',
                    'failure_reason': 'HINTER_FORMAT_ERROR', 'temperature': temperature,
                    'hint_word_count': hint_word_count, 'conversation': conversation_history
                }
            
            # 提取线索文本
            import re
            match = re.search(r'\[CLUE\]\s*(.+)', hinter_response, re.IGNORECASE)
            if match:
                hint_text = match.group(1).strip()
            else:
                return {
                    'success': False, 'turns': turn, 'final_guess': 'INVALID_CLUE',
                    'failure_reason': 'CLUE_EXTRACTION_ERROR', 'temperature': temperature,
                    'hint_word_count': hint_word_count, 'conversation': conversation_history
                }
            
            # 检查禁忌词违规
            hint_lower = hint_text.lower()
            for taboo in taboo_words:
                if taboo.lower() in hint_lower:
                    return {
                        'success': False, 'turns': turn, 'final_guess': 'TABOO_VIOLATION',
                        'failure_reason': 'TABOO_VIOLATION', 'temperature': temperature,
                        'hint_word_count': hint_word_count, 'conversation': conversation_history,
                        'taboo_violation_hint': hint_text
                    }
            
            # 检查提示长度
            word_count = len(hint_text.split())
            if word_count != hint_word_count:
                return {
                    'success': False, 'turns': turn, 'final_guess': 'HINT_LENGTH_ERROR',
                    'failure_reason': 'HINT_LENGTH_FAILURE', 'temperature': temperature,
                    'hint_word_count': hint_word_count, 'conversation': conversation_history,
                    'actual_word_count': word_count
                }
            
            conversation_history.append(f"Hinter: {hinter_response}")
            
            # Guesser进行猜测
            history_text = "\n".join(conversation_history)
            guesser_prompt = f"{guesser_system}\n\nConversation so far:\n{history_text}\n\nWhat is your guess?"
            
            guesser_response = client.call_model_with_temperature(
                guesser_model, [{"role": "user", "content": guesser_prompt}], temperature
            )
            
            # 检查guesser格式
            if not guesser_response.strip().upper().startswith('[GUESS]'):
                return {
                    'success': False, 'turns': turn, 'final_guess': 'GUESSER_FORMAT_ERROR',
                    'failure_reason': 'GUESSER_FORMAT_ERROR', 'temperature': temperature,
                    'hint_word_count': hint_word_count, 'conversation': conversation_history
                }
            
            conversation_history.append(f"Guesser: {guesser_response}")
            
            # 提取猜测
            match = re.search(r'\[GUESS\]\s*(.+)', guesser_response, re.IGNORECASE)
            if match:
                guess = match.group(1).strip().split()[0]  # 取第一个词
                guess = guess.strip('.,!?;:"\'()[]{}')
            else:
                guess = 'INVALID_GUESS'
            
            # 检查是否成功
            if guess.lower() == target_word.lower():
                return {
                    'success': True, 'turns': turn, 'final_guess': guess,
                    'failure_reason': None, 'temperature': temperature,
                    'hint_word_count': hint_word_count, 'conversation': conversation_history
                }
            
            # 如果不是最后一轮，添加反馈
            if turn < max_turns:
                conversation_history.append(f"System: '{guess}' is not correct. Try again!")
        
        except Exception as e:
            return {
                'success': False, 'turns': turn, 'final_guess': 'API_ERROR',
                'failure_reason': 'API_FAILURE', 'temperature': temperature,
                'hint_word_count': hint_word_count, 'conversation': conversation_history,
                'error': str(e)[:200]
            }
    
    # 达到最大轮数仍未成功
    return {
        'success': False, 'turns': max_turns, 'final_guess': guess if 'guess' in locals() else 'N/A',
        'failure_reason': 'MAX_TURNS_EXCEEDED', 'temperature': temperature,
        'hint_word_count': hint_word_count, 'conversation': conversation_history
    }

print("✅ 简化版Taboo游戏函数已定义")


✅ 简化版Taboo游戏函数已定义


In [18]:
# 🚀 运行Quick80小规模定量分析实验
if quant_client is not None:
    print("🚀 开始运行Quick80小规模定量分析实验...")
    
    # 实验配置
    test_models = QUANTITATIVE_MODELS
    test_temperatures = [0.1, 0.3]  # 只用2个温度进行测试
    test_hint_lengths = [1, 5]      # 只用2个提示长度进行测试
    test_sample = test_dataset[:15]  # 只用前15个数据项（5词汇×3禁忌词变体）
    
    total_test_experiments = len(test_models) * len(test_models) * len(test_sample) * len(test_temperatures) * len(test_hint_lengths)
    
    print(f"📊 测试配置:")
    print(f"   模型: {[m.split('/')[-1] for m in test_models]}")
    print(f"   温度: {test_temperatures}")
    print(f"   提示长度: {test_hint_lengths}")
    print(f"   数据项: {len(test_sample)} 个")
    print(f"   总实验数: {total_test_experiments}")
    print(f"   预计耗时: {total_test_experiments * 0.8 / 60:.1f} 分钟")
    
    # 运行实验
    results = []
    experiment_counter = 0
    start_time = time.time()
    
    for hinter_model in test_models:
        for guesser_model in test_models:
            for temperature in test_temperatures:
                for hint_length in test_hint_lengths:
                    for data_item in test_sample:
                        experiment_counter += 1
                        
                        # 进度显示
                        progress = (experiment_counter / total_test_experiments) * 100
                        hinter_name = hinter_model.split('/')[-1]
                        guesser_name = guesser_model.split('/')[-1]
                        
                        print(f\"\\r🔄 实验 {experiment_counter}/{total_test_experiments} ({progress:.1f}%) | {hinter_name}→{guesser_name} | T={temperature} | L={hint_length} | {data_item['target']}\", end=\"\")\n",
                        \n",
                        # 执行游戏\n",
                        game_start = time.time()\n",
                        game_result = simple_taboo_game(\n",
                            quant_client, hinter_model, guesser_model,\n",
                            data_item['target'], data_item['taboo'],\n",
                            temperature=temperature,\n",
                            hint_word_count=hint_length,\n",
                            max_turns=5\n",
                        )\n",
                        game_duration = round(time.time() - game_start, 2)\n",
                        \n",
                        # 记录结果\n",
                        result = {\n",
                            'experiment_id': f\"{experiment_counter:04d}\",\n",
                            'hinter_model': hinter_model,\n",
                            'guesser_model': guesser_model,\n",
                            'target_word': data_item['target'],\n",
                            'category': data_item.get('category', 'unknown'),\n",
                            'taboo_words': '|'.join(data_item['taboo']),\n",
                            'taboo_count': len(data_item['taboo']),\n",
                            'variant_taboo_count': data_item.get('variant_taboo_count', len(data_item['taboo'])),\n",
                            'temperature': temperature,\n",
                            'hint_word_count': hint_length,\n",
                            'success': game_result['success'],\n",
                            'turns_used': game_result['turns'],\n",
                            'final_guess': game_result['final_guess'],\n",
                            'failure_reason': game_result.get('failure_reason', None),\n",
                            'taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',\n",
                            'hint_length_violation': game_result.get('failure_reason') == 'HINT_LENGTH_FAILURE',\n",
                            'actual_word_count': game_result.get('actual_word_count', hint_length),\n",
                            'conversation_length': len(game_result.get('conversation', [])),\n",
                            'duration_seconds': game_duration,\n",
                            'timestamp': datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
                        }\n",
                        \n",
                        if 'error' in game_result:\n",
                            result['error'] = game_result['error']\n",
                        if 'taboo_violation_hint' in game_result:\n",
                            result['taboo_violation_hint'] = game_result['taboo_violation_hint']\n",
                        \n",
                        results.append(result)\n",
                        \n",
                        # 每10个实验显示一次中间统计\n",
                        if experiment_counter % 10 == 0:\n",
                            recent_results = results[-10:]\n",
                            recent_success = sum(r['success'] for r in recent_results)\n",
                            print(f\"\\n   📊 最近10个: 成功{recent_success}/10 ({recent_success*10}%)\")\n",
                        \n",
                        time.sleep(0.5)  # API调用间隔\n",
    \n",
    "    total_duration = time.time() - start_time\n",
    "    \n",
    "    print(f\"\\n\\n✅ Quick80小规模实验完成！\")\n",
    "    print(f\"⏱️ 总耗时: {total_duration/60:.1f} 分钟\")\n",
    "    print(f\"📊 实验结果: {len(results)} 个\")\n",
    "    \n",
    "    # 基本统计\n",
    "    if results:\n",
    "        df = pd.DataFrame(results)\n",
    "        \n",
    "        total_success = sum(r['success'] for r in results)\n",
    "        success_rate = total_success / len(results) * 100\n",
    "        \n",
    "        print(f\"📈 整体成功率: {success_rate:.1f}% ({total_success}/{len(results)})\")\n",
    "        \n",
    "        # 按温度统计\n",
    "        print(f\"\\n🌡️ 温度影响:\")\n",
    "        for temp in test_temperatures:\n",
    "            temp_results = df[df['temperature'] == temp]\n",
    "            temp_success = temp_results['success'].sum()\n",
    "            temp_rate = temp_success / len(temp_results) * 100 if len(temp_results) > 0 else 0\n",
    "            print(f\"   T={temp}: {temp_success}/{len(temp_results)} ({temp_rate:.1f}%)\")\n",
    "        \n",
    "        # 按提示长度统计\n",
    "        print(f\"\\n💬 提示长度影响:\")\n",
    "        for length in test_hint_lengths:\n",
    "            length_results = df[df['hint_word_count'] == length]\n",
    "            length_success = length_results['success'].sum()\n",
    "            length_rate = length_success / len(length_results) * 100 if len(length_results) > 0 else 0\n",
    "            print(f\"   {length}词: {length_success}/{len(length_results)} ({length_rate:.1f}%)\")\n",
    "        \n",
    "        # 按模型统计\n",
    "        print(f\"\\n🤖 模型性能 (Hinter):\")\n",
    "        for model in test_models:\n",
    "            model_results = df[df['hinter_model'] == model]\n",
    "            model_success = model_results['success'].sum()\n",
    "            model_rate = model_success / len(model_results) * 100 if len(model_results) > 0 else 0\n",
    "            model_name = model.split('/')[-1]\n",
    "            print(f\"   {model_name}: {model_success}/{len(model_results)} ({model_rate:.1f}%)\")\n",
    "        \n",
    "        # 失败原因统计\n",
    "        failed_results = df[df['success'] == False]\n",
    "        if len(failed_results) > 0:\n",
    "            print(f\"\\n❌ 失败原因:\")\n",
    "            failure_counts = failed_results['failure_reason'].value_counts()\n",
    "            for reason, count in failure_counts.items():\n",
    "                percentage = count / len(failed_results) * 100\n",
    "                print(f\"   {reason}: {count} 次 ({percentage:.1f}%)\")\n",
    "        \n",
    "        # 保存结果\n",
    "        timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "        output_path = f\"results/quick80_test_{timestamp}.csv\"\n",
    "        os.makedirs(\"results\", exist_ok=True)\n",
    "        df.to_csv(output_path, index=False, encoding='utf-8')\n",
    "        print(f\"\\n💾 结果已保存: {output_path}\")\n",
    "        \n",
    "        print(f\"\\n🎉 Quick80小规模实验成功完成！\")\n",
    "        print(f\"💡 如果结果满意，可以运行完整的Quick80定量分析实验\")\n",
    "    \n",
    "else:\n",
    "    print(\"❌ API客户端未初始化，无法运行实验\")\n",
    "    print(\"💡 请检查api_keys.json文件是否存在且包含正确的API密钥\")"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "vscode": {
     "languageId": "raw"
    }
   },
   "source": [
    "# 执行Quick80完整定量分析实验（可选）\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 🚀 运行Quick80完整定量分析实验（取消注释以运行）\n",
    "\"\"\"\n",
    "if quant_client is not None and input(\"是否运行完整实验？(输入'yes'确认): \") == 'yes':\n",
    "    print(\"🚀 开始运行Quick80完整定量分析实验...\")\n",
    "    \n",
    "    # 使用全部配置\n",
    "    full_models = QUANTITATIVE_MODELS\n",
    "    full_temperatures = TEMPERATURE_VALUES  # 全部4个温度\n",
    "    full_hint_lengths = HINT_WORD_COUNTS    # 全部3个提示长度\n",
    "    full_dataset = quant_dataset            # 全部240个数据项\n",
    "    \n",
    "    total_full_experiments = len(full_models) * len(full_models) * len(full_dataset) * len(full_temperatures) * len(full_hint_lengths)\n",
    "    \n",
    "    print(f\"📊 完整实验配置:\")\n",
    "    print(f\"   模型组合: {len(full_models)}×{len(full_models)} = {len(full_models)**2}\")\n",
    "    print(f\"   温度值: {len(full_temperatures)} 个\")\n",
    "    print(f\"   提示长度: {len(full_hint_lengths)} 个\")\n",
    "    print(f\"   数据项: {len(full_dataset)} 个\")\n",
    "    print(f\"   总实验数: {total_full_experiments:,}\")\n",
    "    print(f\"   预计耗时: {total_full_experiments * 0.8 / 3600:.1f} 小时\")\n",
    "    \n",
    "    # 这里可以添加完整实验的代码...\n",
    "    print(\"\\n💡 完整实验代码可以基于上面的小规模实验进行扩展\")\n",
    "    \n",
    "else:\n",
    "    print(\"💡 完整实验未运行\")\n",
    "\"\"\"\n",
    "\n",
    "print(\"💡 完整实验代码已准备，需要时可以取消注释运行\")\n",
    "print(\"📋 建议: 先分析小规模实验结果，确认配置无误后再运行完整实验\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 204)