In [23]:
import sys
!{sys.executable} -m pip install OpenHowNet jieba requests pandas numpy




In [24]:
# 1. 导入依赖和设置环境
import json
import pandas as pd
import random
import time
import requests
import os
import jieba
import re
from typing import Dict, List, Any, Tuple
from datetime import datetime
from collections import Counter

# 安装和导入OpenHowNet
try:
    import OpenHowNet
    print("✅ OpenHowNet已导入")
except ImportError:
    print("⚠️ 正在安装OpenHowNet...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "OpenHowNet"])
    import OpenHowNet
    print("✅ OpenHowNet安装并导入成功")

print("🚀 中文Taboo实验环境初始化完成")
print("📋 实验目标: 使用OpenHowNet构建100个中文词汇的Taboo数据集")
print("🎯 词性分布: 名词、动词、形容词、副词各25个")


✅ OpenHowNet已导入
🚀 中文Taboo实验环境初始化完成
📋 实验目标: 使用OpenHowNet构建100个中文词汇的Taboo数据集
🎯 词性分布: 名词、动词、形容词、副词各25个


In [25]:
# 深入测试OpenHowNet API
if hownet_dict:
    print("🔍 深入测试OpenHowNet API结构...")
    
    # 测试词汇
    test_words = ["计算机", "学习", "美丽", "快速"]
    
    for test_word in test_words:
        print(f"\n📝 测试词汇: {test_word}")
        try:
            senses = hownet_dict.get_sense(test_word)
            if senses:
                print(f"   义项数量: {len(senses)}")
                sense = senses[0]
                print(f"   第一个义项类型: {type(sense)}")
                
                # 检查所有可能的属性
                if hasattr(sense, '__dict__'):
                    attrs = list(sense.__dict__.keys())
                    print(f"   所有属性: {attrs}")
                else:
                    # 尝试常见属性
                    common_attrs = ['zh_word', 'en_word', 'pos', 'zh_grammar', 'definition', 'Def', 'def']
                    available_attrs = []
                    for attr in common_attrs:
                        if hasattr(sense, attr):
                            value = getattr(sense, attr)
                            available_attrs.append(f"{attr}={value}")
                    print(f"   可用属性: {available_attrs}")
                
                # 尝试调用一些方法
                try:
                    if hasattr(sense, 'zh_word'):
                        print(f"   中文词: {sense.zh_word}")
                    if hasattr(sense, 'pos'):
                        print(f"   词性(pos): {sense.pos}")
                    if hasattr(sense, 'zh_grammar'):
                        print(f"   中文词性(zh_grammar): {sense.zh_grammar}")
                except Exception as e:
                    print(f"   属性访问错误: {e}")
            else:
                print(f"   未找到义项")
        except Exception as e:
            print(f"   测试错误: {e}")
    
    # 测试词汇列表方法
    print(f"\n📊 词汇获取方法测试:")
    try:
        zh_words = hownet_dict.get_zh_words()
        print(f"   get_zh_words() 返回类型: {type(zh_words)}")
        print(f"   词汇数量: {len(zh_words)}")
        print(f"   前10个词汇: {list(zh_words)[:10]}")
    except Exception as e:
        print(f"   get_zh_words() 错误: {e}")
        
else:
    print("❌ hownet_dict 为 None，跳过测试")


🔍 深入测试OpenHowNet API结构...

📝 测试词汇: 计算机
   义项数量: 1
   第一个义项类型: <class 'OpenHowNet.Sense.Sense'>
   所有属性: ['No', 'en_word', 'en_grammar', 'zh_word', 'zh_grammar', 'Def', 'sememes']
   中文词: 计算机
   中文词性(zh_grammar): noun

📝 测试词汇: 学习
   义项数量: 5
   第一个义项类型: <class 'OpenHowNet.Sense.Sense'>
   所有属性: ['No', 'en_word', 'en_grammar', 'zh_word', 'zh_grammar', 'Def', 'sememes']
   中文词: 学习
   中文词性(zh_grammar): verb

📝 测试词汇: 美丽
   义项数量: 3
   第一个义项类型: <class 'OpenHowNet.Sense.Sense'>
   所有属性: ['No', 'en_word', 'en_grammar', 'zh_word', 'zh_grammar', 'Def', 'sememes']
   中文词: 美丽
   中文词性(zh_grammar): adj

📝 测试词汇: 快速
   义项数量: 4
   第一个义项类型: <class 'OpenHowNet.Sense.Sense'>
   所有属性: ['No', 'en_word', 'en_grammar', 'zh_word', 'zh_grammar', 'Def', 'sememes']
   中文词: 快速
   中文词性(zh_grammar): adj

📊 词汇获取方法测试:
   get_zh_words() 返回类型: <class 'list'>
   词汇数量: 135009
   前10个词汇: ['', '深圳乐家精品服务公寓', '临床表现为', '休息', '打小算盘', '湖底', '赏格', '超短裙', '襑', '岐']


In [26]:
# 2. 初始化OpenHowNet和中文处理工具
print("🔧 正在初始化OpenHowNet和中文处理工具...")

# 初始化OpenHowNet实例
try:
    # 首先尝试下载数据
    print("📥 正在下载OpenHowNet数据...")
    OpenHowNet.download()
    print("✅ OpenHowNet数据下载完成")
except Exception as e:
    print(f"⚠️ 下载过程中出现警告: {e}")

try:
    hownet_dict = OpenHowNet.HowNetDict()
    print("✅ OpenHowNet词典加载成功")
    
    # 测试API方法
    zh_words = hownet_dict.get_zh_words()
    print(f"📚 词典包含中文词汇数量: {len(zh_words)} 个")
    
    # 测试get_sense方法
    test_sense = hownet_dict.get_sense("计算机")
    if test_sense:
        print(f"✅ API测试成功，'计算机'有 {len(test_sense)} 个义项")
        print(f"📋 第一个义项数据结构:")
        print(f"   类型: {type(test_sense[0])}")
        print(f"   内容: {test_sense[0]}")
        if hasattr(test_sense[0], '__dict__'):
            print(f"   属性: {list(test_sense[0].__dict__.keys())}")
    else:
        print("⚠️ 测试词汇'计算机'未找到义项")
        
except Exception as e:
    print(f"❌ OpenHowNet初始化失败: {e}")
    print("🔄 尝试重新初始化...")
    hownet_dict = None

# 设置jieba分词
jieba.setLogLevel(20)  # 减少jieba的日志输出
print("✅ jieba分词工具已配置")

# 设置随机种子
random.seed(42)
print("🎲 随机种子已设置为42，确保实验可复现")


🔧 正在初始化OpenHowNet和中文处理工具...
📥 正在下载OpenHowNet数据...


resources/resources.zip: 72948KB [00:06, 10870.59KB/s]                          


✅ OpenHowNet数据下载完成
Initializing OpenHowNet succeeded!
✅ OpenHowNet词典加载成功
📚 词典包含中文词汇数量: 135009 个
✅ API测试成功，'计算机'有 1 个义项
📋 第一个义项数据结构:
   类型: <class 'OpenHowNet.Sense.Sense'>
   内容: No.255809|computer|计算机
   属性: ['No', 'en_word', 'en_grammar', 'zh_word', 'zh_grammar', 'Def', 'sememes']
✅ jieba分词工具已配置
🎲 随机种子已设置为42，确保实验可复现


In [27]:
# 检查并重新初始化 HowNet（如果需要）
print("🔍 检查 HowNet 词典状态...")

try:
    # 检查 hownet_dict 是否已定义且有效
    if 'hownet_dict' not in globals():
        print("⚠️ hownet_dict 未定义，正在初始化...")
        hownet_dict = None
    elif hownet_dict is None:
        print("⚠️ hownet_dict 为 None，正在重新初始化...")
    else:
        print("✅ hownet_dict 已存在且有效")
        
    # 如果需要，重新初始化
    if hownet_dict is None:
        try:
            import OpenHowNet
            print("📥 正在初始化 OpenHowNet...")
            hownet_dict = OpenHowNet.HowNetDict()
            print("✅ OpenHowNet 词典重新初始化成功")
            
            # 测试功能
            test_words = hownet_dict.get_zh_words()
            print(f"📚 词典包含 {len(test_words)} 个中文词汇")
            
        except Exception as e:
            print(f"❌ OpenHowNet 初始化失败: {e}")
            hownet_dict = None
            
except Exception as e:
    print(f"❌ 检查过程出错: {e}")
    hownet_dict = None

print(f"🎯 最终状态: hownet_dict = {type(hownet_dict) if hownet_dict else 'None'}")


🔍 检查 HowNet 词典状态...
✅ hownet_dict 已存在且有效
🎯 最终状态: hownet_dict = <class 'OpenHowNet.HowNetDict.HowNetDict'>


In [28]:
# 3. 中文词汇数据集构建工具函数

def get_pos_mapping():
    """HowNet词性到标准词性的映射"""
    return {
        # 名词类
        'N': 'noun', 'noun': 'noun',
        # 动词类  
        'V': 'verb', 'verb': 'verb',
        # 形容词类
        'A': 'adj', 'adj': 'adj', 'a': 'adj',
        # 副词类
        'D': 'adv', 'adv': 'adv', 'd': 'adv'
    }

def is_valid_chinese_word(word: str) -> bool:
    """检查是否为有效的中文词汇"""
    if not word or len(word) < 1:
        return False
    
    # 检查是否包含中文字符
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
    if not chinese_pattern.search(word):
        return False
    
    # 过滤过长或过短的词
    if len(word) > 6 or len(word) < 1:
        return False
    
    # 过滤包含特殊字符的词
    special_chars = ['·', '—', '…', '〈', '〉', '《', '》', '「', '」']
    if any(char in word for char in special_chars):
        return False
    
    return True

def extract_similar_words_from_hownet(target_word: str, target_pos: str, hownet_dict, max_count: int = 10) -> List[str]:
    """从HowNet中提取与目标词相似的词汇作为禁用词候选"""
    similar_words = set()
    
    try:
        # 获取目标词的义项
        word_senses = hownet_dict.get_sense(target_word)
        if not word_senses:
            return []
        
        # 从第一个义项开始提取相似词
        primary_sense = word_senses[0]
        
        # 方法1: 从义项中提取相关词汇
        try:
            # 获取所有包含该词的相关义项信息
            for sense in word_senses[:3]:  # 取前3个义项
                # 提取中文词 - 使用属性而不是字典访问
                if hasattr(sense, 'zh_word') and sense.zh_word and is_valid_chinese_word(sense.zh_word) and sense.zh_word != target_word:
                    similar_words.add(sense.zh_word)
                    
                # 尝试获取同义词 - 不同的属性名称
                syn_attrs = ['syn', 'synonyms', 'similar_words']
                for syn_attr in syn_attrs:
                    if hasattr(sense, syn_attr):
                        syn_data = getattr(sense, syn_attr)
                        if syn_data:
                            # 处理不同的同义词数据格式
                            if isinstance(syn_data, list):
                                for syn_item in syn_data[:5]:
                                    if isinstance(syn_item, str):
                                        syn_word = syn_item
                                    elif hasattr(syn_item, 'text'):
                                        syn_word = syn_item.text
                                    elif hasattr(syn_item, 'word'):
                                        syn_word = syn_item.word
                                    else:
                                        continue
                                    
                                    if is_valid_chinese_word(syn_word) and syn_word != target_word:
                                        similar_words.add(syn_word)
                        break
        except Exception as e:
            print(f"⚠️ 提取同义词时出错: {e}")
            pass
        
        # 方法2: 从义原定义中提取关键词
        try:
            for sense in word_senses[:2]:  # 取前2个义项
                # 尝试不同的定义属性名称
                definition = ""
                def_attrs = ['Def', 'definition', 'def', 'meaning']
                for def_attr in def_attrs:
                    if hasattr(sense, def_attr):
                        definition = getattr(sense, def_attr)
                        if definition:
                            break
                
                if definition:
                    # 使用jieba分词提取定义中的中文词汇
                    words_in_def = jieba.lcut(definition)
                    for word in words_in_def:
                        if is_valid_chinese_word(word) and word != target_word and len(word) >= 2:
                            similar_words.add(word)
        except Exception as e:
            print(f"⚠️ 提取定义关键词时出错: {e}")
            pass
        
        # 方法3: 从相关词汇中提取（基于词性）
        try:
            # 获取相同词性的相关词汇
            target_pos = ""
            pos_attrs = ['zh_grammar', 'pos', 'part_of_speech']
            for pos_attr in pos_attrs:
                if hasattr(primary_sense, pos_attr):
                    target_pos = getattr(primary_sense, pos_attr)
                    if target_pos:
                        break
            
            if target_pos:
                # 从已获得的相关词汇中进一步筛选（限制数量避免过度搜索）
                similar_words_list = list(similar_words)[:10]  # 减少搜索范围
                for word in similar_words_list:
                    try:
                        related_senses = hownet_dict.get_sense(word)
                        if related_senses:
                            for related_sense in related_senses[:1]:  # 只检查第一个义项
                                related_pos = ""
                                for pos_attr in pos_attrs:
                                    if hasattr(related_sense, pos_attr):
                                        related_pos = getattr(related_sense, pos_attr)
                                        if related_pos:
                                            break
                                
                                if related_pos == target_pos:
                                    # 尝试获取相关词汇
                                    if hasattr(related_sense, 'zh_word') and related_sense.zh_word:
                                        related_word = related_sense.zh_word
                                        if is_valid_chinese_word(related_word) and related_word != target_word:
                                            similar_words.add(related_word)
                    except:
                        continue
        except Exception as e:
            print(f"⚠️ 基于词性提取相关词时出错: {e}")
            pass
    
    except Exception as e:
        print(f"⚠️ 提取 {target_word} 的相似词时出错: {e}")
    
    # 过滤并返回结果
    result = [word for word in similar_words if is_valid_chinese_word(word)][:max_count]
    return result

print("✅ 中文词汇处理工具函数已定义")


✅ 中文词汇处理工具函数已定义


In [29]:
# 修复的中文Taboo数据集构建（基于OpenHowNet）
print("🏗️ 开始构建中文Taboo数据集...")

def build_chinese_taboo_dataset_corrected(hownet_dict, target_count_per_pos: int = 25):
    """构建中文Taboo数据集 - 修复版"""
    
    if hownet_dict is None:
        print("❌ HowNet词典未初始化，无法构建数据集")
        return []
    
    pos_mapping = get_pos_mapping()
    target_pos_list = ['noun', 'verb', 'adj', 'adv']
    dataset = []
    
    print(f"📊 目标: 每个词性 {target_count_per_pos} 个词，总计 {target_count_per_pos * 4} 个词")
    
    # 获取HowNet中文词汇表 - 使用正确的API方法
    try:
        print("🔍 正在获取HowNet中文词汇...")
        zh_words = hownet_dict.get_zh_words()
        chinese_vocab = [word for word in zh_words if is_valid_chinese_word(word)]
        print(f"📚 HowNet中文词汇总数: {len(chinese_vocab)} 个")
    except Exception as e:
        print(f"❌ 获取中文词汇失败: {e}")
        print("💡 提示: 确保OpenHowNet数据已正确下载")
        return []
    
    # 按词性分组收集词汇
    words_by_pos = {pos: [] for pos in target_pos_list}
    
    print("🔍 正在分析词汇词性...")
    progress_count = 0
    
    for word in chinese_vocab[:5000]:  # 限制处理前5000个词汇以节省时间
        progress_count += 1
        if progress_count % 500 == 0:
            print(f"   已处理 {progress_count}/{min(5000, len(chinese_vocab))} 个词汇")
        
        try:
            # 获取词汇的义项信息
            senses = hownet_dict.get_sense(word)
            if not senses:
                continue
            
            # 获取主要词性
            primary_sense = senses[0]
            
            # 尝试不同的词性获取方法
            pos_info = None
            for attr in ['zh_grammar', 'pos', 'part_of_speech']:
                if hasattr(primary_sense, attr):
                    pos_info = getattr(primary_sense, attr)
                    if pos_info:
                        break
            
            if not pos_info:
                continue
                
            # 映射到标准词性
            standard_pos = pos_mapping.get(pos_info, None)
            if standard_pos and standard_pos in target_pos_list:
                words_by_pos[standard_pos].append({
                    'word': word,
                    'senses': senses,
                    'primary_pos': standard_pos
                })
        
        except Exception:
            continue
    
    print(f"\n📈 词性分布统计:")
    for pos, words in words_by_pos.items():
        print(f"   {pos}: {len(words)} 个候选词")
    
    # 为每个词性随机选择指定数量的词汇
    print(f"\n🎯 开始选择目标词汇并生成禁用词...")
    
    for pos in target_pos_list:
        available_words = words_by_pos[pos]
        if len(available_words) == 0:
            print(f"⚠️ {pos} 词性无可用词汇，跳过")
            continue
            
        selected_count = min(target_count_per_pos, len(available_words))
        
        # 随机选择词汇
        selected_words = random.sample(available_words, selected_count)
        print(f"\n🔄 正在处理 {pos} 类词汇 ({selected_count} 个)...")
        
        for i, word_info in enumerate(selected_words):
            target_word = word_info['word']
            senses = word_info['senses']
            
            print(f"   处理 {i+1}/{selected_count}: {target_word}")
            
            # 生成禁用词
            taboo_words = extract_similar_words_from_hownet(
                target_word, pos, hownet_dict, max_count=8
            )
            
            # 确保至少有5个禁用词
            if len(taboo_words) < 5:
                # 根据词性添加通用禁用词
                generic_mapping = {
                    'noun': ['东西', '物品', '事物', '对象', '名词'],
                    'verb': ['动作', '行为', '做', '进行', '活动'],
                    'adj': ['特征', '性质', '状态', '形容', '描述'],
                    'adv': ['方式', '程度', '如何', '状况', '修饰']
                }
                
                generic_taboos = generic_mapping.get(pos, ['相关', '概念', '词汇', '内容', '意思'])
                for generic in generic_taboos:
                    if generic not in taboo_words and generic != target_word:
                        taboo_words.append(generic)
                        if len(taboo_words) >= 5:
                            break
            
            # 将 Sense 对象转换为可序列化的字典格式
            serializable_senses = []
            for sense in senses:
                sense_dict = {
                    'zh_word': getattr(sense, 'zh_word', ''),
                    'en_word': getattr(sense, 'en_word', ''),
                    'zh_grammar': getattr(sense, 'zh_grammar', ''),
                    'en_grammar': getattr(sense, 'en_grammar', ''),
                    'Def': getattr(sense, 'Def', ''),
                    'No': getattr(sense, 'No', ''),
                    'sememes': str(getattr(sense, 'sememes', []))  # 转换为字符串
                }
                serializable_senses.append(sense_dict)
            
            # 构建数据集条目
            entry = {
                'target': target_word,
                'part_of_speech': pos,
                'taboo': taboo_words[:5],  # 确保正好5个禁用词
                'category': 'chinese_hownet',
                'senses': serializable_senses,  # 使用可序列化的版本
                'metadata': {
                    'sense_count': len(senses),
                    'taboo_count': len(taboo_words[:5]),
                    'source': 'openhownet_corrected'
                }
            }
            
            dataset.append(entry)
    
    return dataset

# 构建数据集
if hownet_dict:
    chinese_dataset = build_chinese_taboo_dataset_corrected(hownet_dict, target_count_per_pos=25)
    print(f"\n✅ 中文Taboo数据集构建完成！")
    print(f"📊 总词汇数: {len(chinese_dataset)} 个")
else:
    print("❌ OpenHowNet未正确初始化，无法构建数据集")
    chinese_dataset = []


🏗️ 开始构建中文Taboo数据集...
📊 目标: 每个词性 25 个词，总计 100 个词
🔍 正在获取HowNet中文词汇...
📚 HowNet中文词汇总数: 130347 个
🔍 正在分析词汇词性...
   已处理 500/5000 个词汇
   已处理 1000/5000 个词汇
   已处理 1500/5000 个词汇
   已处理 2000/5000 个词汇
   已处理 2500/5000 个词汇
   已处理 3000/5000 个词汇
   已处理 3500/5000 个词汇
   已处理 4000/5000 个词汇
   已处理 4500/5000 个词汇
   已处理 5000/5000 个词汇

📈 词性分布统计:
   noun: 2528 个候选词
   verb: 1257 个候选词
   adj: 490 个候选词
   adv: 77 个候选词

🎯 开始选择目标词汇并生成禁用词...

🔄 正在处理 noun 类词汇 (25 个)...
   处理 1/25: 人工岛
   处理 2/25: 靳
   处理 3/25: 近几个月来
   处理 4/25: 虚数
   处理 5/25: 供需矛盾
   处理 6/25: 苦竹
   处理 7/25: 长发
   处理 8/25: 多米诺骨牌
   处理 9/25: 单位
   处理 10/25: 协奏曲
   处理 11/25: 盐浓度
   处理 12/25: 冤假错案
   处理 13/25: 饧
   处理 14/25: 军博
   处理 15/25: 球果
   处理 16/25: 日光节约时间
   处理 17/25: 海伦娜
   处理 18/25: 康马县
   处理 19/25: 入场券
   处理 20/25: 咬翼片
   处理 21/25: 双轨
   处理 22/25: 冰镩
   处理 23/25: 送信人
   处理 24/25: 海商法
   处理 25/25: 生命迹象

🔄 正在处理 verb 类词汇 (25 个)...
   处理 1/25: 虚掷
   处理 2/25: 解除
   处理 3/25: 矍
   处理 4/25: 打八折
   处理 5/25: 猛涨
   处理 6/25: 老羞成怒
   处理 7/25: 身心交瘁
   处

In [30]:
# 4. 构建中文Taboo数据集
print("🏗️ 开始构建中文Taboo数据集...")

def build_chinese_taboo_dataset(hownet_dict, target_count_per_pos: int = 25) -> List[Dict[str, Any]]:
    """构建中文Taboo数据集"""
    
    if hownet_dict is None:
        print("❌ HowNet词典未初始化，无法构建数据集")
        return []
    
    pos_mapping = get_pos_mapping()
    target_pos_list = ['noun', 'verb', 'adj', 'adv']
    dataset = []
    
    print(f"📊 目标: 每个词性 {target_count_per_pos} 个词，总计 {target_count_per_pos * 4} 个词")
    
    # 获取HowNet中文词汇表
    chinese_vocab = hownet_dict.get_zh_words()
    chinese_vocab = [word for word in chinese_vocab if is_valid_chinese_word(word)]
    print(f"📚 HowNet中文词汇总数: {len(chinese_vocab)} 个")
    
    # 按词性分组收集词汇
    words_by_pos = {pos: [] for pos in target_pos_list}
    
    print("🔍 正在分析词汇词性...")
    progress_count = 0
    
    for word in chinese_vocab:
        progress_count += 1
        if progress_count % 1000 == 0:
            print(f"   已处理 {progress_count}/{len(chinese_vocab)} 个词汇")
        
        try:
            # 获取词汇的义项信息
            senses = hownet_dict.get_sense(word)
            if not senses:
                continue
            
            # 获取主要词性 - OpenHowNet的Sense对象使用属性而不是字典
            primary_sense = senses[0]
            
            # 尝试不同的词性获取方法
            pos_info = None
            if hasattr(primary_sense, 'zh_grammar'):
                pos_info = primary_sense.zh_grammar
            elif hasattr(primary_sense, 'pos'):
                pos_info = primary_sense.pos
            elif hasattr(primary_sense, 'part_of_speech'):
                pos_info = primary_sense.part_of_speech
            
            if not pos_info:
                continue
                
            # 映射到标准词性
            standard_pos = pos_mapping.get(pos_info, None)
            if standard_pos and standard_pos in target_pos_list:
                words_by_pos[standard_pos].append({
                    'word': word,
                    'senses': senses,
                    'primary_pos': standard_pos
                })
        
        except Exception:
            continue
    
    print("\n📈 词性分布统计:")
    for pos, words in words_by_pos.items():
        print(f"   {pos}: {len(words)} 个候选词")
    
    # 为每个词性随机选择指定数量的词汇
    print("\n🎯 开始选择目标词汇并生成禁用词...")
    
    for pos in target_pos_list:
        available_words = words_by_pos[pos]
        if len(available_words) < target_count_per_pos:
            print(f"⚠️ {pos} 词性可用词汇不足 ({len(available_words)} < {target_count_per_pos})")
            selected_count = len(available_words)
        else:
            selected_count = target_count_per_pos
        
        # 随机选择词汇
        selected_words = random.sample(available_words, selected_count)
        print(f"\n🔄 正在处理 {pos} 类词汇 ({selected_count} 个)...")
        
        for i, word_info in enumerate(selected_words):
            target_word = word_info['word']
            senses = word_info['senses']
            
            print(f"   处理 {i+1}/{selected_count}: {target_word}")
            
            # 生成禁用词
            taboo_words = extract_similar_words_from_hownet(
                target_word, pos, hownet_dict, max_count=8
            )
            
            # 如果禁用词不够，添加一些通用的相关词
            if len(taboo_words) < 5:
                # 使用jieba分词从定义中提取更多词汇
                for sense in senses[:2]:  # 只取前两个义项
                    # 正确使用属性访问而不是字典访问
                    definition = getattr(sense, 'Def', '') if hasattr(sense, 'Def') else ''
                    def_words = jieba.lcut(definition)
                    for def_word in def_words:
                        if (is_valid_chinese_word(def_word) and 
                            def_word != target_word and 
                            len(def_word) >= 2 and 
                            def_word not in taboo_words):
                            taboo_words.append(def_word)
                            if len(taboo_words) >= 5:
                                break
            
            # 确保至少有5个禁用词
            taboo_words = taboo_words[:5]  # 限制为5个
            if len(taboo_words) < 5:
                # 如果还是不够，添加一些通用词汇
                generic_taboos = ['东西', '事物', '物品', '概念', '内容']
                for generic in generic_taboos:
                    if generic not in taboo_words and generic != target_word:
                        taboo_words.append(generic)
                        if len(taboo_words) >= 5:
                            break
            
            # 将 Sense 对象转换为可序列化的字典格式
            serializable_senses = []
            for sense in senses:
                sense_dict = {
                    'zh_word': getattr(sense, 'zh_word', ''),
                    'en_word': getattr(sense, 'en_word', ''),
                    'zh_grammar': getattr(sense, 'zh_grammar', ''),
                    'en_grammar': getattr(sense, 'en_grammar', ''),
                    'Def': getattr(sense, 'Def', ''),
                    'No': getattr(sense, 'No', ''),
                    'sememes': str(getattr(sense, 'sememes', []))  # 转换为字符串
                }
                serializable_senses.append(sense_dict)
            
            # 构建数据集条目
            entry = {
                'target': target_word,
                'part_of_speech': pos,
                'taboo': taboo_words[:5],  # 确保正好5个禁用词
                'category': 'chinese_general',
                'senses': serializable_senses,  # 使用可序列化的版本
                'metadata': {
                    'sense_count': len(senses),
                    'taboo_count': len(taboo_words[:5]),
                    'source': 'openhownet'
                }
            }
            
            dataset.append(entry)
    
    return dataset

# 构建数据集
chinese_dataset = build_chinese_taboo_dataset(hownet_dict, target_count_per_pos=25)
print(f"\n✅ 中文Taboo数据集构建完成！")
print(f"📊 总词汇数: {len(chinese_dataset)} 个")


🏗️ 开始构建中文Taboo数据集...
📊 目标: 每个词性 25 个词，总计 100 个词
📚 HowNet中文词汇总数: 130347 个
🔍 正在分析词汇词性...
   已处理 1000/130347 个词汇
   已处理 2000/130347 个词汇
   已处理 3000/130347 个词汇
   已处理 4000/130347 个词汇
   已处理 5000/130347 个词汇
   已处理 6000/130347 个词汇
   已处理 7000/130347 个词汇
   已处理 8000/130347 个词汇
   已处理 9000/130347 个词汇
   已处理 10000/130347 个词汇
   已处理 11000/130347 个词汇
   已处理 12000/130347 个词汇
   已处理 13000/130347 个词汇
   已处理 14000/130347 个词汇
   已处理 15000/130347 个词汇
   已处理 16000/130347 个词汇
   已处理 17000/130347 个词汇
   已处理 18000/130347 个词汇
   已处理 19000/130347 个词汇
   已处理 20000/130347 个词汇
   已处理 21000/130347 个词汇
   已处理 22000/130347 个词汇
   已处理 23000/130347 个词汇
   已处理 24000/130347 个词汇
   已处理 25000/130347 个词汇
   已处理 26000/130347 个词汇
   已处理 27000/130347 个词汇
   已处理 28000/130347 个词汇
   已处理 29000/130347 个词汇
   已处理 30000/130347 个词汇
   已处理 31000/130347 个词汇
   已处理 32000/130347 个词汇
   已处理 33000/130347 个词汇
   已处理 34000/130347 个词汇
   已处理 35000/130347 个词汇
   已处理 36000/130347 个词汇
   已处理 37000/130347 个词汇
   已处理 38000/130347 个词汇
   已处理 390

In [1]:
import random
import json

# 如果chinese_dataset还没加载，先加载
with open('taboo_benchmark/data/chinese_dataset.json', 'r', encoding='utf-8') as f:
    chinese_dataset = json.load(f)

# 按词性分组
pos_groups = {'noun': [], 'verb': [], 'adj': [], 'adv': []}
for item in chinese_dataset:
    pos = item.get('part_of_speech')
    if pos in pos_groups:
        pos_groups[pos].append(item)

# 每类随机抽取10个
sampled_dataset = []
for pos, items in pos_groups.items():
    sampled = random.sample(items, min(10, len(items)))
    sampled_dataset.extend(sampled)

# 检查结果
for pos in pos_groups:
    count = len([x for x in sampled_dataset if x['part_of_speech'] == pos])
    print(f"{pos}: {count} 个")

# 如需保存
with open('taboo_benchmark/data/chinese_dataset_sample10.json', 'w', encoding='utf-8') as f:
    json.dump(sampled_dataset, f, ensure_ascii=False, indent=2)
print("已保存到 taboo_benchmark/data/chinese_dataset_sample10.json")

FileNotFoundError: [Errno 2] No such file or directory: 'taboo_benchmark/data/chinese_dataset.json'

In [31]:
# 5. 数据集统计分析
print("📊 中文Taboo数据集统计分析:")
print("=" * 50)

# 基本统计
total_words = len(chinese_dataset)
print(f"📝 总词汇数: {total_words}")

# 词性分布
pos_counts = {}
taboo_counts = []
sense_counts = []

for item in chinese_dataset:
    pos = item.get('part_of_speech', 'unknown')
    pos_counts[pos] = pos_counts.get(pos, 0) + 1
    taboo_counts.append(len(item.get('taboo', [])))
    sense_counts.append(len(item.get('senses', [])))

print(f"\n🏷️ 词性分布:")
for pos, count in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = count / total_words * 100
    print(f"   {pos}: {count} 个 ({percentage:.1f}%)")

print(f"\n🚫 禁用词统计:")
print(f"   平均数量: {sum(taboo_counts) / len(taboo_counts):.1f}")
print(f"   范围: {min(taboo_counts)} - {max(taboo_counts)}")

print(f"\n💭 义项统计:")
print(f"   平均数量: {sum(sense_counts) / len(sense_counts):.1f}")
print(f"   范围: {min(sense_counts)} - {max(sense_counts)}")

# 显示数据样本
print(f"\n📋 数据样本 (随机5个):")
sample_items = random.sample(chinese_dataset, min(5, len(chinese_dataset)))
for i, item in enumerate(sample_items, 1):
    print(f"\n   样本 {i}:")
    print(f"     目标词: {item['target']}")
    print(f"     词性: {item['part_of_speech']}")
    print(f"     禁用词: {item['taboo']}")
    if item.get('senses') and len(item['senses']) > 0:
        # senses 现在是字典列表，直接访问字典键
        sense = item['senses'][0]
        definition = sense.get('Def', '无定义')
        if definition and definition != '无定义':
            print(f"     定义: {definition[:50]}...")
        else:
            print(f"     定义: 无定义")

print(f"\n✅ 统计分析完成")


📊 中文Taboo数据集统计分析:
📝 总词汇数: 100

🏷️ 词性分布:
   noun: 25 个 (25.0%)
   verb: 25 个 (25.0%)
   adj: 25 个 (25.0%)
   adv: 25 个 (25.0%)

🚫 禁用词统计:
   平均数量: 5.0
   范围: 5 - 5

💭 义项统计:
   平均数量: 1.8
   范围: 1 - 12

📋 数据样本 (随机5个):

   样本 1:
     目标词: 娴静
     词性: adj
     禁用词: ['东西', '事物', '物品', '概念', '内容']
     定义: {gracious|雅}...

   样本 2:
     目标词: 总算
     词性: adv
     禁用词: ['功能', '时间', '特性', '东西', '事物']
     定义: {FuncWord|功能词:comment={?}}...

   样本 3:
     目标词: 玫瑰紫
     词性: adj
     禁用词: ['东西', '事物', '物品', '概念', '内容']
     定义: {red|红}...

   样本 4:
     目标词: 脚踩两只船
     词性: verb
     禁用词: ['得罪', '回避', '东西', '事物', '物品']
     定义: {evade|回避:content={offend|得罪}}...

   样本 5:
     目标词: 老气横秋
     词性: adj
     禁用词: ['消极', '东西', '事物', '物品', '概念']
     定义: {inactive|消极}...

✅ 统计分析完成


In [33]:
# 完整的中文Taboo实验
print("🧪 完整中文Taboo实验系统...")

def run_full_chinese_experiment(client, models, dataset, experiment_name="chinese_taboo"):
    """运行完整的中文Taboo实验"""
    
    if not client:
        print("❌ API客户端未初始化，无法执行实验")
        return None
    
    # 实验配置
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    experiment_id = f"{experiment_name}_{timestamp}"
    
    print(f"\n🎯 完整实验配置:")
    print(f"   实验ID: {experiment_id}")
    print(f"   词汇数量: {len(dataset)}")
    print(f"   模型数量: {len(models)}")
    print(f"   总游戏数: {len(dataset) * len(models) * len(models)}")
    
    # 创建结果目录
    results_dir = f"results/{experiment_id}"
    os.makedirs(results_dir, exist_ok=True)
    
    # 分批处理以避免过长时间运行
    batch_size = 10  # 每批处理10个词汇
    all_results = []
    
    print(f"\n🚀 开始完整实验（分批处理）...")
    
    # 按批次处理数据集
    for batch_start in range(0, len(dataset), batch_size):
        batch_end = min(batch_start + batch_size, len(dataset))
        batch_dataset = dataset[batch_start:batch_end]
        batch_num = (batch_start // batch_size) + 1
        total_batches = (len(dataset) + batch_size - 1) // batch_size
        
        print(f"\n📦 处理批次 {batch_num}/{total_batches} (词汇 {batch_start+1}-{batch_end})...")
        
        batch_results = []
        game_counter = batch_start * len(models) * len(models)
        
        for word_data in batch_dataset:
            target_word = word_data['target']
            taboo_words = word_data['taboo']
            
            print(f"\n🎯 词汇: {target_word} ({word_data['part_of_speech']})\")")
            print(f"🚫 禁用词: {taboo_words}")
            
            for hinter_model in models:
                for guesser_model in models:
                    game_counter += 1
                    hinter_name = hinter_model.split('/')[-1]
                    guesser_name = guesser_model.split('/')[-1]
                    
                    print(f"  🔄 游戏 {game_counter}: {hinter_name} → {guesser_name}")
                    
                    try:\n",
                        start_time = time.time()
                        
                        # 执行游戏
                        game_result = play_chinese_taboo_game(
                            client, hinter_model, guesser_model, 
                            target_word, taboo_words, max_turns=5
                        )
                        
                        duration = round(time.time() - start_time, 2)
                        
                        # 构建详细结果记录
                        result = {
                            'experiment_id': experiment_id,
                            'batch_num': batch_num,
                            'game_id': game_counter,
                            'target_word': target_word,
                            'part_of_speech': word_data['part_of_speech'],
                            'category': word_data['category'],
                            'taboo_words': '|'.join(taboo_words),
                            'hinter_model': hinter_model,
                            'guesser_model': guesser_model,
                            'model_pair': f\"{hinter_name}_vs_{guesser_name}\",
                            'success': game_result['success'],\n",
                            'turns_used': game_result['turns'],
                            'final_guess': game_result['final_guess'],
                            'failure_reason': game_result.get('failure_reason', None),
                            'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                            'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                            'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                            'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                            'all_hints': ' | '.join(game_result.get('all_hints', [])),
                            'all_guesses': ' | '.join(game_result.get('all_guesses', [])),
                            'conversation': ' | '.join(game_result.get('conversation', [])),
                            'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                            'hinter_attempts': game_result.get('total_hinter_attempts', 0),
                            'guesser_attempts': game_result.get('total_guesser_attempts', 0),
                            'format_errors': ' | '.join(game_result.get('format_errors', [])),
                            'hinter_failed_outputs': ' | '.join(game_result.get('hinter_failed_outputs', [])),
                            'guesser_failed_outputs': ' | '.join(game_result.get('guesser_failed_outputs', [])),
                            'duration_seconds': duration,
                            'timestamp': datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"),
                            'language': 'chinese',
                            'dataset_source': word_data.get('metadata', {}).get('source', 'unknown')
                        }
                        
                        if 'error' in game_result:
                            result['error'] = game_result['error']
                        
                        batch_results.append(result)
                        
                        # 显示结果
                        status = \"✅ 成功\" if game_result['success'] else \"❌ 失败\"
                        extra_info = \"\"
                        if not game_result['success']:
                            reason = game_result.get('failure_reason', 'unknown')
                            if reason == 'TABOO_VIOLATION':
                                extra_info = \" (违规)\"
                            elif reason == 'FORMAT_FAILURE':
                                extra_info = \" (格式)\"
                            elif reason == 'MAX_TURNS_EXCEEDED':
                                extra_info = \" (轮数)\"
                        
                        print(f\"     {status}{extra_info} | {game_result['turns']}轮 | {duration}s | {game_result['final_guess']}\")\n",
                        
                    except Exception as e:\n",
                        print(f\"     ❌ 执行异常: {str(e)[:50]}...\")
                        # 记录异常
                        error_result = {\n",
                            'experiment_id': experiment_id,
                            'batch_num': batch_num,
                            'game_id': game_counter,
                            'target_word': target_word,
                            'hinter_model': hinter_model,\n",
                            'guesser_model': guesser_model,\n",
                            'success': False,\n",
                            'failure_reason': 'EXCEPTION',\n",
                            'error': str(e),\n",
                            'timestamp': datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"),\n",
                            'language': 'chinese'\n",
                        }\n",
                        batch_results.append(error_result)\n",
                    \n",
                    time.sleep(0.3)  # API调用间隔\n",
        \n",
        # 保存批次结果\n",
        batch_df = pd.DataFrame(batch_results)\n",
        batch_file = f\"{results_dir}/batch_{batch_num:03d}.csv\"\n",
        batch_df.to_csv(batch_file, index=False, encoding='utf-8-sig')\n",
        print(f\"💾 批次 {batch_num} 结果已保存: {batch_file}\")\n",
        \n",
        all_results.extend(batch_results)\n",
        \n",
        # 显示批次统计\n",
        batch_success = len([r for r in batch_results if r.get('success', False)])\n",
        batch_total = len(batch_results)\n",
        print(f\"📊 批次 {batch_num} 成功率: {batch_success}/{batch_total} ({batch_success/batch_total*100:.1f}%)\")\n",
    \n",
    # 保存完整结果\n",
    complete_df = pd.DataFrame(all_results)\n",
    complete_file = f\"{results_dir}/complete_experiment_results.csv\"\n",
    complete_df.to_csv(complete_file, index=False, encoding='utf-8-sig')\n",
    \n",
    print(f\"\\n🎉 完整实验完成！\")\n",
    print(f\"📁 结果目录: {results_dir}\")\n",
    print(f\"📊 总游戏数: {len(all_results)}\")\n",
    \n",
    # 生成实验报告\n",
    generate_experiment_report(all_results, results_dir, experiment_id)\n",
    \n",
    return all_results\n",

def generate_experiment_report(results, results_dir, experiment_id):
    \"\"\"生成详细的实验报告\"\"\"
    print(f\"\\n📋 生成实验报告...\")\n",
    \n",
    total_games = len(results)\n",
    successful_games = [r for r in results if r.get('success', False)]\n",
    success_rate = len(successful_games) / total_games * 100 if total_games > 0 else 0\n",
    \n",
    # 按模型统计\n",
    models_used = list(set([r.get('hinter_model', 'unknown') for r in results if 'hinter_model' in r]))\n",
    model_stats = {}\n",
    \n",
    for model in models_used:\n",
        model_name = model.split('/')[-1] if '/' in model else model\n",
        \n",
        # 作为hinter的表现\n",
        as_hinter = [r for r in results if r.get('hinter_model') == model]\n",
        hinter_success = len([r for r in as_hinter if r.get('success', False)])\n",
        \n",
        # 作为guesser的表现\n",
        as_guesser = [r for r in results if r.get('guesser_model') == model]\n",
        guesser_success = len([r for r in as_guesser if r.get('success', False)])\n",
        \n",
        model_stats[model_name] = {\n",
            'as_hinter': {'total': len(as_hinter), 'success': hinter_success, 'rate': hinter_success/len(as_hinter)*100 if as_hinter else 0},\n",
            'as_guesser': {'total': len(as_guesser), 'success': guesser_success, 'rate': guesser_success/len(as_guesser)*100 if as_guesser else 0}\n",
        }\n",
    \n",
    # 失败原因统计\n",
    failure_reasons = {}\n",
    failed_games = [r for r in results if not r.get('success', True)]\n",
    for game in failed_games:\n",
        reason = game.get('failure_reason', 'unknown')\n",
        failure_reasons[reason] = failure_reasons.get(reason, 0) + 1\n",
    \n",
    # 词性表现\n",
    pos_stats = {}\n",
    for result in results:\n",
        pos = result.get('part_of_speech', 'unknown')\n",
        if pos not in pos_stats:\n",
            pos_stats[pos] = {'total': 0, 'success': 0}\n",
        pos_stats[pos]['total'] += 1\n",
        if result.get('success', False):\n",
            pos_stats[pos]['success'] += 1\n",
    \n",
    # 生成报告\n",
    report = {\n",
        'experiment_info': {\n",
            'experiment_id': experiment_id,\n",
            'total_games': total_games,\n",
            'successful_games': len(successful_games),\n",
            'success_rate': round(success_rate, 2),\n",
            'completion_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\n",
            'models_tested': len(models_used),\n",
            'language': 'chinese'\n",
        },\n",
        'model_performance': model_stats,\n",
        'failure_analysis': failure_reasons,\n",
        'pos_performance': {pos: {'total': stats['total'], 'success': stats['success'], 'rate': round(stats['success']/stats['total']*100, 1)} for pos, stats in pos_stats.items()}\n",
    }\n",
    \n",
    # 保存报告\n",
    report_file = f\"{results_dir}/experiment_report.json\"\n",
    with open(report_file, 'w', encoding='utf-8') as f:\n",
        json.dump(report, f, ensure_ascii=False, indent=2)\n",
    \n",
    # 显示报告摘要\n",
    print(f\"\\n📈 实验结果摘要:\")\n",
    print(f\"   总游戏数: {total_games}\")\n",
    print(f\"   成功游戏: {len(successful_games)}\")\n",
    print(f\"   整体成功率: {success_rate:.1f}%\")\n",
    \n",
    print(f\"\\n🤖 模型表现:\")\n",
    for model_name, stats in model_stats.items():\n",
        print(f\"   {model_name}:\")\n",
        print(f\"     作为提示者: {stats['as_hinter']['success']}/{stats['as_hinter']['total']} ({stats['as_hinter']['rate']:.1f}%)\")\n",
        print(f\"     作为猜测者: {stats['as_guesser']['success']}/{stats['as_guesser']['total']} ({stats['as_guesser']['rate']:.1f}%)\")\n",
    \n",
    if failure_reasons:\n",
        print(f\"\\n❌ 失败原因分析:\")\n",
        for reason, count in failure_reasons.items():\n",
            print(f\"   {reason}: {count} 次 ({count/len(failed_games)*100:.1f}%)\")\n",
    \n",
    print(f\"\\n📝 报告已保存: {report_file}\")\n",

print(\"✅ 完整中文Taboo实验系统已定义\")"


SyntaxError: unexpected character after line continuation character (91919430.py, line 58)

In [None]:
# 6. 保存中文数据集
print("💾 保存中文Taboo数据集...")

# 创建数据目录
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"📁 创建数据目录: {data_dir}")

# 保存完整数据集
chinese_dataset_path = os.path.join(data_dir, "chinese_dataset.json")
with open(chinese_dataset_path, 'w', encoding='utf-8') as f:
    json.dump(chinese_dataset, f, ensure_ascii=False, indent=2)
print(f"✅ 完整数据集已保存: {chinese_dataset_path}")

# 创建简化版数据集（用于快速测试）
simplified_dataset = []
for item in chinese_dataset:
    simplified_item = {
        'target': item['target'],
        'part_of_speech': item['part_of_speech'],
        'taboo': item['taboo'],
        'category': item['category']
    }
    simplified_dataset.append(simplified_item)

simplified_path = os.path.join(data_dir, "chinese_dataset_simple.json")
with open(simplified_path, 'w', encoding='utf-8') as f:
    json.dump(simplified_dataset, f, ensure_ascii=False, indent=2)
print(f"✅ 简化数据集已保存: {simplified_path}")

# 创建安全的样本数据（移除可能不可序列化的内容）
safe_sample_items = []
for item in sample_items:
    safe_item = {
        'target': item['target'],
        'part_of_speech': item['part_of_speech'],
        'taboo': item['taboo'],
        'category': item['category'],
        'sense_count': len(item.get('senses', [])),
        'first_definition': item['senses'][0].get('Def', '无定义')[:100] if item.get('senses') else '无定义'
    }
    safe_sample_items.append(safe_item)

# 生成数据集报告
report = {
    'dataset_info': {
        'total_words': len(chinese_dataset),
        'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'source': 'OpenHowNet',
        'language': 'Chinese',
        'pos_distribution': pos_counts,
        'avg_taboo_count': sum(taboo_counts) / len(taboo_counts),
        'avg_sense_count': sum(sense_counts) / len(sense_counts)
    },
    'sample_data': safe_sample_items
}

report_path = os.path.join(data_dir, "chinese_dataset_report.json")
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, ensure_ascii=False, indent=2)
print(f"✅ 数据集报告已保存: {report_path}")

print(f"\n🎉 中文Taboo数据集构建完成！")
print(f"📁 数据文件位置:")
print(f"   完整版: {chinese_dataset_path}")
print(f"   简化版: {simplified_path}")
print(f"   报告: {report_path}")


💾 保存中文Taboo数据集...


TypeError: Object of type Sense is not JSON serializable

In [None]:
# 7. API客户端设置（支持中文模型）
print("🔧 设置中文Taboo实验API客户端...")

def load_api_keys(keys_path: str = "api_keys.json") -> Dict[str, str]:
    """加载API密钥"""
    with open(keys_path, 'r', encoding='utf-8') as f:
        return json.load(f)

class ChineseTabooClient:
    """中文Taboo游戏专用API客户端"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def call_model(self, model: str, messages: List[Dict[str, str]], temperature: float = 0.3) -> str:
        """调用模型API"""
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": 2000
        }
        
        response = requests.post(self.base_url, headers=self.headers, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        content = result['choices'][0]['message']['content'].strip()
        
        return content

# 初始化API客户端
try:
    api_keys = load_api_keys()
    chinese_client = ChineseTabooClient(api_keys["OPENROUTER_API_KEY"])
    print("✅ 中文Taboo API客户端初始化成功")
except Exception as e:
    print(f"❌ API客户端初始化失败: {e}")
    chinese_client = None

# 定义支持中文的测试模型
CHINESE_TEST_MODELS = [
    "openai/gpt-4o",  # GPT-4o 支持中文
    "google/gemini-2.5-flash",  # Gemini 支持中文
    "deepseek/deepseek-chat-v3-0324",  # DeepSeek 中文模型
    "anthropic/claude-sonnet-4",  # Claude 支持中文
    "moonshotai/kimi-k2",  # kimi-k2 的API路径
]

# 中文Taboo游戏逻辑
def play_chinese_taboo_game(client, hinter_model, guesser_model, target_word, taboo_words, max_turns=5):
    """执行一场中文Taboo游戏"""
    
    conversation = []
    all_hints = []
    all_guesses = []
    format_errors = []
    hinter_failed_outputs = []
    guesser_failed_outputs = []
    total_hinter_attempts = 0
    total_guesser_attempts = 0
    
    try:
        for turn in range(1, max_turns + 1):
            # Hinter给出提示
            hinter_prompt = f"""你正在玩中文Taboo游戏。你需要用中文描述一个词汇，让对方猜出这个词。

目标词汇: {target_word}
禁用词汇: {', '.join(taboo_words)}

规则:
1. 你必须用中文描述目标词汇
2. 你不能直接说出目标词汇或任何禁用词汇
3. 你的回答只能包含描述文字，不要包含其他内容
4. 保持描述简洁但有帮助

请给出你的描述:"""

            # 获取hinter的提示
            hint = None
            hinter_attempts = 0
            while hint is None and hinter_attempts < 3:
                hinter_attempts += 1
                total_hinter_attempts += 1
                try:
                    hint_response = client.call_model(hinter_model, [{"role": "user", "content": hinter_prompt}])
                    hint = hint_response.strip()
                    
                    # 检查是否违反禁用词
                    hint_lower = hint.lower()
                    target_lower = target_word.lower()
                    
                    if target_lower in hint_lower:
                        return {
                            'success': False,
                            'failure_reason': 'TABOO_VIOLATION',
                            'turns': turn,
                            'final_guess': '',
                            'taboo_violation_turn': turn,
                            'taboo_violation_hint': hint,
                            'conversation': conversation,
                            'all_hints': all_hints,
                            'all_guesses': all_guesses,
                            'format_errors': format_errors,
                            'hinter_failed_outputs': hinter_failed_outputs,
                            'guesser_failed_outputs': guesser_failed_outputs,
                            'total_hinter_attempts': total_hinter_attempts,
                            'total_guesser_attempts': total_guesser_attempts
                        }
                    
                    for taboo_word in taboo_words:
                        if taboo_word.lower() in hint_lower:
                            return {
                                'success': False,
                                'failure_reason': 'TABOO_VIOLATION',
                                'turns': turn,
                                'final_guess': '',
                                'taboo_violation_turn': turn,
                                'taboo_violation_hint': hint,
                                'conversation': conversation,
                                'all_hints': all_hints,
                                'all_guesses': all_guesses,
                                'format_errors': format_errors,
                                'hinter_failed_outputs': hinter_failed_outputs,
                                'guesser_failed_outputs': guesser_failed_outputs,
                                'total_hinter_attempts': total_hinter_attempts,
                                'total_guesser_attempts': total_guesser_attempts
                            }
                    
                    break
                    
                except Exception as e:
                    if hinter_attempts == 3:
                        return {
                            'success': False,
                            'failure_reason': 'API_FAILURE',
                            'turns': turn,
                            'final_guess': '',
                            'error': f"Hinter API failure: {e}",
                            'conversation': conversation,
                            'all_hints': all_hints,
                            'all_guesses': all_guesses,
                            'format_errors': format_errors,
                            'hinter_failed_outputs': hinter_failed_outputs,
                            'guesser_failed_outputs': guesser_failed_outputs,
                            'total_hinter_attempts': total_hinter_attempts,
                            'total_guesser_attempts': total_guesser_attempts
                        }
                    hinter_failed_outputs.append(str(e))
                    time.sleep(1)
            
            if hint is None:
                return {
                    'success': False,
                    'failure_reason': 'FORMAT_FAILURE',
                    'turns': turn,
                    'final_guess': '',
                    'conversation': conversation,
                    'all_hints': all_hints,
                    'all_guesses': all_guesses,
                    'format_errors': format_errors,
                    'hinter_failed_outputs': hinter_failed_outputs,
                    'guesser_failed_outputs': guesser_failed_outputs,
                    'total_hinter_attempts': total_hinter_attempts,
                    'total_guesser_attempts': total_guesser_attempts
                }
            
            all_hints.append(hint)
            conversation.append(f"第{turn}轮 提示: {hint}")
            
            # Guesser猜测
            conversation_history = "\n".join(conversation)
            guesser_prompt = f"""你正在玩中文Taboo游戏。根据对方的描述，你需要猜出他想让你猜的词汇。

游戏历史:
{conversation_history}

请根据以上描述猜测目标词汇。你的回答只需要包含你猜测的词汇，不要包含其他解释。

你的猜测:"""

            # 获取guesser的猜测
            guess = None
            guesser_attempts = 0
            while guess is None and guesser_attempts < 3:
                guesser_attempts += 1
                total_guesser_attempts += 1
                try:
                    guess_response = client.call_model(guesser_model, [{"role": "user", "content": guesser_prompt}])
                    guess = guess_response.strip()
                    
                    # 简单的格式检查
                    if len(guess) > 50:  # 回答太长，可能包含解释
                        # 尝试提取最可能的词汇
                        words = jieba.lcut(guess)
                        chinese_words = [w for w in words if is_valid_chinese_word(w)]
                        if chinese_words:
                            guess = chinese_words[0]
                        else:
                            guess = guess[:10]  # 截取前10个字符
                    
                    break
                    
                except Exception as e:
                    if guesser_attempts == 3:
                        return {
                            'success': False,
                            'failure_reason': 'API_FAILURE',
                            'turns': turn,
                            'final_guess': '',
                            'error': f"Guesser API failure: {e}",
                            'conversation': conversation,
                            'all_hints': all_hints,
                            'all_guesses': all_guesses,
                            'format_errors': format_errors,
                            'hinter_failed_outputs': hinter_failed_outputs,
                            'guesser_failed_outputs': guesser_failed_outputs,
                            'total_hinter_attempts': total_hinter_attempts,
                            'total_guesser_attempts': total_guesser_attempts
                        }
                    guesser_failed_outputs.append(str(e))
                    time.sleep(1)
            
            if guess is None:
                return {
                    'success': False,
                    'failure_reason': 'FORMAT_FAILURE',
                    'turns': turn,
                    'final_guess': '',
                    'conversation': conversation,
                    'all_hints': all_hints,
                    'all_guesses': all_guesses,
                    'format_errors': format_errors,
                    'hinter_failed_outputs': hinter_failed_outputs,
                    'guesser_failed_outputs': guesser_failed_outputs,
                    'total_hinter_attempts': total_hinter_attempts,
                    'total_guesser_attempts': total_guesser_attempts
                }
            
            all_guesses.append(guess)
            conversation.append(f"第{turn}轮 猜测: {guess}")
            
            # 检查是否猜对
            if guess.lower().strip() == target_word.lower().strip():
                return {
                    'success': True,
                    'turns': turn,
                    'final_guess': guess,
                    'conversation': conversation,
                    'all_hints': all_hints,
                    'all_guesses': all_guesses,
                    'format_errors': format_errors,
                    'hinter_failed_outputs': hinter_failed_outputs,
                    'guesser_failed_outputs': guesser_failed_outputs,
                    'total_hinter_attempts': total_hinter_attempts,
                    'total_guesser_attempts': total_guesser_attempts
                }
        
        # 如果所有轮次都用完了还没猜对
        return {
            'success': False,
            'failure_reason': 'MAX_TURNS_EXCEEDED',
            'turns': max_turns,
            'final_guess': all_guesses[-1] if all_guesses else '',
            'conversation': conversation,
            'all_hints': all_hints,
            'all_guesses': all_guesses,
            'format_errors': format_errors,
            'hinter_failed_outputs': hinter_failed_outputs,
            'guesser_failed_outputs': guesser_failed_outputs,
            'total_hinter_attempts': total_hinter_attempts,
            'total_guesser_attempts': total_guesser_attempts
        }
        
    except Exception as e:
        return {
            'success': False,
            'failure_reason': 'EXCEPTION',
            'turns': 0,
            'final_guess': '',
            'error': str(e),
            'conversation': conversation,
            'all_hints': all_hints,
            'all_guesses': all_guesses,
            'format_errors': format_errors,
            'hinter_failed_outputs': hinter_failed_outputs,
            'guesser_failed_outputs': guesser_failed_outputs,
            'total_hinter_attempts': total_hinter_attempts,
            'total_guesser_attempts': total_guesser_attempts
        }

print("✅ 中文Taboo游戏逻辑已定义")

print(f"🤖 中文实验模型列表 ({len(CHINESE_TEST_MODELS)} 个):")
for i, model in enumerate(CHINESE_TEST_MODELS, 1):
    print(f"   {i}. {model}")

print(f"\n💡 选择较少模型进行测试以节省成本和时间")


In [None]:
# 8. 运行完整中文Taboo实验
print("🚀 开始运行完整中文Taboo实验...")

# 选择实验规模
experiment_scales = {
    'quick': {'count': 5, 'desc': '快速测试（5个词汇）'},
    'medium': {'count': 20, 'desc': '中等规模（20个词汇）'},
    'full': {'count': 100, 'desc': '完整实验（100个词汇）'}
}

# 设置实验规模 - 可以修改这里选择不同规模
EXPERIMENT_SCALE = 'quick'  # 改为 'medium' 或 'full' 来运行更大规模实验

scale_config = experiment_scales[EXPERIMENT_SCALE]
experiment_dataset = chinese_dataset[:scale_config['count']]

print(f"📊 实验配置: {scale_config['desc']}")
print(f"🎯 词汇数量: {len(experiment_dataset)}")
print(f"🤖 模型数量: {len(CHINESE_TEST_MODELS)}")
print(f"🎮 总游戏数: {len(experiment_dataset) * len(CHINESE_TEST_MODELS) * len(CHINESE_TEST_MODELS)}")

if chinese_client and chinese_dataset:
    try:
        # 运行实验
        experiment_results = run_full_chinese_experiment(
            chinese_client, 
            CHINESE_TEST_MODELS, 
            experiment_dataset,
            f"chinese_taboo_{EXPERIMENT_SCALE}"
        )
        
        print(f"\n🎉 {scale_config['desc']}完成！")
        
    except Exception as e:
        print(f"❌ 实验执行失败: {e}")
        print("💡 请检查API密钥和网络连接")
        
else:
    print("❌ 无法运行实验：API客户端或数据集未准备就绪")
    if not chinese_client:
        print("   - API客户端未初始化")
    if not chinese_dataset:
        print("   - 数据集未构建")


In [None]:
# 启动中文Taboo完整实验
print("🚀 启动中文Taboo完整实验...")

# 实验选项：用户可以选择实验规模
EXPERIMENT_OPTIONS = {
    'quick': {'size': 5, 'name': '快速测试'},
    'medium': {'size': 20, 'name': '中等规模'}, 
    'full': {'size': 100, 'name': '完整实验'}
}

# 默认选择快速测试
selected_option = 'quick'
selected_size = EXPERIMENT_OPTIONS[selected_option]['size']

print(f"📊 实验规模: {EXPERIMENT_OPTIONS[selected_option]['name']} ({selected_size} 个词汇)")
print(f"🤖 测试模型: {len(CHINESE_TEST_MODELS)} 个")
print(f"🎯 预计游戏数: {selected_size * len(CHINESE_TEST_MODELS) * len(CHINESE_TEST_MODELS)}")

# 检查必要组件
if 'chinese_client' in globals() and chinese_client and 'chinese_dataset' in globals() and chinese_dataset:
    print("✅ 所有组件已就绪，可以开始实验")
    
    # 选择数据集子集
    test_dataset = chinese_dataset[:selected_size]
    
    print(f"\\n🎯 选择的测试词汇:")
    for i, item in enumerate(test_dataset[:5], 1):  # 显示前5个
        print(f"   {i}. {item['target']} ({item['part_of_speech']}) - 禁用词: {item['taboo']}")
    if len(test_dataset) > 5:
        print(f"   ... 还有 {len(test_dataset) - 5} 个词汇")
    
    print(f"\\n💡 要执行完整实验，请运行下一个cell")
    print(f"💡 要修改实验规模，请修改上面的 selected_option 变量")
    
else:
    print("❌ 缺少必要组件:")
    if 'chinese_client' not in globals() or not chinese_client:
        print("   - API客户端未初始化")
    if 'chinese_dataset' not in globals() or not chinese_dataset:
        print("   - 中文数据集未准备好")
    print("💡 请先运行前面的cell来初始化这些组件")


In [None]:
# 执行中文Taboo完整实验
print("🎮 执行中文Taboo完整实验...")

def execute_chinese_experiment(dataset, models, client):
    """执行中文Taboo实验"""
    
    # 创建结果目录
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    results_dir = f"results/chinese_experiment_{timestamp}"
    os.makedirs(results_dir, exist_ok=True)
    
    print(f"📁 结果保存目录: {results_dir}")
    
    all_results = []
    game_counter = 0
    total_games = len(dataset) * len(models) * len(models)
    
    print(f"🚀 开始执行 {total_games} 个游戏...")
    
    for word_idx, word_data in enumerate(dataset, 1):
        target_word = word_data['target']
        taboo_words = word_data['taboo']
        pos = word_data['part_of_speech']
        
        print(f"\\n🎯 词汇 {word_idx}/{len(dataset)}: {target_word} ({pos})")
        print(f"🚫 禁用词: {taboo_words}")
        
        word_success = 0
        word_total = 0
        
        for hinter_model in models:
            for guesser_model in models:
                game_counter += 1
                word_total += 1
                
                hinter_name = hinter_model.split('/')[-1]
                guesser_name = guesser_model.split('/')[-1]
                
                print(f"  🔄 {game_counter}/{total_games}: {hinter_name}→{guesser_name}", end=" ")
                
                try:
                    start_time = time.time()
                    
                    # 执行游戏
                    game_result = play_chinese_taboo_game(
                        client, hinter_model, guesser_model, 
                        target_word, taboo_words, max_turns=5
                    )
                    
                    duration = round(time.time() - start_time, 2)
                    
                    # 记录结果
                    result = {
                        'game_id': game_counter,
                        'word_index': word_idx,
                        'target_word': target_word,
                        'part_of_speech': pos,
                        'taboo_words': '|'.join(taboo_words),
                        'hinter_model': hinter_model,
                        'guesser_model': guesser_model,
                        'success': game_result['success'],
                        'turns_used': game_result['turns'],
                        'final_guess': game_result['final_guess'],
                        'failure_reason': game_result.get('failure_reason', ''),
                        'duration_seconds': duration,
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
                    
                    all_results.append(result)
                    
                    if game_result['success']:
                        word_success += 1
                        print(f"✅ {game_result['turns']}轮 {duration}s")
                    else:
                        reason = game_result.get('failure_reason', 'unknown')
                        if reason == 'TABOO_VIOLATION':
                            print(f"❌ 违规 {duration}s")
                        elif reason == 'MAX_TURNS_EXCEEDED':
                            print(f"❌ 轮数 {duration}s")
                        else:
                            print(f"❌ {reason[:10]} {duration}s")
                    
                except Exception as e:
                    print(f"❌ 异常: {str(e)[:20]}...")
                    result = {
                        'game_id': game_counter,
                        'target_word': target_word,
                        'hinter_model': hinter_model,
                        'guesser_model': guesser_model,
                        'success': False,
                        'failure_reason': 'EXCEPTION',
                        'error': str(e),
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    }
                    all_results.append(result)
                
                time.sleep(0.1)  # 避免API限制
        
        # 显示词汇小结
        word_rate = word_success / word_total * 100 if word_total > 0 else 0
        print(f"  📊 '{target_word}' 成功率: {word_success}/{word_total} ({word_rate:.1f}%)")
    
    # 保存完整结果
    results_df = pd.DataFrame(all_results)
    results_file = f"{results_dir}/complete_results.csv"
    results_df.to_csv(results_file, index=False, encoding='utf-8-sig')
    
    # 生成报告
    total_success = len([r for r in all_results if r.get('success', False)])
    overall_rate = total_success / len(all_results) * 100 if all_results else 0
    
    print(f"\\n🎉 实验完成！")
    print(f"📊 整体结果:")
    print(f"   总游戏数: {len(all_results)}")
    print(f"   成功数: {total_success}")
    print(f"   成功率: {overall_rate:.1f}%")
    
    # 按模型统计
    print(f"\\n🤖 模型表现:")
    for model in models:
        model_name = model.split('/')[-1]
        
        # 作为提示者
        as_hinter = [r for r in all_results if r.get('hinter_model') == model]
        hinter_success = len([r for r in as_hinter if r.get('success', False)])
        hinter_rate = hinter_success / len(as_hinter) * 100 if as_hinter else 0
        
        # 作为猜测者  
        as_guesser = [r for r in all_results if r.get('guesser_model') == model]
        guesser_success = len([r for r in as_guesser if r.get('success', False)])
        guesser_rate = guesser_success / len(as_guesser) * 100 if as_guesser else 0
        
        print(f"   {model_name}:")
        print(f"     提示者: {hinter_success}/{len(as_hinter)} ({hinter_rate:.1f}%)")
        print(f"     猜测者: {guesser_success}/{len(as_guesser)} ({guesser_rate:.1f}%)")
    
    print(f"\\n📁 详细结果已保存: {results_file}")
    return all_results, results_dir

# 执行实验
if 'test_dataset' in globals() and test_dataset and chinese_client:
    print("\\n" + "="*60)
    print("🧪 开始中文Taboo完整实验")
    print("="*60)
    
    experiment_results, results_directory = execute_chinese_experiment(
        test_dataset, CHINESE_TEST_MODELS, chinese_client
    )
    
    print(f"\\n✅ 实验完成！结果保存在: {results_directory}")
    
else:
    print("❌ 无法执行实验：")
    if 'test_dataset' not in globals():
        print("   - 测试数据集未定义，请先运行上一个cell")
    if 'chinese_client' not in globals() or not chinese_client:
        print("   - API客户端未初始化")


In [None]:
# 8. 中文Taboo游戏核心逻辑和工具函数
print("🎮 定义中文Taboo游戏核心逻辑...")

def safe_chinese_text_cleanup(text: str, max_length: int = 300) -> str:
    """安全清理中文文本"""
    if not text:
        return ""
    
    # 保留中文字符、英文字符、数字和常用标点
    import re
    cleaned = re.sub(r'[^\u4e00-\u9fff\w\s\.,!?;:"\'()[\]{}\-]', '', str(text))
    
    if len(cleaned) > max_length:
        cleaned = cleaned[:max_length] + "..."
    
    return cleaned

def extract_chinese_clue_text(response: str) -> str:
    """从响应中提取中文线索文本"""
    if "FORMAT_ERROR_EXCEEDED" in response:
        return "FORMAT_ERROR"
    
    # 检查中文格式标记
    if '[线索]' in response or '[CLUE]' in response.upper():
        import re
        # 优先匹配中文标记
        match = re.search(r'\[线索\]\s*(.+)', response, re.DOTALL)
        if not match:
            match = re.search(r'\[CLUE\]\s*(.+)', response, re.IGNORECASE | re.DOTALL)
        
        if match:
            return match.group(1).strip()
    
    # 备用格式
    if '线索:' in response or 'Clue:' in response:
        if '线索:' in response:
            return response.split('线索:')[1].strip()
        else:
            return response.split('Clue:')[1].strip()
    
    return "INVALID_FORMAT"

def extract_chinese_guess_word(response: str) -> str:
    """从响应中提取中文猜测词"""
    if "FORMAT_ERROR_EXCEEDED" in response:
        return "FORMAT_ERROR"
    
    # 检查中文格式标记
    if '[猜测]' in response or '[GUESS]' in response.upper():
        import re
        # 优先匹配中文标记
        match = re.search(r'\[猜测\]\s*(.+)', response)
        if not match:
            match = re.search(r'\[GUESS\]\s*(.+)', response, re.IGNORECASE)
        
        if match:
            guess_part = match.group(1).strip()
            # 提取第一个中文词汇
            chinese_words = re.findall(r'[\u4e00-\u9fff]+', guess_part)
            if chinese_words:
                return chinese_words[0]
    
    # 备用格式
    if '猜测:' in response or 'Guess:' in response:
        if '猜测:' in response:
            guess_part = response.split('猜测:')[1].strip()
        else:
            guess_part = response.split('Guess:')[1].strip()
        
        chinese_words = re.findall(r'[\u4e00-\u9fff]+', guess_part)
        if chinese_words:
            return chinese_words[0]
    
    return "INVALID_FORMAT"

def check_chinese_taboo_violation(hint: str, taboo_words: List[str]) -> bool:
    """检查中文线索是否违反禁用词规则"""
    hint_cleaned = re.sub(r'[^\u4e00-\u9fff]', '', hint.lower())
    
    for taboo in taboo_words:
        taboo_cleaned = re.sub(r'[^\u4e00-\u9fff]', '', taboo.lower())
        
        # 检查完整匹配
        if taboo_cleaned in hint_cleaned:
            return True
        
        # 检查部分匹配（对于较长的词）
        if len(taboo_cleaned) >= 2:
            # 检查是否包含禁用词的主要部分
            if len(taboo_cleaned) >= 3:
                core_part = taboo_cleaned[:2]  # 取前两个字符作为核心
                if core_part in hint_cleaned:
                    return True
    
    return False

def robust_chinese_api_call(client, model: str, base_prompt: str, expected_prefix: str, max_retries: int = 3):
    """健壮的中文API调用"""
    failed_outputs = []
    
    for attempt in range(1, max_retries + 1):
        try:
            if attempt == 1:
                prompt = base_prompt
            else:
                prev_output = failed_outputs[-1] if failed_outputs else "未知"
                format_reminder = f"""

⚠️ 格式错误 ⚠️
您之前的回复是: "{prev_output}"

必需格式:
- 您必须以 '{expected_prefix}' 开头（包括方括号）
- 不要在 {expected_prefix} 前添加任何文字

请使用正确格式重试:"""
                prompt = base_prompt + format_reminder
            
            response = client.call_model(model, [{"role": "user", "content": prompt}])
            
            if (response.strip().startswith(expected_prefix) or 
                response.strip().upper().startswith(expected_prefix.upper())):
                return {
                    'success': True,
                    'response': response,
                    'attempts': attempt,
                    'error': None,
                    'failed_outputs': failed_outputs
                }
            else:
                safe_response = safe_chinese_text_cleanup(response, max_length=150)
                failed_outputs.append(safe_response)
                
                if attempt == max_retries:
                    all_failed = " | ".join(failed_outputs)
                    return {
                        'success': False,
                        'response': f"FORMAT_ERROR_EXCEEDED: {safe_response}",
                        'attempts': attempt,
                        'error': f"尝试 {max_retries} 次后失败。期望格式 '{expected_prefix}'。所有失败输出: {all_failed}",
                        'failed_outputs': failed_outputs
                    }
                time.sleep(0.5)
                
        except Exception as e:
            safe_error = safe_chinese_text_cleanup(str(e), max_length=150)
            error_msg = f"API错误 (尝试 {attempt}/{max_retries}): {safe_error}"
            
            if attempt == max_retries:
                return {
                    'success': False,
                    'response': None,
                    'attempts': attempt,
                    'error': error_msg,
                    'failed_outputs': failed_outputs
                }
            time.sleep(1.0)
    
    return {
        'success': False,
        'response': None,
        'attempts': max_retries,
        'error': "超出最大重试次数",
        'failed_outputs': failed_outputs
    }

print("✅ 中文Taboo游戏核心逻辑已定义")


In [None]:
# 9. 中文Taboo游戏主函数
def play_chinese_taboo_game(client, hinter_model, guesser_model, target_word, taboo_words, max_turns=5):
    """中文Taboo游戏主函数"""
    
    conversation_history = []
    taboo_list = '、'.join(taboo_words)
    
    # 中文系统提示 - Hinter
    hinter_system = f"""您正在玩中文Taboo游戏，担任提示者角色。

关键格式要求:
- 您必须以 '[线索]' 开头回复（包括方括号）
- 不要在 [线索] 前添加任何文字
- 示例: [线索] 这是一种有长鼻子的大型哺乳动物

游戏规则:
1. 绝对不能使用这些禁用词: {taboo_list}
2. 给出有用的线索帮助猜测者找到目标词
3. 可以看到对话历史，在之前线索基础上补充
4. 要有创意但严格遵循格式
5. 使用标准中文表达

目标词: {target_word}
禁用词（绝对禁止）: {taboo_list}"""

    # 中文系统提示 - Guesser
    guesser_system = """您正在玩中文Taboo游戏，担任猜测者角色。

关键格式要求:
- 您必须以 '[猜测]' 开头回复（包括方括号）
- 不要在 [猜测] 前添加任何文字
- 示例: [猜测] 大象

游戏规则:
1. 根据收到的所有线索进行最佳猜测
2. 可以看到对话历史
3. 在 [猜测] 后只给出一个中文词汇作为答案
4. 使用标准中文词汇"""

    # 记录统计信息
    total_hinter_attempts = 0
    total_guesser_attempts = 0
    format_errors = []
    hinter_failed_outputs = []
    guesser_failed_outputs = []

    for turn in range(1, max_turns + 1):
        # 构建Hinter提示
        if turn == 1:
            hinter_prompt = f"{hinter_system}\n\n请提供您的第一个线索:"
        else:
            history_text = "\n".join([f"第{i}轮: {msg}" for i, msg in enumerate(conversation_history, 1)])
            hinter_prompt = f"{hinter_system}\n\n对话历史:\n{history_text}\n\n猜测者还没有找到答案。请提供下一个线索:"
        
        # Hinter给出线索
        hinter_result = robust_chinese_api_call(client, hinter_model, hinter_prompt, "[线索]", max_retries=3)
        total_hinter_attempts += hinter_result['attempts']
        
        if hinter_result.get('failed_outputs'):
            hinter_failed_outputs.extend(hinter_result['failed_outputs'])
        
        if not hinter_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(hinter_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"第{turn}轮 提示者: {hinter_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"HINTER_{error_type}",
                'error': f"{error_type}: {hinter_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('提示者:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('猜测者:')]
            }
        
        # 提取线索并检查taboo violation
        hint_text = extract_chinese_clue_text(hinter_result['response'])
        
        # 检查是否违反禁用词规则
        taboo_violated = check_chinese_taboo_violation(hint_text, taboo_words)
        if taboo_violated:
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': '违反禁用词规则: 提示者违规',
                'error': f'违反禁用词规则: 提示者在第{turn}轮违反规则，使用了禁用词: {hint_text}',
                'failure_reason': 'TABOO_VIOLATION',
                'taboo_violation_turn': turn,
                'taboo_violation_hint': hint_text,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('提示者:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('猜测者:')]
            }
        
        conversation_history.append(f"提示者: {hinter_result['response']}")
        
        # 构建Guesser提示
        history_text = "\n".join([f"第{i}轮: {msg}" for i, msg in enumerate(conversation_history, 1)])
        guesser_prompt = f"{guesser_system}\n\n对话历史:\n{history_text}\n\n您的猜测是什么?"
        
        # Guesser进行猜测
        guesser_result = robust_chinese_api_call(client, guesser_model, guesser_prompt, "[猜测]", max_retries=3)
        total_guesser_attempts += guesser_result['attempts']
        
        if guesser_result.get('failed_outputs'):
            guesser_failed_outputs.extend(guesser_result['failed_outputs'])
        
        if not guesser_result['success']:
            error_type = "FORMAT_FAILURE" if "FORMAT_ERROR_EXCEEDED" in str(guesser_result.get('response', '')) else "API_FAILURE"
            format_errors.append(f"第{turn}轮 猜测者: {guesser_result['error']}")
            
            return {
                'success': False,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': f"GUESSER_{error_type}",
                'error': f"{error_type}: {guesser_result['error']}",
                'failure_reason': error_type,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('提示者:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('猜测者:')]
            }
        
        conversation_history.append(f"猜测者: {guesser_result['response']}")
        guess = extract_chinese_guess_word(guesser_result['response'])
        
        # 检查是否成功
        if guess == target_word:
            return {
                'success': True,
                'turns': turn,
                'conversation': conversation_history,
                'final_guess': guess,
                'failure_reason': None,
                'total_hinter_attempts': total_hinter_attempts,
                'total_guesser_attempts': total_guesser_attempts,
                'format_errors': format_errors,
                'hinter_failed_outputs': hinter_failed_outputs,
                'guesser_failed_outputs': guesser_failed_outputs,
                'all_hints': [msg for msg in conversation_history if msg.startswith('提示者:')],
                'all_guesses': [msg for msg in conversation_history if msg.startswith('猜测者:')]
            }
        
        # 如果不是最后一轮，添加反馈
        if turn < max_turns:
            conversation_history.append(f"系统: '{guess}' 不正确。请继续！")
    
    # 达到最大轮数仍未成功
    return {
        'success': False,
        'turns': max_turns,
        'conversation': conversation_history,
        'final_guess': guess if 'guess' in locals() else 'N/A',
        'failure_reason': 'MAX_TURNS_EXCEEDED',
        'total_hinter_attempts': total_hinter_attempts,
        'total_guesser_attempts': total_guesser_attempts,
        'format_errors': format_errors,
        'hinter_failed_outputs': hinter_failed_outputs,
        'guesser_failed_outputs': guesser_failed_outputs,
        'all_hints': [msg for msg in conversation_history if msg.startswith('提示者:')],
        'all_guesses': [msg for msg in conversation_history if msg.startswith('猜测者:')]
    }

print("✅ 中文Taboo游戏主函数已定义")


In [None]:
# 10. 执行中文Taboo测试实验
print("🧪 开始执行中文Taboo测试实验...")

def run_chinese_test_experiment(client, models, dataset, num_test_words=3):
    """运行中文Taboo测试实验"""
    
    if not client:
        print("❌ API客户端未初始化，无法执行实验")
        return None
    
    print(f"\n🎯 测试配置:")
    print(f"   测试词汇数: {num_test_words}")
    print(f"   模型数量: {len(models)}")
    print(f"   总游戏数: {num_test_words * len(models) * len(models)}")
    
    # 随机选择测试词汇
    test_words = random.sample(dataset, min(num_test_words, len(dataset)))
    print(f"\n📋 选择的测试词汇:")
    for i, word_data in enumerate(test_words, 1):
        print(f"   {i}. {word_data['target']} ({word_data['part_of_speech']}) - 禁用词: {word_data['taboo']}")
    
    all_results = []
    total_games = len(test_words) * len(models) * len(models)
    game_counter = 0
    
    print(f"\n🚀 开始执行实验...")
    
    for word_data in test_words:
        target_word = word_data['target']
        taboo_words = word_data['taboo']
        
        print(f"\n🎯 测试词汇: {target_word}")
        print(f"🚫 禁用词: {taboo_words}")
        
        for hinter_model in models:
            for guesser_model in models:
                game_counter += 1
                hinter_name = hinter_model.split('/')[-1]
                guesser_name = guesser_model.split('/')[-1]
                pair_name = f"{hinter_name}→{guesser_name}"
                
                print(f"  🔄 游戏 {game_counter}/{total_games}: {pair_name}")
                
                start_time = time.time()
                
                try:
                    # 执行游戏
                    game_result = play_chinese_taboo_game(
                        client, hinter_model, guesser_model, 
                        target_word, taboo_words, max_turns=5
                    )
                    
                    duration = round(time.time() - start_time, 2)
                    
                    # 记录结果
                    result = {
                        'game_id': game_counter,
                        'target_word': target_word,
                        'part_of_speech': word_data['part_of_speech'],
                        'category': word_data['category'],
                        'taboo_words': '|'.join(taboo_words),
                        'hinter_model': hinter_model,
                        'guesser_model': guesser_model,
                        'success': game_result['success'],
                        'turns_used': game_result['turns'],
                        'final_guess': game_result['final_guess'],
                        'failure_reason': game_result.get('failure_reason', None),
                        'taboo_violation_turn': game_result.get('taboo_violation_turn', None),
                        'taboo_violation_hint': game_result.get('taboo_violation_hint', None),
                        'has_taboo_violation': game_result.get('failure_reason') == 'TABOO_VIOLATION',
                        'all_hints': ' | '.join(game_result['all_hints']),
                        'all_guesses': ' | '.join(game_result['all_guesses']),
                        'conversation': ' | '.join(game_result['conversation']),
                        'total_api_attempts': game_result.get('total_hinter_attempts', 0) + game_result.get('total_guesser_attempts', 0),
                        'format_errors': ' | '.join(game_result.get('format_errors', [])),
                        'has_format_errors': len(game_result.get('format_errors', [])) > 0,
                        'duration_seconds': duration,
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        'language': 'chinese',
                        'dataset_source': 'openhownet'
                    }
                    
                    if 'error' in game_result:
                        result['error'] = game_result['error']
                    
                    all_results.append(result)
                    
                    # 显示结果
                    status = "✅ 成功" if game_result['success'] else "❌ 失败"
                    failure_info = ""
                    if not game_result['success'] and game_result.get('failure_reason'):
                        failure_reason = game_result['failure_reason']
                        if failure_reason == 'TABOO_VIOLATION':
                            failure_info = " (违反禁用词)"
                        elif failure_reason == 'FORMAT_FAILURE':
                            failure_info = " (格式错误)"
                        elif failure_reason == 'API_FAILURE':
                            failure_info = " (API失败)"
                        elif failure_reason == 'MAX_TURNS_EXCEEDED':
                            failure_info = " (轮数耗尽)"
                    
                    print(f"     {status}{failure_info} | {game_result['turns']}轮 | 最终猜测: {game_result['final_guess']}")
                    
                except Exception as e:
                    print(f"     ❌ 游戏执行异常: {e}")
                    # 记录异常结果
                    result = {
                        'game_id': game_counter,
                        'target_word': target_word,
                        'hinter_model': hinter_model,
                        'guesser_model': guesser_model,
                        'success': False,
                        'failure_reason': 'EXCEPTION',
                        'error': str(e),
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        'language': 'chinese'
                    }
                    all_results.append(result)
                
                time.sleep(0.5)  # API调用间隔
    
    return all_results

# 执行测试实验
if chinese_client:
    test_results = run_chinese_test_experiment(
        chinese_client, CHINESE_TEST_MODELS, chinese_dataset, num_test_words=3
    )
    
    if test_results:
        print(f"\n🎉 中文Taboo测试实验完成！")
        print(f"📊 总游戏数: {len(test_results)}")
        
        # 统计结果
        successful_games = [r for r in test_results if r['success']]
        success_rate = len(successful_games) / len(test_results) * 100
        print(f"📈 成功率: {len(successful_games)}/{len(test_results)} ({success_rate:.1f}%)")
        
        # 保存测试结果
        test_results_path = f"results/chinese_test_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        os.makedirs("results", exist_ok=True)
        
        df_results = pd.DataFrame(test_results)
        df_results.to_csv(test_results_path, index=False, encoding='utf-8-sig')
        print(f"💾 测试结果已保存: {test_results_path}")
        
        # 按模型统计
        print(f"\n📊 各模型表现:")
        for model in CHINESE_TEST_MODELS:
            model_name = model.split('/')[-1]
            model_as_hinter = [r for r in test_results if r['hinter_model'] == model]
            model_as_guesser = [r for r in test_results if r['guesser_model'] == model]
            
            hinter_success = len([r for r in model_as_hinter if r['success']])
            guesser_success = len([r for r in model_as_guesser if r['success']])
            
            print(f"   {model_name}:")
            if len(model_as_hinter) > 0:
                print(f"     作为提示者: {hinter_success}/{len(model_as_hinter)} ({hinter_success/len(model_as_hinter)*100:.1f}%)")
            if len(model_as_guesser) > 0:
                print(f"     作为猜测者: {guesser_success}/{len(model_as_guesser)} ({guesser_success/len(model_as_guesser)*100:.1f}%)")
    else:
        print("❌ 测试实验失败")
else:
    print("❌ 无法执行测试实验：API客户端未初始化")


In [None]:
# 结果分析与可视化
print("📊 中文Taboo实验结果深度分析")
print("=" * 60)

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict
import glob

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

def load_experiment_results():
    """加载所有实验结果文件"""
    results_files = glob.glob("results/**/chinese_*.csv", recursive=True)
    results_files.extend(glob.glob("results/**/*chinese*.csv", recursive=True))
    
    if not results_files:
        print("⚠️ 未找到实验结果文件")
        return None
    
    print(f"🔍 找到 {len(results_files)} 个结果文件:")
    for file in results_files:
        print(f"   - {file}")
    
    # 加载最新的结果文件
    latest_file = max(results_files, key=lambda x: x.split('_')[-1] if '_' in x else x)
    print(f"\n📂 加载最新结果文件: {latest_file}")
    
    try:
        df = pd.read_csv(latest_file, encoding='utf-8-sig')
        print(f"✅ 成功加载 {len(df)} 条记录")
        return df
    except Exception as e:
        print(f"❌ 加载失败: {e}")
        return None

def analyze_overall_performance(df):
    """整体性能分析"""
    print("\n🎯 整体性能分析")
    print("-" * 40)
    
    total_games = len(df)
    successful_games = len(df[df['success'] == True])
    success_rate = successful_games / total_games * 100 if total_games > 0 else 0
    
    print(f"总游戏数: {total_games}")
    print(f"成功游戏数: {successful_games}")
    print(f"整体成功率: {success_rate:.1f}%")
    
    # 失败原因分析
    failed_games = df[df['success'] == False]
    if len(failed_games) > 0:
        failure_reasons = failed_games['failure_reason'].value_counts()
        print(f"\n❌ 失败原因分布:")
        for reason, count in failure_reasons.items():
            percentage = count / len(failed_games) * 100
            print(f"   {reason}: {count} 次 ({percentage:.1f}%)")
    
    return {
        'total_games': total_games,
        'successful_games': successful_games,
        'success_rate': success_rate,
        'failure_reasons': failure_reasons if len(failed_games) > 0 else None
    }

def analyze_model_performance(df):
    """模型性能分析"""
    print("\n🤖 模型性能分析")
    print("-" * 40)
    
    models = list(set(df['hinter_model'].unique()) | set(df['guesser_model'].unique()))
    model_stats = {}
    
    for model in models:
        model_name = model.split('/')[-1] if '/' in model else model
        
        # 作为提示者的表现
        as_hinter = df[df['hinter_model'] == model]
        hinter_success = len(as_hinter[as_hinter['success'] == True])
        hinter_total = len(as_hinter)
        hinter_rate = hinter_success / hinter_total * 100 if hinter_total > 0 else 0
        
        # 作为猜测者的表现
        as_guesser = df[df['guesser_model'] == model]
        guesser_success = len(as_guesser[as_guesser['success'] == True])
        guesser_total = len(as_guesser)
        guesser_rate = guesser_success / guesser_total * 100 if guesser_total > 0 else 0
        
        model_stats[model_name] = {
            'hinter': {'success': hinter_success, 'total': hinter_total, 'rate': hinter_rate},
            'guesser': {'success': guesser_success, 'total': guesser_total, 'rate': guesser_rate}
        }
        
        print(f"\n{model_name}:")
        print(f"  作为提示者: {hinter_success}/{hinter_total} ({hinter_rate:.1f}%)")
        print(f"  作为猜测者: {guesser_success}/{guesser_total} ({guesser_rate:.1f}%)")
    
    return model_stats

def analyze_pos_performance(df):
    """词性性能分析"""
    print("\n📝 词性性能分析")
    print("-" * 40)
    
    if 'part_of_speech' not in df.columns:
        print("⚠️ 数据中未找到词性信息")
        return None
    
    pos_stats = {}
    pos_groups = df.groupby('part_of_speech')
    
    for pos, group in pos_groups:
        total = len(group)
        success = len(group[group['success'] == True])
        rate = success / total * 100 if total > 0 else 0
        
        pos_stats[pos] = {'total': total, 'success': success, 'rate': rate}
        print(f"{pos}: {success}/{total} ({rate:.1f}%)")
    
    return pos_stats

def create_visualizations(df, model_stats, pos_stats, overall_stats):
    """创建可视化图表"""
    print("\n📈 生成可视化图表")
    print("-" * 40)
    
    # 创建子图
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('中文Taboo实验结果分析', fontsize=16, fontweight='bold')
    
    # 1. 整体成功率饼图
    ax1 = axes[0, 0]
    success_data = [overall_stats['successful_games'], 
                   overall_stats['total_games'] - overall_stats['successful_games']]
    labels = [f'成功 ({overall_stats["success_rate"]:.1f}%)', 
              f'失败 ({100-overall_stats["success_rate"]:.1f}%)']
    colors = ['#2E8B57', '#DC143C']
    
    ax1.pie(success_data, labels=labels, colors=colors, autopct='%d', startangle=90)
    ax1.set_title('整体成功率分布')
    
    # 2. 模型性能对比
    ax2 = axes[0, 1]
    if model_stats:
        model_names = list(model_stats.keys())
        hinter_rates = [stats['hinter']['rate'] for stats in model_stats.values()]
        guesser_rates = [stats['guesser']['rate'] for stats in model_stats.values()]
        
        x = np.arange(len(model_names))
        width = 0.35
        
        bars1 = ax2.bar(x - width/2, hinter_rates, width, label='作为提示者', color='skyblue')
        bars2 = ax2.bar(x + width/2, guesser_rates, width, label='作为猜测者', color='lightcoral')
        
        ax2.set_xlabel('模型')
        ax2.set_ylabel('成功率 (%)')
        ax2.set_title('各模型性能对比')
        ax2.set_xticks(x)
        ax2.set_xticklabels([name[:10] for name in model_names], rotation=45)
        ax2.legend()
        ax2.grid(axis='y', alpha=0.3)
    
    # 3. 词性性能分析
    ax3 = axes[1, 0]
    if pos_stats:
        pos_names = list(pos_stats.keys())
        pos_rates = [stats['rate'] for stats in pos_stats.values()]
        pos_totals = [stats['total'] for stats in pos_stats.values()]
        
        bars = ax3.bar(pos_names, pos_rates, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
        ax3.set_ylabel('成功率 (%)')
        ax3.set_title('各词性表现')
        ax3.grid(axis='y', alpha=0.3)
        
        # 在柱子上添加数据标签
        for bar, total in zip(bars, pos_totals):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%\n(n={total})', ha='center', va='bottom', fontsize=9)
    
    # 4. 失败原因分析
    ax4 = axes[1, 1]
    if overall_stats['failure_reasons'] is not None and len(overall_stats['failure_reasons']) > 0:
        failure_reasons = overall_stats['failure_reasons']
        reasons = list(failure_reasons.keys())
        counts = list(failure_reasons.values())
        
        # 简化失败原因名称
        reason_mapping = {
            'TABOO_VIOLATION': '禁用词违规',
            'MAX_TURNS_EXCEEDED': '轮数耗尽',
            'FORMAT_FAILURE': '格式错误',
            'API_FAILURE': 'API失败',
            'EXCEPTION': '异常错误'
        }
        
        simplified_reasons = [reason_mapping.get(r, r) for r in reasons]
        
        wedges, texts, autotexts = ax4.pie(counts, labels=simplified_reasons, autopct='%1.1f%%', startangle=90)
        ax4.set_title('失败原因分布')
    else:
        ax4.text(0.5, 0.5, '无失败数据', ha='center', va='center', transform=ax4.transAxes)
        ax4.set_title('失败原因分布')
    
    plt.tight_layout()
    
    # 保存图表
    plot_filename = f"results/chinese_experiment_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
    plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
    print(f"📊 图表已保存: {plot_filename}")
    
    plt.show()

def generate_detailed_report(df, model_stats, pos_stats, overall_stats):
    """生成详细分析报告"""
    print("\n📋 生成详细分析报告")
    print("-" * 40)
    
    report = {
        'experiment_summary': {
            'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'total_games': overall_stats['total_games'],
            'success_rate': overall_stats['success_rate'],
            'dataset_size': len(df['target_word'].unique()) if 'target_word' in df.columns else 0
        },
        'overall_performance': overall_stats,
        'model_performance': model_stats,
        'pos_performance': pos_stats
    }
    
    # 添加词汇级别分析
    if 'target_word' in df.columns:
        word_analysis = {}
        for word in df['target_word'].unique():
            word_games = df[df['target_word'] == word]
            word_success = len(word_games[word_games['success'] == True])
            word_total = len(word_games)
            word_rate = word_success / word_total * 100 if word_total > 0 else 0
            
            word_analysis[word] = {
                'success_rate': word_rate,
                'total_games': word_total,
                'success_count': word_success
            }
        
        # 找出最难和最容易的词汇
        sorted_words = sorted(word_analysis.items(), key=lambda x: x[1]['success_rate'])
        
        report['word_analysis'] = {
            'most_difficult': sorted_words[:5],  # 最难的5个词
            'easiest': sorted_words[-5:],  # 最容易的5个词
            'all_words': word_analysis
        }
        
        print(f"最难词汇 (成功率最低):")
        for word, stats in sorted_words[:5]:
            print(f"  {word}: {stats['success_rate']:.1f}% ({stats['success_count']}/{stats['total_games']})")
        
        print(f"\n最容易词汇 (成功率最高):")
        for word, stats in sorted_words[-5:]:
            print(f"  {word}: {stats['success_rate']:.1f}% ({stats['success_count']}/{stats['total_games']})")
    
    # 保存报告
    report_filename = f"results/chinese_experiment_detailed_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(report_filename, 'w', encoding='utf-8') as f:
        json.dump(report, f, ensure_ascii=False, indent=2)
    
    print(f"\n💾 详细报告已保存: {report_filename}")
    return report

# 主要分析流程
print("🚀 开始结果分析...")

# 1. 加载数据
df_results = load_experiment_results()

if df_results is not None:
    print(f"\n📊 数据概览:")
    print(f"   数据维度: {df_results.shape}")
    print(f"   列名: {list(df_results.columns)}")
    
    # 2. 整体性能分析
    overall_performance = analyze_overall_performance(df_results)
    
    # 3. 模型性能分析
    model_performance = analyze_model_performance(df_results)
    
    # 4. 词性性能分析
    pos_performance = analyze_pos_performance(df_results)
    
    # 5. 创建可视化
    create_visualizations(df_results, model_performance, pos_performance, overall_performance)
    
    # 6. 生成详细报告
    detailed_report = generate_detailed_report(df_results, model_performance, pos_performance, overall_performance)
    
    print(f"\n🎉 结果分析完成！")
    print(f"📈 关键指标:")
    print(f"   整体成功率: {overall_performance['success_rate']:.1f}%")
    print(f"   测试模型数: {len(model_performance)}")
    print(f"   词性覆盖: {len(pos_performance) if pos_performance else 0}")
    
else:
    print("❌ 无法进行结果分析：未找到有效的实验结果文件")
    print("💡 请先运行实验生成结果文件")

print("\n" + "=" * 60)
print("📋 分析完成 - 所有图表和报告已保存到 results/ 目录")
