In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 设置图表样式
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8')

# 设置颜色调色板
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']


In [None]:
# 加载数据集
with open('data/dataset.json', 'r', encoding='utf-8') as f:
    dataset = json.load(f)

# 转换为DataFrame
dataset_df = pd.DataFrame(dataset)

# 提取concreteness_score
dataset_df['concreteness_score'] = dataset_df['metadata'].apply(lambda x: x.get('concreteness_score'))

print(f"数据集总词汇数: {len(dataset_df)}")
print(f"词性分布: {dataset_df['part_of_speech'].value_counts().to_dict()}")
print(f"领域分布: {dataset_df['category'].value_counts().to_dict()}")
print(f"有抽象程度评分的词汇数: {dataset_df['concreteness_score'].notna().sum()}")

dataset_df.head()


In [None]:
# 加载实验结果数据
results_df = pd.read_csv('results/taboo_experiment_20250712_004918/complete_experiment_results.csv')

print(f"实验结果总数: {len(results_df)}")
print(f"参与实验的模型: {results_df['hinter_model'].unique().tolist()}")
print(f"总体成功率: {results_df['success'].mean():.3f}")

# 清理模型名称以便显示
model_name_mapping = {
    'anthropic/claude-sonnet-4': 'Claude Sonnet 4',
    'openai/gpt-4o': 'GPT-4o',
    'google/gemini-2.5-pro': 'Gemini 2.5 Pro',
    'deepseek/deepseek-chat-v3-0324': 'DeepSeek Chat V3'
}

results_df['hinter_model_clean'] = results_df['hinter_model'].map(model_name_mapping)
results_df['guesser_model_clean'] = results_df['guesser_model'].map(model_name_mapping)

results_df.head()


In [None]:
# 合并数据集信息到结果中
# 创建一个简化的数据集映射
dataset_info = dataset_df[['target', 'part_of_speech', 'category', 'concreteness_score']].copy()
dataset_info = dataset_info.rename(columns={'target': 'target_word'})

# 合并数据
merged_df = results_df.merge(dataset_info, on='target_word', how='left')

print(f"合并后数据量: {len(merged_df)}")
print(f"成功匹配词汇信息的比例: {merged_df['part_of_speech'].notna().mean():.3f}")

merged_df.head()


In [None]:
# 计算各模型的总体成功率
model_success = merged_df.groupby('hinter_model_clean').agg({
    'success': ['count', 'sum', 'mean'],
    'turns_used': 'mean',
    'has_taboo_violation': 'mean'
}).round(3)

model_success.columns = ['总游戏数', '成功数', '成功率', '平均轮数', '违规率']
model_success = model_success.sort_values('成功率', ascending=False)

print("各模型表现总览:")
print(model_success)

# 绘制模型成功率对比图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 成功率柱状图
bars1 = ax1.bar(model_success.index, model_success['成功率'], color=colors[:len(model_success)])
ax1.set_title('各模型成功率对比', fontsize=14, fontweight='bold')
ax1.set_ylabel('成功率')
ax1.set_ylim(0, 1)
ax1.tick_params(axis='x', rotation=45)

# 在柱状图上添加数值标签
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

# 平均轮数对比
bars2 = ax2.bar(model_success.index, model_success['平均轮数'], color=colors[:len(model_success)])
ax2.set_title('各模型平均轮数对比', fontsize=14, fontweight='bold')
ax2.set_ylabel('平均轮数')
ax2.tick_params(axis='x', rotation=45)

# 在柱状图上添加数值标签
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.05,
             f'{height:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# 分析成功案例的轮数分布
successful_games = merged_df[merged_df['success'] == True]

# 计算各轮成功的详细分布
turns_analysis = successful_games.groupby(['hinter_model_clean', 'turns_used']).size().unstack(fill_value=0)
turns_pct = turns_analysis.div(turns_analysis.sum(axis=1), axis=0)

# 计算各轮成功率（包括第1轮、第2轮等）
turn_success_rates = {}
for turn in range(1, 6):  # 分析前5轮
    turn_rates = successful_games.groupby('hinter_model_clean').apply(
        lambda x: (x['turns_used'] == turn).sum() / len(x)
    )
    turn_success_rates[f'第{turn}轮成功率'] = turn_rates

turn_success_df = pd.DataFrame(turn_success_rates).fillna(0)
print("各模型在不同轮数的成功率分布:")
print(turn_success_df.round(3))

# 计算累积成功率
cumulative_success = {}
for turn in range(1, 6):
    cumulative_rates = successful_games.groupby('hinter_model_clean').apply(
        lambda x: (x['turns_used'] <= turn).sum() / len(x)
    )
    cumulative_success[f'前{turn}轮累积成功率'] = cumulative_rates

cumulative_df = pd.DataFrame(cumulative_success).fillna(0)
print("\n各模型的累积成功率:")
print(cumulative_df.round(3))

# 绘制详细的轮数分析图
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. 各轮成功率对比
turn_success_df.plot(kind='bar', ax=ax1, color=colors[:len(turn_success_df.columns)])
ax1.set_title('各模型在不同轮数的成功率', fontsize=14, fontweight='bold')
ax1.set_ylabel('成功率')
ax1.set_xlabel('模型')
ax1.legend(title='轮数', bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.tick_params(axis='x', rotation=45)

# 2. 堆积柱状图显示轮数分布
turns_pct.plot(kind='bar', stacked=True, ax=ax2, colormap='viridis', 
               legend_kws={'title': '轮数', 'bbox_to_anchor': (1.05, 1)})
ax2.set_title('成功案例的轮数分布（百分比）', fontsize=14, fontweight='bold')
ax2.set_ylabel('比例')
ax2.set_xlabel('模型')
ax2.tick_params(axis='x', rotation=45)

# 3. 累积成功率曲线
for i, model in enumerate(cumulative_df.index):
    turns = range(1, 6)
    rates = [cumulative_df.loc[model, f'前{turn}轮累积成功率'] for turn in turns]
    ax3.plot(turns, rates, 'o-', linewidth=2, label=model, color=colors[i])

ax3.set_title('累积成功率曲线', fontsize=14, fontweight='bold')
ax3.set_xlabel('轮数')
ax3.set_ylabel('累积成功率')
ax3.legend()
ax3.grid(True, alpha=0.3)
ax3.set_xticks(range(1, 6))
ax3.set_ylim(0, 1)

# 4. 第1轮vs其他轮数成功率对比
first_turn_vs_others = pd.DataFrame({
    '第1轮成功率': turn_success_df['第1轮成功率'],
    '其他轮成功率': 1 - turn_success_df['第1轮成功率']
})

first_turn_vs_others.plot(kind='bar', ax=ax4, color=['#2ca02c', '#ff7f0e'])
ax4.set_title('第1轮 vs 其他轮次成功率对比', fontsize=14, fontweight='bold')
ax4.set_ylabel('比例')
ax4.set_xlabel('模型')
ax4.legend(title='成功轮次')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# 输出详细统计
print(f"\n📈 轮数分析总结:")
print(f"  • 总体第1轮成功率: {turn_success_df['第1轮成功率'].mean():.1%}")
print(f"  • 总体前3轮累积成功率: {cumulative_df['前3轮累积成功率'].mean():.1%}")
print(f"  • 第1轮成功率最高的模型: {turn_success_df['第1轮成功率'].idxmax()} ({turn_success_df['第1轮成功率'].max():.1%})")
print(f"  • 前3轮累积成功率最高的模型: {cumulative_df['前3轮累积成功率'].idxmax()} ({cumulative_df['前3轮累积成功率'].max():.1%})")


In [None]:
# 按词性分析成功率
pos_success = merged_df.groupby(['part_of_speech', 'hinter_model_clean']).agg({
    'success': ['count', 'mean'],
    'turns_used': 'mean'
}).round(3)

pos_success.columns = ['游戏数', '成功率', '平均轮数']
pos_success = pos_success.reset_index()

print("按词性的成功率分析:")
pos_pivot = pos_success.pivot(index='part_of_speech', columns='hinter_model_clean', values='成功率')
print(pos_pivot.round(3))

# 绘制词性成功率热力图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# 热力图
sns.heatmap(pos_pivot, annot=True, cmap='YlOrRd', ax=ax1, 
            cbar_kws={'label': '成功率'}, fmt='.3f')
ax1.set_title('各模型在不同词性上的成功率热力图', fontsize=14, fontweight='bold')
ax1.set_xlabel('模型')
ax1.set_ylabel('词性')

# 词性整体成功率
overall_pos = merged_df.groupby('part_of_speech')['success'].mean().sort_values(ascending=True)
bars = ax2.barh(range(len(overall_pos)), overall_pos.values, color=colors[:len(overall_pos)])
ax2.set_yticks(range(len(overall_pos)))
ax2.set_yticklabels(overall_pos.index)
ax2.set_title('不同词性的整体成功率', fontsize=14, fontweight='bold')
ax2.set_xlabel('成功率')

# 添加数值标签
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax2.text(width + 0.01, bar.get_y() + bar.get_height()/2.,
             f'{width:.3f}', ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# 过滤有抽象程度评分的数据
concrete_df = merged_df[merged_df['concreteness_score'].notna()].copy()

print(f"有抽象程度评分的实验数据: {len(concrete_df)} 条")

# 将抽象程度分为几个区间
concrete_df['concreteness_level'] = pd.cut(concrete_df['concreteness_score'], 
                                           bins=[0, 2, 3, 4, 5], 
                                           labels=['高抽象(0-2)', '中抽象(2-3)', '中具体(3-4)', '高具体(4-5)'])

# 按抽象程度分析
concrete_success = concrete_df.groupby(['concreteness_level', 'hinter_model_clean']).agg({
    'success': ['count', 'mean'],
    'turns_used': 'mean'
}).round(3)

concrete_success.columns = ['游戏数', '成功率', '平均轮数']
concrete_success = concrete_success.reset_index()

print("按抽象程度的成功率分析:")
concrete_pivot = concrete_success.pivot(index='concreteness_level', columns='hinter_model_clean', values='成功率')
print(concrete_pivot.round(3))

# 绘制抽象程度分析图
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 不同模型在各抽象程度的表现
concrete_pivot.plot(kind='bar', ax=ax1, color=colors[:len(concrete_pivot.columns)])
ax1.set_title('各模型在不同抽象程度词汇上的成功率', fontsize=14, fontweight='bold')
ax1.set_ylabel('成功率')
ax1.set_xlabel('抽象程度')
ax1.legend(title='模型', bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.tick_params(axis='x', rotation=45)
ax1.set_ylim(0, 1)

# 抽象程度与成功率的关系散点图
for i, model in enumerate(concrete_df['hinter_model_clean'].unique()):
    model_data = concrete_df[concrete_df['hinter_model_clean'] == model]
    success_by_concrete = model_data.groupby('concreteness_score')['success'].mean()
    ax2.scatter(success_by_concrete.index, success_by_concrete.values, 
               label=model, color=colors[i], alpha=0.7, s=50)

ax2.set_title('抽象程度与成功率的关系', fontsize=14, fontweight='bold')
ax2.set_xlabel('具体程度评分 (1=抽象, 5=具体)')
ax2.set_ylabel('成功率')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 按领域分析成功率
category_success = merged_df.groupby(['category', 'hinter_model_clean']).agg({
    'success': ['count', 'mean'],
    'turns_used': 'mean'
}).round(3)

category_success.columns = ['游戏数', '成功率', '平均轮数']
category_success = category_success.reset_index()

print("按领域的成功率分析:")
category_pivot = category_success.pivot(index='category', columns='hinter_model_clean', values='成功率')
print(category_pivot.round(3))

# 如果只有general领域，加载专业领域数据进行补充分析
try:
    # 加载专业领域数据集
    domain_datasets = {}
    domains = ['cs', 'biology', 'law', 'literature', 'medical']
    
    for domain in domains:
        try:
            with open(f'hpc_taboo/data/{domain}_wordnet_dataset.json', 'r') as f:
                domain_datasets[domain] = json.load(f)
        except FileNotFoundError:
            print(f"未找到{domain}数据集")
    
    print(f"\n加载了{len(domain_datasets)}个专业领域数据集")
    for domain, data in domain_datasets.items():
        print(f"{domain}: {len(data)} 个词汇")
        
except Exception as e:
    print(f"加载专业领域数据时出错: {e}")

# 绘制领域分析图
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

if len(category_pivot) > 1:
    category_pivot.plot(kind='bar', ax=ax, color=colors[:len(category_pivot.columns)])
    ax.set_title('各模型在不同领域的成功率', fontsize=14, fontweight='bold')
else:
    # 如果只有general领域，显示整体分布
    overall_category = merged_df.groupby('hinter_model_clean')['success'].mean()
    bars = ax.bar(overall_category.index, overall_category.values, color=colors[:len(overall_category)])
    ax.set_title('各模型在通用领域的成功率', fontsize=14, fontweight='bold')
    
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

ax.set_ylabel('成功率')
ax.set_xlabel('模型')
ax.tick_params(axis='x', rotation=45)
ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()


In [None]:
# 创建综合性能对比雷达图
from math import pi
from sklearn.preprocessing import MinMaxScaler

# 计算各模型的多维性能指标
model_metrics = merged_df.groupby('hinter_model_clean').agg({
    'success': 'mean',
    'turns_used': lambda x: 1/(x[merged_df.loc[x.index, 'success']].mean()),  # 效率指标（轮数越少越好）
    'has_taboo_violation': lambda x: 1-x.mean()  # 规则遵守指标
}).round(3)

# 添加第一次成功率
first_success_rate = merged_df[merged_df['success'] == True].groupby('hinter_model_clean').apply(
    lambda x: (x['turns_used'] == 1).sum() / len(x)
)
model_metrics['第一次成功率'] = first_success_rate

model_metrics.columns = ['成功率', '效率指标', '规则遵守', '第一次成功率']

# 标准化指标到0-1范围
scaler = MinMaxScaler()
model_metrics_scaled = pd.DataFrame(
    scaler.fit_transform(model_metrics), 
    index=model_metrics.index, 
    columns=model_metrics.columns
)

print("模型综合性能指标:")
print(model_metrics)

# 绘制雷达图
fig, ax = plt.subplots(1, 1, figsize=(10, 10), subplot_kw=dict(projection='polar'))

angles = [n / float(len(model_metrics.columns)) * 2 * pi for n in range(len(model_metrics.columns))]
angles += angles[:1]  # 闭合图形

for i, (model, values) in enumerate(model_metrics_scaled.iterrows()):
    values_list = values.tolist()
    values_list += values_list[:1]  # 闭合图形
    
    ax.plot(angles, values_list, 'o-', linewidth=2, label=model, color=colors[i])
    ax.fill(angles, values_list, alpha=0.25, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(model_metrics.columns)
ax.set_ylim(0, 1)
ax.set_title('模型综合性能雷达图', fontsize=16, fontweight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# 错误分析 - 失败原因分布
failure_analysis = merged_df[merged_df['success'] == False]['failure_reason'].value_counts()

print("失败原因分布:")
print(failure_analysis)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 失败原因饼图
ax1.pie(failure_analysis.values, labels=failure_analysis.index, autopct='%1.1f%%', 
        colors=colors[:len(failure_analysis)])
ax1.set_title('失败原因分布', fontsize=14, fontweight='bold')

# 各模型的违规率对比
violation_by_model = merged_df.groupby('hinter_model_clean')['has_taboo_violation'].mean()
bars = ax2.bar(violation_by_model.index, violation_by_model.values, 
               color=colors[:len(violation_by_model)])
ax2.set_title('各模型违规率对比', fontsize=14, fontweight='bold')
ax2.set_ylabel('违规率')
ax2.tick_params(axis='x', rotation=45)

for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.005,
             f'{height:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# 难度分析 - 基于游戏轮数的词汇难度分布
successful_games = merged_df[merged_df['success'] == True]
word_difficulty = successful_games.groupby('target_word')['turns_used'].agg(['mean', 'count']).reset_index()
word_difficulty = word_difficulty[word_difficulty['count'] >= 2]  # 至少被测试2次的词汇

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 词汇难度分布直方图
ax1.hist(word_difficulty['mean'], bins=20, color=colors[0], alpha=0.7, edgecolor='black')
ax1.set_title('词汇难度分布（基于平均轮数）', fontsize=14, fontweight='bold')
ax1.set_xlabel('平均轮数')
ax1.set_ylabel('词汇数量')
ax1.axvline(word_difficulty['mean'].mean(), color='red', linestyle='--', 
            label=f'平均难度: {word_difficulty["mean"].mean():.2f}轮')
ax1.legend()

# 成功率与平均轮数的关系
model_turns_success = merged_df.groupby('hinter_model_clean').agg({
    'success': 'mean',
    'turns_used': lambda x: x[merged_df.loc[x.index, 'success']].mean()
})

ax2.scatter(model_turns_success['turns_used'], model_turns_success['success'], 
           s=100, color=colors[:len(model_turns_success)], alpha=0.7)

for i, (model, data) in enumerate(model_turns_success.iterrows()):
    ax2.annotate(model, (data['turns_used'], data['success']), 
                xytext=(5, 5), textcoords='offset points', fontsize=10)

ax2.set_title('成功率与效率的关系', fontsize=14, fontweight='bold')
ax2.set_xlabel('平均轮数（成功案例）')
ax2.set_ylabel('成功率')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
from scipy import stats
from itertools import combinations

# 对模型间成功率进行统计检验
models = merged_df['hinter_model_clean'].unique()
model_success_data = {}

for model in models:
    model_data = merged_df[merged_df['hinter_model_clean'] == model]['success']
    model_success_data[model] = model_data

print("模型间成功率差异的统计检验 (Chi-square test):")
print("="*60)

results_matrix = pd.DataFrame(index=models, columns=models, dtype=float)

for model1, model2 in combinations(models, 2):
    # 创建列联表
    data1 = model_success_data[model1]
    data2 = model_success_data[model2]
    
    contingency_table = pd.crosstab(
        pd.concat([data1, data2]), 
        pd.concat([pd.Series([model1]*len(data1)), pd.Series([model2]*len(data2))])
    )
    
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    results_matrix.loc[model1, model2] = p_value
    results_matrix.loc[model2, model1] = p_value
    
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
    print(f"{model1} vs {model2}: p = {p_value:.4f} {significance}")

print("\n*** p < 0.001, ** p < 0.01, * p < 0.05, ns = not significant")


In [None]:
# 生成总结报告
print("="*80)
print("                          TABOO游戏实验总结报告")
print("="*80)

print(f"\n📊 实验规模:")
print(f"  • 总游戏数: {len(merged_df):,}")
print(f"  • 参与模型: {len(models)} 个")
print(f"  • 测试词汇: {merged_df['target_word'].nunique():,} 个")
print(f"  • 总体成功率: {merged_df['success'].mean():.1%}")

print(f"\n🏆 模型排名 (按成功率):")
model_ranking = model_success.sort_values('成功率', ascending=False)
for i, (model, data) in enumerate(model_ranking.iterrows(), 1):
    print(f"  {i}. {model}: {data['成功率']:.1%} (平均{data['平均轮数']:.1f}轮)")

print(f"\n⚡ 效率分析:")
first_turn_ranking = turn_success_df.sort_values('第1轮成功率', ascending=False)
print(f"  • 第1轮成功率最高: {first_turn_ranking.index[0]} ({first_turn_ranking.iloc[0]['第1轮成功率']:.1%})")
print(f"  • 前3轮累积成功率最高: {cumulative_df['前3轮累积成功率'].idxmax()} ({cumulative_df['前3轮累积成功率'].max():.1%})")
print(f"  • 平均轮数最少: {model_success.sort_values('平均轮数').index[0]} ({model_success.sort_values('平均轮数').iloc[0]['平均轮数']:.1f}轮)")

print(f"\n📝 词性分析:")
if 'part_of_speech' in merged_df.columns:
    pos_overall = merged_df.groupby('part_of_speech')['success'].mean().sort_values(ascending=False)
    print(f"  • 最容易的词性: {pos_overall.index[0]} ({pos_overall.iloc[0]:.1%})")
    print(f"  • 最困难的词性: {pos_overall.index[-1]} ({pos_overall.iloc[-1]:.1%})")

print(f"\n🎯 抽象程度分析:")
if len(concrete_df) > 0:
    concrete_overall = concrete_df.groupby('concreteness_level')['success'].mean().sort_values(ascending=False)
    print(f"  • 最容易的抽象程度: {concrete_overall.index[0]} ({concrete_overall.iloc[0]:.1%})")
    print(f"  • 最困难的抽象程度: {concrete_overall.index[-1]} ({concrete_overall.iloc[-1]:.1%})")

print(f"\n⚠️  规则遵守:")
violation_ranking = merged_df.groupby('hinter_model_clean')['has_taboo_violation'].mean().sort_values()
print(f"  • 违规率最低: {violation_ranking.index[0]} ({violation_ranking.iloc[0]:.1%})")
print(f"  • 违规率最高: {violation_ranking.index[-1]} ({violation_ranking.iloc[-1]:.1%})")

print(f"\n🔍 关键发现:")
best_model = model_ranking.index[0]
best_success_rate = model_ranking.iloc[0]['成功率']
worst_model = model_ranking.index[-1]
worst_success_rate = model_ranking.iloc[-1]['成功率']

print(f"  • {best_model} 表现最佳，成功率达到 {best_success_rate:.1%}")
print(f"  • 最佳与最差模型的成功率差距为 {best_success_rate - worst_success_rate:.1%}")
print(f"  • 平均游戏轮数为 {merged_df[merged_df['success']]['turns_used'].mean():.1f} 轮")

print("\n" + "="*80)


In [None]:
# 保存关键结果到文件
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# 保存模型性能总结
model_summary = model_success.copy()
model_summary['第1轮成功率'] = model_summary.index.map(turn_success_df['第1轮成功率'])
model_summary['前3轮累积成功率'] = model_summary.index.map(cumulative_df['前3轮累积成功率'])
model_summary['违规率'] = model_summary.index.map(violation_by_model)

model_summary.to_csv(f'analysis_results_{timestamp}.csv', encoding='utf-8')
print(f"\n✅ 分析结果已保存到: analysis_results_{timestamp}.csv")

print("\n🎉 数据分析完成！")


In [None]:
import json
import random
import time
import requests
import pandas as pd
from typing import Dict, List, Any
from datetime import datetime
import os

# 加载数据集
def load_dataset(dataset_path: str) -> List[Dict[str, Any]]:
    """加载Taboo游戏数据集"""
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    return dataset

# 加载预生成的数据集
DATASET_PATH = "data/dataset.json"
dataset = load_dataset(DATASET_PATH)
print(f"✅ 数据集加载成功: {len(dataset)} 条记录")
print(f"📁 数据集路径: {DATASET_PATH}")

# 显示第一个样本
if dataset:
    sample = dataset[0]
    print(f"\n📋 数据样本:")
    print(f"   目标词: {sample['target']}")
    print(f"   禁用词: {sample['taboo']}")
    print(f"   类别: {sample.get('category', 'N/A')}")
    if sample.get('senses'):
        print(f"   定义: {sample['senses'][0].get('definition', 'N/A')[:100]}...")


✅ 数据集加载成功: 300 条记录
📁 数据集路径: data/dataset.json

📋 数据样本:
   目标词: regent
   禁用词: ['board', 'members', 'trustee', 'committee', 'governing']
   类别: general
   定义: members of a governing board...
