# 不同表现分段学生的Tutor辅导提升效果分析

本notebook分析不同基础表现水平的学生，在接受tutor辅导后的Task4准确率提升情况。

## 分析思路
1. 从baseline和tutoring_only两个实验结果中读取数据
2. 计算每个学生在baseline条件下的Task4准确率（作为学生基础水平）
3. 将学生按照baseline表现分为不同分段（如：低、中低、中高、高）
4. 统计每个分段学生在baseline和tutoring_only条件下的Task4准确率
5. 绘图展示不同分段的提升效果


In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)


## 1. 数据加载和预处理


In [None]:
# 加载实验数据
print("正在加载实验数据...")

# 加载baseline结果
with open('experiment_results_baseline_qwen-plus.pkl', 'rb') as f:
    baseline_data = pickle.load(f)

# 加载tutoring_only结果  
with open('experiment_results_tutoring_only_qwen-plus_v0.pkl', 'rb') as f:
    tutoring_data = pickle.load(f)

print(f"Baseline数据类型: {type(baseline_data)}")
print(f"Tutoring数据类型: {type(tutoring_data)}")

# 检查数据结构
if isinstance(baseline_data, dict):
    print(f"Baseline数据条目数: {len(baseline_data)}")
    print(f"Baseline数据键示例: {list(baseline_data.keys())[:3]}")
    
if isinstance(tutoring_data, dict):
    print(f"Tutoring数据条目数: {len(tutoring_data)}")
    print(f"Tutoring数据键示例: {list(tutoring_data.keys())[:3]}")
    
print("\n数据加载完成！")


In [None]:
# 将字典数据转换为DataFrame格式，便于分析
def dict_to_dataframe(data_dict):
    """
    将实验结果字典转换为DataFrame
    字典格式: {(student_id, question_id): {task_results}}
    """
    records = []
    for (student_id, question_id), result in data_dict.items():
        record = {
            'student_id': student_id,
            'question_id': question_id,
        }
        record.update(result)
        records.append(record)
    return pd.DataFrame(records)

# 转换数据
baseline_df = dict_to_dataframe(baseline_data)
tutoring_df = dict_to_dataframe(tutoring_data)

print("=" * 80)
print("Baseline DataFrame信息:")
print("=" * 80)
print(f"形状: {baseline_df.shape}")
print(f"列名: {list(baseline_df.columns)}")
print(f"\n前3行:")
print(baseline_df.head(3))

print("\n" + "=" * 80)
print("Tutoring DataFrame信息:")
print("=" * 80)
print(f"形状: {tutoring_df.shape}")
print(f"列名: {list(tutoring_df.columns)}")
print(f"\n前3行:")
print(tutoring_df.head(3))


In [None]:
# 计算Task4准确率：比较predicted_task4_score和true_score
# Task4: 智能体预测学生是否能答对题目（Yes/No）

def calculate_task4_accuracy(df):
    """
    计算Task4准确率：智能体预测学生答题结果的准确性
    true_score: 1表示答对，0表示答错
    predicted_task4_score: Yes表示预测答对，No表示预测答错
    """
    # 将true_score转为Yes/No格式以便比较
    df['true_answer'] = df['true_score'].map({1: 'Yes', 0: 'No'})
    
    # 计算是否预测正确
    df['task4_correct'] = (df['predicted_task4_score'] == df['true_answer']).astype(int)
    
    return df

# 处理两个数据集
baseline_df = calculate_task4_accuracy(baseline_df)
tutoring_df = calculate_task4_accuracy(tutoring_df)

print("Task4准确率计算完成！")
print(f"\nBaseline整体Task4准确率: {baseline_df['task4_correct'].mean():.2%}")
print(f"Tutoring整体Task4准确率: {tutoring_df['task4_correct'].mean():.2%}")
print(f"整体提升: {(tutoring_df['task4_correct'].mean() - baseline_df['task4_correct'].mean()):.2%}")


## 2. 学生表现分段和统计


In [None]:
# 按学生统计baseline的Task4准确率（作为学生基础水平）
student_baseline_accuracy = baseline_df.groupby('student_id').agg({
    'task4_correct': 'mean',  # Task4准确率
    'question_id': 'count'     # 问题数量
}).rename(columns={
    'task4_correct': 'baseline_accuracy',
    'question_id': 'question_count'
})

print("=" * 80)
print("学生Baseline表现统计:")
print("=" * 80)
print(student_baseline_accuracy.describe())
print(f"\n学生总数: {len(student_baseline_accuracy)}")
print(f"平均答题数: {student_baseline_accuracy['question_count'].mean():.1f}")
print(f"平均准确率: {student_baseline_accuracy['baseline_accuracy'].mean():.2%}")

# 可视化学生baseline准确率分布
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(student_baseline_accuracy['baseline_accuracy'], bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Baseline Task4准确率')
plt.ylabel('学生人数')
plt.title('学生Baseline表现分布')
plt.axvline(student_baseline_accuracy['baseline_accuracy'].mean(), 
            color='red', linestyle='--', label=f'平均值: {student_baseline_accuracy["baseline_accuracy"].mean():.2%}')
plt.legend()
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(student_baseline_accuracy['baseline_accuracy'])
plt.ylabel('Baseline Task4准确率')
plt.title('学生Baseline表现箱线图')
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 将学生按baseline表现分为四个分段
# 使用四分位数分段：低(0-25%)、中低(25%-50%)、中高(50%-75%)、高(75%-100%)

quartiles = student_baseline_accuracy['baseline_accuracy'].quantile([0.25, 0.5, 0.75])
print("=" * 80)
print("四分位数:")
print("=" * 80)
print(f"25%分位数 (Q1): {quartiles[0.25]:.2%}")
print(f"50%分位数 (Q2/中位数): {quartiles[0.5]:.2%}")
print(f"75%分位数 (Q3): {quartiles[0.75]:.2%}")

# 定义分段函数
def assign_performance_segment(accuracy):
    if accuracy <= quartiles[0.25]:
        return '低水平 (Q1)'
    elif accuracy <= quartiles[0.5]:
        return '中低水平 (Q2)'
    elif accuracy <= quartiles[0.75]:
        return '中高水平 (Q3)'
    else:
        return '高水平 (Q4)'

# 分配分段
student_baseline_accuracy['segment'] = student_baseline_accuracy['baseline_accuracy'].apply(assign_performance_segment)

print("\n" + "=" * 80)
print("各分段学生数量:")
print("=" * 80)
segment_counts = student_baseline_accuracy['segment'].value_counts().sort_index()
print(segment_counts)

# 各分段的准确率范围
print("\n" + "=" * 80)
print("各分段准确率范围:")
print("=" * 80)
for segment in ['低水平 (Q1)', '中低水平 (Q2)', '中高水平 (Q3)', '高水平 (Q4)']:
    segment_data = student_baseline_accuracy[student_baseline_accuracy['segment'] == segment]['baseline_accuracy']
    print(f"{segment}: {segment_data.min():.2%} ~ {segment_data.max():.2%}, "
          f"平均: {segment_data.mean():.2%}, 学生数: {len(segment_data)}")


In [None]:
# 计算每个学生在tutoring条件下的Task4准确率
student_tutoring_accuracy = tutoring_df.groupby('student_id').agg({
    'task4_correct': 'mean',
    'question_id': 'count'
}).rename(columns={
    'task4_correct': 'tutoring_accuracy',
    'question_id': 'tutoring_question_count'
})

# 合并baseline和tutoring的学生数据
student_comparison = student_baseline_accuracy.join(student_tutoring_accuracy, how='inner')

# 计算提升幅度
student_comparison['improvement'] = student_comparison['tutoring_accuracy'] - student_comparison['baseline_accuracy']
student_comparison['improvement_pct'] = (student_comparison['improvement'] / student_comparison['baseline_accuracy']) * 100

print("=" * 80)
print("学生Baseline vs Tutoring对比:")
print("=" * 80)
print(f"参与对比的学生数: {len(student_comparison)}")
print(f"\nBaseline平均准确率: {student_comparison['baseline_accuracy'].mean():.2%}")
print(f"Tutoring平均准确率: {student_comparison['tutoring_accuracy'].mean():.2%}")
print(f"平均绝对提升: {student_comparison['improvement'].mean():.2%}")
print(f"平均相对提升: {student_comparison['improvement_pct'].mean():.2f}%")

# 展示前5名提升最大和最小的学生
print("\n" + "=" * 80)
print("提升最大的5名学生:")
print("=" * 80)
top_5_improved = student_comparison.nlargest(5, 'improvement')[['segment', 'baseline_accuracy', 'tutoring_accuracy', 'improvement']]
for idx, row in top_5_improved.iterrows():
    print(f"学生{idx}: {row['segment']}, Baseline={row['baseline_accuracy']:.2%}, "
          f"Tutoring={row['tutoring_accuracy']:.2%}, 提升={row['improvement']:+.2%}")

print("\n" + "=" * 80)
print("提升最小的5名学生:")
print("=" * 80)
bottom_5_improved = student_comparison.nsmallest(5, 'improvement')[['segment', 'baseline_accuracy', 'tutoring_accuracy', 'improvement']]
for idx, row in bottom_5_improved.iterrows():
    print(f"学生{idx}: {row['segment']}, Baseline={row['baseline_accuracy']:.2%}, "
          f"Tutoring={row['tutoring_accuracy']:.2%}, 提升={row['improvement']:+.2%}")


In [None]:
# 按分段统计Task4准确率
segment_stats = student_comparison.groupby('segment').agg({
    'baseline_accuracy': ['mean', 'std', 'count'],
    'tutoring_accuracy': ['mean', 'std'],
    'improvement': ['mean', 'std', 'min', 'max'],
    'improvement_pct': 'mean'
})

# 展平多级列名
segment_stats.columns = ['_'.join(col).strip() for col in segment_stats.columns.values]

# 重新排序行（按分段顺序）
segment_order = ['低水平 (Q1)', '中低水平 (Q2)', '中高水平 (Q3)', '高水平 (Q4)']
segment_stats = segment_stats.reindex(segment_order)

print("=" * 80)
print("各分段Task4准确率统计:")
print("=" * 80)
print(segment_stats)

# 创建一个更易读的统计表
print("\n" + "=" * 80)
print("各分段详细对比:")
print("=" * 80)
print(f"{'分段':<15} {'学生数':<8} {'Baseline准确率':<18} {'Tutoring准确率':<18} {'绝对提升':<18} {'相对提升%':<12}")
print("-" * 100)
for segment in segment_order:
    if segment in segment_stats.index:
        row = segment_stats.loc[segment]
        print(f"{segment:<15} "
              f"{int(row['baseline_accuracy_count']):<8} "
              f"{row['baseline_accuracy_mean']:.2%} ± {row['baseline_accuracy_std']:.2%}  "
              f"{row['tutoring_accuracy_mean']:.2%} ± {row['tutoring_accuracy_std']:.2%}  "
              f"{row['improvement_mean']:+.2%} ± {row['improvement_std']:.2%}  "
              f"{row['improvement_pct_mean']:+.1f}%")


## 3. 可视化分析


In [None]:
# 准备绘图数据
segment_order = ['低水平 (Q1)', '中低水平 (Q2)', '中高水平 (Q3)', '高水平 (Q4)']
segments = []
baseline_means = []
tutoring_means = []
baseline_stds = []
tutoring_stds = []
improvements = []

for segment in segment_order:
    if segment in segment_stats.index:
        row = segment_stats.loc[segment]
        segments.append(segment.replace(' (Q', '\n(Q'))  # 换行显示更美观
        baseline_means.append(row['baseline_accuracy_mean'] * 100)
        tutoring_means.append(row['tutoring_accuracy_mean'] * 100)
        baseline_stds.append(row['baseline_accuracy_std'] * 100)
        tutoring_stds.append(row['tutoring_accuracy_std'] * 100)
        improvements.append(row['improvement_mean'] * 100)

# 1. 对比柱状图 - Baseline vs Tutoring
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 子图1: 分段对比柱状图
ax1 = axes[0, 0]
x = np.arange(len(segments))
width = 0.35

bars1 = ax1.bar(x - width/2, baseline_means, width, label='Baseline', 
                color='#5B9BD5', alpha=0.8, yerr=baseline_stds, capsize=5)
bars2 = ax1.bar(x + width/2, tutoring_means, width, label='Tutoring', 
                color='#70AD47', alpha=0.8, yerr=tutoring_stds, capsize=5)

ax1.set_xlabel('学生表现分段', fontsize=12, fontweight='bold')
ax1.set_ylabel('Task4准确率 (%)', fontsize=12, fontweight='bold')
ax1.set_title('不同分段学生的Task4准确率对比', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(segments)
ax1.legend(fontsize=11)
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim([0, 100])

# 在柱状图上添加数值标签
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

# 子图2: 提升幅度柱状图
ax2 = axes[0, 1]
colors = ['#C55A5A' if imp < 0 else '#70AD47' for imp in improvements]
bars3 = ax2.bar(segments, improvements, color=colors, alpha=0.8, edgecolor='black')

ax2.set_xlabel('学生表现分段', fontsize=12, fontweight='bold')
ax2.set_ylabel('准确率提升 (百分点)', fontsize=12, fontweight='bold')
ax2.set_title('Tutoring相对Baseline的提升幅度', fontsize=14, fontweight='bold')
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
ax2.grid(axis='y', alpha=0.3)

# 添加数值标签
for bar in bars3:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + (0.5 if height > 0 else -0.5),
            f'{height:+.1f}', ha='center', va='bottom' if height > 0 else 'top', 
            fontsize=10, fontweight='bold')

# 子图3: 折线图对比
ax3 = axes[1, 0]
ax3.plot(segments, baseline_means, marker='o', linewidth=2.5, markersize=10, 
         label='Baseline', color='#5B9BD5')
ax3.plot(segments, tutoring_means, marker='s', linewidth=2.5, markersize=10, 
         label='Tutoring', color='#70AD47')
ax3.fill_between(range(len(segments)), baseline_means, tutoring_means, 
                  alpha=0.2, color='green')

ax3.set_xlabel('学生表现分段', fontsize=12, fontweight='bold')
ax3.set_ylabel('Task4准确率 (%)', fontsize=12, fontweight='bold')
ax3.set_title('准确率变化趋势', fontsize=14, fontweight='bold')
ax3.legend(fontsize=11)
ax3.grid(True, alpha=0.3)
ax3.set_ylim([0, 100])

# 子图4: 箱线图 - 各分段的提升分布
ax4 = axes[1, 1]
improvement_by_segment = [
    student_comparison[student_comparison['segment'] == seg]['improvement'].values * 100
    for seg in segment_order
]
bp = ax4.boxplot(improvement_by_segment, labels=[s.replace(' (Q', '\n(Q') for s in segment_order],
                  patch_artist=True, showmeans=True)

# 美化箱线图
for patch, color in zip(bp['boxes'], ['#C55A5A', '#F4B183', '#70AD47', '#70AD47']):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)

ax4.set_xlabel('学生表现分段', fontsize=12, fontweight='bold')
ax4.set_ylabel('准确率提升 (百分点)', fontsize=12, fontweight='bold')
ax4.set_title('各分段提升幅度分布', fontsize=14, fontweight='bold')
ax4.axhline(y=0, color='red', linestyle='--', linewidth=1.5, label='零提升线')
ax4.legend(fontsize=10)
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('tutoring_improvement_by_segment.png', dpi=300, bbox_inches='tight')
plt.show()

print("图表已保存为 'tutoring_improvement_by_segment.png'")


In [None]:
# 额外分析：散点图显示个体学生的变化
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 子图1: 散点图 - Baseline vs Tutoring (按分段着色)
ax1 = axes[0]
segment_colors = {
    '低水平 (Q1)': '#C55A5A',
    '中低水平 (Q2)': '#F4B183', 
    '中高水平 (Q3)': '#9DC3E6',
    '高水平 (Q4)': '#70AD47'
}

for segment in segment_order:
    segment_data = student_comparison[student_comparison['segment'] == segment]
    ax1.scatter(segment_data['baseline_accuracy'] * 100, 
               segment_data['tutoring_accuracy'] * 100,
               label=segment, alpha=0.6, s=100, 
               color=segment_colors[segment], edgecolors='black', linewidth=0.5)

# 添加y=x参考线
ax1.plot([0, 100], [0, 100], 'k--', alpha=0.5, linewidth=2, label='y=x (无变化)')

ax1.set_xlabel('Baseline准确率 (%)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Tutoring准确率 (%)', fontsize=12, fontweight='bold')
ax1.set_title('个体学生准确率变化散点图', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10, loc='upper left')
ax1.grid(True, alpha=0.3)
ax1.set_xlim([0, 100])
ax1.set_ylim([0, 100])
ax1.set_aspect('equal')

# 子图2: 提升幅度的直方图（分段叠加）
ax2 = axes[1]
for segment in segment_order:
    segment_data = student_comparison[student_comparison['segment'] == segment]
    ax2.hist(segment_data['improvement'] * 100, 
            bins=15, alpha=0.5, label=segment, 
            color=segment_colors[segment], edgecolor='black')

ax2.axvline(x=0, color='red', linestyle='--', linewidth=2, label='零提升线')
ax2.set_xlabel('准确率提升 (百分点)', fontsize=12, fontweight='bold')
ax2.set_ylabel('学生人数', fontsize=12, fontweight='bold')
ax2.set_title('提升幅度分布直方图', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('individual_student_improvement_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("图表已保存为 'individual_student_improvement_analysis.png'")


## 4. 统计显著性检验


In [None]:
# 对各分段进行配对t检验，检验提升是否显著
from scipy import stats

print("=" * 80)
print("统计显著性检验 (配对t检验)")
print("=" * 80)
print(f"{'分段':<15} {'t统计量':<12} {'p值':<12} {'显著性':<15} {'Cohen\\'s d':<12}")
print("-" * 80)

for segment in segment_order:
    segment_data = student_comparison[student_comparison['segment'] == segment]
    
    if len(segment_data) >= 2:  # 至少需要2个样本
        baseline = segment_data['baseline_accuracy'].values
        tutoring = segment_data['tutoring_accuracy'].values
        
        # 配对t检验
        t_stat, p_value = stats.ttest_rel(tutoring, baseline)
        
        # 计算效应量 (Cohen's d for paired samples)
        diff = tutoring - baseline
        cohens_d = diff.mean() / diff.std() if diff.std() > 0 else 0
        
        # 判断显著性
        if p_value < 0.001:
            significance = "***"
        elif p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        else:
            significance = "ns (不显著)"
        
        print(f"{segment:<15} {t_stat:>11.3f} {p_value:>11.4f} {significance:<15} {cohens_d:>11.3f}")
    else:
        print(f"{segment:<15} {'N/A':<12} {'N/A':<12} {'样本不足':<15} {'N/A':<12}")

print("\n注: *** p<0.001, ** p<0.01, * p<0.05, ns=不显著")
print("Cohen's d效应量: |d|<0.2为小效应, 0.2-0.8为中等效应, >0.8为大效应")

## 5. 结论总结


In [None]:
# 生成分析报告
print("=" * 80)
print("Tutor辅导提升效果分析报告".center(80))
print("=" * 80)

print("\n【1. 整体效果】")
print(f"  • 参与学生总数: {len(student_comparison)}")
print(f"  • Baseline平均准确率: {student_comparison['baseline_accuracy'].mean():.2%}")
print(f"  • Tutoring平均准确率: {student_comparison['tutoring_accuracy'].mean():.2%}")
print(f"  • 平均绝对提升: {student_comparison['improvement'].mean():+.2%}")
print(f"  • 提升学生比例: {(student_comparison['improvement'] > 0).sum() / len(student_comparison):.1%}")
print(f"  • 下降学生比例: {(student_comparison['improvement'] < 0).sum() / len(student_comparison):.1%}")

print("\n【2. 分段效果】")
for segment in segment_order:
    if segment in segment_stats.index:
        row = segment_stats.loc[segment]
        segment_data = student_comparison[student_comparison['segment'] == segment]
        
        print(f"\n  {segment}:")
        print(f"    - 学生数: {int(row['baseline_accuracy_count'])}")
        print(f"    - Baseline: {row['baseline_accuracy_mean']:.2%}")
        print(f"    - Tutoring: {row['tutoring_accuracy_mean']:.2%}")
        print(f"    - 平均提升: {row['improvement_mean']:+.2%}")
        print(f"    - 提升范围: {row['improvement_min']:+.2%} ~ {row['improvement_max']:+.2%}")
        print(f"    - 提升学生占比: {(segment_data['improvement'] > 0).sum() / len(segment_data):.1%}")

print("\n【3. 关键发现】")
# 找出提升最大的分段
best_segment = segment_stats['improvement_mean'].idxmax()
best_improvement = segment_stats.loc[best_segment, 'improvement_mean']
print(f"  • 提升最显著的分段: {best_segment} (平均提升 {best_improvement:+.2%})")

# 找出提升最小的分段
worst_segment = segment_stats['improvement_mean'].idxmin()
worst_improvement = segment_stats.loc[worst_segment, 'improvement_mean']
print(f"  • 提升最不明显的分段: {worst_segment} (平均提升 {worst_improvement:+.2%})")

# 分析趋势
improvements_by_segment = [segment_stats.loc[seg, 'improvement_mean'] for seg in segment_order if seg in segment_stats.index]
if len(improvements_by_segment) >= 2:
    if improvements_by_segment[0] > improvements_by_segment[-1]:
        print(f"  • 趋势: 低水平学生从Tutoring中获益更多")
    elif improvements_by_segment[0] < improvements_by_segment[-1]:
        print(f"  • 趋势: 高水平学生从Tutoring中获益更多")
    else:
        print(f"  • 趋势: 各水平学生获益相当")

print("\n" + "=" * 80)
