In [11]:
from scipy.stats import chi2_contingency
import numpy as np
from sklearn.metrics import f1_score
import pandas as pd
from itertools import combinations

In [12]:
def accuracy_chi_square_test(df, subset_col, gold_col, pred_col, subset_names=None):
    """
    对不同子集上的 accuracy 做卡方检验，判断是否显著不同。

    参数：
    - df: 包含预测结果的 DataFrame
    - subset_col: 区分子集的列名（如 'direct_source'）
    - gold_col: 真实标签列名（如 'gold_judgment'）
    - pred_col: 模型预测标签列名（如 'model_pred'）
    - subset_names: 可选，指定要比较的子集名列表，默认是全部子集

    返回：
    - accuracy_table: 各子集的 [正确预测数, 错误预测数] 列表
    - p_value: 卡方检验的 p 值
    """

    if subset_names is None:
        subset_names = df[subset_col].unique()

    accuracy_table = []

    for subset in subset_names:
        sub_df = df[df[subset_col] == subset]
        correct = (sub_df[gold_col] == sub_df[pred_col]).sum()
        total = len(sub_df)
        accuracy_table.append([correct, total - correct])

    chi2, p, dof, expected = chi2_contingency(accuracy_table)

    print("Contingency Table:", accuracy_table)
    print(f"Chi-square stat = {chi2:.4f}, p-value = {p:.4f}")
    return accuracy_table, p

def pairwise_accuracy_chi_square(df, subset_col, gold_col, pred_col, subset_names=None, alpha=0.05):
    if subset_names is None:
        subset_names = df[subset_col].unique()

    results = []
    comparisons = list(combinations(subset_names, 2))
    corrected_alpha = alpha / len(comparisons)

    for a, b in comparisons:
        df_a = df[df[subset_col] == a]
        df_b = df[df[subset_col] == b]

        correct_a = (df_a[gold_col] == df_a[pred_col]).sum()
        wrong_a = len(df_a) - correct_a
        correct_b = (df_b[gold_col] == df_b[pred_col]).sum()
        wrong_b = len(df_b) - correct_b

        contingency_table = [[correct_a, wrong_a], [correct_b, wrong_b]]
        chi2, p, dof, expected = chi2_contingency(contingency_table)

        results.append({
            'comparison': f"{a} vs {b}",
            'accuracy_a': correct_a / len(df_a),
            'accuracy_b': correct_b / len(df_b),
            'p_value': p,
            'significant': p < corrected_alpha
        })

    return pd.DataFrame(results)

In [13]:
def bootstrap_macro_f1(y_true, y_pred, n_bootstrap=1000):
    scores = []
    n = len(y_true)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    for _ in range(n_bootstrap):
        idx = np.random.choice(n, n, replace=True)
        f1 = f1_score(y_true[idx], y_pred[idx], average='macro')
        scores.append(f1)
    return np.percentile(scores, 2.5), np.percentile(scores, 97.5)

In [14]:
np.random.seed(42)

In [15]:
result_files = ['Qwen3_4B_FS_UND.csv', 'Qwen3_32B_FS_UND.csv', 'Qwen3_4B_UND_FS_DSPy_CoT_no_theory_examples_wording_A.csv', 
                'Qwen3_4B_DSPy_FS_UND.csv', 'DS14B_FS_UND_correct.csv', 'DS_R1_API_FS_UND.csv', 'DS_V3_FS_UND.csv',
                'llama_3_3_70B_FS_UND.csv', 'Llama_70B_DSPy_CoT_FS_UND.csv']

for file_name in result_files:
    df = pd.read_csv(file_name)
    print(f'Printing results for {file_name}:')
    
    accuracy_chi_square_test(
    df,
    subset_col='direct_source',
    gold_col='gold_judgment',
    pred_col='model_pred',
    subset_names=['CoCoNot', 'IN3', 'CLAMBER']
)
    print()
    pairwise_df = pairwise_accuracy_chi_square(
        df,
        subset_col='direct_source',
        gold_col='gold_judgment',
        pred_col='model_pred',
        subset_names=['CoCoNot', 'IN3', 'CLAMBER']
    )
    print(pairwise_df)  
    print()
    # 对每个子集计算 Bootstrap 置信区间
    for subset in ['CoCoNot', 'IN3', 'CLAMBER']:
        subset_df = df[df['direct_source'] == subset]
        lower, upper = bootstrap_macro_f1(subset_df['gold_judgment'], subset_df['model_pred'])
        print(f"{subset} Macro-F1 95% CI: [{lower:.4f}, {upper:.4f}]")

    print("-----------------------")

Printing results for Qwen3_4B_FS_UND.csv:
Contingency Table: [[126, 30], [219, 80], [258, 142]]
Chi-square stat = 15.9247, p-value = 0.0003

           comparison  accuracy_a  accuracy_b   p_value  significant
0      CoCoNot vs IN3    0.807692    0.732441  0.096072        False
1  CoCoNot vs CLAMBER    0.807692    0.645000  0.000287         True
2      IN3 vs CLAMBER    0.732441    0.645000  0.017559        False

CoCoNot Macro-F1 95% CI: [0.7048, 0.8430]
IN3 Macro-F1 95% CI: [0.6823, 0.7792]
CLAMBER Macro-F1 95% CI: [0.5944, 0.6874]
-----------------------
Printing results for Qwen3_32B_FS_UND.csv:
Contingency Table: [[127, 29], [215, 84], [254, 146]]
Chi-square stat = 18.1017, p-value = 0.0001

           comparison  accuracy_a  accuracy_b   p_value  significant
0      CoCoNot vs IN3    0.814103    0.719064  0.034612        False
1  CoCoNot vs CLAMBER    0.814103    0.635000  0.000068         True
2      IN3 vs CLAMBER    0.719064    0.635000  0.023890        False

CoCoNot Macro-F1 