In [1]:
import pandas as pd
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import numpy as np
from scipy.stats import ttest_rel
from sklearn.metrics import f1_score
from tqdm import tqdm
import seaborn as sns

In [2]:
def merge_and_evaluate_correctness(df_incontext, df_zero, pred_col_theory, pred_col_no_theory, suffix_incontext='_theory', suffix_zero='_no_theory'):
    merged = pd.merge(
        df_incontext, 
        df_zero, 
        on="request", 
        suffixes=(suffix_incontext, suffix_zero)
    )
    merged["correct_theory"] = (
        merged[pred_col_theory] == merged["gold_judgment"+suffix_incontext]
    ).astype(int)
    merged["correct_no_theory"] = (
        merged[pred_col_no_theory] == merged["gold_judgment"+suffix_incontext]
    ).astype(int)

    t_stat, p_value = ttest_rel(merged["correct_theory"], merged["correct_no_theory"])

    merged_UND = merged[merged["gold_judgment"+suffix_incontext] == 'underspecified']
    t_stat_UND, p_value_UND = ttest_rel(merged_UND["correct_theory"], merged_UND["correct_no_theory"])

    merged_FS = merged[merged["gold_judgment"+suffix_incontext] == 'fully specified']
    t_stat_FS, p_value_FS = ttest_rel(merged_FS["correct_theory"], merged_FS["correct_no_theory"])

    # Results
    print("Overall paired t-test:")
    print(f"t-statistic = {t_stat:.4f}")
    print(f"p-value     = {p_value:.4f}")

    print("UND paired t-test:")
    print(f"t-statistic = {t_stat_UND:.4f}")
    print(f"p-value     = {p_value_UND:.4f}")

    print("FS paired t-test:")
    print(f"t-statistic = {t_stat_FS:.4f}")
    print(f"p-value     = {p_value_FS:.4f}")
    
    # 分析两种情况
    theory_better = merged[(merged["correct_theory"] == 1) & (merged["correct_no_theory"] == 0)]
    no_theory_better = merged[(merged["correct_theory"] == 0) & (merged["correct_no_theory"] == 1)]

    print(f"\nNumber of cases where theory is correct and no_theory is not: {len(theory_better)}")
    print(f"Number of cases where no_theory is correct and theory is not: {len(no_theory_better)}")

    return merged, theory_better, no_theory_better

In [10]:
Qwen3_4B_UND_FS_DSPy_CoT_instructions_examples_wording_A  = pd.read_csv('../DSPy_incontext_onehop.csv')
Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B = pd.read_csv('../clean_exp_1/Qwen3_4B_FS_UND.csv')
Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B = pd.read_csv('../clean_exp_1/Qwen3_4B_DSPy_FS_UND.csv')
Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A = pd.read_csv('../clean_exp_1/Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A.csv')

# Qwen3-4B_UND/FS_DSPy_CoT_instructions_examples_wording_A
repro_test_1 =  pd.read_csv('../Reproductivity/test_1_complete.csv')
repro_test_1_o =  pd.read_csv('../Reproductivity/test_1.csv')

Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A = pd.read_csv('Qwen3_4B_UND_FS_DSPy_CoT_definitions_examples_wording_A.csv')
Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B = pd.read_csv('Qwen3_4B_UND_FS_DSPy_CoT_definitions_examples_wording_B.csv')

# Qwen3-4B DSPy CoT 

In [7]:
repro_test_1_without = repro_test_1[repro_test_1['direct_source']!='AmbigQA']

Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B_without = Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B[Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B['direct_source']!='AmbigQA']
Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B_without = Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B[Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B['direct_source']!='AmbigQA']
Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A_without = Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A[Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A['direct_source']!= 'AmbigQA']

df_merge_nl, df_theory_better_nl, df_no_theory_better_nl = merge_and_evaluate_correctness(repro_test_1_without, Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 1.8012
p-value     = 0.0720
UND paired t-test:
t-statistic = 0.1138
p-value     = 0.9094
FS paired t-test:
t-statistic = 2.2999
p-value     = 0.0219

Number of cases where theory is correct and no_theory is not: 101
Number of cases where no_theory is correct and theory is not: 77


In [8]:
df_merge_dspy_b, df_theory_better_dspy_b, df_no_theory_better_dspy_b = merge_and_evaluate_correctness(repro_test_1_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 2.0217
p-value     = 0.0435
UND paired t-test:
t-statistic = 4.6949
p-value     = 0.0000
FS paired t-test:
t-statistic = -1.5414
p-value     = 0.1239

Number of cases where theory is correct and no_theory is not: 103
Number of cases where no_theory is correct and theory is not: 76


In [9]:
df_merge_dspy_a, df_theory_better_dspy_a, df_no_theory_better_dspy_a = merge_and_evaluate_correctness(repro_test_1_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 2.2932
p-value     = 0.0221
UND paired t-test:
t-statistic = 4.4857
p-value     = 0.0000
FS paired t-test:
t-statistic = -0.9331
p-value     = 0.3513

Number of cases where theory is correct and no_theory is not: 101
Number of cases where no_theory is correct and theory is not: 71


In [17]:
df_theory_better_nl.to_csv('df_theory_better_dspy_vs_nl.csv')
df_no_theory_better_nl.to_csv('df_no_theory_better_dspy_vs_nl.csv')

df_theory_better_dspy.to_csv('df_theory_better_dspy_vs_dspy.csv')
df_no_theory_better_dspy.to_csv('df_no_theory_better_dspy_vs_dspy.csv')

# Qwen3-4B_UND/FS_DSPy_CoT_definitions_examples_wording_A

In [11]:
Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A_without = Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A[Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A['direct_source']!='AmbigQA']

In [12]:
df_merge_nl, df_theory_better_nl, df_no_theory_better_nl = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A_without, Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 1.6487
p-value     = 0.0996
UND paired t-test:
t-statistic = -0.5693
p-value     = 0.5694
FS paired t-test:
t-statistic = 2.5947
p-value     = 0.0098

Number of cases where theory is correct and no_theory is not: 109
Number of cases where no_theory is correct and theory is not: 86


In [13]:
df_merge_dspy_b, df_theory_better_dspy_b, df_no_theory_better_dspy_b = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 1.9744
p-value     = 0.0487
UND paired t-test:
t-statistic = 4.0992
p-value     = 0.0000
FS paired t-test:
t-statistic = -1.0315
p-value     = 0.3029

Number of cases where theory is correct and no_theory is not: 100
Number of cases where no_theory is correct and theory is not: 74


In [14]:
df_merge_dspy_a, df_theory_better_dspy_a, df_no_theory_better_dspy_a = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_A_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 2.1971
p-value     = 0.0283
UND paired t-test:
t-statistic = 3.7218
p-value     = 0.0002
FS paired t-test:
t-statistic = -0.4122
p-value     = 0.6804

Number of cases where theory is correct and no_theory is not: 102
Number of cases where no_theory is correct and theory is not: 73


# Qwen3-4B_UND/FS_DSPy_CoT_definitions_examples_wording_B

In [15]:
Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B_without = Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B[Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B['direct_source']!='AmbigQA']

In [16]:
df_merge_nl, df_theory_better_nl, df_no_theory_better_nl = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B_without, Qwen3_4B_UND_FS_NL_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 1.7912
p-value     = 0.0736
UND paired t-test:
t-statistic = -0.1124
p-value     = 0.9106
FS paired t-test:
t-statistic = 2.5027
p-value     = 0.0127

Number of cases where theory is correct and no_theory is not: 102
Number of cases where no_theory is correct and theory is not: 78


In [17]:
f_merge_dspy_b, df_theory_better_dspy_b, df_no_theory_better_dspy_b = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_B_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 1.9267
p-value     = 0.0543
UND paired t-test:
t-statistic = 4.2061
p-value     = 0.0000
FS paired t-test:
t-statistic = -1.2819
p-value     = 0.2006

Number of cases where theory is correct and no_theory is not: 112
Number of cases where no_theory is correct and theory is not: 85


In [18]:
df_merge_dspy_a, df_theory_better_dspy_a, df_no_theory_better_dspy_a = merge_and_evaluate_correctness(Qwen3_4B_UND_FS_DSPy_definitions_examples_wording_B_without, Qwen3_4B_UND_FS_DSPy_CoT_no_theory_no_example_wording_A_without, 'model_pred_theory', 'model_pred_no_theory')

Overall paired t-test:
t-statistic = 2.2540
p-value     = 0.0245
UND paired t-test:
t-statistic = 3.9447
p-value     = 0.0001
FS paired t-test:
t-statistic = -0.7501
p-value     = 0.4536

Number of cases where theory is correct and no_theory is not: 104
Number of cases where no_theory is correct and theory is not: 74


# Result comparisons

In [5]:
def find_model_pred_differences(df1, df2, key_column="request"):

    # 合并两个 DataFrame
    merged = df1.merge(df2, on=key_column, suffixes=("_df1", "_df2"))

    # 过滤出 model_pred 不相等的行
    diff = merged[merged["model_pred_df1"] != merged["model_pred_df2"]]

    return diff[[key_column, 'gold_judgment_df1', "model_pred_df1", "model_pred_df2"]]