# Read data

In [23]:
import os
import json
import pandas as pd

def load_evaluation_data(eval_path="center1"):
    """
    Load evaluation data from the specified path.
    
    Args:
        eval_path (str): Path to the evaluation data directory (center1 or center2)
    
    Returns:
        pd.DataFrame: DataFrame containing evaluation data with pt_No and all model predictions
    """
    # 获取eval_path目录下的所有文件
    files = os.listdir(eval_path)

    with open(os.path.join(eval_path, "gold.json"), 'r') as f:
        data = json.load(f)

    df_eval = pd.DataFrame(data.items(), columns=["pt_No", "gold"])

    for file in files:
        if "gold" not in file and ".DS_Store" not in file:
            print(file)
            with open(os.path.join(eval_path, file), 'r') as f:
                data = json.load(f)
            tmp = pd.DataFrame(data.items(), columns=["pt_No", file.replace(".json", "")])
            df_eval = pd.merge(df_eval, tmp, on="pt_No", how="left")
    
    return df_eval



# Evaluation

In [10]:
def evaluate_micro(golds, preds):
    """
    Micro average calculation of Precision, Recall, F1.
    golds and preds are lists of equal length, each element is a label set or list for that sample.
    """
    TP_global = 0
    FP_global = 0
    FN_global = 0
    for gold, pred in zip(golds, preds):
        gold_set = set(gold)
        pred_set = set(pred)
        TP_global += len(gold_set & pred_set)
        FP_global += len(pred_set - gold_set)
        FN_global += len(gold_set - pred_set)
        
    precision = TP_global / (TP_global + FP_global) if (TP_global + FP_global) else 0.0
    recall = TP_global / (TP_global + FN_global) if (TP_global + FN_global) else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
    return precision, recall, f1

In [11]:
def evaluate_macro_sample(golds, preds):
    """
    Calculate P, R, F1 for each sample separately, then average them (macro-average across samples).
    """
    p_list, r_list, f_list = [], [], []
    for gold, pred in zip(golds, preds):
        gold_set = set(gold)
        pred_set = set(pred)
        tp = len(gold_set & pred_set)
        fp = len(pred_set - gold_set)
        fn = len(gold_set - pred_set)
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall) else 0.0
        p_list.append(precision)
        r_list.append(recall)
        f_list.append(f1)
        
    # Finally average P, R, F1 across all samples
    macro_p = sum(p_list)/len(p_list) if p_list else 0.0
    macro_r = sum(r_list)/len(r_list) if r_list else 0.0
    macro_f1 = sum(f_list)/len(f_list) if f_list else 0.0
    return macro_p, macro_r, macro_f1

In [12]:
def evaluate_df(df, micro="micro", save=True, output_file="results/for_plot/test.csv"):
    golds = df["gold"]
    results = [] # Create a list to store results
    columns = sorted(df.columns)  # Sort alphabetically

    for col in columns:
        try:
            if col != "gold" and col != "pt_No":
                preds = df[col]
                if micro == "micro":
                    precision, recall, f1 = evaluate_micro(golds, preds)
                else:
                    precision, recall, f1 = evaluate_macro_sample(golds, preds)
                # Output results
                print(f"{col} --- Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
                # Add results to the list
                results.append({
                    "Models": col,
                    "Precision": precision,
                    "Recall": recall,
                    "F1_score": f1
                })
        except Exception as e:
            print(col)
            print(e)
            pass
    
    # Save results as csv
    if save:
        results_df = pd.DataFrame(results)
        results_df.to_csv(output_file, index=False)
        print(f"\nEvaluation results saved to {output_file}")
    return pd.DataFrame(results)

In [26]:
# Evaluate center1 data
df_eval = load_evaluation_data(eval_path="center1")
output_file = "evaluation_dataframes/evaluation_micro_center1.csv"
# Evaluate center1 data using micro-averaging method
evaluate_df(df_eval, "micro", save=True, output_file=output_file)
# Evaluate center1 data using macro-averaging method
output_file = "evaluation_dataframes/evaluation_macro_center1.csv"
evaluate_df(df_eval, "macro", save=True, output_file=output_file)

# Load train/validation split information
with open("../data_source/center1/train_val.json", "r") as f:
    train_val = json.load(f)

# Filter data based on train and validation IDs
df_eval_train = df_eval[df_eval["pt_No"].isin([str(i) for i in train_val["train"]])]
df_eval_val = df_eval[df_eval["pt_No"].isin([str(i) for i in train_val["val"]])]

# Evaluate center1 validation set data (micro-averaging)
output_file = "evaluation_dataframes/evaluation_micro_val_center1.csv"
evaluate_df(df_eval_val, "micro", save=True, output_file=output_file)

# Evaluate center1 validation set data (macro-averaging)
output_file = "evaluation_dataframes/evaluation_macro_val_center1.csv"
evaluate_df(df_eval_val, "macro", save=True, output_file=output_file)

# Evaluate center2 data
df_eval = load_evaluation_data(eval_path="center2")
# Evaluate center2 data using micro-averaging method
output_file = "evaluation_dataframes/evaluation_micro_center2.csv"
evaluate_df(df_eval, "micro", save=True, output_file=output_file)

# Evaluate center2 data using macro-averaging method
output_file = "evaluation_dataframes/evaluation_macro_center2.csv"
evaluate_df(df_eval, "macro", save=True, output_file=output_file)



qwen3_4B_targeted_sft_4.json
qwen3_14B_targeted_sft_2.json
ds_v3_comprehensive_modified.json
qwen3_4B_comprehensive_3.json
grok3_comprehensive.json
qwen3_8B_targeted_1.json
gemma327B_comprehensive.json
ds_r1_comprehensive_modified.json
qwen3_8B_comprehensive_5.json
claude3_7_reasoning_comprehensive_nothink.json
qwen3_4B_targeted_2.json
qwen3_32B_targeted_4.json
qwen3_32B_targeted_5.json
ds_r1_targeted.json
qwen3_4B_targeted_3.json
qwen3_8B_comprehensive_4.json
qwen3_32B_targeted_sft_1.json
qwq32B_comprehensive_modified_sft_1.json
grok3_mini_comprehensive.json
qwen3_4B_comprehensive_2.json
qwen3_4B_targeted_sft_5.json
qwen3_14B_targeted_sft_3.json
ds_v3_comprehensive_nothink.json
qwen3reasoning_235_comprehensive.json
gemini_2_5_flash_comprehensive_modified.json
gemini_2_5_pro_comprehensive_modified.json
ds_v3_comprehensive.json
gemini_2_5_pro_comprehensive.json
qwen3_8B_comprehensive_3.json
qwen3_8B_targeted_sft_1.json
qwen3_32B_comprehensive_1.json
qwen3_4B_targeted_4.json
qwen3_32B_ta

Unnamed: 0,Models,Precision,Recall,F1_score
0,human,0.783654,0.696154,0.722253
1,qwen3_14B_comprehensive_1,0.389423,0.384936,0.375962
2,qwen3_14B_comprehensive_2,0.394551,0.391346,0.389835
3,qwen3_14B_comprehensive_3,0.366209,0.368269,0.358761
4,qwen3_14B_comprehensive_4,0.287179,0.300000,0.286951
...,...,...,...,...
72,qwq32B_comprehensive_modified_sft_1,0.847756,0.786538,0.802473
73,qwq32B_comprehensive_modified_sft_2,0.872115,0.814423,0.831777
74,qwq32B_comprehensive_modified_sft_3,0.862179,0.792308,0.816484
75,qwq32B_comprehensive_modified_sft_4,0.823718,0.750321,0.775229


In [27]:
# Evaluate strict data
# center 1 validation set
df_eval = load_evaluation_data(eval_path="strict/center1")

# Load train/validation split information
with open("../data_source/center1/train_val.json", "r") as f:
    train_val = json.load(f)

# Filter data based on train and validation IDs
df_eval_train = df_eval[df_eval["pt_No"].isin([str(i) for i in train_val["train"]])]
df_eval_val = df_eval[df_eval["pt_No"].isin([str(i) for i in train_val["val"]])]

evaluate_df(df_eval_val, "micro", save=True, output_file="evaluation_dataframes/evaluation_micro_val_center1_strict.csv")
evaluate_df(df_eval_val, "macro", save=True, output_file="evaluation_dataframes/evaluation_macro_val_center1_strict.csv")

# center 2
df_eval = load_evaluation_data(eval_path="strict/center2")
evaluate_df(df_eval, "micro", save=True, output_file="evaluation_dataframes/evaluation_micro_center2_strict.csv")
evaluate_df(df_eval, "macro", save=True, output_file="evaluation_dataframes/evaluation_macro_center2_strict.csv")

gemini_2_5_pro_comprehensive_modified_strict.json
ds_r1_comprehensive_modified_strict.json
qwen3_4B_targeted_sft_1_strict.json
human.json
qwq32B_comprehensive_modified_sft_1_strict.json
qwen3_8B_targeted_sft_1_strict.json
human_strict.json
qwen3_32B_targeted_sft_1_strict.json
gemini_2_5_flash_comprehensive_modified_strict.json
qwen3_14B_targeted_sft_1_strict.json
o1_comprehensive_modified_strict.json
ds_r1_comprehensive_modified_strict --- Precision: 0.5942, Recall: 0.5541, F1: 0.5734
gemini_2_5_flash_comprehensive_modified_strict --- Precision: 0.6267, Recall: 0.6351, F1: 0.6309
gemini_2_5_pro_comprehensive_modified_strict --- Precision: 0.3333, Recall: 0.4459, F1: 0.3815
human --- Precision: 0.4068, Recall: 0.3243, F1: 0.3609
human_strict --- Precision: 0.4068, Recall: 0.3243, F1: 0.3609
o1_comprehensive_modified_strict --- Precision: 0.6207, Recall: 0.4865, F1: 0.5455
qwen3_14B_targeted_sft_1_strict --- Precision: 0.5000, Recall: 0.4595, F1: 0.4789
qwen3_32B_targeted_sft_1_strict --

Unnamed: 0,Models,Precision,Recall,F1_score
0,ds_r1_comprehensive_strict,0.719551,0.716667,0.713828
1,gemini_2_5_flash_comprehensive_strict,0.794872,0.789744,0.786538
2,gemini_2_5_pro_comprehensive_strict,0.678526,0.726282,0.696479
3,human,0.519231,0.519231,0.519231
4,human_strict,0.764423,0.676923,0.703022
5,o1_comprehensive_strict,0.68109,0.631731,0.646154
6,qwen3_14B_targeted_sft_1_strict,0.719551,0.689744,0.698214
7,qwen3_32B_targeted_sft_1_strict,0.764423,0.759295,0.750717
8,qwen3_4B_targeted_sft_1_strict,0.689744,0.675,0.677534
9,qwen3_8B_targeted_sft_1_strict,0.578846,0.575641,0.576923


## Sensitivity anaylysis

In [None]:
# To seperate prompts to four quertiles
# Note: prompts including patient's data are not publicly available, this is an example
import json
prompt_path = "prompts/center1/comprehensive_prompts.json" # not public

with open(prompt_path, "r") as f:
    prompts = json.load(f)

{'31214485': '你是一名资深外科医生，你的任务是根据患者的资料，判断患者术后出现哪些并发症，并发症的诊断依据:\n\n1. 急性肾损伤\n- 定义: 术后七天内，符合 KDIGO 标准：\n  - I 级: 肌酐 1.5–1.9 倍基线或尿量减少 6–12 小时\n  - II 级: 肌酐 2–2.9 倍基线或尿量减少 >12 小时\n  - III 级: 肌酐 ≥3 倍基线或需要肾替代治疗\n2. 急性呼吸窘迫综合征 (ARDS)\n- 在已知临床损伤或新发/加重呼吸症状后一周内出现双侧浸润，不能完全用积液、肺叶/肺塌陷或结节解释\n- 呼吸衰竭不能完全用心脏衰竭或液体超负荷解释。若无风险因素需通过客观评估（如超声心动图）排除肺水肿\n- 氧合水平：\n  - 轻度：PaO2/FiO2 26.7-40.0 kPa（200-300 mmHg），\n  - 中度：PaO2/FiO2 13.3-26.6 kPa（100-200 mmHg）\n  - 重度：PaO2/FiO2 ≤13.3 kPa（100 mmHg）\n3. 吻合口破裂\n   - 定义: 手术连接处内容物流出，包括胃肠道、胆道、胰管等，可导致发热、脓肿或器官衰竭\n   - 分级:\n     - 轻度: 无症状，仅影像学发现\n     - 中度: 需要治疗，无永久伤害\n     - 重度: 需手术干预或导致器官功能受损\n4. 心律失常\n   - ECG 证实的心律异常\n   - 分级：常规分级\n5. 心脏骤停\n   - 心跳机械活动停止，无循环体征\n   - 诊断依据：ECG 或临床检查\n   - 无分级\n6. 心源性肺水肿\n   - 心功能障碍导致的肺泡液体积聚\n   - 分级：常规分级\n7. 深静脉血栓\n   - 超声、静脉造影或CT等影像学检查发现的静脉系统中新形成的血栓\n   - 分级：常规分级\n8. 谵妄\n   - 每满足以下标准之一，患者得一分：\n     - 注意力不集中\n     - 定向障碍\n     - 幻觉-妄想-精神病\n     - 精神运动性激动或迟缓\n     - 不当言语或情绪\n     - 睡眠/觉醒周期紊乱或症状波动\n   - 当得分达到 4 分时可诊断为谵妄\n   - 无分级\n9. 胃肠道出血\n   -

In [None]:
# Use Qwen2.5-7B tokenizer to divide prompts into 4 quarters based on token count
from transformers import AutoTokenizer
import numpy as np
import pandas as pd

# Load Qwen2.5-7B tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

# Calculate token count for each prompt
token_counts = {}
for pt_id, prompt in prompts.items():
    tokens = tokenizer.encode(prompt)
    token_counts[pt_id] = len(tokens)

# Convert token counts to DataFrame for analysis
token_df = pd.DataFrame(list(token_counts.items()), columns=['pt_No', 'token_count'])

# Calculate quartiles
q1, q2, q3 = np.percentile(token_df['token_count'], [25, 50, 75])

# Divide prompts into 4 quarters
token_df['quarter'] = pd.cut(
    token_df['token_count'], 
    bins=[0, q1, q2, q3, float('inf')], 
    labels=['Q1', 'Q2', 'Q3', 'Q4']
)

# Display statistics for each quarter
token_df.groupby('quarter').agg({
    'token_count': ['min', 'max', 'mean', 'count']
})


  from .autonotebook import tqdm as notebook_tqdm
  token_df.groupby('quarter').agg({


Unnamed: 0_level_0,token_count,token_count,token_count,token_count
Unnamed: 0_level_1,min,max,mean,count
quarter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Q1,3751,5036,4564.945946,37
Q2,5053,5884,5454.944444,36
Q3,5907,7591,6694.5,36
Q4,7666,14606,10610.486486,37


In [None]:
# Load evaluation results data
# Assume df_eval already exists, containing evaluation results for all models
# If df_eval doesn't exist, need to load evaluation results first

# First ensure we have pt_No to quarter mapping
pt_to_quarter = dict(zip(token_df['pt_No'], token_df['quarter']))

# Load evaluation results data
df_eval = load_evaluation_data(eval_path="center1")


# Filter out ds_r1_hard_original and human data from df_eval
df_eval_sensitivity = df_eval[['pt_No', 'gold', 'ds_r1_comprehensive', 'human']]

# Calculate median token count for each quarter
quarter_median_counts = token_df.groupby('quarter')['token_count'].median()

df_sensitivity_analysis = pd.DataFrame()
for q in token_df['quarter'].unique():
    print(q)
    pts_q = token_df[token_df['quarter'] == q]['pt_No'].tolist()
    df_eval_sensitivity_q = df_eval_sensitivity[df_eval_sensitivity['pt_No'].isin(pts_q)]
    df_eval_results = evaluate_df(df_eval_sensitivity_q, "micro", save=False, output_file=f"results/for_plot/evaluation_sensitivity_{q}.csv")
    df_eval_results["quarter"] = q
    df_eval_results["median_token_count"] = quarter_median_counts[q]
    df_sensitivity_analysis = pd.concat([df_sensitivity_analysis, df_eval_results], ignore_index=True)

df_sensitivity_analysis.to_csv("evaluation_dataframes/sensitivity_queters.csv", index=False)


  quarter_median_counts = token_df.groupby('quarter')['token_count'].median()


Q4
ds_r1_hard_original --- Precision: 0.7674, Recall: 0.7253, F1: 0.7458
human --- Precision: 0.7077, Recall: 0.5055, F1: 0.5897
Q3
ds_r1_hard_original --- Precision: 0.6724, Recall: 0.7091, F1: 0.6903
human --- Precision: 0.5116, Recall: 0.4000, F1: 0.4490
Q2
ds_r1_hard_original --- Precision: 0.6346, Recall: 0.6471, F1: 0.6408
human --- Precision: 0.6190, Recall: 0.5098, F1: 0.5591
Q1
ds_r1_hard_original --- Precision: 0.6809, Recall: 0.8205, F1: 0.7442
human --- Precision: 0.7368, Recall: 0.7179, F1: 0.7273
