In [1]:
import pandas as pd

# ==== 1. 读入文件 ====
synthetic = pd.read_csv("synthetic_reports.csv")  # 包含 Original, Synthetic, Transform_Type 等
subset = pd.read_csv("subset_scored.csv")         # 有 GPT4o-mini, Gemini, ClinicalBERT
restset = pd.read_csv("rest_scored.csv")          # 有 GPT4o-mini, ClinicalBERT
deepseek = pd.read_csv("deepseek_scored.csv")     # 有 DeepSeek 专用分数

# 确保有 pair_id，如果没有就用 reset_index
if "pair_id" not in synthetic.columns:
    synthetic = synthetic.reset_index().rename(columns={"index": "pair_id"})
if "pair_id" not in subset.columns:
    subset = subset.reset_index().rename(columns={"index": "pair_id"})
if "pair_id" not in restset.columns:
    restset = restset.reset_index().rename(columns={"index": "pair_id"})
if "pair_id" not in deepseek.columns:
    deepseek = deepseek.reset_index().rename(columns={"index": "pair_id"})

# ==== 2. 标准化列名（重命名分数字段） ====
subset = subset.rename(columns={
    "Score_LLM": "score_gpt4omini",
    "Score_Embed": "score_clinicalbert",
    "score_gemini": "score_gemini"  # 如果原来就有可以不用改
})

restset = restset.rename(columns={
    "Score_LLM": "score_gpt4omini",
    "Score_Embed": "score_clinicalbert"
})

deepseek = deepseek.rename(columns={
    "Score_LLM": "score_deepseek"
})

# ==== 3. 合并（左连接） ====
eval_formal = synthetic.merge(subset[["pair_id","score_gpt4omini","score_gemini","score_clinicalbert"]],
                              on="pair_id", how="left")

eval_formal = eval_formal.merge(restset[["pair_id","score_gpt4omini","score_clinicalbert"]],
                                on="pair_id", how="left", suffixes=("","_rest"))

eval_formal = eval_formal.merge(deepseek[["pair_id","score_deepseek"]],
                                on="pair_id", how="left")

# ==== 4. 派生列 ====
# a) 是否临床变化
eval_formal["is_clinical_change"] = eval_formal["Transform_Type"].isin(
    ["Clinically Significant","Hallucinated"]
).astype(int)

# b) 长度特征
eval_formal["len_orig"] = eval_formal["Original"].astype(str).str.split().str.len()
eval_formal["len_syn"] = eval_formal["Synthetic"].astype(str).str.split().str.len()
eval_formal["len_ratio"] = eval_formal["len_syn"] / eval_formal["len_orig"].clip(lower=1)

# ==== 5. 导出 ====
eval_formal.to_csv("eval_formal.csv", index=False)

print("合并完成 ✅ eval_formal.csv 已保存")
print(eval_formal.head())


合并完成 ✅ eval_formal.csv 已保存
   pair_id                                           Original  \
0        0  A right chest tube remains in place, a small r...   
1        1  A right chest tube remains in place, a small r...   
2        2  A right chest tube remains in place, a small r...   
3        3  A right chest tube remains in place, a small r...   
4        4  A right chest tube remains in place, a small r...   

                                           Synthetic          Transform_Type  \
0  A right chest tube is still in place, and a sm...             Paraphrased   
1  A right chest tube has been removed, a large r...  Clinically Significant   
2  A right chest tube remains in place, a small r...                   Noisy   
3  A right chest tube remains in place, with a sm...      Neutral Distractor   
4  A right chest tube remains in place, and an ov...            Hallucinated   

     Specialty Source  score_gpt4omini  score_gemini  score_clinicalbert  \
0  chest x-ray  CXRdf    