In [1]:
# ==== 基本依赖 ====
import os, re, math
import pandas as pd
import torch
from torch.nn.functional import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
from openai import OpenAI
from google import genai

# ==== 环境 ====
load_dotenv()
OPENAI_API_KEY   = os.getenv("OPENAI_API_KEY")
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")  # 仅后续鲁棒性用，不在本脚本里跑
# GEMINI_API_KEY 由 genai.Client() 自行从环境读取

# ==== 模型与客户端 ====
# GPT 主力评分（快）：gpt-4o-mini
gpt_client = OpenAI(api_key=OPENAI_API_KEY)
GPT_MODEL  = "gpt-4o-mini"

# Gemini-Pro（抽检）
gem_client = genai.Client()
GEM_MODEL  = "gemini-2.5-flash"

# ClinicalBERT（基线）
BERT_NAME  = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer  = AutoTokenizer.from_pretrained(BERT_NAME)
bert_model = AutoModel.from_pretrained(BERT_NAME)
bert_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# ==== 读取底表 ====
# 需要列：Original, Synthetic, Transform_Type, Specialty, Source
pairs = pd.read_csv("synthetic_reports.csv").reset_index().rename(columns={"index":"pair_id"})

# === 关键改动：为“同一原始报告”的5条建立 group_id ===
# 用 Original 分组，保持出现顺序（sort=False），同一原文的5条会得到同一个 group_id
pairs["group_id"] = pairs.groupby("Original", sort=False).ngroup()


# ==== 抽检比例（一个数字即可）====
SUBSET_FRAC = 0.2  # 20%~30% 之间自己改

all_groups = pd.Index(pairs["group_id"].unique())

# 采样到的 group_id（整组抽）
sampled_groups = (all_groups.to_series()
                  .sample(frac=SUBSET_FRAC, random_state=42)
                  .sort_values())  # 组ID按顺序，方便后续查看

# === 得到 subset / rest，并按 pair_id 升序排列 ===
subset = (pairs[pairs["group_id"].isin(sampled_groups)]
          .sort_values("pair_id")
          .reset_index(drop=True))

rest   = (pairs[~pairs["group_id"].isin(sampled_groups)]
          .sort_values("pair_id")
          .reset_index(drop=True))

print(f"总组数: {pairs['group_id'].nunique()} | 抽检组数: {subset['group_id'].nunique()} | "
      f"subset条数: {len(subset)} | rest条数: {len(rest)}")

# ==== 通用：提取分数 ====
num_pattern = re.compile(r"(?<!\d)(\d+(?:\.\d+)?)(?!\d)")  # 抓 12 或 12.3

def extract_score(text: str):
    m = num_pattern.search(text)
    if not m: 
        return None
    val = float(m.group(1))
    # 保守截断 0~10
    return max(0.0, min(10.0, val))

# ==== 打分提示词（无 JSON，要求输出“纯数字0-10”）====
SCORE_PROMPT = """You are a medical report evaluator.
Compare clinical meaning only (findings, laterality, devices, measurements, diagnoses).
Score from 0 to 10 where 10 = identical clinical meaning and 0 = contradictory/irrelevant.

Scoring instructions:
Assign a similarity score from 0.0 to 10.0 (one decimal place):
10.0: Completely identical in all clinically meaningful aspects.
8-9.9: Only very minor differences (e.g., "small" vs. "mild"; equivalent negative findings; different but clinically irrelevant wording).
6-7.9: Noticeable but not clinically significant differences (e.g., "small" vs. "moderate" effusion; more detail in one report, but the overall meaning is similar).
3-5.9: Clinically significant but not directly contradictory (e.g., one report notes a finding the other omits; or laterality mismatch, but not a direct contradiction).
0-2.9: Reports are clinically contradictory (e.g., one says "definite pneumonia," the other says "no pneumonia"; or one says "fracture present," the other "no fracture").
Return ONLY a single number (0-10, one decimal). No words.

Report A (original):
{A}

Report B (candidate):
{B}
"""

# ==== GPT-4o-mini 打分 ====
def score_gpt(a: str, b: str) -> float:
    prompt = SCORE_PROMPT.format(A=a, B=b)
    resp = gpt_client.chat.completions.create(
        model=GPT_MODEL,
        messages=[
            {"role":"system","content":"You are a precise clinical text evaluator."},
            {"role":"user","content":prompt}
        ],
        temperature=0,
        stream=False
    )
    out = resp.choices[0].message.content.strip()
    return extract_score(out)

# ==== Gemini-Pro 打分 ====
def score_gemini(a: str, b: str) -> float:
    prompt = SCORE_PROMPT.format(A=a, B=b)
    response = gem_client.models.generate_content(model=GEM_MODEL, contents=prompt)
    out = (response.text or "").strip()
    return extract_score(out)

# ==== ClinicalBERT 余弦（映射到 0-10）====
def cls_embedding(text: str) -> torch.Tensor:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
        # 取 [CLS]
        emb = outputs.last_hidden_state[:,0,:]   # [1, hidden]
    return emb.squeeze(0).cpu()  # [hidden]

def score_bert(a: str, b: str) -> float:
    e1 = cls_embedding(a)
    e2 = cls_embedding(b)
    cos = cosine_similarity(e1, e2, dim=0).item()      # [-1, 1]
    score = (cos + 1.0) / 2.0 * 10.0                   # → [0, 10]
    score = max(0.0, min(10.0, score))
    return round(score, 2)

# ==== 极简进度显示 ====
def progress(i, n, prefix="Progress"):
    pct = int((i+1)*100/n)
    print(f"\r{prefix}: {pct}%", end="", flush=True)

# =========================
# ① 抽检：Gemini + GPT + BERT
# =========================
sub_rows = []
N = len(subset)
for i, r in enumerate(subset.itertuples(index=False)):
    a = r.Original
    b = r.Synthetic
    s_gpt    = score_gpt(a, b)
    s_gemini = score_gemini(a, b)
    s_bert   = score_bert(a, b)
    sub_rows.append({
        "pair_id": r.pair_id,
        "Original": a,
        "Synthetic": b,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_gpt4omini": s_gpt,
        "score_gemini": s_gemini,
        "score_clinicalbert": s_bert
    })
    progress(i, N, prefix="Subset scoring")

print()  # 换行
subset_scored = pd.DataFrame(sub_rows)
subset_scored.to_csv("subset_scored.csv", index=False)
print(f"✅ subset_scored.csv 保存完成：{len(subset_scored)}")

# =========================
# ② 其余：GPT + BERT
# =========================
rest_rows = []
N = len(rest)
for i, r in enumerate(rest.itertuples(index=False)):
    a = r.Original
    b = r.Synthetic
    s_gpt  = score_gpt(a, b)
    s_bert = score_bert(a, b)
    rest_rows.append({
        "pair_id": r.pair_id,
        "Original": a,
        "Synthetic": b,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_gpt4omini": s_gpt,
        "score_clinicalbert": s_bert
    })
    progress(i, N, prefix="Rest scoring")

print()
rest_scored = pd.DataFrame(rest_rows)
rest_scored.to_csv("rest_scored.csv", index=False)
print(f"✅ rest_scored.csv 保存完成：{len(rest_scored)}")

# ==（可选）检查一下列名 ==
print(subset_scored.columns.tolist())
print(rest_scored.columns.tolist())


总组数: 1605 | 抽检组数: 321 | subset条数: 1605 | rest条数: 6420
Subset scoring: 30%

ServerError: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}

In [2]:
import pandas as pd
pd.DataFrame(sub_rows).to_csv("subset_scored_partial.csv", index=False)
print("已保存 subset_scored_partial.csv")
subset.to_csv("subset_pairs.csv", index=False)
rest.to_csv("rest_pairs.csv", index=False)

已保存 subset_scored_partial.csv


In [3]:
import pandas as pd

# 读入抽检全集 & 已完成部分
subset_all = pd.read_csv("subset_pairs.csv")                 # 抽检全集（1605 条）
done_df    = pd.read_csv("subset_scored_partial.csv")        # 已完成（约30%）

done_ids = set(done_df["pair_id"])
todo = subset_all[~subset_all["pair_id"].isin(done_ids)].copy()

print(f"待续跑条数: {len(todo)}")

# 继续跑剩余
more_rows = []
N = len(todo)
for i, r in enumerate(todo.itertuples(index=False)):
    a, b = r.Original, r.Synthetic
    s_gpt    = score_gpt(a, b)
    s_gemini = score_gemini(a, b)
    s_bert   = score_bert(a, b)           # 内部已 round(…, 2) 更好

    more_rows.append({
        "pair_id": r.pair_id,
        "Original": a,
        "Synthetic": b,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_gpt4omini": s_gpt,
        "score_gemini": s_gemini,
        "score_clinicalbert": round(s_bert, 2),
    })

    # 简单进度
    pct = int((i+1)*100/N)
    print(f"\rResume subset scoring: {pct}%", end="", flush=True)
print()

# 合并 → 去重（以 pair_id 为准）→ 按 pair_id 排序 → 保存最终 subset
subset_scored = pd.concat([done_df, pd.DataFrame(more_rows)], ignore_index=True)
subset_scored = subset_scored.sort_values("pair_id").drop_duplicates("pair_id", keep="last")
subset_scored.to_csv("subset_scored.csv", index=False)
print(f"✅ subset_scored.csv 完成：{len(subset_scored)}")


待续跑条数: 1112
Resume subset scoring: 84%

ServerError: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}

In [4]:
import pandas as pd

# 把内存中的 more_rows 固化为第二个分片
pd.DataFrame(more_rows).to_csv("subset_scored_partial2.csv", index=False)
print("✔ 保存 subset_scored_partial2.csv")


✔ 保存 subset_scored_partial2.csv


In [6]:
import pandas as pd

# 读入抽检全集（整组抽样得到的全集）
subset_all = pd.read_csv("subset_pairs.csv")  # 含 pair_id, Original, Synthetic, Transform_Type, Specialty, Source

# 已完成的两个分片
p1 = pd.read_csv("subset_scored_partial.csv")   # 第一段 ~30%
p2 = pd.read_csv("subset_scored_partial2.csv")  # 第二段 ~84%（相对于“待续跑集”）

# 计算还未评分的 pair_id
done_ids = set(p1["pair_id"]) | set(p2["pair_id"])
todo = subset_all[~subset_all["pair_id"].isin(done_ids)].copy()
print("最后待跑条数:", len(todo))

# 跑最后 16%
final_rows = []
N = len(todo)
for i, r in enumerate(todo.itertuples(index=False)):
    a, b = r.Original, r.Synthetic
    s_gpt    = score_gpt(a, b)
    s_gemini = score_gemini(a, b)
    s_bert   = score_bert(a, b)
    final_rows.append({
        "pair_id": r.pair_id,
        "Original": a,
        "Synthetic": b,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_gpt4omini": s_gpt,
        "score_gemini": s_gemini,
        "score_clinicalbert": round(s_bert, 2),
    })
    # 简单进度
    pct = int((i+1)*100/N) if N else 100
    print(f"\rResume last 16%: {pct}%", end="", flush=True)
print()

# 合并三个分片 → 去重（pair_id）→ 排序 → 保存最终 subset
p3 = pd.DataFrame(final_rows)
subset_scored = pd.concat([p1, p2, p3], ignore_index=True)
subset_scored = subset_scored.sort_values("pair_id").drop_duplicates("pair_id", keep="last")

#（可选）所有分数统一保留两位
for col in ["score_gpt4omini", "score_gemini", "score_clinicalbert"]:
    if col in subset_scored:
        subset_scored[col] = subset_scored[col].round(2)

subset_scored.to_csv("subset_scored.csv", index=False)
print(f"✅ subset_scored.csv 完成：{len(subset_scored)} 条")


最后待跑条数: 169
Resume last 16%: 100%
✅ subset_scored.csv 完成：1605 条


In [7]:
# =========================
# ② 其余：GPT + BERT
# =========================
rest_rows = []
N = len(rest)
for i, r in enumerate(rest.itertuples(index=False)):
    a = r.Original
    b = r.Synthetic
    s_gpt  = score_gpt(a, b)
    s_bert = score_bert(a, b)
    rest_rows.append({
        "pair_id": r.pair_id,
        "Original": a,
        "Synthetic": b,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_gpt4omini": s_gpt,
        "score_clinicalbert": s_bert
    })
    progress(i, N, prefix="Rest scoring")

print()
rest_scored = pd.DataFrame(rest_rows)
rest_scored.to_csv("rest_scored.csv", index=False)
print(f"✅ rest_scored.csv 保存完成：{len(rest_scored)}")

Rest scoring: 100%
✅ rest_scored.csv 保存完成：6420


In [8]:
# deepseek 抽样
import os, re
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

# ================= 配置 =================
DS_MAX_PAIRS   = 200               # 最多抽样 200 对（≈40 组）
DEEPSEEK_KEY   = os.getenv("DEEPSEEK_API_KEY")
assert DEEPSEEK_KEY, "请在环境变量中设置 DEEPSEEK_API_KEY"
ds_client      = OpenAI(api_key=DEEPSEEK_KEY, base_url="https://api.deepseek.com")
DS_MODEL       = "deepseek-reasoner"
SUBSET_PATH    = "subset_pairs.csv"
REST_PATH      = "rest_pairs.csv"
ALL_PATH       = "synthetic_reports.csv"   # 兜底

# ================ 读取并准备全集（带 pair_id / group_id） =================
if os.path.exists(SUBSET_PATH) and os.path.exists(REST_PATH):
    df_sub = pd.read_csv(SUBSET_PATH)
    df_rst = pd.read_csv(REST_PATH)
    df_all = pd.concat([df_sub, df_rst], ignore_index=True)
else:
    df_all = pd.read_csv(ALL_PATH)
    df_all = df_all.reset_index().rename(columns={"index":"pair_id"})

# 若缺少 group_id，则根据 Original 生成（同一原文的5条一个组）
if "group_id" not in df_all.columns:
    df_all["group_id"] = df_all.groupby("Original", sort=False).ngroup()

# 统一必要列顺序
base_cols = ["pair_id","group_id","Original","Synthetic","Transform_Type","Specialty","Source"]
df_all = df_all[base_cols].sort_values("pair_id").reset_index(drop=True)

# ================ 组抽样（整组 5 条一起抽） =================
n_groups_total = df_all["group_id"].nunique()
n_groups_need  = min(DS_MAX_PAIRS // 5, n_groups_total)
sampled_groups = (pd.Series(df_all["group_id"].unique())
                    .sample(n=n_groups_need, random_state=123)
                    .sort_values())

sample_df = (df_all[df_all["group_id"].isin(sampled_groups)]
             .sort_values("pair_id")
             .reset_index(drop=True))

# 截断到 DS_MAX_PAIRS（防止不是5的倍数时多出来）
if len(sample_df) > DS_MAX_PAIRS:
    sample_df = sample_df.iloc[:DS_MAX_PAIRS].copy()

print(f"DeepSeek 抽样：组数 {n_groups_need}/{n_groups_total}  → 样本对数 {len(sample_df)}")

# ================ 评分：DeepSeek =================
num_pattern = re.compile(r"(?<!\d)(\d+(?:\.\d+)?)(?!\d)")

SCORE_PROMPT = """You are a medical report evaluator.
Compare clinical meaning only (findings, laterality, devices, measurements, diagnoses).
Score from 0 to 10 where 10 = identical clinical meaning and 0 = contradictory/irrelevant.

Scoring instructions:
Assign a similarity score from 0.0 to 10.0 (one decimal place):
10.0: Completely identical in all clinically meaningful aspects.
8-9.9: Only very minor differences (e.g., "small" vs. "mild"; equivalent negative findings; different but clinically irrelevant wording).
6-7.9: Noticeable but not clinically significant differences (e.g., "small" vs. "moderate" effusion; more detail in one report, but the overall meaning is similar).
3-5.9: Clinically significant but not directly contradictory (e.g., one report notes a finding the other omits; or laterality mismatch, but not a direct contradiction).
0-2.9: Reports are clinically contradictory (e.g., one says "definite pneumonia," the other says "no pneumonia"; or one says "fracture present," the other "no fracture").
Return ONLY a single number (0-10, one decimal). No words.

Report A (original):
{A}

Report B (candidate):
{B}
"""

def extract_score(text: str):
    m = num_pattern.search(text or "")
    if not m: 
        return None
    v = float(m.group(1))
    # 保守截断 0~10 并保留两位
    return round(max(0.0, min(10.0, v)), 2)

def score_deepseek(a: str, b: str) -> float:
    prompt = SCORE_PROMPT.format(A=a, B=b)
    resp = ds_client.chat.completions.create(
        model=DS_MODEL,
        messages=[
            {"role":"system","content":"You are a precise clinical text evaluator."},
            {"role":"user","content":prompt}
        ],
        temperature=0,
        stream=False
    )
    out = resp.choices[0].message.content.strip()
    return extract_score(out)

# 进度
def progress(i, n, prefix="DeepSeek scoring"):
    pct = int((i+1)*100/n)
    print(f"\r{prefix}: {pct}%", end="", flush=True)

rows = []
N = len(sample_df)
for i, r in enumerate(sample_df.itertuples(index=False)):
    s = score_deepseek(r.Original, r.Synthetic)
    rows.append({
        "pair_id": r.pair_id,
        "Original": r.Original,
        "Synthetic": r.Synthetic,
        "Transform_Type": r.Transform_Type,
        "Specialty": r.Specialty,
        "Source": r.Source,
        "score_deepseek": s
    })
    progress(i, N)

print()  # 换行

deepseek_scored = pd.DataFrame(rows).sort_values("pair_id")
deepseek_scored.to_csv("deepseek_scored.csv", index=False)
print(f"✅ deepseek_scored.csv 保存完成：{len(deepseek_scored)} 条")


DeepSeek 抽样：组数 40/1605  → 样本对数 200
DeepSeek scoring: 100%
✅ deepseek_scored.csv 保存完成：200 条
