In [1]:
import pandas as pd
import os
import torch
from dotenv import load_dotenv
from openai import OpenAI
from google import genai
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm

# 1. 读取API Key和初始化client
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

client_openai = OpenAI(api_key=openai_api_key)
client_deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")
client_gemini = genai.Client()

# 2. 加载embedding模型
tokenizer_bio = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model_bio = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
tokenizer_clin = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model_clin = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# 3. 各模型的评分函数
def llm_score(original, synthetic, client, model="gpt-4o"):
    prompt = (
        "You are a medical expert. How similar are the following two medical reports? Give a score from 0 to 10 and briefly explain your reasoning. Be precise and concise, avoid lengthy details.\n\n"
        f"Report A (Original):\n{original}\n\n"
        f"Report B (Synthetic):\n{synthetic}"
    )
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        content = resp.choices[0].message.content.strip()
        match = re.search(r'(\d+(\.\d+)?)(/10)?', content)
        score = float(match.group(1)) if match else None
        return score, content
    except Exception as e:
        return None, f"{model}_ERROR: {e}"

def deepseek_score(original, synthetic, client):
    # 用openai格式client，base_url已在初始化时指定
    return llm_score(original, synthetic, client, model="deepseek-reasoner")

def gemini_score(original, synthetic, client):
    prompt = (
        "You are a medical expert. How similar are the following two medical reports? Give a score from 0 to 10 and briefly explain your reasoning. Be precise and concise, avoid lengthy details.\n\n"
        f"Report A (Original):\n{original}\n\n"
        f"Report B (Synthetic):\n{synthetic}"
    )
    try:
        response = client.models.generate_content(
            model="gemini-2.5-pro", contents=prompt
        )
        content = response.text
        match = re.search(r'(\d+(\.\d+)?)(/10)?', content)
        score = float(match.group(1)) if match else None
        return score, content
    except Exception as e:
        return None, f"GEMINI_ERROR: {e}"

def bert_similarity(original, synthetic, tokenizer, model):
    def get_emb(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**inputs)
            emb = output.last_hidden_state.mean(dim=1).numpy()
        return emb
    emb1 = get_emb(original)
    emb2 = get_emb(synthetic)
    sim = cosine_similarity(emb1, emb2)[0][0]
    return sim

# 4. 读取数据
synthetic_df = pd.read_csv("synthetic_1000_gpt3.5-turbo.csv")

results = []
for idx, row in tqdm(synthetic_df.iterrows(), total=len(synthetic_df)):
    orig, synth = row["Original"], row["Synthetic"]
    meta = {k: row[k] for k in ["Label", "Source", "Specialty"]}
    result = {
        "Original": orig, "Synthetic": synth, **meta
    }

    # GPT-4o
    score_4o, explain_4o = llm_score(orig, synth, client_openai, model="gpt-4o")
    result["GPT-4o_Score"] = score_4o
    result["GPT-4o_Explain"] = explain_4o

    # GPT-3.5-turbo
    score_35, explain_35 = llm_score(orig, synth, client_openai, model="gpt-3.5-turbo")
    result["GPT-3.5_Score"] = score_35
    result["GPT-3.5_Explain"] = explain_35

    # DeepSeek
    score_ds, explain_ds = deepseek_score(orig, synth, client_deepseek)
    result["DeepSeek_Score"] = score_ds
    result["DeepSeek_Explain"] = explain_ds

    # Gemini
    score_gem, explain_gem = gemini_score(orig, synth, client_gemini)
    result["Gemini_Score"] = score_gem
    result["Gemini_Explain"] = explain_gem

    # BioBERT
    result["BioBERT_sim"] = bert_similarity(orig, synth, tokenizer_bio, model_bio)

    # ClinicalBERT
    result["ClinicalBERT_sim"] = bert_similarity(orig, synth, tokenizer_clin, model_clin)

    results.append(result)

    # 每60条保存一次，文件名依次为synthetic_eval_1.csv, synthetic_eval_2.csv等
    if (idx + 1) % 60 == 0:
        file_idx = (idx + 1) // 60
        pd.DataFrame(results).to_csv(f"synthetic_eval_{file_idx}.csv", index=False)
        print(f"已保存 {idx + 1} 条到 synthetic_eval_{file_idx}.csv ...")

# 最终保存
results_df = pd.DataFrame(results)
results_df.to_csv("synthetic_sample_results.csv", index=False)





  1%|          | 60/6000 [49:32<75:38:46, 45.85s/it]

已保存 60 条到 synthetic_eval_1.csv ...


  2%|▏         | 120/6000 [1:35:17<73:33:20, 45.03s/it]

已保存 120 条到 synthetic_eval_2.csv ...


  3%|▎         | 180/6000 [2:23:03<78:49:35, 48.76s/it]

已保存 180 条到 synthetic_eval_3.csv ...


  4%|▍         | 240/6000 [3:08:40<72:48:28, 45.50s/it]

已保存 240 条到 synthetic_eval_4.csv ...


  5%|▌         | 300/6000 [3:56:26<79:01:11, 49.91s/it]

已保存 300 条到 synthetic_eval_5.csv ...


  6%|▌         | 360/6000 [4:48:21<68:46:12, 43.90s/it] 

已保存 360 条到 synthetic_eval_6.csv ...


  7%|▋         | 420/6000 [5:36:47<77:12:05, 49.81s/it]

已保存 420 条到 synthetic_eval_7.csv ...


  8%|▊         | 480/6000 [6:24:56<77:00:48, 50.23s/it]

已保存 480 条到 synthetic_eval_8.csv ...


  9%|▉         | 540/6000 [7:16:24<74:44:08, 49.28s/it]

已保存 540 条到 synthetic_eval_9.csv ...


 10%|█         | 600/6000 [8:08:23<72:54:54, 48.61s/it]

已保存 600 条到 synthetic_eval_10.csv ...


 11%|█         | 660/6000 [9:00:47<73:52:24, 49.80s/it]

已保存 660 条到 synthetic_eval_11.csv ...


 12%|█▏        | 720/6000 [9:52:22<114:49:08, 78.29s/it]

已保存 720 条到 synthetic_eval_12.csv ...


 13%|█▎        | 780/6000 [10:42:40<72:25:47, 49.95s/it]

已保存 780 条到 synthetic_eval_13.csv ...


 14%|█▍        | 840/6000 [11:32:36<70:09:54, 48.95s/it]

已保存 840 条到 synthetic_eval_14.csv ...


 15%|█▌        | 900/6000 [12:24:49<59:06:31, 41.72s/it]

已保存 900 条到 synthetic_eval_15.csv ...


 16%|█▌        | 960/6000 [13:14:41<66:43:53, 47.67s/it]

已保存 960 条到 synthetic_eval_16.csv ...


 17%|█▋        | 1020/6000 [14:07:21<68:53:27, 49.80s/it]

已保存 1020 条到 synthetic_eval_17.csv ...


 17%|█▋        | 1028/6000 [14:15:32<68:57:52, 49.93s/it]


KeyboardInterrupt: 