In [None]:
import pandas as pd

# 读取你的主表
df = pd.read_csv("data/complete_reports.csv")

# 随机采样1000条（不放回，保证唯一）
df_sample = df.sample(n=1000, random_state=42).copy()  # random_state 保证可复现
df_sample = df_sample.reset_index(drop=True)   # <--- 让index从0开始

# 确认有 'Original' 列
print(df_sample.columns)


Index(['Original', 'Source', 'Specialty'], dtype='object')


Paraphrased

Clinically Significant

Noisy/Ambiguous

Neutral Distractor

Omission

Hallucinated

In [7]:
# prompt to generate the synthetic reports
import random

INSTRUCTIONS = {
    "Paraphrased":
        "Paraphrase the following medical report without changing its clinical meaning:",
    "Clinically Significant":
        "Modify the following report to introduce a clinically significant change (for example, reverse or alter a key finding):",
    "Noisy":
        "Add vague, ambiguous, or contradictory statements to the following medical report, while keeping the main findings:",
    "Neutral":
        "Add a sentence of irrelevant but harmless information to the following medical report, without changing the clinical findings:",
}

def script_omission(text):
    sents = text.split('. ')
    if len(sents) > 2:
        omit = random.randint(0, len(sents)-1)
        sents.pop(omit)
        return '. '.join(sents)
    else:
        return text

def script_hallucinated(text):
    fake_facts = [
        "A previously undocumented mass was detected in the left lung.",
        "Imaging reveals possible early-stage sarcoidosis.",
        "An unexpected metabolic anomaly is noted.",
        "No radiological evidence of infectious disease is present, but an incidental cyst is seen in the right lobe."
    ]
    return text + " " + random.choice(fake_facts)



In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

def gpt_generate(original, instruction, model="gpt-4o"):
    prompt = f"{instruction}\n\n{original}"
    resp = client.chat.completions.create(
        model=model,  
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    return resp.choices[0].message.content.strip()



In [9]:
original = "Chest X-ray shows no acute cardiopulmonary abnormality."
instruction = "Paraphrase the following medical report without changing its clinical meaning:"
print(gpt_generate(original, instruction, model="gpt-3.5-turbo"))


The chest X-ray does not reveal any acute abnormalities related to the heart or lungs.


In [10]:
synthetic_data = []
for idx, row in df_sample.iterrows():
    orig = row["Original"]
    meta = {"Source": row["Source"], "Specialty": row["Specialty"]}

    # LLM生成的4类
    for label in ["Paraphrased", "Clinically Significant", "Noisy", "Neutral"]:
        try:
            synthetic = gpt_generate(orig, INSTRUCTIONS[label], model="gpt-3.5-turbo")
        except Exception as e:
            synthetic = f"LLM_ERROR: {e}"
        synthetic_data.append({
            "Original": orig,
            "Synthetic": synthetic,
            "Label": label,
            **meta
        })

    # 脚本法2类
    synthetic_data.append({
        "Original": orig,
        "Synthetic": script_omission(orig),
        "Label": "Omission",
        **meta
    })
    synthetic_data.append({
        "Original": orig,
        "Synthetic": script_hallucinated(orig),
        "Label": "Hallucinated",
        **meta
    })

    if (idx+1) % 20 == 0:  # 可以调整进度提示频率
        print(f"Processed {idx+1} / {len(df_sample)} originals...")

# 保存
synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.to_csv("synthetic_1000_gpt3.5-turbo.csv", index=False)


Processed 20 / 1000 originals...
Processed 40 / 1000 originals...
Processed 60 / 1000 originals...
Processed 80 / 1000 originals...
Processed 100 / 1000 originals...
Processed 120 / 1000 originals...
Processed 140 / 1000 originals...
Processed 160 / 1000 originals...
Processed 180 / 1000 originals...
Processed 200 / 1000 originals...
Processed 220 / 1000 originals...
Processed 240 / 1000 originals...
Processed 260 / 1000 originals...
Processed 280 / 1000 originals...
Processed 300 / 1000 originals...
Processed 320 / 1000 originals...
Processed 340 / 1000 originals...
Processed 360 / 1000 originals...
Processed 380 / 1000 originals...
Processed 400 / 1000 originals...
Processed 420 / 1000 originals...
Processed 440 / 1000 originals...
Processed 460 / 1000 originals...
Processed 480 / 1000 originals...
Processed 500 / 1000 originals...
Processed 520 / 1000 originals...
Processed 540 / 1000 originals...
Processed 560 / 1000 originals...
Processed 580 / 1000 originals...
Processed 600 / 10

In [None]:
import pandas as pd
import os
import torch
from dotenv import load_dotenv
from openai import OpenAI
from google import genai
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import re
from tqdm import tqdm

# 1. 读取API Key和初始化client
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

client_openai = OpenAI(api_key=openai_api_key)
client_deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com")
client_gemini = genai.Client()

# 2. 加载embedding模型
tokenizer_bio = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model_bio = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
tokenizer_clin = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model_clin = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# 3. 各模型的评分函数
def llm_score(original, synthetic, client, model="gpt-4o"):
    prompt = (
        "You are a medical expert. How similar are the following two medical reports? Give a score from 0 to 10 and briefly explain your reasoning. Be precise and concise, avoid lengthy details.\n\n"
        f"Report A (Original):\n{original}\n\n"
        f"Report B (Synthetic):\n{synthetic}"
    )
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        content = resp.choices[0].message.content.strip()
        match = re.search(r'(\d+(\.\d+)?)(/10)?', content)
        score = float(match.group(1)) if match else None
        return score, content
    except Exception as e:
        return None, f"{model}_ERROR: {e}"

def deepseek_score(original, synthetic, client):
    # 用openai格式client，base_url已在初始化时指定
    return llm_score(original, synthetic, client, model="deepseek-reasoner")

def gemini_score(original, synthetic, client):
    prompt = (
        "You are a medical expert. How similar are the following two medical reports? Give a score from 0 to 10 and briefly explain your reasoning. Be precise and concise, avoid lengthy details.\n\n"
        f"Report A (Original):\n{original}\n\n"
        f"Report B (Synthetic):\n{synthetic}"
    )
    try:
        response = client.models.generate_content(
            model="gemini-2.5-pro", contents=prompt
        )
        content = response.text
        match = re.search(r'(\d+(\.\d+)?)(/10)?', content)
        score = float(match.group(1)) if match else None
        return score, content
    except Exception as e:
        return None, f"GEMINI_ERROR: {e}"

def bert_similarity(original, synthetic, tokenizer, model):
    def get_emb(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**inputs)
            emb = output.last_hidden_state.mean(dim=1).numpy()
        return emb
    emb1 = get_emb(original)
    emb2 = get_emb(synthetic)
    sim = cosine_similarity(emb1, emb2)[0][0]
    return sim

# 4. 读取数据
synthetic_df = pd.read_csv("synthetic_1000_gpt3.5-turbo.csv")

results = []
for idx, row in tqdm(synthetic_df.iterrows(), total=len(synthetic_df)):
    orig, synth = row["Original"], row["Synthetic"]
    meta = {k: row[k] for k in ["Label", "Source", "Specialty"]}
    result = {
        "Original": orig, "Synthetic": synth, **meta
    }

    # GPT-4o
    score_4o, explain_4o = llm_score(orig, synth, client_openai, model="gpt-4o")
    result["GPT-4o_Score"] = score_4o
    result["GPT-4o_Explain"] = explain_4o

    # GPT-3.5-turbo
    score_35, explain_35 = llm_score(orig, synth, client_openai, model="gpt-3.5-turbo")
    result["GPT-3.5_Score"] = score_35
    result["GPT-3.5_Explain"] = explain_35

    # DeepSeek
    score_ds, explain_ds = deepseek_score(orig, synth, client_deepseek)
    result["DeepSeek_Score"] = score_ds
    result["DeepSeek_Explain"] = explain_ds

    # Gemini
    score_gem, explain_gem = gemini_score(orig, synth, client_gemini)
    result["Gemini_Score"] = score_gem
    result["Gemini_Explain"] = explain_gem

    # BioBERT
    result["BioBERT_sim"] = bert_similarity(orig, synth, tokenizer_bio, model_bio)

    # ClinicalBERT
    result["ClinicalBERT_sim"] = bert_similarity(orig, synth, tokenizer_clin, model_clin)

    results.append(result)

    # 可选：定期保存，避免意外
    if (idx+1) % 50 == 0:
        pd.DataFrame(results).to_csv("synthetic_eval_partial.csv", index=False)
        print(f"已保存 {idx+1} 条...")

# 最终保存
results_df = pd.DataFrame(results)
results_df.to_csv("synthetic_eval_results.csv", index=False)


  1%|          | 50/6000 [26:00<52:08:18, 31.55s/it]

已保存 50 条...


  1%|▏         | 77/6000 [39:46<54:55:07, 33.38s/it]

: 

: 