In [None]:
"""
1.Paraphrased:(semantically equivalent): Reworded with no change in meaning
2.Clinically Significant: Subtle edits that reverse or alter findings (e.g. adding/removing "no", changing “mild” to “severe”)
3.Noisy: 	Add vague, contradictory, or irrelevant statements
4.Neutral Distractor: Shuffle or add non-impactful content	
5.Hallucinated:Add medically inaccurate or incorrect/fake content

"""



In [None]:
import os
import re
import random
import pandas as pd
from copy import deepcopy
from dotenv import load_dotenv
from openai import OpenAI

# ============== 环境 & 客户端 ==============
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)  # 与你示例一致

# ============== 读取采样数据 ==============
# 结构：Original, Source, Specialty
df = pd.read_csv("Stratified_sampling_cleaned.csv")

# ============== 统一 PROMPT ==============
PROMPT_TEMPLATE = """You are a medical report rewriting assistant.
You will receive an original medical report. Your task is to create ONE synthetic version according to the given category rules, preserving the overall style, structure, and medical terminology of the original report.

Original report:
{ORIGINAL_REPORT}

Category: {CATEGORY_DESCRIPTION}

Instructions:
- Keep the medical style consistent with the original.
- Do not include explanations, reasons, or any extra commentary in your output. Output only the synthetic report text.
- Maintain realistic medical details unless the category requires alteration (e.g., Hallucinated).
- Keep the report length roughly similar to the original unless specified otherwise.

Category definitions and rules:
1. Paraphrased (semantically equivalent):
   - Reword the report without changing its meaning.
   - Keep all factual medical findings identical.
   - Only alter phrasing, sentence structure, or synonyms.

2. Clinically Significant:
   - Make subtle but important changes to clinical findings that reverse or alter the diagnosis.
   - Examples: add/remove “no” to reverse meaning, change “mild” to “severe”, “normal” to “abnormal”.
   - Only one or two changes; keep other details exactly the same.

3. Neutral Distractor:
   - Shuffle sentences or add non-impactful content (e.g., “The patient tolerated the procedure well”).
   - Added content must not alter medical findings or diagnoses.
   - All clinical conclusions remain the same.

4. Hallucinated:
   - Insert medically inaccurate or impossible findings (e.g., mention organs not related to the exam, impossible measurements).
   - Keep most of the report intact, but add 1-2 fabricated medical facts.
   - Fabrications should be obvious to a medical expert but still sound professional.

Now rewrite the report according to the selected category.
"""

# 映射：类别名 -> Category: 传入的说明文字
CATEGORY_MAP = {
    "Paraphrased": "Paraphrased (semantically equivalent)",
    "Clinically Significant": "Clinically Significant",
    "Neutral Distractor": "Neutral Distractor",
    "Hallucinated": "Hallucinated",
}

# ============== Noisy 生成（脚本） ==============
def generate_noisy_version(text: str) -> str:
    """
    目标：让表达更含糊/模糊，但不改变临床含义。
    规则：
    - 只在不含明显否定/排除的句子里做轻微弱化（避免改变阴性/阳性含义）
    - 数值前加 approximately / about / around(不改数值本身)
    - 可在末尾加建议性短句
    """
    weakening_phrases = ["likely", "appears to be", "suggests", "possibly", "may represent", "cannot rule out"]
    approx_phrases = ["approximately", "about", "around"]
    advisory_phrases = ["Further evaluation is recommended.", "Clinical correlation is advised."]

    negation_tokens = [" no ", " without ", " absent ", " negative for "]

    noisy_text = deepcopy(text)

    # 分句（很粗糙即可）：尽量在不含否定词的句子里做一次弱化
    sentences = re.split(r'(?<=[\.\!\?])\s+', noisy_text)
    idxs = list(range(len(sentences)))
    random.shuffle(idxs)

    def has_negation(s):
        s_l = " " + s.lower() + " "
        return any(tok in s_l for tok in negation_tokens)

    # 1) 在非否定句中做一次轻微弱化替换
    replaced = False
    for i in idxs:
        s = sentences[i]
        if not s.strip():
            continue
        if has_negation(s):
            continue
        # 只替换一次，最小扰动
        repl_map = [
            (r"\bis\b", f"{random.choice(weakening_phrases)}"),
            (r"\bare\b", f"{random.choice(weakening_phrases)}"),
            (r"\bshows\b", f"{random.choice(weakening_phrases)}"),
            (r"\bdemonstrates\b", f"{random.choice(weakening_phrases)}"),
        ]
        for pat, rep in repl_map:
            if re.search(pat, s):
                # 用弱化短语替换动词，同时保留后文（简单替换）
                s_new = re.sub(pat, rep, s, count=1)
                if s_new != s:
                    sentences[i] = s_new
                    replaced = True
                    break
        if replaced:
            break

    noisy_text = " ".join(sentences)

    # 2) 模糊化第一个数字（不改数值，只在前面加 approx）
    m = re.search(r"\b(\d+(\.\d+)?)\b", noisy_text)
    if m:
        num = m.group(1)
        noisy_text = noisy_text.replace(num, f"{random.choice(approx_phrases)} {num}", 1)

    # 3) 30% 概率在末尾加建议性短句（非医疗结论变更）
    if random.random() < 0.3:
        if not noisy_text.strip().endswith(('.', '!', '?')):
            noisy_text += "."
        noisy_text += " " + random.choice(advisory_phrases)

    return noisy_text

# ============== LLM 生成 ==============
def llm_generate(original_text: str, transform_type: str, model_name: str = "gpt-4o", temperature: float = 0.7) -> str:
    """
    使用你的统一 Prompt。始终包含全部规则,并在 Category 传入当前类型。
    """
    category_desc = CATEGORY_MAP[transform_type]
    prompt = PROMPT_TEMPLATE.format(
        ORIGINAL_REPORT=original_text,
        CATEGORY_DESCRIPTION=category_desc
    )

    resp = client.chat.completions.create(
        model=model_name,          # 你可以改成 gpt-4o / gpt-4o-mini 等
        messages=[
            {"role": "system", "content": "You are a helpful assistant specialized in clinical text rewriting."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        stream=False
    )
    return resp.choices[0].message.content.strip()

# ============== 批量生成 + 简易进度百分比 ==============
transform_types = ["Paraphrased", "Clinically Significant", "Noisy", "Neutral Distractor", "Hallucinated"]

total_pairs = len(df) * len(transform_types)
done = 0
last_pct = -1

synthetic_rows = []

for _, row in df.iterrows():
    original_text = row["Original"]
    for t_type in transform_types:
        if t_type == "Noisy":
            synthetic_text = generate_noisy_version(original_text)
        else:
            synthetic_text = llm_generate(original_text, t_type, model_name="gpt-4o", temperature=0.7)

        synthetic_rows.append({
            "Original": original_text,
            "Synthetic": synthetic_text,
            "Transform_Type": t_type,
            "Specialty": row["Specialty"],
            "Source": row["Source"]
        })

        # 进度（百分比，单行刷新）
        done += 1
        pct = int(done * 100 / total_pairs)
        if pct != last_pct:
            print(f"\rProgress: {pct}%", end="", flush=True)
            last_pct = pct

print()  # 换行

synthetic_df = pd.DataFrame(synthetic_rows)
synthetic_df.to_csv("synthetic_reports.csv", index=False)
print(f"✅ 生成完成：{len(synthetic_df)} 条合成报告，输出 synthetic_reports.csv")



Progress: 90%

APITimeoutError: Request timed out.

In [2]:
# 先选一条测试报告（你也可以换成别的 index）
row = df.iloc[0]

original_text = row["Original"]
specialty = row["Specialty"]
source = row["Source"]

# 准备五类
transform_types = ["Paraphrased", "Clinically Significant", "Noisy", "Neutral Distractor", "Hallucinated"]

test_rows = []

for t_type in transform_types:
    if t_type == "Noisy":
        synthetic_text = generate_noisy_version(original_text)
    else:
        synthetic_text = llm_generate(original_text, t_type)

    test_rows.append({
        "Original": original_text,
        "Synthetic": synthetic_text,
        "Transform_Type": t_type,
        "Specialty": specialty,
        "Source": source
    })

# 转为 DataFrame 展示
test_df = pd.DataFrame(test_rows)
test_df


Unnamed: 0,Original,Synthetic,Transform_Type,Specialty,Source
0,"A right chest tube remains in place, a small r...","A right chest tube is still present, and a sma...",Paraphrased,chest x-ray,CXRdf
1,"A right chest tube remains in place, a small r...","A right chest tube has been removed, and a lar...",Clinically Significant,chest x-ray,CXRdf
2,"A right chest tube remains in place, a small r...","A right chest tube remains in place, a small r...",Noisy,chest x-ray,CXRdf
3,"A right chest tube remains in place, a small r...",The right chest tube remains in place; a small...,Neutral Distractor,chest x-ray,CXRdf
4,"A right chest tube remains in place, a small r...","A right chest tube remains in place, with a sm...",Hallucinated,chest x-ray,CXRdf
