In [1]:
%pip -q install -U metapub pdfplumber pandas lxml requests beautifulsoup4 tqdm
%pip -q install -U "transformers>=4.38" "accelerate>=0.26" "bitsandbytes>=0.43" "safetensors"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Step 1:建立目录与配置

In [2]:
import os, json, re, time
from pathlib import Path

CORPUS_DIR = Path("Annane_Sepsis_Corpus")
PAPERS_DIR = CORPUS_DIR / "papers"
OUT_DIR = CORPUS_DIR / "out"

PAPERS_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("CORPUS_DIR:", CORPUS_DIR.resolve())
print("PAPERS_DIR:", PAPERS_DIR.resolve())
print("OUT_DIR:", OUT_DIR.resolve())


CORPUS_DIR: D:\2026projects\courses-HoNLP\Annane_Sepsis_Corpus
PAPERS_DIR: D:\2026projects\courses-HoNLP\Annane_Sepsis_Corpus\papers
OUT_DIR: D:\2026projects\courses-HoNLP\Annane_Sepsis_Corpus\out


## Step 2: Topic 检索 + 下载 PDF + sections.json

In [3]:
from metapub import PubMedFetcher, FindIt
import requests
import pdfplumber
from tqdm.auto import tqdm

fetcher = PubMedFetcher()

TOPICS = {
    "sepsis_diagnosis_criteria": '(Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (diagnosis OR criteria OR definition OR "Sepsis-3")',
    "sepsis_treatment_plan": '(Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (treatment OR therapy OR management OR guideline OR hydrocortisone)',
    "steroid_sensitivity": '(Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (corticosteroid OR hydrocortisone OR steroid) AND (response OR sensitivity OR resistance)',
}

TARGET_PER_TOPIC = 10

def pmid_search(query, retmax=200):
    # metapub 支持 fetcher.pmids_for_query
    pmids = fetcher.pmids_for_query(query, retmax=retmax)
    return [str(x) for x in pmids]

def is_pdf_bytes(b: bytes) -> bool:
    return b[:4] == b"%PDF"

def download_pdf_from_url(url, out_path: Path, timeout=60):
    r = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
    r.raise_for_status()
    data = r.content
    if not is_pdf_bytes(data):
        return False
    out_path.write_bytes(data)
    return True

def find_pdf_url(pmid: str):
    try:
        src = FindIt(pmid)
        if src.url:
            return src.url
        return None
    except Exception:
        return None

def pdf_to_text(pdf_path: Path, max_pages=10):
    text = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        pages = pdf.pages[:max_pages]
        for p in pages:
            t = p.extract_text() or ""
            if t.strip():
                text.append(t)
    return "\n".join(text)

def sectionize_simple(full_text: str):
    # 非完美：先给你一个可稳定跑的最简分段（Abstract/Method/Conclusion）
    # 后续你可以升级为更强的 heading detector
    t = re.sub(r"\s+", " ", full_text).strip()
    sections = {"Abstract":"", "Method":"", "Conclusion":""}

    # 粗规则：按关键词切
    m = re.search(r"\b(METHODS?|PATIENTS?|DESIGN|SETTING)\b", t, flags=re.I)
    c = re.search(r"\b(CONCLUSION(S)?|RESULTS)\b", t, flags=re.I)

    if m:
        sections["Abstract"] = t[:m.start()].strip()
        if c and c.start() > m.start():
            sections["Method"] = t[m.start():c.start()].strip()
            sections["Conclusion"] = t[c.start():].strip()
        else:
            sections["Method"] = t[m.start():].strip()
    else:
        sections["Abstract"] = t[:4000]  # fallback

    return sections

def save_paper(pmid: str, topic: str, pdf_bytes_ok: bool, pdf_path: Path, sections: dict, meta: dict):
    paper_dir = PAPERS_DIR / f"PMID_{pmid}"
    paper_dir.mkdir(parents=True, exist_ok=True)

    if pdf_bytes_ok:
        # 已下载到 pdf_path（先写到 paper_dir）
        (paper_dir / "paper.pdf").write_bytes(pdf_path.read_bytes())

    sec_obj = {
        "pmid": pmid,
        "topic": topic,
        "pdf_path": str(paper_dir / "paper.pdf") if pdf_bytes_ok else None,
        "sectionizer": "simple_pdfplumber",
        "sections": sections,
        "meta": meta
    }
    (paper_dir / "sections.json").write_text(json.dumps(sec_obj, ensure_ascii=False, indent=2), encoding="utf-8")
    return paper_dir

def already_ready(paper_dir: Path):
    return (paper_dir/"paper.pdf").exists() and (paper_dir/"sections.json").exists()

tmp_pdf = OUT_DIR / "tmp_download.pdf"

for topic, query in TOPICS.items():
    print("\n" + "="*60)
    print("Topic:", topic)
    print("Query:", query)

    pmids = pmid_search(query, retmax=500)
    print("PMIDs:", len(pmids))

    saved = 0
    for pmid in tqdm(pmids[:500], desc=f"collect {topic}"):
        paper_dir = PAPERS_DIR / f"PMID_{pmid}"
        if already_ready(paper_dir):
            saved += 1
            if saved >= TARGET_PER_TOPIC:
                break
            continue

        url = find_pdf_url(pmid)
        if not url:
            continue

        ok_pdf = False
        try:
            ok_pdf = download_pdf_from_url(url, tmp_pdf)
        except Exception:
            ok_pdf = False

        if not ok_pdf:
            continue

        # 抽文本并分段
        ft = pdf_to_text(tmp_pdf, max_pages=12)
        secs = sectionize_simple(ft)
        meta = {"findit_url": url}

        save_paper(pmid, topic, True, tmp_pdf, secs, meta)
        saved += 1
        print(f"✅ {topic}: PMID {pmid} saved ({saved}/{TARGET_PER_TOPIC})")
        if saved >= TARGET_PER_TOPIC:
            break

    print(f"==> Done {topic}: {saved}/{TARGET_PER_TOPIC}")


  from .autonotebook import tqdm as notebook_tqdm



Topic: sepsis_diagnosis_criteria
Query: (Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (diagnosis OR criteria OR definition OR "Sepsis-3")
PMIDs: 141


collect sepsis_diagnosis_criteria:   0%|          | 0/141 [00:00<?, ?it/s][32m2026-02-10 10:27:33[0m [35mLAPTOP-4NMFDBK9[0m [34mmetapub.findit[2552][0m [1;30mINFO[0m FindIt Cache initialized at C:\Users\27858\.cache\findit.db
collect sepsis_diagnosis_criteria:  18%|█▊        | 26/141 [00:14<01:06,  1.73it/s]


==> Done sepsis_diagnosis_criteria: 10/10

Topic: sepsis_treatment_plan
Query: (Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (treatment OR therapy OR management OR guideline OR hydrocortisone)
PMIDs: 228


collect sepsis_treatment_plan:  14%|█▎        | 31/228 [00:15<01:41,  1.94it/s]


==> Done sepsis_treatment_plan: 10/10

Topic: steroid_sensitivity
Query: (Annane D[Author] OR Djillali A[Author]) AND (sepsis OR "septic shock") AND (corticosteroid OR hydrocortisone OR steroid) AND (response OR sensitivity OR resistance)
PMIDs: 75


collect steroid_sensitivity:  31%|███       | 23/75 [00:10<00:23,  2.25it/s]

==> Done steroid_sensitivity: 10/10





## 读取一篇 READY paper

In [4]:
def get_first_ready_paper():
    for p in sorted(PAPERS_DIR.glob("PMID_*")):
        sec = p / "sections.json"
        pdf = p / "paper.pdf"
        if not (sec.exists() and pdf.exists()):
            continue
        obj = json.loads(sec.read_text(encoding="utf-8"))
        secs = obj.get("sections") or {}
        if any((v or "").strip() for v in secs.values()):
            return p, obj
    raise RuntimeError("No READY paper found")

def clean_pdf_text(t: str) -> str:
    t = t.replace("\x00", " ")
    t = re.sub(r"\s+", " ", t).strip()

    # 常见页眉/引用信息/作者单位行：直接删掉（保守）
    drop_patterns = [
        r"^critical care.*$",                 # 期刊页眉
        r"^intensive care med.*$",            # 期刊页眉
        r"^doi\s*:\s*\S+.*$",                 # DOI 行
        r"^vol\s*\d+.*$",                     # Vol 行
        r"^review.*clinical review.*$",       # Review 标题
        r"^guidelines.*$",                    # Guidelines 标题
        r"^copyright.*$",                     # copyright
        r"^open access.*$",                   # open access
    ]

    lines = [x.strip() for x in t.split("\n") if x.strip()]
    kept = []
    for ln in lines:
        low = ln.lower()
        if any(re.match(p, low) for p in drop_patterns):
            continue
        # 过短、像作者列表的也跳过
        if len(ln) < 25:
            continue
        kept.append(ln)

    return " ".join(kept)

def paper_text_from_sections(obj, max_chars=12000):
    """
    把 sections.json 的内容拼成：
    [ABSTRACT]
    ...
    [METHOD]
    ...
    这样 parse_section_blocks 才能识别。
    """
    secs = obj.get("sections") or {}
    parts = []

    # 尽量多给一些常见章节（不止 Method/Conclusion）
    order = [
        "Abstract", "Introduction", "Background",
        "Methods", "Method", "Patients", "Materials",
        "Results", "Discussion", "Conclusion",
        "Recommendation", "Guideline"
    ]

    for k in order:
        t = (secs.get(k) or "").strip()
        if t:
            parts.append(f"[{k.upper()}]\n{t}")  # ✅ header 单独一行 + 换行

    text = "\n\n".join(parts).strip()

    # 兜底：如果 sections 为空，就用 abstract 字段
    if not text:
        abs_ = (obj.get("abstract") or "").strip()
        if abs_:
            text = f"[ABSTRACT]\n{abs_}"

    return text[:max_chars]





## 加载 Gemma（4-bit，避免 kernel crash）

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = r"C:\Users\27858\hf_gemma_2b_it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
    low_cpu_mem_usage=True,   # ✅ 更省内存
)

if not torch.cuda.is_available():
    model.to("cpu")

model.eval()
print("Loaded Gemma OK.")


def gemma_generate(prompt: str, max_new_tokens=160, temperature=0.2, max_input_tokens=1024):

    # gemma-it 用 chat template
    if hasattr(tokenizer, "apply_chat_template"):
        messages = [{"role": "user", "content": prompt}]
        enc = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        )
        input_ids = enc.input_ids if hasattr(enc, "input_ids") else enc
    else:
        enc = tokenizer(prompt, return_tensors="pt")
        input_ids = enc["input_ids"]

    if input_ids.shape[-1] > max_input_tokens:
        input_ids = input_ids[:, -max_input_tokens:]

    input_ids = input_ids.to(model.device)

    with torch.no_grad():
        out = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=(temperature is not None and temperature > 0),
            temperature=float(temperature) if temperature else 1.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    gen_ids = out[0, input_ids.shape[-1]:]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()


print(gemma_generate("输出一个词:OK", max_new_tokens=20, temperature=0.0))


Loading weights: 100%|██████████| 164/164 [00:09<00:00, 16.53it/s, Materializing param=model.norm.weight]                               


Loaded Gemma OK.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


OK 是一个词，表示同意或接受。


核心修复：句子编号法（保证 quote 100% 来自文章）

In [None]:
import re, json

# -----------------------------
# 1) 基础工具：句子切分 + 去噪
# -----------------------------
BAD_PATTERNS = [
    r"\bdoi\b", r"\bvol\b", r"\bissue\b", r"\bpages?\b", r"copyright",
    r"\bpmid\b", r"\bpmc\b", r"http[s]?://", r"@", r"conflict of interest",
    r"funding", r"affiliation", r"corresponding author"
]

"""def is_noise_sentence(s: str) -> bool:
    ss = (s or "").strip().lower()
    if len(ss) < 35:
        return True
    if any(re.search(p, ss) for p in BAD_PATTERNS):
        return True
    # 数字太多往往是引用/表格/作者信息
    if sum(ch.isdigit() for ch in ss) > 18:
        return True
    return False"""

NOISE_PATTERNS = [
    r"\bissn\b", r"\bdoi\b", r"\bvol\.?\b", r"\bissue\b",
    r"author manuscript", r"published in final edited form",
    r"springer", r"cochrane", r"wiley", r"elsevier",
    r"all rights reserved", r"copyright",
    r"\btable of contents\b", r"\bcontents\b",
    r"guidelines committee", r"surviving sepsis campaign",  # 这类常变成“标题/名单”
    r"\babstract\b.*\bonline\b",  # 类似“Online ISSN ... Abstract ...”
]

def is_noise_sentence(s: str) -> bool:
    if not s:
        return True
    t = s.strip()
    if len(t) < 40:
        return True
    low = t.lower()
    # 作者名单/机构名单：逗号密度很高且缺少谓词
    if low.count(",") >= 6 and not any(w in low for w in ["increase", "reduce", "associated", "defined", "recommend", "should", "mortality", "diagnos", "treat"]):
        return True
    for pat in NOISE_PATTERNS:
        if re.search(pat, low):
            return True
    return False


def split_sentences_en(text: str):
    """英文为主的句子切分，尽量稳。"""
    t = (text or "").replace("\x00", " ")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t).strip()
    if not t:
        return []
    # 先按换行拆块，再按句末符拆
    blocks = [b.strip() for b in re.split(r"[\r\n]+", t) if b.strip()]
    base = " ".join(blocks)
    sents = re.split(r"(?<=[\.\?\!])\s+|(?<=;)\s+", base)
    sents = [s.strip() for s in sents if s and len(s.strip()) > 20]
    sents = [s for s in sents if not is_noise_sentence(s)]

    return sents

# -----------------------------
# 2) 从你现有 text（包含 [SECTION]）中解析出 block
# -----------------------------
def parse_section_blocks(text: str):
    """
    兼容两种格式：
    1) [METHOD]   (单独一行)
       patients...
    2) [METHOD] patients...  (同一行)
    """
    if not text:
        return []

    blocks = []
    cur_sec = "Unknown"
    buf = []

    def flush():
        nonlocal buf, cur_sec
        if buf:
            blocks.append((cur_sec.title(), "\n".join(buf).strip()))
            buf = []

    for line in text.splitlines():
        line = line.rstrip()

        # ✅ 情况1/2：行首出现 [XXX]
        m = re.match(r"^\[(.+?)\]\s*(.*)$", line.strip())
        if m:
            flush()
            cur_sec = m.group(1).strip()
            rest = (m.group(2) or "").strip()
            if rest:
                buf.append(rest)  # ✅ 把同一行剩余正文也加入
        else:
            if line.strip():
                buf.append(line)

    flush()
    return blocks


# -----------------------------
# 3) 构造“编号句子列表”，按 section 权重优先
# -----------------------------
SEC_WEIGHT = {
    "Abstract": 5,
    "Conclusion": 5,
    "Results": 4,
    "Discussion": 3,
    "Recommendation": 5,
    "Guideline": 5,
    "Introduction": 2,
    "Background": 2,
    "Method": 1,
    "Methods": 1,
    "Patients": 1,
    "Materials": 1,
    "Unknown": 2,
}

def build_indexed_sentences_by_section(text: str, max_sents=80):
    blocks = parse_section_blocks(text)
    if not blocks:
        return [], ""

    cand = []  # (weight, section, sent)
    for sec, sec_text in blocks:
        w = SEC_WEIGHT.get(sec, 2)
        for s in split_sentences_en(sec_text):
            if is_noise_sentence(s):
                continue
            cand.append((w, sec, s))

    # 如果过滤太狠 -> 降级：不做噪声过滤
    if len(cand) < 8:
        cand = []
        for sec, sec_text in blocks:
            w = SEC_WEIGHT.get(sec, 2)
            for s in split_sentences_en(sec_text):
                if len((s or "").strip()) < 25:
                    continue
                cand.append((w, sec, s))

    cand.sort(key=lambda x: x[0], reverse=True)
    cand = cand[:max_sents]

    # 句子列表（保留 section 信息）
    sents = [{"sec": sec, "sent": sent} for _, sec, sent in cand]

    indexed_lines = []
    for i, item in enumerate(sents, start=1):
        indexed_lines.append(f"{i}. [{item['sec'].upper()}] {item['sent']}")
    indexed = "\n".join(indexed_lines)

    return sents, indexed

# -----------------------------
# 4) 让 Gemma 只输出固定文本格式（避免 JSON 崩）
# -----------------------------
def build_quote_pick_prompt(paper_id, topic, indexed_sents_text):
    return f"""
你是医学论证结构抽取助手。

请从下面“编号句子”里挑选：
- CLAIM：1句话，必须是观点/结论/诊断依据/治疗建议相关，不要选作者/期刊/DOI/引用列表。
- E1/E2/E3：3句话，分别作为支持或补充 CLAIM 的证据（尽量来自不同 section）。

严格按照下面格式输出（不要输出其它内容，不要JSON，不要解释）：
CLAIM: <编号>
E1: <编号>
E2: <编号>
E3: <编号>

paper_id={paper_id}
topic={topic}

编号句子：
{indexed_sents_text}
""".strip()

def parse_pick_output(raw: str, n_sents: int):
    """
    解析：
      CLAIM: 12
      E1: 5
      E2: 8
      E3: 3
    """
    if not raw:
        raise ValueError("Empty model output")
    def grab(tag):
        m = re.search(rf"{tag}\s*:\s*(\d+)", raw, flags=re.IGNORECASE)
        return int(m.group(1)) if m else None

    claim_id = grab("CLAIM")
    e1 = grab("E1")
    e2 = grab("E2")
    e3 = grab("E3")

    # 兜底
    claim_id = claim_id or 1
    ev_ids = [e for e in [e1, e2, e3] if e is not None]
    if len(ev_ids) < 3:
        # 补齐
        for i in range(2, 2 + (3 - len(ev_ids))):
            ev_ids.append(i)

    # 边界修正
    claim_id = max(1, min(claim_id, n_sents))
    ev_ids = [max(1, min(i, n_sents)) for i in ev_ids]
    # 去重并保证3个
    ev_ids = list(dict.fromkeys(ev_ids))
    i = 1
    while len(ev_ids) < 3 and i <= n_sents:
        if i != claim_id and i not in ev_ids:
            ev_ids.append(i)
        i += 1
    while len(ev_ids) < 3:
        ev_ids.append(ev_ids[-1])

    # 避免 evidence 跟 claim 一样
    ev_ids = [i for i in ev_ids if i != claim_id]
    while len(ev_ids) < 3:
        ev_ids.append(1 if claim_id != 1 else 2)

    return claim_id, ev_ids[:3]

# -----------------------------
# 5) 主函数：pick_quotes_by_ids（你后面流程直接用它）
# -----------------------------
def pick_quotes_by_ids(paper_id, topic, text, max_sents=80):
    sents, indexed = build_indexed_sentences_by_section(text, max_sents=max_sents)

    # 兜底：没句子
    if not sents:
        fallback = (text or "").strip()[:500] or "No usable text extracted from paper sections."
        return {
            "paper_id": paper_id,
            "topic": topic,
            "claim_quote": fallback,
            "evidence_quotes": [fallback, fallback, fallback],
            "debug": {"reason": "no_sents"}
        }

    # ✅ 不再调用 Gemma 选编号：直接按权重后的顺序取
    # 由于 build_indexed_sentences_by_section 已经按 section 权重排序，
    # 前面更可能是 Abstract/Conclusion/Results/Discussion。
    claim = sents[0]

    evidences = []
    for item in sents[1:]:
        # 尽量来自不同 section（更像你要的“链条”）
        if item["sec"] != claim["sec"] and len(evidences) < 3:
            evidences.append(item)
        if len(evidences) >= 3:
            break

    # 如果还不够 3 条，就直接补齐
    i = 1
    while len(evidences) < 3 and i < len(sents):
        evidences.append(sents[i])
        i += 1

    claim_quote = f"[{claim['sec'].upper()}] {claim['sent']}"
    evidence_quotes = [f"[{e['sec'].upper()}] {e['sent']}" for e in evidences[:3]]

    return {
        "paper_id": paper_id,
        "topic": topic,
        "claim_quote": claim_quote,
        "evidence_quotes": evidence_quotes,
        "debug": {"n_sents": len(sents), "mode": "heuristic"}
    }


# -----------------------------
# 6) （可选）JSON 提取工具：给后续 CE 用
#     如果你后面还需要模型输出 JSON，可以保留它
# -----------------------------
def extract_json_anywhere(text: str):
    if not text:
        raise ValueError("Empty model output")

    t = text.strip().replace("“", '"').replace("”", '"')
    start = t.find("{")
    if start == -1:
        raise ValueError("No '{' found in output.")

    depth = 0
    end = None
    for i in range(start, len(t)):
        ch = t[i]
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                end = i
                break

    if end is None:
        cand = t[start:]
        missing = 0
        for ch in cand:
            if ch == "{":
                missing += 1
            elif ch == "}":
                missing -= 1
        if missing > 0:
            cand = cand + ("}" * missing)
        return json.loads(cand)

    blob = t[start:end+1]
    return json.loads(blob)


## Step3：第二阶段：把 quote 变成 claim/evidence 的可视化结构（生成 argument_tree）

In [22]:
import re, json

def extract_first_json_object(raw: str):
    """从输出中提取第一段 JSON 对象（允许前后夹杂文本/markdown）"""
    if not raw:
        raise ValueError("empty output")
    t = raw.strip().replace("“", '"').replace("”", '"')
    # 去掉 ```json ``` 包裹
    t = re.sub(r"^```(?:json)?\s*", "", t)
    t = re.sub(r"\s*```$", "", t)

    start = t.find("{")
    if start == -1:
        raise ValueError("no { found")

    depth = 0
    end = None
    for i in range(start, len(t)):
        if t[i] == "{":
            depth += 1
        elif t[i] == "}":
            depth -= 1
            if depth == 0:
                end = i
                break

    if end is None:
        # 缺右括号：补齐
        cand = t[start:]
        missing = 0
        for ch in cand:
            if ch == "{": missing += 1
            elif ch == "}": missing -= 1
        if missing > 0:
            cand += "}" * missing
        return json.loads(cand)

    return json.loads(t[start:end+1])

def normalize_ce_obj(obj: dict, paper_id: str, topic: str):
    """
    统一成：
    {
      "paper_id": "...",
      "topic": "...",
      "claim_text": "...",
      "evidence_summaries": ["...", "...", "..."]
    }
    允许两种输入：
    A) {"paper_id","topic","claim_text","evidence_summaries":[...]}
    B) {"CLAIM":"...","E1":"...","E2":"...","E3":"..."}  (你现在遇到的)
    """
    if not isinstance(obj, dict):
        raise ValueError("CE output is not dict")

    # 情况B：CLAIM/E1/E2/E3
    if "CLAIM" in obj or "E1" in obj:
        claim = (obj.get("CLAIM") or obj.get("claim") or "").strip()
        e1 = (obj.get("E1") or "").strip()
        e2 = (obj.get("E2") or "").strip()
        e3 = (obj.get("E3") or "").strip()
        ev = [x for x in [e1, e2, e3] if x]
        while len(ev) < 3:
            ev.append("Supports the claim based on the quoted sentence.")
        return {
            "paper_id": paper_id,
            "topic": topic,
            "claim_text": claim or "Claim.",
            "evidence_summaries": ev[:3],
        }

    # 情况A：claim_text/evidence_summaries
    claim = (obj.get("claim_text") or obj.get("claim") or "").strip()
    ev = obj.get("evidence_summaries") or obj.get("evidence") or []
    if isinstance(ev, str):
        ev = [ev]
    if not isinstance(ev, list):
        ev = []
    ev = [str(x).strip() for x in ev if str(x).strip()]
    while len(ev) < 3:
        ev.append("Supports the claim based on the quoted sentence.")

    return {
        "paper_id": obj.get("paper_id", paper_id),
        "topic": obj.get("topic", topic),
        "claim_text": claim or "Claim.",
        "evidence_summaries": ev[:3],
    }

def parse_ce_text_fallback(raw: str):
    """
    如果模型没输出 JSON，而是输出：
    CLAIM: ...
    E1: ...
    E2: ...
    E3: ...
    就用这个解析
    """
    lines = [l.strip() for l in raw.splitlines() if l.strip()]
    claim = ""
    ev = []
    for l in lines:
        if l.upper().startswith("CLAIM:"):
            claim = l.split(":", 1)[1].strip()
        elif re.match(r"^E\d+\s*:", l.upper()):
            ev.append(l.split(":", 1)[1].strip())
    while len(ev) < 3:
        ev.append("Supports the claim based on the quoted sentence.")
    return claim, ev[:3]

def build_ce_from_quotes_prompt(paper_id, topic, claim_quote, evidence_quotes):
    return f"""
你是医学论证结构抽取助手。

下面给你：
- claim 原文句
- 3条 evidence 原文句

请你只输出“严格 JSON”，不要输出解释/markdown。
必须使用下面键名（大小写必须一致）：
{{
  "CLAIM": "用一句话概括 claim",
  "E1": "用一句话概括 evidence1 如何支持/反驳",
  "E2": "用一句话概括 evidence2 如何支持/反驳",
  "E3": "用一句话概括 evidence3 如何支持/反驳"
}}

claim 原文：
\"\"\"{claim_quote}\"\"\"

evidence 原文：
1) \"\"\"{evidence_quotes[0]}\"\"\"
2) \"\"\"{evidence_quotes[1]}\"\"\"
3) \"\"\"{evidence_quotes[2]}\"\"\"
""".strip()

def gemma_json_call_ce(prompt, max_new_tokens=220, temperature=0.2, retries=3):
    """
    最终统一返回：
      {"claim_text": str, "evidence_summaries": [e1,e2,e3]}
    兼容模型输出：
      A) JSON: {"CLAIM": "...", "E1": "...", ...}（哪怕不完全合法也尽量抠）
      B) JSON: {"claim_text": "...", "evidence_summaries": [...]}
      C) 文本: CLAIM: ... / E1: ... / E2: ... / E3: ...
    """
    last = None

    def _normalize(claim_text, ev_list):
        claim_text = (claim_text or "").strip()
        ev_list = [str(x).strip() for x in (ev_list or []) if str(x).strip()]

        if not claim_text:
            claim_text = "Claim."

        while len(ev_list) < 3:
            ev_list.append("Supports the claim.")
        return {"claim_text": claim_text, "evidence_summaries": ev_list[:3]}

    def _regex_grab(raw: str):
        """
        关键：从类似
          { "CLAIM": "...", "E1": "...", ... }
        的输出里直接抠，不依赖 JSON 合法性。
        """
        t = raw.replace("\r", "\n")

        def grab(key):
            # 支持 "CLAIM": "..." 或 CLAIM: ...
            # 尽量少吃：非贪婪匹配到下一个 "E2" 或行尾
            # 先抓引号型
            m = re.search(rf'"{key}"\s*:\s*"(.+?)"\s*(,|\n|}})', t, flags=re.S)
            if m:
                return m.group(1).strip()

            # 再抓无引号型（CLAIM: xxx）
            m = re.search(rf'^{key}\s*:\s*(.+)$', t, flags=re.I|re.M)
            if m:
                return m.group(1).strip()

            return ""

        claim = grab("CLAIM")
        e1 = grab("E1")
        e2 = grab("E2")
        e3 = grab("E3")

        ev = [x for x in [e1, e2, e3] if x]
        if claim or ev:
            return _normalize(claim, ev)

        return None

    for attempt in range(retries + 1):
        raw = gemma_generate(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
        last = raw
        print(f"[CE attempt {attempt}] head:", raw[:120].replace("\n", " "))

        # ---- 1) 尝试严格/半严格 JSON ----
        obj = None
        try:
            obj = extract_json_anywhere(raw)  # 你已有
        except Exception:
            obj = None

        if isinstance(obj, dict):
            # 已经是标准字段
            if "claim_text" in obj and "evidence_summaries" in obj:
                return _normalize(obj.get("claim_text"), obj.get("evidence_summaries"))

            # CLAIM/E1/E2/E3 字段
            claim = obj.get("CLAIM") or obj.get("claim") or obj.get("Claim")
            ev = []
            for k in ["E1","E2","E3","e1","e2","e3"]:
                if k in obj and str(obj[k]).strip():
                    ev.append(str(obj[k]).strip())
            if claim or ev:
                return _normalize(claim, ev)

        # ---- 2) JSON不合法也能抠：regex 抠 CLAIM/E1/E2/E3 ----
        grabbed = _regex_grab(raw)
        if grabbed is not None:
            return grabbed

        # ---- 3) 纯文本 CLAIM:/E1:... 解析 ----
        claim_text = ""
        ev = []
        for line in raw.splitlines():
            l = line.strip()
            if not l:
                continue
            if l.upper().startswith("CLAIM:"):
                claim_text = l.split(":", 1)[1].strip()
            elif re.match(r"^E[1-3]:", l, flags=re.I):
                ev.append(l.split(":", 1)[1].strip())

        if claim_text or ev:
            return _normalize(claim_text, ev)

        # ---- 4) 重试：更强提示 + 降温 ----
        prompt = (
            "只输出下面格式（不要解释，不要总结，不要markdown）：\n"
            "CLAIM: ...\nE1: ...\nE2: ...\nE3: ...\n\n" + prompt
        )
        max_new_tokens = max(160, int(max_new_tokens * 0.85))
        temperature = 0.1

    raise RuntimeError("CE parse failed. Last output head:\n" + (last or "")[:800])



def parse_ce_text_output(text, paper_id, topic):
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    claim_text = ""
    evidences = []

    for l in lines:
        if l.startswith("CLAIM:"):
            claim_text = l.replace("CLAIM:", "").strip()
        elif l.startswith("E") and ":" in l:
            evidences.append(l.split(":", 1)[1].strip())

    while len(evidences) < 3:
        evidences.append("Supports the claim.")

    return {
        "paper_id": paper_id,
        "topic": topic,
        "claim_text": claim_text or "Claim.",
        "evidence_summaries": evidences[:3],
    }



def build_tree_from_ce(ce_obj, claim_quote, evidence_quotes, paper_id=None, topic=None):
    paper_id = paper_id or ce_obj.get("paper_id", "PMID:UNKNOWN")
    topic = topic or ce_obj.get("topic", "UNKNOWN")

    # ✅ 防止 evidence_quotes 不足 3 条导致 IndexError
    evidence_quotes = list(evidence_quotes or [])
    while len(evidence_quotes) < 3:
        evidence_quotes.append(claim_quote)

    root = {
        "id": "C0",
        "type": "claim",
        "text": ce_obj.get("claim_text","").strip() or claim_quote[:160],
        "source": {"paper_id": paper_id, "section": "Unknown", "quote": claim_quote},
        "children": []
    }

    sums = ce_obj.get("evidence_summaries") or []
    while len(sums) < 3:
        sums.append("Supports the claim based on the quoted sentence.")

    for i in range(3):
        root["children"].append({
            "id": f"E{i+1}",
            "type": "evidence",
            "polarity": "support",
            "text": str(sums[i])[:220],
            "source": {"paper_id": paper_id, "section": "Unknown", "quote": evidence_quotes[i]},
            "children": []
        })

    return {"paper_id": paper_id, "topic": topic, "root": root}



def validate_tree_min(tree):
    r = tree.get("root")
    if not r or r.get("type") != "claim":
        return False, "root invalid"
    if not isinstance(r.get("children"), list) or len(r["children"]) < 3:
        return False, "need >=3 evidence"
    return True, "ok"



## 单篇测试：生成 argument_tree.json

In [23]:
paper_dir, obj = get_first_ready_paper()
paper_id = f"PMID:{obj['pmid']}"
topic = obj["topic"]
text = paper_text_from_sections(obj)

print("TEXT LEN:", len(text), "HEAD:", text[:200].replace("\n"," "))

q_obj = pick_quotes_by_ids(paper_id, topic, text, max_sents=80)
print("Claim quote:", q_obj["claim_quote"][:200])
print("Evidence 1:", q_obj["evidence_quotes"][0][:200])

ce_prompt = build_ce_from_quotes_prompt(paper_id, topic, q_obj["claim_quote"], q_obj["evidence_quotes"])
raw = gemma_generate(ce_prompt, max_new_tokens=220, temperature=0.2)
print("[CE raw head]", raw[:200])

ce = parse_ce_text_output(raw, paper_id, topic)

tree = build_tree_from_ce(ce, q_obj["claim_quote"], q_obj["evidence_quotes"])

ok, msg = validate_tree_min(tree)
print("validate:", ok, msg)

out_path = paper_dir / "argument_tree.json"
out_path.write_text(json.dumps(tree, ensure_ascii=False, indent=2), encoding="utf-8")
print("saved:", out_path)


TEXT LEN: 12000 HEAD: [ABSTRACT] Critical Care April 2004 Vol 8 No 2 Prigent et al. Review Clinical review: Corticotherapy in sepsis Helene Prigent1, Virginie Maxime2 and Djillali Annane3 1Senior Resident, Service de Réani
Claim quote: [ABSTRACT] Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the main controversial issues in critical care medicine.
Evidence 1: [ABSTRACT] These agents were commonly used to treat sepsis until the end of the 1980s, when several randomized trials casted serious doubt on any benefit from high-dose glucocorticoids.
[CE raw head] {
  "CLAIM": "Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the main controversial issues in critical care medicine.",
  "E1": "These agents we
validate: True ok
saved: Annane_Sepsis_Corpus\papers\PMID_15025773\argument_tree.json


In [24]:
ce = gemma_json_call_ce(ce_prompt, max_new_tokens=220, temperature=0.2, retries=3)
print(ce)

[CE attempt 0] head: {   "CLAIM": "Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the
{'claim_text': 'Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the main controversial issues in critical care medicine.', 'evidence_summaries': ['These agents were commonly used to treat sepsis until the end of the 1980s, when several randomized trials casted serious doubt on any benefit from high-dose glucocorticoids.', 'Later, important progress in our understanding of the role played by the hypothalamic–pituitary– adrenal axis in the response to sepsis, and of the mechanisms of action of glucocorticoids led us to reconsider their use in septic shock.', 'The present review summarizes the basics of the physiological response of the hypothalamic–pituitary–adrenal axis to stress, including regulation of glucocorticoid synthesis, the cellular mechanisms of action of glucocorticoids, and how th

## 批量 Step4：生成 out/argument_trees.jsonl（为每篇文章都生成argument_tree.json）

In [None]:
OUT_JSONL = OUT_DIR / "argument_trees.jsonl"

def iter_ready_papers():
    for p in sorted(PAPERS_DIR.glob("PMID_*")):
        sec = p / "sections.json"
        if not sec.exists():
            continue
        obj = json.loads(sec.read_text(encoding="utf-8"))
        text = paper_text_from_sections(obj, max_chars=12000)
        if len(text) < 400:
            continue
        yield p, obj, text

ok_cnt, fail_cnt = 0, 0

with OUT_JSONL.open("w", encoding="utf-8") as f:
    for paper_dir, obj, text in iter_ready_papers():
        paper_id = f"PMID:{obj['pmid']}"
        topic = obj["topic"]

        print("\n" + "="*28)
        print("Processing:", paper_id)
        print("TEXT LEN:", len(text))

        try:
            # 1) 选 quote
            q_obj = pick_quotes_by_ids(paper_id, topic, text, max_sents=80)
            print("Claim quote:", q_obj["claim_quote"][:120])
            print("Evidence 1:", q_obj["evidence_quotes"][0][:120])

            # 2) quote -> claim/evidence summaries
            ce_prompt = build_ce_from_quotes_prompt(
                paper_id, topic,
                q_obj["claim_quote"],
                q_obj["evidence_quotes"]
            )

            ce = gemma_json_call_ce(ce_prompt, max_new_tokens=220, temperature=0.2, retries=3)

            # ✅ 关键修改：补齐 paper_id/topic，避免 KeyError
            ce["paper_id"] = paper_id
            ce["topic"] = topic

            # 3) 拼树（写入 quote）
            #tree = build_tree_from_ce(ce, q_obj["claim_quote"], q_obj["evidence_quotes"])
            tree = build_tree_from_ce(ce, q_obj["claim_quote"], q_obj["evidence_quotes"], paper_id=paper_id, topic=topic)


            ok2, msg2 = validate_tree_min(tree)
            if not ok2:
                raise ValueError(msg2)

            (paper_dir / "argument_tree.json").write_text(
                json.dumps(tree, ensure_ascii=False, indent=2),
                encoding="utf-8"
            )
            f.write(json.dumps(tree, ensure_ascii=False) + "\n")
            f.flush()

            ok_cnt += 1
            print("✅ OK", paper_id)

        except Exception as e:
            fail_cnt += 1
            print("❌ FAIL", paper_id, "->", repr(e))

print("\nDONE. OK:", ok_cnt, "FAIL:", fail_cnt)
print("WROTE:", OUT_JSONL)



Processing: PMID:15025773
TEXT LEN: 12000
Claim quote: [ABSTRACT] Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the ma
Evidence 1: [ABSTRACT] These agents were commonly used to treat sepsis until the end of the 1980s, when several randomized trials ca
[CE attempt 0] head: {   "CLAIM": "Online ISSN 1466-609X) Abstract The use of glucocorticoids (corticotherapy) in severe sepsis is one of the
✅ OK PMID:15025773

Processing: PMID:23361625
TEXT LEN: 12000
Claim quote: [ABSTRACT] Dellinger Surviving Sepsis Campaign: International Mitchell M.
Evidence 1: [ABSTRACT] Levy Guidelines for Management of Severe Sepsis Andrew Rhodes Djillali Annane and Septic Shock, 2012 Herwig G
[CE attempt 0] head: {   "CLAIM": "Dellinger Surviving Sepsis Campaign is a randomized, controlled trial comparing two treatment arms for pat
✅ OK PMID:23361625

Processing: PMID:26633262
TEXT LEN: 12000
Claim quote: [ABSTRACT] CochraneDatabaseofSystematicReviews Co

## Step5：jsonl → nodes.tsv / edges.tsv

In [31]:
import json, re
import pandas as pd

OUT_NODES = OUT_DIR / "argument_nodes.tsv"
OUT_EDGES = OUT_DIR / "argument_edges.tsv"

# ---------- 1) 抽 section：优先用 source.section；没有就从 quote 前缀推断 ----------
def infer_section(node):
    src = node.get("source") or {}
    sec = (src.get("section") or "").strip()
    if sec and sec.lower() != "unknown":
        return sec

    q = (src.get("quote") or "").strip()
    m = re.match(r"^\[(ABSTRACT|METHOD|CONCLUSION|RESULTS|INTRODUCTION|DISCUSSION)\]\s*", q, re.I)
    if m:
        key = m.group(1).lower()
        return {
            "abstract": "Abstract",
            "method": "Method",
            "conclusion": "Conclusion",
            "results": "Results",
            "introduction": "Introduction",
            "discussion": "Discussion",
        }.get(key, "Unknown")

    return "Unknown"

# ---------- 2) 噪声过滤：去掉明显“元信息”句子 ----------
NOISE_PATTERNS = [
    r"\bISSN\b", r"\bdoi\b", r"\bhttp\b", r"\bwww\b",
    r"author manuscript", r"public access", r"published in final edited form",
    r"\bcopyright\b", r"©", r"springer", r"wiley", r"elsevier",
    r"cochrane", r"table of contents", r"guidelines committee",
    r"all rights reserved", r"\blicense\b",
]
NOISE_RE = re.compile("|".join(f"(?:{p})" for p in NOISE_PATTERNS), re.I)

def is_noise_text(s: str) -> bool:
    if not s:
        return True
    t = s.strip()
    if len(t) < 40:                      # 太短，通常是标题碎片
        return True
    if NOISE_RE.search(t):               # 命中明显出版元信息
        return True
    # 名字/作者列表特征：逗号很多 + 大写首字母很多
    comma_cnt = t.count(",")
    if comma_cnt >= 6:
        return True
    cap_words = sum(1 for w in re.findall(r"\b[A-Z][a-z]+\b", t))
    if cap_words >= 12 and comma_cnt >= 3:
        return True
    return False

# ---------- 3) walk_tree：一边扁平化，一边清洗 ----------
def walk_tree(node, paper_id, parent_id=None, nodes=None, edges=None):
    if nodes is None: nodes = []
    if edges is None: edges = []

    nid = node["id"]
    ntype = node.get("type") or "unknown"
    text = (node.get("text") or "").strip()
    quote = ((node.get("source") or {}).get("quote") or "").strip()
    section = infer_section(node)

    # ✅ 内容清洗：如果 text 太模板/太空，用 quote 替代一部分
    if not text or text.lower() in {"claim.", "supports the claim.", "evidence"}:
        # 取 quote 去掉 [SECTION] 前缀
        text2 = re.sub(r"^\[[A-Z]+\]\s*", "", quote).strip()
        text = text2[:220] if text2 else text

    # ✅ 节点是否有效（噪声就标记，后面可选择丢掉）
    node_is_noise = is_noise_text(text) and is_noise_text(quote)

    nodes.append({
        "paper_id": paper_id,
        "id": nid,
        "type": ntype,
        "text": text,
        "section": section,
        "quote": quote,
        "is_noise": int(node_is_noise),
    })

    if parent_id is not None:
        edges.append({
            "paper_id": paper_id,
            # 先保持“树结构边”：parent -> child
            "subj": parent_id,
            "pred": node.get("polarity") or "support",
            "obj": nid
        })

    for ch in node.get("children", []) or []:
        walk_tree(ch, paper_id, parent_id=nid, nodes=nodes, edges=edges)

    return nodes, edges

# ---------- 4) 读取 JSONL ----------
all_nodes, all_edges = [], []

with OUT_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        tree = json.loads(line)
        paper_id = tree.get("paper_id", "PMID:UNKNOWN")
        n, e = walk_tree(tree["root"], paper_id)
        all_nodes.extend(n)
        all_edges.extend(e)

nodes_df = pd.DataFrame(all_nodes).drop_duplicates(subset=["paper_id","id"])
edges_df = pd.DataFrame(all_edges)

# ---------- 5) 强制剔除“噪声 evidence”，并同步删边 ----------
# 只删 evidence 噪声；claim（C0）即使噪声也先保留，避免全空
noise_evidence = nodes_df[(nodes_df["type"] == "evidence") & (nodes_df["is_noise"] == 1)]
bad_ids = set(zip(noise_evidence["paper_id"], noise_evidence["id"]))

if bad_ids:
    # 删除这些 evidence 节点
    nodes_df = nodes_df[~nodes_df.apply(lambda r: (r["paper_id"], r["id"]) in bad_ids, axis=1)]
    # 删除与之相关的边
    edges_df = edges_df[~edges_df.apply(lambda r: (r["paper_id"], r["obj"]) in bad_ids, axis=1)]

# 再保证：每篇至少保留 1 个 evidence，不足则不删（安全兜底）
def ensure_min_evidence(df_nodes, df_edges):
    kept = []
    for pid, grp in df_nodes.groupby("paper_id"):
        evid = grp[grp["type"] == "evidence"]
        if len(evid) == 0:
            # 兜底：不做进一步处理，需要回去检查该 paper 的抽取质量
            kept.append(pid)
    return kept

no_evidence_papers = ensure_min_evidence(nodes_df, edges_df)
if no_evidence_papers:
    print("⚠️ papers with 0 evidence after filtering:", no_evidence_papers[:5], "..." if len(no_evidence_papers)>5 else "")

# ---------- 6) 写出 ----------
nodes_df.to_csv(OUT_NODES, sep="\t", index=False)
edges_df.to_csv(OUT_EDGES, sep="\t", index=False)

print("nodes:", len(nodes_df), "edges:", len(edges_df))
print("wrote:", OUT_NODES)
print("wrote:", OUT_EDGES)
print("noise evidence removed:", len(bad_ids))


nodes: 74 edges: 55
wrote: Annane_Sepsis_Corpus\out\argument_nodes.tsv
wrote: Annane_Sepsis_Corpus\out\argument_edges.tsv
noise evidence removed: 2


## Step6：导出 RDF Turtle（TTL），用于截图的 RDF 可视化工具

In [33]:
import json, re
from pathlib import Path

TTL_MIN = OUT_DIR / "argument_graph_min.ttl"
TTL_FULL = OUT_DIR / "argument_graph_full.ttl"

def esc(s):
    if s is None:
        return ""
    s = str(s).replace("\\", "\\\\").replace('"', '\\"')
    s = s.replace("\n", "\\n").replace("\r", "")
    return s

def safe_id(s: str):
    return re.sub(r"[^A-Za-z0-9_]", "_", str(s))

def node_uri(paper_id, nid):
    pid = safe_id(paper_id.replace(":", ""))
    nid2 = safe_id(nid)
    return f"ex:{pid}_{nid2}"

def infer_section_from_quote(q: str):
    if not q:
        return "Unknown"
    m = re.match(r"^\[(ABSTRACT|METHOD|CONCLUSION|RESULTS|INTRODUCTION|DISCUSSION)\]\s*", q.strip(), re.I)
    if not m:
        return "Unknown"
    key = m.group(1).lower()
    return {
        "abstract": "Abstract",
        "method": "Method",
        "conclusion": "Conclusion",
        "results": "Results",
        "introduction": "Introduction",
        "discussion": "Discussion",
    }.get(key, "Unknown")

def short_label(node, paper_id):
    """图上显示用：尽量短，不要 quote 全文"""
    nid = node.get("id", "")
    t = (node.get("type") or "").lower()
    src = node.get("source") or {}
    q = (src.get("quote") or "").strip()
    sec = (src.get("section") or "").strip()
    if not sec or sec.lower() == "unknown":
        sec = infer_section_from_quote(q)

    # 用 id + section 做 label，别放长文本
    if t == "claim":
        return f"{paper_id} C0 ({sec})"
    else:
        return f"{paper_id} {nid} ({sec})"

# --------- 核心：只画关系，不画属性节点 ---------
def walk_emit(node, paper_id, triples_min, triples_full, parent_uri=None):
    nid = node.get("id")
    nuri = node_uri(paper_id, nid)

    ntype = (node.get("type") or "evidence").lower()
    rdf_type = "ex:Claim" if ntype == "claim" else "ex:Evidence"

    # ---- MIN：只有 type + label + supports/attacks ----
    triples_min.append(f'{nuri} rdf:type {rdf_type} .')
    triples_min.append(f'{nuri} rdfs:label "{esc(short_label(node, paper_id))}" .')

    # ---- FULL：保留元信息，给后续用（不建议拿来直接画图）----
    triples_full.append(f'{nuri} rdf:type {rdf_type} .')
    triples_full.append(f'{nuri} ex:paper_id "{esc(paper_id)}" .')
    triples_full.append(f'{nuri} ex:node_id "{esc(nid)}" .')

    src = node.get("source") or {}
    quote = (src.get("quote") or "").strip()
    section = (src.get("section") or "").strip()
    if not section or section.lower() == "unknown":
        section = infer_section_from_quote(quote)

    text = (node.get("text") or "").strip()
    triples_full.append(f'{nuri} ex:section "{esc(section)}" .')
    triples_full.append(f'{nuri} ex:text "{esc(text)}" .')
    triples_full.append(f'{nuri} ex:quote "{esc(quote)}" .')

    # ✅ 关键：关系方向：child -> parent（Evidence supports/attacks Claim）
    if parent_uri is not None:
        pol = (node.get("polarity") or "support").lower()
        pred = "ex:attacks" if pol == "attack" else "ex:supports"
        triples_min.append(f"{nuri} {pred} {parent_uri} .")
        triples_full.append(f"{nuri} {pred} {parent_uri} .")

    for ch in node.get("children", []) or []:
        walk_emit(ch, paper_id, triples_min, triples_full, parent_uri=nuri)

# --------- 写文件 ---------
prefix = [
    '@prefix ex: <http://example.org/sepsis/> .',
    '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .',
    '@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .',
    ""
]
triples_min = prefix.copy()
triples_full = prefix.copy()

with OUT_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        tree = json.loads(line)
        paper_id = tree.get("paper_id", "PMID:UNKNOWN")
        walk_emit(tree["root"], paper_id, triples_min, triples_full, parent_uri=None)
        triples_min.append("")
        triples_full.append("")

TTL_MIN.write_text("\n".join(triples_min), encoding="utf-8")
TTL_FULL.write_text("\n".join(triples_full), encoding="utf-8")

print("Wrote MIN TTL (for visualization):", TTL_MIN)
print("Wrote FULL TTL (for storage/search):", TTL_FULL)


Wrote MIN TTL (for visualization): Annane_Sepsis_Corpus\out\argument_graph_min.ttl
Wrote FULL TTL (for storage/search): Annane_Sepsis_Corpus\out\argument_graph_full.ttl


In [32]:
import json, re
from pathlib import Path

TTL_PATH = OUT_DIR / "argument_graph.ttl"

def esc(s):
    if s is None:
        return ""
    s = str(s).replace("\\", "\\\\").replace('"', '\\"')
    s = s.replace("\n", "\\n").replace("\r", "")
    return s

def safe_id(s: str):
    # TTL 的 localName 最好只用字母数字下划线
    return re.sub(r"[^A-Za-z0-9_]", "_", str(s))

def node_uri(paper_id, nid):
    pid = safe_id(paper_id.replace(":", ""))
    nid2 = safe_id(nid)
    return f"ex:{pid}_{nid2}"

# --------- section 兜底：优先 source.section，否则从 quote 的 [ABSTRACT] 推断 ----------
def infer_section(node):
    src = node.get("source") or {}
    sec = (src.get("section") or "").strip()
    if sec and sec.lower() != "unknown":
        return sec
    q = (src.get("quote") or "").strip()
    m = re.match(r"^\[(ABSTRACT|METHOD|CONCLUSION|RESULTS|INTRODUCTION|DISCUSSION)\]\s*", q, re.I)
    if m:
        key = m.group(1).lower()
        return {
            "abstract": "Abstract",
            "method": "Method",
            "conclusion": "Conclusion",
            "results": "Results",
            "introduction": "Introduction",
            "discussion": "Discussion",
        }.get(key, "Unknown")
    return "Unknown"

# --------- 噪声过滤（和你 nodes/edges 的过滤逻辑保持一致） ----------
NOISE_PATTERNS = [
    r"\bISSN\b", r"\bdoi\b", r"\bhttp\b", r"\bwww\b",
    r"author manuscript", r"public access", r"published in final edited form",
    r"\bcopyright\b", r"©", r"springer", r"wiley", r"elsevier",
    r"cochrane", r"table of contents", r"guidelines committee",
    r"all rights reserved", r"\blicense\b",
]
NOISE_RE = re.compile("|".join(f"(?:{p})" for p in NOISE_PATTERNS), re.I)

def is_noise_text(s: str) -> bool:
    if not s:
        return True
    t = s.strip()
    if len(t) < 40:
        return True
    if NOISE_RE.search(t):
        return True
    comma_cnt = t.count(",")
    if comma_cnt >= 6:
        return True
    cap_words = sum(1 for w in re.findall(r"\b[A-Z][a-z]+\b", t))
    if cap_words >= 12 and comma_cnt >= 3:
        return True
    return False

def node_is_noise(node):
    text = (node.get("text") or "").strip()
    quote = ((node.get("source") or {}).get("quote") or "").strip()
    # 只有 evidence 才过滤（claim 先保留避免全空）
    if (node.get("type") or "").lower() == "evidence":
        return is_noise_text(text) and is_noise_text(quote)
    return False

# --------- 正确语义：Evidence supports Claim（child -> parent） ----------
def walk_emit(node, paper_id, triples, parent_uri=None, parent_type=None):
    nid = node.get("id")
    nuri = node_uri(paper_id, nid)

    ntype = (node.get("type") or "unknown").lower()
    rdf_type = "ex:Claim" if ntype == "claim" else "ex:Evidence"
    triples.append(f'{nuri} rdf:type {rdf_type} .')

    triples.append(f'{nuri} ex:paper_id "{esc(paper_id)}" .')
    triples.append(f'{nuri} ex:node_id "{esc(nid)}" .')

    # text：如果是模板，尽量用 quote 补
    text = (node.get("text") or "").strip()
    src = node.get("source") or {}
    quote = (src.get("quote") or "").strip()

    if not text or text.lower() in {"claim.", "supports the claim.", "supports the claim", "evidence"}:
        text2 = re.sub(r"^\[[A-Z]+\]\s*", "", quote).strip()
        if text2:
            text = text2[:220]

    triples.append(f'{nuri} ex:text "{esc(text)}" .')
    triples.append(f'{nuri} ex:section "{esc(infer_section(node))}" .')
    triples.append(f'{nuri} ex:quote "{esc(quote)}" .')

    # ✅ 关键：把关系方向改为 child -> parent（Evidence supports Claim）
    if parent_uri is not None:
        pol = (node.get("polarity") or "support").lower()
        # 只在非噪声 evidence 上建边（避免作者/ISSN 垃圾边）
        if not node_is_noise(node):
            if pol == "attack":
                triples.append(f'{nuri} ex:attacks {parent_uri} .')
            else:
                triples.append(f'{nuri} ex:supports {parent_uri} .')

    for ch in node.get("children", []) or []:
        walk_emit(ch, paper_id, triples, parent_uri=nuri, parent_type=ntype)

# --------- 写 TTL ----------
triples = [
    '@prefix ex: <http://example.org/sepsis/> .',
    '@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .',
    ""
]

with OUT_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        tree = json.loads(line)
        paper_id = tree.get("paper_id", "PMID:UNKNOWN")
        walk_emit(tree["root"], paper_id, triples, parent_uri=None)
        triples.append("")

TTL_PATH.write_text("\n".join(triples), encoding="utf-8")
print("Wrote TTL:", TTL_PATH)


Wrote TTL: Annane_Sepsis_Corpus\out\argument_graph.ttl


## 下一步：你要的“嵌套 support/attack”
你本周的图二最终希望 evidence 下还有 support/attack evidence。
下面先给你一个“深度=2”的扩展骨架：对每条 evidence 再挑 2 条句子（支持/反驳）。你确认后我再帮你把 prompt 调到稳定。

In [28]:
def build_child_ids_prompt(paper_id, topic, evidence_quote, indexed_sents_text):
    return f"""
你必须只输出严格合法 JSON，不要输出解释。
现在我们要为下面这条 evidence 找：
- 1条支持它的句子编号 support_id
- 1条反驳它的句子编号 attack_id

输出格式：
{{
  "support_id": 1,
  "attack_id": 2
}}

evidence_quote:
{evidence_quote}

编号句子：
{indexed_sents_text}
""".strip()

def add_nested_children_depth2(tree, text, max_sents=80):
    sents, indexed = build_indexed_sentences(text, max_sents=max_sents)

    for ev in tree["root"]["children"]:
        ev_quote = (ev.get("source") or {}).get("quote","")
        prompt = build_child_ids_prompt(tree["paper_id"], tree["topic"], ev_quote, indexed)
        raw = gemma_generate(prompt, max_new_tokens=120, temperature=0.1)
        obj = extract_json_loose(raw)

        sid = max(1, min(int(obj["support_id"]), len(sents)))
        aid = max(1, min(int(obj["attack_id"]), len(sents)))

        ev["children"] = [
            {
                "id": f"{ev['id']}_S1",
                "type": "evidence",
                "polarity": "support",
                "text": sents[sid-1][:120],
                "source": {"paper_id": tree["paper_id"], "section": "Unknown", "quote": sents[sid-1]},
                "children": []
            },
            {
                "id": f"{ev['id']}_A1",
                "type": "evidence",
                "polarity": "attack",
                "text": sents[aid-1][:120],
                "source": {"paper_id": tree["paper_id"], "section": "Unknown", "quote": sents[aid-1]},
                "children": []
            }
        ]
    return tree
