Imports

In [1]:
import os
import re
import json
import csv
import io
import requests
import pandas as pd

from dataclasses import dataclass, field, asdict
from typing import Optional, TypedDict, Literal, List, Tuple, Dict, Any

from dotenv import load_dotenv
from pydantic import BaseModel, Field, field_validator
from __future__ import annotations

from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, START, END
from tavily import TavilyClient

from datetime import datetime

Paths + Constants

In [2]:
# =========================
# 2) Paths + constants
# =========================

DATASET_PATH = "BRCA1-206.txt"   # your biomart TSV export (in your repo folder)
DEFAULT_TRANSCRIPT_ID = "ENST00000461798"
DEFAULT_GENE_SYMBOL = "BRCA1"

# Save location 
SAVE_DIR = r"C:\Users\justi\OneDrive\Desktop\CU-Anschutz\repos\davidsonlab\Splicing-Agent-Rotation\data\test_runs"

Load environment

In [3]:
# =========================
# 3) Load environment
# =========================

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("OPEN_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not OPENAI_API_KEY:
    raise RuntimeError("Missing OpenAI key. Set OPENAI_API_KEY (or OPEN_API_KEY) in .env")

if not TAVILY_API_KEY:
    raise RuntimeError("Missing Tavily key. Set TAVILY_API_KEY in .env like: TAVILY_API_KEY=tvly-xxxx")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print("✅ Environment loaded: OpenAI + Tavily keys present")

✅ Environment loaded: OpenAI + Tavily keys present


LLM Set-up

In [4]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    api_key=OPENAI_API_KEY)

print("✅ LLM initialized")

✅ LLM initialized


Canonical Schema

In [5]:
# =========================
# 5) Canonical schema
# =========================

@dataclass
class EventCall:
    event_id: str
    gene_symbol: str
    coordinate_system: str          # "genomic" | "transcript_rna" | "unknown"
    transcript_id: Optional[str]
    event_type: str
    notes: str

    chromosome: Optional[str] = None
    region_start: Optional[int] = None
    region_end: Optional[int] = None


@dataclass
class CDSImpact:
    ptc_predicted: bool
    stop_codon_cdna_pos: Optional[int]
    stop_codon_aa_index: Optional[int]
    notes: List[str]


@dataclass
class NMDImpact:
    nmd_likelihood: str   # "likely_triggering" | "likely_escaping" | "uncertain"
    confidence: str       # "high" | "medium" | "low"
    rationale: str
    notes: List[str]

In [6]:
def safe_int(x: Any) -> Optional[int]:
    if x is None:
        return None
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        return int(float(s))
    except Exception:
        return None


def json_default(o):
    """
    Makes payload JSON-serializable (handles numpy/pandas types cleanly).
    """
    # numpy / pandas numbers
    try:
        import numpy as np
        if isinstance(o, (np.integer,)):
            return int(o)
        if isinstance(o, (np.floating,)):
            return float(o)
        if isinstance(o, (np.ndarray,)):
            return o.tolist()
    except Exception:
        pass

    # pandas NA / Timestamp
    try:
        import pandas as pd
        if pd.isna(o):
            return None
        if isinstance(o, (pd.Timestamp,)):
            return o.isoformat()
    except Exception:
        pass

    # fallback
    return str(o)


def extract_transcript_id(text: str) -> Optional[str]:
    m = re.search(r"(ENST\d{11}(?:\.\d+)?)", text)
    if not m:
        return None
    return m.group(1).split(".")[0]  # strip version


def extract_gene_symbol(text: str) -> Optional[str]:
    """
    Minimal heuristic. If you give the gene explicitly (BRCA1), this catches it.
    """
    m = re.search(r"\b([A-Z0-9]{2,10})\b", text.strip())
    if not m:
        return None
    token = m.group(1)
    if token.startswith("ENST"):
        return None
    return token


def load_biomart_tsv(path: str) -> pd.DataFrame:
    return pd.read_csv(path, sep="\t", dtype=str)


def build_transcript_table(df: pd.DataFrame, transcript_id: str) -> pd.DataFrame:
    tdf = df[df["Transcript stable ID"].str.strip() == transcript_id].copy()
    if tdf.empty:
        raise ValueError(f"No rows found for transcript_id={transcript_id} in dataset.")

    int_cols = [
        "Gene start (bp)", "Gene end (bp)", "Transcript start (bp)", "Transcript end (bp)",
        "Strand", "Exon region start (bp)", "Exon region end (bp)", "Exon rank in transcript",
        "cDNA coding start", "cDNA coding end", "Genomic coding start", "Genomic coding end",
        "CDS Length"
    ]
    for c in int_cols:
        if c in tdf.columns:
            tdf[c] = tdf[c].apply(safe_int)

    tdf = tdf.sort_values("Exon rank in transcript").reset_index(drop=True)
    return tdf


# ---- Tavily (REQUIRED) ----

tavily_client = TavilyClient(api_key=TAVILY_API_KEY)

def tavily_search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
    resp = tavily_client.search(query=query, max_results=max_results)
    return resp.get("results", [])


def is_live_url(url: str, timeout: int = 8) -> bool:
    """
    Lightweight dead-link filter to reduce Tavily junk.
    """
    if not url:
        return False
    try:
        r = requests.head(url, allow_redirects=True, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        if r.status_code in (405, 403):  # some sites block HEAD; fallback GET
            r = requests.get(url, allow_redirects=True, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        return 200 <= r.status_code < 400
    except Exception:
        return False


def filter_tavily_live_results(results: List[Dict[str, Any]], max_keep: int = 5) -> List[Dict[str, Any]]:
    kept = []
    for r in results:
        url = (r.get("url") or "").strip()
        if is_live_url(url):
            kept.append(r)
        if len(kept) >= max_keep:
            break
    return kept


def summarize_tavily_results(results: List[Dict[str, Any]]) -> List[str]:
    notes = []
    for r in results:
        title = (r.get("title") or "").strip()
        url = (r.get("url") or "").strip()
        snippet = (r.get("content") or r.get("snippet") or "").strip()
        if title and url:
            notes.append(f"{title} — {url}")
        if snippet:
            notes.append(f"  ↳ {snippet[:200]}{'...' if len(snippet) > 200 else ''}")
    return notes

In [7]:
def compute_cdna_junction_positions(exon_table: pd.DataFrame) -> List[int]:
    """
    Returns cDNA positions (1-based) where each exon ends (junction after exon i),
    so length = (#exons - 1).
    """
    exon_lengths = []
    for _, row in exon_table.iterrows():
        s = row.get("Exon region start (bp)")
        e = row.get("Exon region end (bp)")
        if s is None or e is None:
            exon_lengths.append(0)
        else:
            exon_lengths.append(int(e - s + 1))

    cum = 0
    ends = []
    for L in exon_lengths:
        cum += L
        ends.append(cum)

    return ends[:-1]


def infer_stop_codon_position_from_cds(exon_table: pd.DataFrame) -> Tuple[Optional[int], Optional[int], List[str]]:
    """
    Uses dataset cDNA coding end:
      stop codon start (cDNA) = cDNA_coding_end - 2
      AA index (0-based) ≈ (stop_start - cDNA_coding_start) // 3
    """
    notes = []
    cds_starts = exon_table["cDNA coding start"].dropna().tolist() if "cDNA coding start" in exon_table.columns else []
    cds_ends = exon_table["cDNA coding end"].dropna().tolist() if "cDNA coding end" in exon_table.columns else []

    if not cds_starts or not cds_ends:
        notes.append("Missing cDNA coding start/end in dataset; cannot infer stop codon.")
        return None, None, notes

    cdna_coding_start = int(cds_starts[0])
    cdna_coding_end = int(cds_ends[0])

    if cdna_coding_end < 3:
        notes.append("cDNA coding end < 3; invalid.")
        return None, None, notes

    stop_start = cdna_coding_end - 2
    aa_index = (stop_start - cdna_coding_start) // 3

    notes.append(f"Using dataset cDNA coding start={cdna_coding_start}, end={cdna_coding_end}.")
    notes.append(f"Stop codon assumed at cDNA {stop_start}-{cdna_coding_end} (1-based).")
    notes.append(f"Stop codon AA index (0-based) ≈ {aa_index}.")

    return stop_start, aa_index, notes


def nmd_50nt_rule(stop_cdna_pos: int, junction_positions: List[int]) -> Tuple[str, str, str, List[str]]:
    notes = []
    if not junction_positions:
        return "likely_escaping", "medium", "No exon-exon junctions detected (single-exon transcript).", notes

    last_junction = junction_positions[-1]
    dist = last_junction - stop_cdna_pos

    notes.append(f"Last exon-exon junction at cDNA position {last_junction}.")
    notes.append(f"Distance (junction - stop_start) = {dist} nt.")

    if dist > 50:
        return "likely_triggering", "medium", f"Stop codon is {dist} nt upstream of last junction (>50 nt): consistent with NMD.", notes

    return "likely_escaping", "medium", f"Stop codon is {dist} nt upstream of last junction (≤50 nt): consistent with NMD escape.", notes


# =========================
# 8) Exon summary table
# =========================

def build_exon_summary_table(tx_table: pd.DataFrame) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Returns:
      - markdown table string
      - exon rows as dicts (for logging/eval)
    """
    rows = []

    # Some biomart exports include "Constitutive exon" as 0/1; keep as-is
    const_col = "Constitutive exon" if "Constitutive exon" in tx_table.columns else None

    for _, r in tx_table.iterrows():
        exon_rank = r.get("Exon rank in transcript")
        start = r.get("Exon region start (bp)")
        end = r.get("Exon region end (bp)")
        length = None
        if start is not None and end is not None:
            length = int(end - start + 1)

        rows.append({
            "exon_rank": exon_rank,
            "start": start,
            "end": end,
            "length_bp": length,
            "constitutive_exon": r.get(const_col) if const_col else None,
        })

    md = []
    md.append("| Exon rank | Exon (start–end, bp) | Length (bp) | Constitutive exon |")

    for x in rows:
        exon_label = f"{x['start']}–{x['end']}" if x["start"] is not None and x["end"] is not None else "NA"
        md.append(
            f"| {x['exon_rank'] if x['exon_rank'] is not None else 'NA'} "
            f"| {exon_label} "
            f"| {x['length_bp'] if x['length_bp'] is not None else 'NA'} "
            f"| {x['constitutive_exon'] if x['constitutive_exon'] is not None else 'NA'} |"
        )

    return "\n".join(md), rows

Agent State

In [8]:
# =========================
# 8) Agent state
# =========================

class SplicingAgentState(TypedDict, total=False):
    raw_input: str
    input_format: str

    dataset_df: Any
    transcript_table: Any
    selected_transcript_id: str
    selected_gene_symbol: str

    tavily_results: Any
    tavily_notes: List[str]

    event_call: EventCall
    cds_impact: CDSImpact
    nmd_impact: NMDImpact

    validation_warnings: List[str]

    # modular summary outputs
    user_summary_table: str
    user_summary_text: str

    user_summary: str

    # evaluation payload (failure-mode logging)
    eval_flags: Dict[str, Any]

    # optional extras for logging
    exon_summary_rows: Any


Nodes

In [9]:
# =========================
# 9) Nodes
# =========================

def detect_format(state: SplicingAgentState) -> SplicingAgentState:
    return {"input_format": "text"}


def tavily_websearch_first(state: SplicingAgentState) -> SplicingAgentState:
    """
    Do a web search FIRST to establish context.
    We also filter dead URLs to reduce noise.

    NOTE: Tavily is NOT deciding NMD. It just provides supporting supporting context + links of any missing information not provided by user.
    """
    raw = state["raw_input"]
    tx = extract_transcript_id(raw) or DEFAULT_TRANSCRIPT_ID
    gene = extract_gene_symbol(raw) or DEFAULT_GENE_SYMBOL

    # bias toward stable sources to reduce garbage
    query = (
        f"site:ensembl.org {gene} {tx} transcript summary OR "
        f"site:ncbi.nlm.nih.gov {gene} gene OR "
        f"nonsense mediated decay 50 nt rule exon junction"
    )

    results = tavily_search(query=query, max_results=8)
    results_live = filter_tavily_live_results(results, max_keep=5)
    notes = summarize_tavily_results(results_live)

    return {
        "selected_transcript_id": tx,
        "selected_gene_symbol": gene,
        "tavily_results": results_live,
        "tavily_notes": notes,
    }


def load_dataset_and_transcript(state: SplicingAgentState) -> SplicingAgentState:
    df = load_biomart_tsv(DATASET_PATH)
    tx = state["selected_transcript_id"]
    tx_table = build_transcript_table(df, tx)
    return {"dataset_df": df, "transcript_table": tx_table}


def normalize_event_call_from_dataset(state: SplicingAgentState) -> SplicingAgentState:
    tx_table = state["transcript_table"]
    tx = state["selected_transcript_id"]

    gene = (
        str(tx_table.iloc[0]["Gene name"]).strip()
        if "Gene name" in tx_table.columns and pd.notna(tx_table.iloc[0]["Gene name"])
        else state.get("selected_gene_symbol", DEFAULT_GENE_SYMBOL)
    )

    tstart = tx_table.iloc[0].get("Transcript start (bp)", None)
    tend = tx_table.iloc[0].get("Transcript end (bp)", None)

    event = EventCall(
        event_id=f"NMD_CASE_{gene}_{tx}",
        gene_symbol=gene,
        coordinate_system="genomic",
        transcript_id=tx,
        event_type="transcript_nmd_assessment",
        notes="Dataset-driven transcript assessment (Biomart exon/CDS columns).",
        chromosome="17",  # TODO: generalize from dataset when you add chr column
        region_start=min(tstart, tend) if tstart and tend else None,
        region_end=max(tstart, tend) if tstart and tend else None,
    )
    return {"event_call": event}


def validate_minimum_fields(state: SplicingAgentState) -> SplicingAgentState:
    warnings = []
    e = state["event_call"]

    if not e.gene_symbol:
        warnings.append("Missing gene_symbol.")
    if not e.transcript_id:
        warnings.append("Missing transcript_id.")
    if state.get("transcript_table") is None:
        warnings.append("Missing transcript_table (dataset load failed).")

    return {"validation_warnings": warnings}


def compute_deterministic_nmd(state: SplicingAgentState) -> SplicingAgentState:
    tx_table = state["transcript_table"]

    stop_cdna_pos, stop_aa_index, cds_notes = infer_stop_codon_position_from_cds(tx_table)
    cds_impact = CDSImpact(
        ptc_predicted=False,
        stop_codon_cdna_pos=stop_cdna_pos,
        stop_codon_aa_index=stop_aa_index,
        notes=cds_notes,
    )

    if stop_cdna_pos is None:
        nmd = NMDImpact(
            nmd_likelihood="uncertain",
            confidence="high",
            rationale="Cannot apply 50-nt rule because stop codon position could not be inferred from dataset.",
            notes=[],
        )
        return {"cds_impact": cds_impact, "nmd_impact": nmd}

    junctions = compute_cdna_junction_positions(tx_table)
    nmd_call, conf, rationale, nmd_notes = nmd_50nt_rule(stop_cdna_pos, junctions)

    nmd = NMDImpact(
        nmd_likelihood=nmd_call,
        confidence=conf,
        rationale=rationale,
        notes=nmd_notes,
    )

    return {"cds_impact": cds_impact, "nmd_impact": nmd}

# =========================
# 11) Report node (LLM formats; math stays deterministic)
# =========================

def splice_event_identifier(state: SplicingAgentState) -> SplicingAgentState:
    """
    Produce a clean scientific report + exon table.
    LLM is used ONLY for formatting + explanation of deterministic results.
    """
    e = state["event_call"]
    cds = state["cds_impact"]
    nmd = state["nmd_impact"]
    warnings = state.get("validation_warnings", [])
    tav = state.get("tavily_notes", [])

    exon_md, exon_rows = build_exon_summary_table(state["transcript_table"])

    # Save the table modularly too (your existing pattern)
    state_user_table = exon_md

    prompt = f"""
You are an aberrant splicing research assistant for scientists.
Write a structured markdown report with the following output:
- short, systematic, and clear
- uses section headers
- includes an exon table
- gives an NMD call + rationale clearly

Sections to include:
1) Transcript Context (1–2 sentences)
2) Exon Structure Summary (include the exon table verbatim)
3) Stop Codon / CDS Summary (bullets)
4) NMD Assessment (call + confidence + rationale; explain the 50-nt rule in one sentence)
5) Evidence Links (bullets; only use provided Tavily notes)
6) Warnings (only if any)

Hard rules:
- Do NOT invent exons, coordinates, or claims not in the provided data.
- Do NOT claim “novel transcript vs known” unless the dataset contains multiple transcripts to compare (if missing, say what’s missing).
- Use the provided Tavily notes only as links/context (do not fabricate extra citations).

Inputs:
Gene: {e.gene_symbol}
Transcript: {e.transcript_id}
Genomic span: chr{e.chromosome}:{e.region_start}-{e.region_end}

EXON TABLE (use verbatim):
{exon_md}

Stop codon (cDNA start): {cds.stop_codon_cdna_pos}
Stop codon (AA index): {cds.stop_codon_aa_index}
CDS notes: {cds.notes}

NMD call: {nmd.nmd_likelihood}
Confidence: {nmd.confidence}
Rationale: {nmd.rationale}
NMD notes: {nmd.notes}

Tavily notes:
{tav}

Warnings:
{warnings}
""".strip()

    msg = llm.invoke(prompt)
    report = msg.content.strip()

    return {
        "user_summary_table": state_user_table,
        "user_summary_text": report,
        "user_summary": report,          # ✅ final output key
        "exon_summary_rows": exon_rows,  # optional logging
    }


def minimal_eval_flags(state: SplicingAgentState) -> SplicingAgentState:
    """
    IntelliAgent-like: small structured flags for failure modes.
    """
    flags = {}

    e = state["event_call"]
    cds = state["cds_impact"]
    nmd = state["nmd_impact"]

    flags["has_transcript_id"] = bool(e.transcript_id)
    flags["has_gene_symbol"] = bool(e.gene_symbol)
    flags["has_stop_position"] = cds.stop_codon_cdna_pos is not None
    flags["has_junctions"] = len(compute_cdna_junction_positions(state["transcript_table"])) > 0
    flags["nmd_call"] = nmd.nmd_likelihood

    flags["warning_count"] = len(state.get("validation_warnings", []))
    flags["tavily_live_results"] = len(state.get("tavily_results", []))
    flags["report_generated"] = bool(state.get("user_summary"))

    return {"eval_flags": flags}

Build the Graph!

In [10]:
# =========================
# 10) Build graph
# =========================

builder = StateGraph(SplicingAgentState)

builder.add_node("detect_format", detect_format)
builder.add_node("tavily_websearch_first", tavily_websearch_first)
builder.add_node("load_dataset_and_transcript", load_dataset_and_transcript)
builder.add_node("normalize_event_call_from_dataset", normalize_event_call_from_dataset)
builder.add_node("validate_minimum_fields", validate_minimum_fields)
builder.add_node("compute_deterministic_nmd", compute_deterministic_nmd)

# ✅ Replace old summary pipeline with Natalie-style report node
builder.add_node("splice_event_identifier", splice_event_identifier)

builder.add_node("minimal_eval_flags", minimal_eval_flags)

builder.add_edge(START, "detect_format")
builder.add_edge("detect_format", "tavily_websearch_first")
builder.add_edge("tavily_websearch_first", "load_dataset_and_transcript")
builder.add_edge("load_dataset_and_transcript", "normalize_event_call_from_dataset")
builder.add_edge("normalize_event_call_from_dataset", "validate_minimum_fields")
builder.add_edge("validate_minimum_fields", "compute_deterministic_nmd")

# ✅ New output stage
builder.add_edge("compute_deterministic_nmd", "splice_event_identifier")
builder.add_edge("splice_event_identifier", "minimal_eval_flags")
builder.add_edge("minimal_eval_flags", END)

graph = builder.compile()
print("✅ Graph compiled successfully (Tavily-first → dataset → deterministic NMD → Natalie-style report).")

✅ Graph compiled successfully (Tavily-first → dataset → deterministic NMD → Natalie-style report).


Sanity Checks on BRCA1 events!

In [11]:
# =========================
# 9) Run sanity check (dataset-driven)
# =========================

raw_text = """
I would like to determine aberrant splicing for BRCA1.
Use transcript ENST00000461798 (BRCA1-206).
Assess NMD.
"""

state = {"raw_input": raw_text}
result = graph.invoke(state)

print("Input format:", result.get("input_format"))

print("\n--- Canonical EventCall ---")
print(result.get("event_call"))

print("\n--- CDSImpact ---")
print(result.get("cds_impact"))

print("\n--- NMDImpact ---")
print(result.get("nmd_impact"))

print("\n--- User summary ---")
print(result.get("user_summary"))

Input format: text

--- Canonical EventCall ---
EventCall(event_id='NMD_CASE_BRCA1_ENST00000461798', gene_symbol='BRCA1', coordinate_system='genomic', transcript_id='ENST00000461798', event_type='transcript_nmd_assessment', notes='Dataset-driven transcript assessment (Biomart exon/CDS columns).', chromosome='17', region_start=np.int64(43099831), region_end=np.int64(43125370))

--- CDSImpact ---
CDSImpact(ptc_predicted=False, stop_codon_cdna_pos=191, stop_codon_aa_index=25, notes=['Using dataset cDNA coding start=114, end=193.', 'Stop codon assumed at cDNA 191-193 (1-based).', 'Stop codon AA index (0-based) ≈ 25.'])

--- NMDImpact ---
NMDImpact(nmd_likelihood='likely_triggering', confidence='medium', rationale='Stop codon is 341 nt upstream of last junction (>50 nt): consistent with NMD.', notes=['Last exon-exon junction at cDNA position 532.', 'Distance (junction - stop_start) = 341 nt.'])

--- User summary ---
# Aberrant Splicing Report: BRCA1 (ENST00000461798)

## Transcript Context


In [12]:
# =========================
# 12) Save results 
# =========================

os.makedirs(SAVE_DIR, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
outfile = os.path.join(SAVE_DIR, f"run_{timestamp}.json")

payload = {
    "timestamp": timestamp,
    "raw_input": state["raw_input"],
    "input_format": result.get("input_format"),
    "event_call": asdict(result["event_call"]) if "event_call" in result else None,
    "cds_impact": asdict(result["cds_impact"]) if "cds_impact" in result else None,
    "nmd_impact": asdict(result["nmd_impact"]) if "nmd_impact" in result else None,
    "user_summary_table": result.get("user_summary_table"),
    "user_summary_text": result.get("user_summary_text"),
    "user_summary": result.get("user_summary"),
    "eval_flags": result.get("eval_flags"),
    "validation_warnings": result.get("validation_warnings", []),
    "tavily_notes": result.get("tavily_notes", []),
    "exon_summary_rows": result.get("exon_summary_rows"),
}

with open(outfile, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2, ensure_ascii=False, default=json_default)

print(f"✅ Saved run to:\n{outfile}")

✅ Saved run to:
C:\Users\justi\OneDrive\Desktop\CU-Anschutz\repos\davidsonlab\Splicing-Agent-Rotation\data\test_runs\run_20260105_104458.json


In [16]:
# =========================
# 8) Sanity check
# =========================

raw_brca1_text = """
BRCA1ΔE11q is an alternative BRCA1 isoform arising from use of an internal splice donor in exon 11, 
deleting a distal portion of exon 11 (r.788_4096del) but maintaining the open reading frame. 
It is overexpressed in ovarian cancer patients compared to healthy controls and is associated with 
resistance to PARP inhibitors and cisplatin.
"""

state = {"raw_input": raw_brca1_text}
result = graph.invoke(state)

print("Input format:", result["input_format"])
print("\n--- Canonical EventCall ---")
print(json.dumps(asdict(result["event_call"]), indent=2))

print("\n--- CDSImpact ---")
print(json.dumps(asdict(result.get("cds_impact", CDSImpact())), indent=2))

print("\n--- NMDImpact ---")
print(json.dumps(asdict(result.get("nmd_impact", NMDImpact("uncertain","low","missing"))), indent=2))

print("\n--- Enrichment notes ---")
print(result.get("enrichment_notes", []))

print("\n--- Validation warnings ---")
print(result.get("validation_warnings", []))

print("\n--- User summary ---")
print(result["user_summary"])

Input format: text

--- Canonical EventCall ---
{
  "event_id": "EVT_UNKNOWN_0001",
  "gene_symbol": "UNKNOWN",
  "coordinate_system": "unknown",
  "chromosome": null,
  "region_start": null,
  "region_end": null,
  "exon_number_start": null,
  "exon_number_end": null,
  "exon_genomic_start": null,
  "exon_genomic_end": null,
  "transcript_id": null,
  "hgvs_rna": null,
  "event_type": "other",
  "notes": "LLM extraction failed; provide structured input."
}

--- CDSImpact ---
{
  "frameshift": null,
  "in_frame_indel": null,
  "ptc_predicted": false,
  "stop_codon_cds_index": null,
  "notes": [
    "No HGVS RNA edit applied (hgvs_rna is None)."
  ]
}

--- NMDImpact ---
{
  "nmd_likelihood": "likely_escaping",
  "confidence": "low",
  "rationale": "No PTC inferred from HGVS deletion length (prototype). 50-nt rule requires exon junction mapping.",
  "notes": []
}

--- Enrichment notes ---
['Biotypes - Ensembl — http://www.ensembl.org/info/genome/genebuild/biotypes.html', 'Nonsense-Mediat

Convery .ipynb to .py for LangGraph