# CRC Screening Prototype (NextGen mixed sources)
This notebook demonstrates a **one-metric** prototype for **UDS Colorectal Cancer Screening** evidence extraction from:
- scanned colonoscopy reports (PDF)
- scanned FIT/iFOBT reports (PDF)
- optional structured exports (CSV)

It outputs a normalized `ScreeningEvent` table with **audit-ready evidence pointers** (file + page + snippet).


In [None]:

# Paths to sample files (uploaded)
from pathlib import Path

DATA_DIR = Path("/mnt/data")

PDFS = [
    DATA_DIR / "12969_colonoscopy.pdf",
    DATA_DIR / "110656_colonoscopy.pdf",
    DATA_DIR / "118450_colonoscopy.pdf",
    DATA_DIR / "166222_colonoscopy.pdf",
    DATA_DIR / "12969_fecal_ia.pdf",
    DATA_DIR / "46874_fit.pdf",
    DATA_DIR / "190147_ifobt.pdf",
]
[p.name for p in PDFS if p.exists()]


In [None]:

# OCR + parsing helpers
import re
from dataclasses import dataclass
from typing import Optional, Literal, Dict, Any, List
from datetime import date
import dateutil.parser as dparser

from pdf2image import convert_from_path
import pytesseract

EventType = Literal["FOBT","FIT","FIT_DNA","SIGMOIDOSCOPY","CT_COLONOGRAPHY","COLONOSCOPY"]

@dataclass
class ScreeningEvent:
    patient_hint: Optional[str]
    event_type: EventType
    event_date: str                 # ISO yyyy-mm-dd
    source: Literal["scanned_doc"]
    confidence: float
    evidence: Dict[str, Any]        # {file,page,snippet}

def ocr_pdf_pages(pdf_path: str, first_page: int = 1, last_page: int = 1, dpi: int = 150) -> List[str]:
    imgs = convert_from_path(str(pdf_path), dpi=dpi, first_page=first_page, last_page=last_page)
    config="--oem 1 --psm 6"
    return [pytesseract.image_to_string(img, config=config) for img in imgs]

def guess_event_type(text: str) -> Optional[EventType]:
    t=text.lower()
    if "colonoscopy" in t:
        return "COLONOSCOPY"
    if "ct colonography" in t:
        return "CT_COLONOGRAPHY"
    if "sigmoidoscopy" in t:
        return "SIGMOIDOSCOPY"
    if "fit-dna" in t or "cologuard" in t or "sdna" in t:
        return "FIT_DNA"
    if ("occult" in t and "fecal" in t and "fit" in t) or ("immunochemical" in t and "fit" in t):
        return "FIT"
    if "ifobt" in t or "fobt" in t or "fecal occult" in t:
        return "FOBT" if "fit" not in t else "FIT"
    return None

def safe_parse_date(s: str) -> Optional[str]:
    # reject obvious OCR garbage like 01/0/1984
    if re.search(r'/0[/-]', s):
        return None
    try:
        dt = dparser.parse(s, fuzzy=True, dayfirst=False)
        return dt.date().isoformat()
    except Exception:
        return None

DATE_PATTERNS = [
    r'Procedure Date:\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})',
    r'DATE OF OPERATION:\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})',
    r'Result Date:\s*([0-9]{1,2}\s+[A-Za-z]+\s+[0-9]{4})',
    r'Collection Date:\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})',
    r'Collected:\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})',
]

def extract_dates(text: str):
    out=[]
    for pat in DATE_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            raw=m.group(1).strip()
            iso=safe_parse_date(raw)
            if iso:
                out.append((pat, raw, iso))
    for m in re.finditer(r'([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})', text):
        raw=m.group(1)
        iso=safe_parse_date(raw)
        if iso:
            out.append(("generic_mdy", raw, iso))
    # de-dupe by iso
    seen=set(); dedup=[]
    for item in out:
        if item[2] not in seen:
            seen.add(item[2]); dedup.append(item)
    return dedup

def normalize_dates(dates):
    # snap obvious OCR year errors to modal year (common on scanned lab forms)
    years=[int(iso.split("-")[0]) for _,_,iso in dates if 1940 <= int(iso.split('-')[0]) <= 2100]
    if not years:
        return dates
    from collections import Counter
    year_mode=Counter(years).most_common(1)[0][0]
    fixed=[]
    for pat,raw,iso in dates:
        y,m,d = map(int, iso.split("-"))
        if y!=year_mode and 2 <= abs(y-year_mode) <= 10:
            iso2=f"{year_mode:04d}-{m:02d}-{d:02d}"
            fixed.append((pat, raw+" (year_snapped)", iso2))
        else:
            fixed.append((pat, raw, iso))
    return fixed

def choose_best_date(dates):
    years=[int(iso.split('-')[0]) for _,_,iso in dates if 1940 <= int(iso.split('-')[0]) <= 2100]
    if not years:
        return None
    from collections import Counter
    year_mode=Counter(years).most_common(1)[0][0]
    priority=["Collection Date","Collected","Procedure Date","DATE OF OPERATION","Result Date"]
    for key in priority:
        for pat,raw,iso in dates:
            if int(iso.split('-')[0])==year_mode and key.lower() in pat.lower():
                return iso
    # fallback: first date in modal year
    for pat,raw,iso in dates:
        if int(iso.split('-')[0])==year_mode:
            return iso
    return dates[0][2] if dates else None


In [None]:

# Extract events from the PDFs (scanned docs)
events=[]
for pdf in PDFS:
    texts = ocr_pdf_pages(pdf, first_page=1, last_page=1, dpi=150)
    text = texts[0]
    et = guess_event_type(text)
    if et is None and ("occult" in text.lower() and "fecal" in text.lower()):
        et="FIT"
    dates = normalize_dates(extract_dates(text))
    best_date = choose_best_date(dates)

    if et and best_date:
        snippet = text[:500].replace("\n"," ")
        events.append(ScreeningEvent(
            patient_hint=None,
            event_type=et,
            event_date=best_date,
            source="scanned_doc",
            confidence=0.75,
            evidence={"file": pdf.name, "page": 1, "snippet": snippet}
        ))

events


In [None]:

# Convert to a review table (auditor-friendly)
import pandas as pd

df = pd.DataFrame([{
    "file": e.evidence["file"],
    "event_type": e.event_type,
    "event_date": e.event_date,
    "page": e.evidence["page"],
    "snippet": e.evidence["snippet"][:200] + "..."
} for e in events])

df.sort_values(["event_type","event_date"], ascending=[True, False])


In [None]:

# CRC UDS-style counting logic for a measurement year (simplified)
from dateutil.relativedelta import relativedelta

LOOKBACK = {
    "FOBT": relativedelta(years=0),
    "FIT": relativedelta(years=0),
    "FIT_DNA": relativedelta(years=2),
    "SIGMOIDOSCOPY": relativedelta(years=4),
    "CT_COLONOGRAPHY": relativedelta(years=4),
    "COLONOSCOPY": relativedelta(years=9),
}

def counts_for_crc(event_type: str, event_date_iso: str, year: int) -> bool:
    year_start = date(year,1,1)
    year_end = date(year,12,31)
    d = date.fromisoformat(event_date_iso)
    if event_type in ["FOBT","FIT"]:
        return year_start <= d <= year_end
    window_start = year_start - LOOKBACK[event_type]
    return window_start <= d <= year_end

year = 2025
df["counts_for_crc_"+str(year)] = df.apply(lambda r: counts_for_crc(r["event_type"], r["event_date"], year), axis=1)
df
