# CRC UDS Prototype (2025)
This notebook ingests:
- Scanned CRC reports (PDF) via OCR
- FOBT CSV extract

It outputs per-patient CRC numerator status for 2025 with an auditor-friendly evidence string.


In [None]:
import os, re, json, datetime as dt
import pandas as pd
from dateutil.relativedelta import relativedelta
import pytesseract
from pdf2image import convert_from_path


In [None]:
PDF_PATHS = ["/mnt/data/12969_colonoscopy.pdf", "/mnt/data/12969_fecal_ia.pdf", "/mnt/data/46874_fit.pdf", "/mnt/data/110656_colonoscopy.pdf", "/mnt/data/118450_colonoscopy.pdf", "/mnt/data/166222_colonoscopy.pdf", "/mnt/data/190147_ifobt.pdf"]
FOBT_CSV_PATH = r"/mnt/data/fobt.csv"
REPORTING_YEAR = 2025


In [None]:
date_pat = re.compile(r'(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})')
month_map={m.lower():i+1 for i,m in enumerate(["January","February","March","April","May","June","July","August","September","October","November","December"])}
long_pat=re.compile(r'(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})', re.I)

def parse_dates_from_line(line):
    out=[]
    for m in date_pat.finditer(line):
        mm,dd,yy=m.groups()
        yy=int(yy)
        if yy<100: yy += 2000 if yy < 50 else 1900
        try:
            out.append(dt.date(yy,int(mm),int(dd)))
        except:
            pass
    for m in long_pat.finditer(line):
        dd,mon,yy=m.group(1),m.group(2),m.group(3)
        out.append(dt.date(int(yy), month_map[mon.lower()], int(dd)))
    return out

def extract_years(text):
    return [int(m.group(0)) for m in re.finditer(r'(19|20)\d{2}', text)]

def fix_future_year(d, doc_year):
    if doc_year and d.year > doc_year+1:
        return dt.date(doc_year, d.month, d.day)
    return d

def first_date_in_line(line):
    m=date_pat.search(line)
    if m:
        mm,dd,yy=m.groups()
        yy=int(yy)
        if yy<100: yy += 2000 if yy < 50 else 1900
        try:
            return dt.date(yy,int(mm),int(dd))
        except:
            return None
    m=long_pat.search(line)
    if m:
        dd,mon,yy=m.group(1),m.group(2),m.group(3)
        return dt.date(int(yy), month_map[mon.lower()], int(dd))
    return None

def ocr_first_page(pdf_path, dpi=150):
    img = convert_from_path(pdf_path, dpi=dpi, first_page=1, last_page=1)[0]
    return pytesseract.image_to_string(img, config="--psm 6")

def classify_doc(text):
    tl=text.lower()
    if "fit-dna" in tl or "cologuard" in tl:
        return "FIT_DNA"
    if "ct colonography" in tl:
        return "CT_COLONOGRAPHY"
    if "sigmoidoscopy" in tl:
        return "SIGMOIDOSCOPY"
    if "colonoscopy" in tl and ("procedure date" in tl or "type: colonoscopy" in tl or "procedure:" in tl):
        return "COLONOSCOPY"
    if "occult blood fecal" in tl:
        return "FIT"   # immunoassay
    if "ifobt" in tl:
        return "FIT"
    if "fobt" in tl:
        return "FOBT"
    return "UNKNOWN"

def best_date_for_colonoscopy(text):
    lines=[l.strip() for l in text.splitlines() if l.strip()]
    # prefer Procedure Date
    for line in lines:
        if re.search(r'Procedure Date', line, re.I):
            ds=parse_dates_from_line(line)
            if ds:
                return ds[0].isoformat(), line, "Procedure Date"
    # fallback: Performed By / Result Date
    for pat,label in [(r'Performed By',"Performed By"),(r'Result Date',"Result Date")]:
        for line in lines:
            if re.search(pat, line, re.I):
                ds=parse_dates_from_line(line)
                if ds:
                    return ds[0].isoformat(), line, label
    # fallback any date in first 120 lines
    for line in lines[:120]:
        ds=parse_dates_from_line(line)
        if ds:
            return ds[0].isoformat(), line, "Fallback"
    return None, None, None

def best_date_for_fit_fobt(text):
    lines=[l.strip() for l in text.splitlines() if l.strip()]
    years=extract_years(text)
    doc_year=max(set(years), key=years.count) if years else None
    patterns=[(r'Collection Date',1.0,"Collection Date"),
              (r'Collected',1.0,"Collected"),
              (r'Order Date',0.8,"Order Date"),
              (r'Received',0.6,"Received"),
              (r'Report',0.5,"Report"),
              (r'Result Date',0.5,"Result Date")]
    cand=[]
    for line in lines:
        for pat,w,label in patterns:
            if re.search(pat, line, re.I):
                d=first_date_in_line(line)
                if d:
                    d=fix_future_year(d, doc_year)
                    cand.append((w,d,line,label))
    if not cand:
        return None,None,None
    # highest priority label wins, even if report date is later
    cand.sort(key=lambda x:(x[0],x[1]), reverse=True)
    w,d,line,label=cand[0]
    return d.isoformat(), line, label

def extract_event_from_pdf(pdf_path, text):
    filename=os.path.basename(pdf_path)
    patient_key=re.match(r'(\d+)', filename).group(1) if re.match(r'(\d+)', filename) else filename
    event_type=classify_doc(text)

    if event_type=="COLONOSCOPY":
        event_date, line, label = best_date_for_colonoscopy(text)
        confidence=0.92 if event_date else 0.6
    elif event_type in ("FIT","FOBT","FIT_DNA","SIGMOIDOSCOPY","CT_COLONOGRAPHY"):
        event_date, line, label = best_date_for_fit_fobt(text)
        confidence=0.88 if event_date else 0.55
    else:
        event_date, line, label = None, None, None
        confidence=0.4

    return {
        "patient_key": patient_key,
        "event_type": event_type,
        "event_date": event_date,
        "source": "scanned_pdf",
        "confidence": confidence,
        "evidence": f"PDF:{filename} p1 | {line[:140] if line else ''}"
    }


In [None]:
# OCR PDFs (first page) and extract events
pdf_events=[]
for p in PDF_PATHS:
    txt = ocr_first_page(p, dpi=150)
    pdf_events.append(extract_event_from_pdf(p, txt))

df_pdf = pd.DataFrame(pdf_events)
df_pdf


In [None]:
# Parse FOBT CSV extract (it has a header row embedded in the file)
raw = pd.read_csv(FOBT_CSV_PATH)

header_row_idx = None
for i in range(min(30, len(raw))):
    row = raw.iloc[i]
    if any(str(v).strip() == "Person Nbr" for v in row.dropna().astype(str).tolist()):
        header_row_idx = i
        break

if header_row_idx is None:
    # fallback: known from sample
    header_row_idx = 5

headers = raw.iloc[header_row_idx]
col_map={}
for col, val in headers.items():
    if pd.notna(val) and str(val).strip():
        col_map[col]=str(val).strip()

df = raw.iloc[header_row_idx+1:].rename(columns=col_map)[list(col_map.values())]
df = df.dropna(subset=["Person Nbr"], how="all")

def parse_mdY(s):
    try:
        return dt.datetime.strptime(str(s).strip(), "%m/%d/%Y").date().isoformat()
    except:
        try:
            return dt.datetime.strptime(str(s).strip(), "%m/%d/%y").date().isoformat()
        except:
            return None

df_struct = pd.DataFrame({
    "patient_key": df["Person Nbr"].astype(str),
    "event_type": "FOBT",
    "event_date": df["Enc Date"].apply(parse_mdY),
    "source": "structured_csv",
    "confidence": 0.95,
    "evidence": "CSV:fobt.csv EncNbr=" + df["Enc Nbr"].astype(str) + " | Result=" + df["Result"].astype(str)
})
df_struct


In [None]:
# CRC 2025 rule engine
year_start = dt.date(REPORTING_YEAR,1,1)
year_end = dt.date(REPORTING_YEAR,12,31)

def crc_counts(event_type, event_date):
    if not event_date:
        return False
    d = dt.date.fromisoformat(event_date)
    if event_type in ("FOBT","FIT"):
        return year_start <= d <= year_end
    if event_type == "FIT_DNA":
        return (year_start - relativedelta(years=2)) <= d <= year_end
    if event_type in ("SIGMOIDOSCOPY","CT_COLONOGRAPHY"):
        return (year_start - relativedelta(years=4)) <= d <= year_end
    if event_type == "COLONOSCOPY":
        return (year_start - relativedelta(years=9)) <= d <= year_end
    return False

df_all = pd.concat([df_pdf, df_struct], ignore_index=True)
df_all["counts"] = df_all.apply(lambda r: crc_counts(r["event_type"], r["event_date"]), axis=1)
df_all.sort_values(["patient_key","event_date"], inplace=True)
df_all


In [None]:
# Pick best numerator evidence per patient (most recent qualifying event)
dfq = df_all[df_all["counts"]==True].copy()
dfq["date"] = pd.to_datetime(dfq["event_date"])
dfq.sort_values(["patient_key","date","confidence"], ascending=[True,False,False], inplace=True)

best = dfq.groupby("patient_key").head(1).copy()
best = best.rename(columns={
    "event_type":"best_event_type",
    "event_date":"best_event_date",
    "source":"best_source",
    "evidence":"best_evidence"
})[["patient_key","best_event_type","best_event_date","best_source","best_evidence"]]

patients = pd.DataFrame({"patient_key": sorted(df_all["patient_key"].unique())})
result = patients.merge(best, on="patient_key", how="left")
result["numerator_2025"] = result["best_event_type"].notna()
result
