## ⚠️ POPPLER & TESSERACT REQUIRED FOR OCR

**These PDFs are scanned images** - pdfplumber can only extract headers. OCR is required to read the actual content.

### Install Poppler (Windows):
1. Download: https://github.com/oschwartz10612/poppler-windows/releases/latest
2. Extract to `C:\poppler` (or any location)
3. **Option A**: Add `C:\poppler\Library\bin` to your Windows PATH
4. **Option B**: Or specify the path in code (see POPPLER_PATH variable below)

### Install Tesseract OCR (Windows):
1. Download installer: https://github.com/UB-Mannheim/tesseract/wiki
2. Run the installer (default location: `C:\Program Files\Tesseract-OCR`)
3. Add `C:\Program Files\Tesseract-OCR` to your Windows PATH
4. Or set `pytesseract.pytesseract.tesseract_cmd` in code

After installation, restart this notebook kernel and re-run the cells.

# CRC UDS Prototype (2025)
This notebook ingests:
- Scanned CRC reports (PDF) via OCR
- FOBT CSV extract

It outputs per-patient CRC numerator status for 2025 with an auditor-friendly evidence string.


In [1]:
import os, re, json, datetime as dt
import pandas as pd
from dateutil.relativedelta import relativedelta
import pytesseract
from pdf2image import convert_from_path


In [2]:
import glob, os

# Auto-discover CRC-related PDFs in /mnt/data
PDF_GLOBS = [
    "pdf_data/*colonoscopy*.pdf",
    "pdf_data/*_fit*.pdf",
    "pdf_data/*_ifobt*.pdf",
    "pdf_data/*fecal_ia*.pdf",
    "pdf_data/*fobt*.pdf",
]
PDF_PATHS = sorted({p for g in PDF_GLOBS for p in glob.glob(g)})

FOBT_CSV_PATH = "csv_data/fobt.csv"
REPORTING_YEAR = 2025

print(f"Found {len(PDF_PATHS)} PDFs:")
for p in PDF_PATHS:
    print(" -", os.path.basename(p))
print("FOBT CSV:", os.path.basename(FOBT_CSV_PATH), "exists:", os.path.exists(FOBT_CSV_PATH))

Found 17 PDFs:
 - 110656_colonoscopy.pdf
 - 118450_colonoscopy.pdf
 - 12969_colonoscopy.pdf
 - 12969_fecal_ia.pdf
 - 130349_fobt.pdf
 - 151177_ifobt.pdf
 - 165505_fit.pdf
 - 166003_ifobt.pdf
 - 166222_colonoscopy.pdf
 - 166372_colonoscopy.pdf
 - 181256_ifobt.pdf
 - 183636_colonoscopy.pdf
 - 184224_colonoscopy.pdf
 - 190147_ifobt.pdf
 - 2073_fit.pdf
 - 46874_fit.pdf
 - MR_163038_colonoscopy.pdf
FOBT CSV: fobt.csv exists: True


In [10]:
import os, re, datetime as dt
from dataclasses import dataclass
from typing import Optional, Literal, Dict, Any, List, Tuple

DATE_PAT = re.compile(r'(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})')

def _to_iso(m:int,d:int,y:int) -> Optional[str]:
    if y < 100:
        y = 2000 + y if y < 50 else 1900 + y
    try:
        return dt.date(y,m,d).isoformat()
    except Exception:
        return None

def parse_dates(text: str) -> List[Tuple[str, str]]:
    out=[]
    for m,d,y in DATE_PAT.findall(text or ""):
        iso = _to_iso(int(m), int(d), int(y))
        if iso:
            out.append((iso, f"{m}/{d}/{y}"))
    return out

def looks_like_nonclinical_line(line: str) -> bool:
    l=(line or "").lower()
    if "electronically signed" in l or "signed by" in l:
        return True
    if "dob" in l or "date of birth" in l:
        return True
    if "data:text" in l or "base64" in l:
        return True
    return False

def filename_patient_id(path: str) -> str:
    base=os.path.basename(path)
    m=re.match(r'^(\d+)', base)
    return m.group(1) if m else base

import pdfplumber
from pdf2image import convert_from_path
import pytesseract

KEYWORDS = ["colonoscopy","occult","fecal","fit","fobt","ifobt","colofit","guaiac"]

# POPPLER PATH: Uncomment and set this if poppler is not in your system PATH
POPPLER_PATH = r"C:\poppler\Library\bin"
#POPPLER_PATH = None  # Set to None to use system PATH

def extract_first_page_text(pdf_path: str, ocr_dpi: int = 220, min_chars: int = 80) -> str:
    """Try PDF text extraction first; fall back to OCR if content looks like a scan header."""
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) > 0:
                text = (pdf.pages[0].extract_text() or "")
    except Exception:
        text = ""

    # If extracted text is long but lacks clinical keywords (often just Name/DOB headers), OCR anyway
    text_l = (text or "").lower()
    keyword_hit = any(k in text_l for k in KEYWORDS)
    
    # If we have decent text with keywords, use it. Otherwise try OCR
    if text and len(text.strip()) >= min_chars and keyword_hit:
        return text

    # OCR fallback - CRITICAL for scanned PDFs
    try:
        pages = convert_from_path(
            pdf_path, 
            dpi=ocr_dpi, 
            first_page=1, 
            last_page=1,
            poppler_path=POPPLER_PATH
        )
        if pages:
            ocr_text = pytesseract.image_to_string(pages[0], timeout=12)
            if ocr_text and len(ocr_text.strip()) > len(text.strip()):
                return ocr_text
    except Exception as e:
        # OCR failed - likely poppler not installed
        pass
    
    # Return whatever we got, even if minimal
    return text or ""

EventType = Literal["FOBT","FIT","FIT_DNA","SIGMOIDOSCOPY","CT_COLONOGRAPHY","COLONOSCOPY"]

@dataclass
class ScreeningEvent:
    patient_id: str
    event_type: EventType
    event_date: Optional[str]
    source: Literal["pdf","structured_csv"]
    confidence: float
    needs_review: bool
    evidence: Dict[str, Any]

def classify_doc(text: str) -> Optional[EventType]:
    t=(text or "").lower()
    if "colonoscopy" in t:
        return "COLONOSCOPY"
    if "occult blood" in t and "fecal" in t:
        return "FIT"  # includes IA (FIT)
    if "colofit" in t:
        return "FIT"
    if "\bfit\b" in t and "fecal" in t:
        return "FIT"
    if "ifobt" in t or "fobt" in t or "guaiac" in t:
        return "FOBT"
    return None

def extract_labeled_dates(text: str) -> Dict[str, List[str]]:
    labels = {"procedure":[],"collection":[],"received":[],"result":[],"other":[]}
    for line in (text or "").splitlines():
        if not line.strip():
            continue
        if looks_like_nonclinical_line(line):
            continue
        ll=line.lower()
        label="other"
        if re.search(r'procedure\s*date|date\s*of\s*procedure|date\s*of\s*service|\bdos\b', ll):
            label="procedure"
        elif re.search(r'collection\s*date|date\s*of\s*collection|\bcollected\b', ll):
            label="collection"
        elif re.search(r'\breceived\b', ll):
            label="received"
        elif re.search(r'\bresult\b|\breported\b|\bfinal\b', ll):
            label="result"
        if re.search(r'\b(at|on)\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', ll):
            label="collection"
        for iso,_ in parse_dates(line):
            labels[label].append(iso)
    for k,v in labels.items():
        seen=set(); nv=[]
        for d in v:
            if d not in seen:
                nv.append(d); seen.add(d)
        labels[k]=nv
    return labels

def is_reasonable_procedure_date(date_iso: str) -> bool:
    """Filter out dates that are clearly not procedure dates (e.g., DOBs from 1950s)"""
    try:
        year = int(date_iso[:4])
        # CRC screening procedures should be in a reasonable range
        # Allow 1995-present (30 years back) to be safe
        return 1995 <= year <= (dt.date.today().year + 1)
    except:
        return False

def choose_best_date(event_type: EventType, labeled: Dict[str,List[str]], all_dates: List[str]) -> Tuple[Optional[str], float, bool, str]:
    if event_type=="COLONOSCOPY":
        pref="procedure"
    elif event_type in ["FIT","FOBT"]:
        pref="collection"
    else:
        pref="other"

    candidates = labeled.get(pref, []).copy()
    if not candidates:
        candidates = labeled.get("result",[]) + labeled.get("received",[]) + labeled.get("other",[])
    if not candidates and all_dates:
        candidates = list(dict.fromkeys(all_dates))

    if not candidates:
        return None, 0.2, True, "no date found"

    # FIXED: Apply stricter date filtering to exclude DOB-era dates
    filtered = [d for d in candidates if is_reasonable_procedure_date(d)]
    
    if not filtered:
        # All dates were filtered out (probably DOBs)
        return None, 0.1, True, f"dates found but all appear to be DOBs or invalid: {candidates}"
    
    candidates = filtered

    best = max(candidates)
    conf = 0.9 if (labeled.get(pref) and best in labeled[pref]) else 0.65
    needs_review = False
    rationale = f"picked {best} from {pref if conf>0.8 else 'fallback'}"

    distinct = sorted(set(sum(labeled.values(), [])))
    if conf < 0.8 and len(distinct) >= 2:
        needs_review = True
        conf = min(conf, 0.55)
        rationale += "; multiple conflicting dates"

    return best, conf, needs_review, rationale

def extract_event_from_pdf(pdf_path: str, text: str) -> ScreeningEvent:
    pid = filename_patient_id(pdf_path)
    filename = os.path.basename(pdf_path).lower()
    
    # FIXED: Use filename as hint if text extraction failed
    et = classify_doc(text)
    if et is None:
        # Fallback to filename-based classification
        if "colonoscopy" in filename:
            et = "COLONOSCOPY"
        elif "fit" in filename and "dna" not in filename:
            et = "FIT"
        elif "ifobt" in filename or "fobt" in filename:
            et = "FOBT"
        elif "fecal" in filename:
            et = "FIT"
        else:
            et = "FOBT"  # Last resort default
    
    labeled = extract_labeled_dates(text)
    all_dates = [d for d,_ in parse_dates(text)]
    best_date, conf, needs_review, rationale = choose_best_date(et, labeled, all_dates)

    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
    key = "colonoscopy" if et=="COLONOSCOPY" else ("occult" if et=="FIT" else "fobt")
    snippet = " ".join([ln for ln in lines if key in ln.lower()][:6])[:500]

    # If text was minimal (likely failed OCR), mark for review and note filename-based classification
    text_minimal = len(text.strip()) < 100
    if text_minimal:
        needs_review = True
        conf = min(conf, 0.6)
        if best_date:
            rationale += f"; text extraction minimal ({len(text.strip())} chars), classified by filename"
        else:
            rationale = f"text extraction minimal ({len(text.strip())} chars), classified by filename, no valid date"

    # If we found FIT/FOBT but no confident collection/procedure date, flag for review
    if et in ["FIT","FOBT"] and (best_date is None or conf < 0.8):
        needs_review = True

    return ScreeningEvent(
        patient_id=pid,
        event_type=et,
        event_date=best_date,
        source="pdf",
        confidence=float(conf),
        needs_review=bool(needs_review),
        evidence={
            "file": os.path.basename(pdf_path),
            "rationale": rationale,
            "snippet": snippet,
            "dates_by_label": labeled,
            "text_length": len(text),
        }
    )

In [4]:
# Extract events from PDFs (fast text extraction with OCR fallback)
pdf_rows=[]
for pdf_path in PDF_PATHS:
    text = extract_first_page_text(pdf_path)
    ev = extract_event_from_pdf(pdf_path, text)
    pdf_rows.append(ev.__dict__)

df_pdf = pd.DataFrame(pdf_rows)
df_pdf

Unnamed: 0,patient_id,event_type,event_date,source,confidence,needs_review,evidence
0,110656,COLONOSCOPY,,pdf,0.1,True,"{'file': '110656_colonoscopy.pdf', 'rationale'..."
1,118450,COLONOSCOPY,2022-12-22,pdf,0.65,False,"{'file': '118450_colonoscopy.pdf', 'rationale'..."
2,12969,COLONOSCOPY,,pdf,0.1,True,"{'file': '12969_colonoscopy.pdf', 'rationale':..."
3,12969,FIT,2025-04-03,pdf,0.65,True,"{'file': '12969_fecal_ia.pdf', 'rationale': 'p..."
4,130349,FOBT,,pdf,0.1,True,"{'file': '130349_fobt.pdf', 'rationale': 'date..."
5,151177,FOBT,2025-01-31,pdf,0.65,True,"{'file': '151177_ifobt.pdf', 'rationale': 'pic..."
6,165505,FIT,,pdf,0.1,True,"{'file': '165505_fit.pdf', 'rationale': 'text ..."
7,166003,FIT,2025-04-17,pdf,0.65,True,"{'file': '166003_ifobt.pdf', 'rationale': 'pic..."
8,166222,COLONOSCOPY,2019-05-07,pdf,0.65,False,"{'file': '166222_colonoscopy.pdf', 'rationale'..."
9,166372,COLONOSCOPY,2017-04-17,pdf,0.65,False,"{'file': '166372_colonoscopy.pdf', 'rationale'..."


In [5]:
# Parse FOBT CSV extract and convert to ScreeningEvent rows
import pandas as pd
import numpy as np
import datetime as dt

def detect_header_row(csv_path: str, max_scan: int = 40) -> int:
    with open(csv_path, 'r', errors='ignore') as f:
        for i,line in enumerate(f):
            if i>max_scan:
                break
            l=line.lower()
            if 'person' in l and ('nbr' in l or '#' in l) and ('mrn' in l or 'enc' in l or 'date' in l):
                return i
    return 0

hdr = detect_header_row(FOBT_CSV_PATH)
df_csv = pd.read_csv(FOBT_CSV_PATH, header=hdr)
df_csv.columns = [c.strip() for c in df_csv.columns]

# Identify key columns
def find_col(cols, patterns):
    for p in patterns:
        for c in cols:
            if re.search(p, c, re.I):
                return c
    return None

pid_col = find_col(df_csv.columns, [r'person\s*nbr', r'person\s*#', r'patient'])
encdate_col = find_col(df_csv.columns, [r'enc\s*date', r'collection\s*date', r'collected', r'date'])
test_col = find_col(df_csv.columns, [r'test', r'order', r'procedure', r'description', r'name'])

def parse_any_date(x):
    if pd.isna(x): return None
    s=str(x).strip()
    # try mm/dd/yyyy
    m=DATE_PAT.search(s)
    if not m: 
        return None
    mm,dd,yy = map(int, m.groups())
    iso=_to_iso(mm,dd,yy)
    return iso

csv_events=[]
for idx,row in df_csv.iterrows():
    pid = str(row.get(pid_col, '')).strip()
    if not pid or pid.lower()=='nan':
        continue
    ev_date = parse_any_date(row.get(encdate_col, None))
    if not ev_date:
        continue
    test_name = str(row.get(test_col, '')).lower() if test_col else ''
    et = 'FIT' if 'fit' in test_name and 'dna' not in test_name else 'FOBT'
    csv_events.append({
        "patient_id": pid,
        "event_type": et,
        "event_date": ev_date,
        "source": "structured_csv",
        "confidence": 0.95,
        "needs_review": False,
        "evidence": {"source_file": os.path.basename(FOBT_CSV_PATH), "row_index": int(idx), "test_name": test_name}
    })

df_csv_events = pd.DataFrame(csv_events)
df_csv_events.head(), df_csv_events.shape

(  patient_id event_type  event_date          source  confidence  needs_review  \
 0   173232.0       FOBT  2025-04-28  structured_csv        0.95         False   
 1     9816.0       FOBT  2025-10-27  structured_csv        0.95         False   
 2    64114.0       FOBT  2025-07-17  structured_csv        0.95         False   
 3    49260.0       FOBT  2025-09-04  structured_csv        0.95         False   
 4   163237.0       FOBT  2025-07-31  structured_csv        0.95         False   
 
                                             evidence  
 0  {'source_file': 'fobt.csv', 'row_index': 0, 't...  
 1  {'source_file': 'fobt.csv', 'row_index': 1, 't...  
 2  {'source_file': 'fobt.csv', 'row_index': 2, 't...  
 3  {'source_file': 'fobt.csv', 'row_index': 3, 't...  
 4  {'source_file': 'fobt.csv', 'row_index': 4, 't...  ,
 (10, 7))

In [6]:
# CRC 2025 rule engine + merge, de-dup, and scoring
import datetime as dt
from dateutil.relativedelta import relativedelta

year_start = dt.date(REPORTING_YEAR,1,1)
year_end = dt.date(REPORTING_YEAR,12,31)

def counts_for_crc(event_type: str, event_date_iso: str) -> bool:
    if not event_date_iso:
        return False
    d = dt.date.fromisoformat(event_date_iso)
    if event_type in ["FOBT","FIT"]:
        return year_start <= d <= year_end
    if event_type == "FIT_DNA":
        return (year_start - relativedelta(years=2)) <= d <= year_end
    if event_type in ["SIGMOIDOSCOPY","CT_COLONOGRAPHY"]:
        return (year_start - relativedelta(years=4)) <= d <= year_end
    if event_type == "COLONOSCOPY":
        return (year_start - relativedelta(years=9)) <= d <= year_end
    return False

# Merge
df_all = pd.concat([
    df_pdf[["patient_id","event_type","event_date","source","confidence","needs_review","evidence"]],
    df_csv_events[["patient_id","event_type","event_date","source","confidence","needs_review","evidence"]],
], ignore_index=True)

# De-dup: keep highest confidence for same patient/type/date/source
df_all["dedup_key"] = df_all["patient_id"].astype(str)+"|"+df_all["event_type"].astype(str)+"|"+df_all["event_date"].astype(str)
df_all = (df_all
          .sort_values(["confidence","source"], ascending=[False, True])
          .drop_duplicates("dedup_key", keep="first")
          .drop(columns=["dedup_key"])
         )

df_all["counts_crc_2025"] = df_all.apply(lambda r: counts_for_crc(r["event_type"], r["event_date"]), axis=1)
df_all.sort_values(["patient_id","counts_crc_2025","event_date"], ascending=[True, False, False]).head(30)

Unnamed: 0,patient_id,event_type,event_date,source,confidence,needs_review,evidence,counts_crc_2025
24,103890.0,FOBT,2025-12-10,structured_csv,0.95,False,"{'source_file': 'fobt.csv', 'row_index': 7, 't...",True
0,110656,COLONOSCOPY,,pdf,0.1,True,"{'file': '110656_colonoscopy.pdf', 'rationale'...",False
26,116271.0,FOBT,2025-05-13,structured_csv,0.95,False,"{'source_file': 'fobt.csv', 'row_index': 9, 't...",True
1,118450,COLONOSCOPY,2022-12-22,pdf,0.65,False,"{'file': '118450_colonoscopy.pdf', 'rationale'...",True
3,12969,FIT,2025-04-03,pdf,0.65,True,"{'file': '12969_fecal_ia.pdf', 'rationale': 'p...",True
2,12969,COLONOSCOPY,,pdf,0.1,True,"{'file': '12969_colonoscopy.pdf', 'rationale':...",False
4,130349,FOBT,,pdf,0.1,True,"{'file': '130349_fobt.pdf', 'rationale': 'date...",False
23,130349.0,FOBT,2025-04-21,structured_csv,0.95,False,"{'source_file': 'fobt.csv', 'row_index': 6, 't...",True
5,151177,FOBT,2025-01-31,pdf,0.65,True,"{'file': '151177_ifobt.pdf', 'rationale': 'pic...",True
25,157903.0,FOBT,2025-11-17,structured_csv,0.95,False,"{'source_file': 'fobt.csv', 'row_index': 8, 't...",True


In [7]:
# Pick best numerator evidence per patient (most recent qualifying; tie-break by confidence)
def pick_best(df):
    q = df[df["counts_crc_2025"]==True].copy()
    if q.empty:
        # keep best available for review (most recent date regardless) if any
        df2 = df.copy()
        df2 = df2[df2["event_date"].notna()]
        if df2.empty:
            return pd.Series({
                "numerator_met": False,
                "best_event_type": None,
                "best_event_date": None,
                "best_source": None,
                "best_confidence": None,
                "needs_review": True,
                "evidence_summary": "no dated CRC evidence found",
            })
        df2 = df2.sort_values(["event_date","confidence"], ascending=[False,False]).head(1)
        row = df2.iloc[0]
        return pd.Series({
            "numerator_met": False,
            "best_event_type": row.event_type,
            "best_event_date": row.event_date,
            "best_source": row.source,
            "best_confidence": row.confidence,
            "needs_review": True,
            "evidence_summary": f"Best non-qualifying evidence: {row.event_type} {row.event_date} ({row.source}); {row.evidence.get('file', row.evidence.get('source_file',''))}",
        })
    q = q.sort_values(["event_date","confidence"], ascending=[False,False]).head(1)
    row = q.iloc[0]
    file_or_row = row.evidence.get('file', row.evidence.get('source_file',''))
    snippet = row.evidence.get('snippet','')
    summary = f"{row.event_type} on {row.event_date} via {row.source} ({file_or_row})"
    if snippet:
        summary += f" | snippet: {snippet[:160]}"
    return pd.Series({
        "numerator_met": True,
        "best_event_type": row.event_type,
        "best_event_date": row.event_date,
        "best_source": row.source,
        "best_confidence": row.confidence,
        "needs_review": bool(row.needs_review),
        "evidence_summary": summary,
    })

df_best = df_all.groupby("patient_id", as_index=False).apply(pick_best).reset_index(drop=True)
df_best = df_best.sort_values(["numerator_met","needs_review","best_event_date"], ascending=[False, True, False])

# Save auditor table
out_csv = "audit_data/crc_2025_audit_table_updated.csv"
df_best.to_csv(out_csv, index=False)
out_csv, df_best.head(50)

  df_best = df_all.groupby("patient_id", as_index=False).apply(pick_best).reset_index(drop=True)


('audit_data/crc_2025_audit_table_updated.csv',
                    patient_id  numerator_met best_event_type best_event_date  \
 24  MR_163038_colonoscopy.pdf           True     COLONOSCOPY      2025-12-31   
 0                    103890.0           True            FOBT      2025-12-10   
 9                    163237.0           True            FOBT      2025-12-01   
 8                    157903.0           True            FOBT      2025-11-17   
 23                     9816.0           True            FOBT      2025-10-27   
 21                    49260.0           True            FOBT      2025-09-04   
 22                    64114.0           True            FOBT      2025-07-17   
 2                    116271.0           True            FOBT      2025-05-13   
 14                   173232.0           True            FOBT      2025-04-28   
 17                     184224           True     COLONOSCOPY      2025-04-22   
 6                    130349.0           True            FOBT

In [11]:
# DEBUG: Check what pdfplumber extracts vs what's actually in the PDF
test_pdf = "pdf_data/110656_colonoscopy.pdf"

print("="*80)
print("PDFPLUMBER EXTRACTION:")
print("="*80)
try:
    with pdfplumber.open(test_pdf) as pdf:
        print(f"Total pages: {len(pdf.pages)}")
        for i, page in enumerate(pdf.pages[:3]):  # Check first 3 pages
            text = page.extract_text()
            print(f"\n--- Page {i+1} (length: {len(text) if text else 0}) ---")
            print(text[:1000] if text else "No text extracted")
            print()
except Exception as e:
    print(f"Error: {e}")

print("\n" + "="*80)
print(f"TRYING OCR with POPPLER_PATH={POPPLER_PATH}:")
print("="*80)
try:
    pages = convert_from_path(test_pdf, dpi=150, first_page=1, last_page=1, poppler_path=POPPLER_PATH)
    if pages:
        ocr_text = pytesseract.image_to_string(pages[0])
        print(f"✓ OCR SUCCESS! Extracted {len(ocr_text)} chars")
        print(f"OCR text (first 1000 chars):\n{ocr_text[:1000]}")
except Exception as e:
    print(f"✗ OCR failed: {e}")

PDFPLUMBER EXTRACTION:
Total pages: 5

--- Page 1 (length: 46) ---
Name: HERNANDEZ, ARMANDO DOB: 09/27/1957 Date:


--- Page 2 (length: 46) ---
Name: HERNANDEZ, ARMANDO DOB: 09/27/1957 Date:


--- Page 3 (length: 46) ---
Name: HERNANDEZ, ARMANDO DOB: 09/27/1957 Date:


TRYING OCR with POPPLER_PATH=C:\poppler\Library\bin:
✗ OCR failed: tesseract is not installed or it's not in your PATH. See README file for more information.
