# Breast Cancer Screening UDS Prototype (2025)

This notebook ingests:
- Scanned mammogram reports (PDF) via OCR
- Breast imaging CSV extract (mammograms, tomosynthesis)

It outputs per-patient breast cancer screening numerator status for 2025 with an auditor-friendly evidence string.

## UDS 2025 Breast Cancer Screening Rules:
- **Mammogram (including digital tomosynthesis)**: Within 2 years (2024-2025)
- Age range: Typically 50-74 years (verify with your UDS specifications)
- Includes: screening mammogram, diagnostic mammogram, digital breast tomosynthesis (DBT)

In [None]:
import os, re, json, datetime as dt
import pandas as pd
from dateutil.relativedelta import relativedelta
import pytesseract
from pdf2image import convert_from_path

In [None]:
import glob, os

# Auto-discover breast cancer screening PDFs
PDF_GLOBS = [
    "pdf_data/*mammo*.pdf",
    "pdf_data/*mammogram*.pdf",
    "pdf_data/*breast*.pdf",
    "pdf_data/*tomosynthesis*.pdf",
    "pdf_data/*dbt*.pdf",
]
PDF_PATHS = sorted({p for g in PDF_GLOBS for p in glob.glob(g)})

BREAST_CSV_PATH = "csv_data/breast_screening.csv"
REPORTING_YEAR = 2025

print(f"Found {len(PDF_PATHS)} PDFs:")
for p in PDF_PATHS:
    print(" -", os.path.basename(p))
print("Breast screening CSV:", os.path.basename(BREAST_CSV_PATH), "exists:", os.path.exists(BREAST_CSV_PATH))

In [1]:
import os, re, datetime as dt
from dataclasses import dataclass
from typing import Optional, Literal, Dict, Any, List, Tuple

DATE_PAT = re.compile(r'(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})')
# Match dates with month names like "April 02, 2018" or "April 2, 2018"
MONTH_NAME_PAT = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\.?\s+(\d{1,2}),?\s+(\d{4})\b', re.IGNORECASE)

def _to_iso(m:int,d:int,y:int) -> Optional[str]:
    if y < 100:
        y = 2000 + y if y < 50 else 1900 + y
    try:
        return dt.date(y,m,d).isoformat()
    except Exception:
        return None

def parse_dates(text: str) -> List[Tuple[str, str]]:
    out=[]
    # Extract numeric dates like 4/2/2018
    for m,d,y in DATE_PAT.findall(text or ""):
        iso = _to_iso(int(m), int(d), int(y))
        if iso:
            out.append((iso, f"{m}/{d}/{y}"))
    
    # Extract dates with month names like "April 02, 2018"
    month_map = {
        'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3,
        'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6,
        'jul': 7, 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, 'sept': 9, 'september': 9,
        'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12
    }
    for month_name, day, year in MONTH_NAME_PAT.findall(text or ""):
        month_num = month_map.get(month_name.lower())
        if month_num:
            iso = _to_iso(month_num, int(day), int(year))
            if iso:
                out.append((iso, f"{month_name} {day}, {year}"))
    
    return out

def looks_like_nonclinical_line(line: str) -> bool:
    l=(line or "").lower()
    if "electronically signed" in l or "signed by" in l:
        return True
    if "dob" in l or "date of birth" in l:
        return True
    if "data:text" in l or "base64" in l:
        return True
    return False

def filename_patient_id(path: str) -> str:
    base=os.path.basename(path)
    m=re.match(r'^(\d+)', base)
    return m.group(1) if m else base

import pdfplumber
from pdf2image import convert_from_path
import pytesseract

KEYWORDS = ["mammogram","mammography","breast","tomosynthesis","dbt","screening","diagnostic","birads","bi-rads"]

# POPPLER PATH: Uncomment and set this if poppler is not in your system PATH
POPPLER_PATH = r"C:\poppler\Library\bin"

# TESSERACT PATH: Set path to tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\jloya\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
#POPPLER_PATH = None  # Set to None to use system PATH

def extract_first_page_text(pdf_path: str, ocr_dpi: int = 300, min_chars: int = 80, max_pages: int = 5) -> str:
    """Extract text from PDF, checking multiple pages if needed. Uses OCR fallback for scanned documents."""
    all_text = ""
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            pages_to_check = min(max_pages, total_pages)
            
            # Try extracting from first few pages
            for page_num in range(pages_to_check):
                page_text = (pdf.pages[page_num].extract_text() or "")
                all_text += page_text + "\n"
                
                # Check if we have enough clinical content
                text_l = all_text.lower()
                keyword_hit = any(k in text_l for k in KEYWORDS)
                if len(all_text.strip()) >= min_chars and keyword_hit:
                    # Found good content, stop here
                    return all_text
    except Exception:
        pass

    # If we didn't get good text, try OCR on first few pages
    if len(all_text.strip()) < min_chars or not any(k in all_text.lower() for k in KEYWORDS):
        try:
            ocr_combined = ""
            # OCR up to max_pages or until we find clinical content
            for page_num in range(1, min(max_pages + 1, 15)):  # Cap at 15 pages max
                pages = convert_from_path(
                    pdf_path, 
                    dpi=ocr_dpi, 
                    first_page=page_num, 
                    last_page=page_num,
                    poppler_path=POPPLER_PATH
                )
                if pages:
                    ocr_text = pytesseract.image_to_string(pages[0], timeout=12)
                    ocr_combined += ocr_text + "\n"
                    
                    # Check if we found clinical content
                    ocr_l = ocr_combined.lower()
                    keyword_hit = any(k in ocr_l for k in KEYWORDS)
                    if len(ocr_combined.strip()) >= min_chars and keyword_hit:
                        # Found good OCR content, stop here
                        return ocr_combined
                        
            # Return whatever OCR we got if it's better than pdfplumber
            if len(ocr_combined.strip()) > len(all_text.strip()):
                return ocr_combined
        except Exception as e:
            pass
    
    # Return whatever we got, even if minimal
    return all_text or ""

    # OCR fallback - CRITICAL for scanned PDFs
    try:
        pages = convert_from_path(
            pdf_path, 
            dpi=ocr_dpi, 
            first_page=1, 
            last_page=1,
            poppler_path=POPPLER_PATH
        )
        if pages:
            ocr_text = pytesseract.image_to_string(pages[0], timeout=12)
            if ocr_text and len(ocr_text.strip()) > len(text.strip()):
                return ocr_text
    except Exception as e:
        # OCR failed - likely poppler not installed
        pass
    
    # Return whatever we got, even if minimal
    return text or ""

EventType = Literal["SCREENING_MAMMOGRAM","DIAGNOSTIC_MAMMOGRAM","TOMOSYNTHESIS"]

@dataclass
class ScreeningEvent:
    patient_id: str
    event_type: EventType
    event_date: Optional[str]
    source: Literal["pdf","structured_csv"]
    confidence: float
    needs_review: bool
    evidence: Dict[str, Any]

def classify_doc(text: str) -> Optional[EventType]:
    """Classify breast cancer screening document type."""
    t=(text or "").lower()
    
    # Check for tomosynthesis/DBT first (most specific)
    if re.search(r'\b(tomosynthesis|dbt|3d\s*mammo)\b', t):
        return "TOMOSYNTHESIS"
    
    # Check for mammogram type
    if "mammogram" in t or "mammography" in t:
        # Try to determine if screening vs diagnostic
        if re.search(r'\b(screening|screen|routine|annual)\b', t):
            return "SCREENING_MAMMOGRAM"
        elif re.search(r'\b(diagnostic|diagnosis|follow[\s-]?up|callback|additional)\b', t):
            return "DIAGNOSTIC_MAMMOGRAM"
        else:
            # Default to screening if type unclear
            return "SCREENING_MAMMOGRAM"
    
    # General breast imaging mention
    if "breast" in t and ("imaging" in t or "radiology" in t or "exam" in t):
        return "SCREENING_MAMMOGRAM"
    
    return None

def extract_labeled_dates(text: str) -> Dict[str, List[str]]:
    labels = {"exam":[],"procedure":[],"study":[],"received":[],"result":[],"other":[]}
    for line in (text or "").splitlines():
        if not line.strip():
            continue
        if looks_like_nonclinical_line(line):
            continue
        ll=line.lower()
        label="other"
        if re.search(r'exam\s*date|date\s*of\s*exam', ll):
            label="exam"
        elif re.search(r'procedure\s*date|date\s*of\s*procedure|date\s*of\s*service|\bdos\b', ll):
            label="procedure"
        elif re.search(r'study\s*date|date\s*of\s*study', ll):
            label="study"
        elif re.search(r'\breceived\b', ll):
            label="received"
        elif re.search(r'\bresult\b|\breported\b|\bfinal\b', ll):
            label="result"
        # Pattern for "performed on MM/DD/YYYY" or "on MM/DD/YYYY"
        if re.search(r'\b(performed|on)\s*\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', ll):
            label="exam"
        for iso,_ in parse_dates(line):
            labels[label].append(iso)
    for k,v in labels.items():
        seen=set(); nv=[]
        for d in v:
            if d not in seen:
                nv.append(d); seen.add(d)
        labels[k]=nv
    return labels

def is_reasonable_procedure_date(date_iso: str) -> bool:
    """Filter out dates that are clearly not procedure dates (e.g., DOBs from 1950s)"""
    try:
        year = int(date_iso[:4])
        # Mammography screening should be in a reasonable range
        # Allow 1995-present (30 years back) to be safe
        return 1995 <= year <= (dt.date.today().year + 1)
    except:
        return False

def choose_best_date(event_type: EventType, labeled: Dict[str,List[str]], all_dates: List[str]) -> Tuple[Optional[str], float, bool, str]:
    # For mammograms, prefer exam/procedure/study date
    pref="exam"
    
    candidates = labeled.get("exam", []).copy()
    if not candidates:
        candidates = labeled.get("procedure",[])
    if not candidates:
        candidates = labeled.get("study",[])
    if not candidates:
        candidates = labeled.get("result",[]) + labeled.get("received",[]) + labeled.get("other",[])
    if not candidates and all_dates:
        candidates = list(dict.fromkeys(all_dates))

    if not candidates:
        return None, 0.2, True, "no date found"

    # Apply stricter date filtering to exclude DOB-era dates
    filtered = [d for d in candidates if is_reasonable_procedure_date(d)]
    
    if not filtered:
        # All dates were filtered out (probably DOBs)
        return None, 0.1, True, f"dates found but all appear to be DOBs or invalid: {candidates}"
    
    candidates = filtered

    best = max(candidates)
    conf = 0.9 if ((labeled.get("exam") and best in labeled["exam"]) or 
                   (labeled.get("procedure") and best in labeled["procedure"]) or
                   (labeled.get("study") and best in labeled["study"])) else 0.65
    needs_review = False
    rationale = f"picked {best} from {pref if conf>0.8 else 'fallback'}"

    distinct = sorted(set(sum(labeled.values(), [])))
    if conf < 0.8 and len(distinct) >= 2:
        needs_review = True
        conf = min(conf, 0.55)
        rationale += "; multiple conflicting dates"

    return best, conf, needs_review, rationale

def extract_event_from_pdf(pdf_path: str, text: str) -> ScreeningEvent:
    pid = filename_patient_id(pdf_path)
    filename = os.path.basename(pdf_path).lower()
    
    # Try to classify from text content first
    et = classify_doc(text)
    if et is None:
        # Fallback to filename-based classification
        if "tomosynthesis" in filename or "dbt" in filename:
            et = "TOMOSYNTHESIS"
        elif "diagnostic" in filename:
            et = "DIAGNOSTIC_MAMMOGRAM"
        elif "mammo" in filename or "breast" in filename:
            et = "SCREENING_MAMMOGRAM"
        else:
            et = "SCREENING_MAMMOGRAM"  # Default assumption
    
    labeled = extract_labeled_dates(text)
    all_dates = [d for d,_ in parse_dates(text)]
    best_date, conf, needs_review, rationale = choose_best_date(et, labeled, all_dates)

    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
    keys = ["mammogram", "mammography", "breast", "tomosynthesis", "screening", "diagnostic", 
            "birads", "bi-rads", "impression", "findings", "exam", "study", "procedure"]
    
    snippet_lines = []
    for ln in lines:
        lnl = ln.lower()
        if any(k in lnl for k in keys):
            snippet_lines.append(ln)
        if len(snippet_lines) >= 8:
            break
    snippet = (" ".join(snippet_lines) or (text or "")[:600]).replace(chr(10), " ")

    # If text was minimal (likely failed OCR), mark for review and note filename-based classification
    text_minimal = len(text.strip()) < 100
    if text_minimal:
        needs_review = True
        conf = min(conf, 0.6)
        if best_date:
            rationale += f"; text extraction minimal ({len(text.strip())} chars), classified by filename"
        else:
            rationale = f"text extraction minimal ({len(text.strip())} chars), classified by filename, no valid date"

    # If no confident exam date, flag for review
    if best_date is None or conf < 0.8:
        needs_review = True

    return ScreeningEvent(
        patient_id=pid,
        event_type=et,
        event_date=best_date,
        source="pdf",
        confidence=float(conf),
        needs_review=bool(needs_review),
        evidence={
            "file": os.path.basename(pdf_path),
            "rationale": rationale,
            "snippet": snippet,
            "dates_by_label": labeled,
            "text_length": len(text),
        }
    )

In [2]:
# Extract events from PDFs (fast text extraction with OCR fallback)
pdf_rows=[]
for pdf_path in PDF_PATHS:
    text = extract_first_page_text(pdf_path)
    ev = extract_event_from_pdf(pdf_path, text)
    pdf_rows.append(ev.__dict__)

df_pdf = pd.DataFrame(pdf_rows)
df_pdf

NameError: name 'PDF_PATHS' is not defined

In [None]:
# Parse breast screening CSV extract and convert to ScreeningEvent rows
import pandas as pd
import numpy as np
import datetime as dt

def detect_header_row(csv_path: str, max_scan: int = 40) -> int:
    with open(csv_path, 'r', errors='ignore') as f:
        for i,line in enumerate(f):
            if i>max_scan:
                break
            l=line.lower()
            if 'person' in l and ('nbr' in l or '#' in l) and ('mrn' in l or 'enc' in l or 'date' in l):
                return i
    return 0

# Check if CSV exists before processing
if not os.path.exists(BREAST_CSV_PATH):
    print(f"⚠️ CSV file not found: {BREAST_CSV_PATH}")
    print("Creating empty dataframe for CSV events")
    df_csv_events = pd.DataFrame(columns=["patient_id","event_type","event_date","source","confidence","needs_review","evidence"])
else:
    hdr = detect_header_row(BREAST_CSV_PATH)
    df_csv = pd.read_csv(BREAST_CSV_PATH, header=hdr)
    df_csv.columns = [c.strip() for c in df_csv.columns]

    # Identify key columns
    def find_col(cols, patterns):
        for p in patterns:
            for c in cols:
                if re.search(p, c, re.I):
                    return c
        return None

    pid_col = find_col(df_csv.columns, [r'person\s*nbr', r'person\s*#', r'patient'])
    encdate_col = find_col(df_csv.columns, [r'enc\s*date', r'exam\s*date', r'study\s*date', r'procedure\s*date', r'date'])
    test_col = find_col(df_csv.columns, [r'test', r'order', r'procedure', r'description', r'name', r'exam'])

    def parse_any_date(x):
        if pd.isna(x): return None
        s=str(x).strip()
        m=DATE_PAT.search(s)
        if not m: 
            return None
        mm,dd,yy = map(int, m.groups())
        iso=_to_iso(mm,dd,yy)
        return iso

    csv_events=[]
    for idx,row in df_csv.iterrows():
        pid = str(row.get(pid_col, '')).strip()
        if not pid or pid.lower()=='nan':
            continue
        ev_date = parse_any_date(row.get(encdate_col, None))
        if not ev_date:
            continue
        test_name = str(row.get(test_col, '')).lower() if test_col else ''
        
        # Classify based on test name
        if 'tomosynthesis' in test_name or 'dbt' in test_name or '3d' in test_name:
            et = 'TOMOSYNTHESIS'
        elif 'diagnostic' in test_name:
            et = 'DIAGNOSTIC_MAMMOGRAM'
        else:
            et = 'SCREENING_MAMMOGRAM'
        
        csv_events.append({
            "patient_id": pid,
            "event_type": et,
            "event_date": ev_date,
            "source": "structured_csv",
            "confidence": 0.95,
            "needs_review": False,
            "evidence": {"source_file": os.path.basename(BREAST_CSV_PATH), "row_index": int(idx), "test_name": test_name}
        })

    df_csv_events = pd.DataFrame(csv_events)

df_csv_events.head() if not df_csv_events.empty else df_csv_events, df_csv_events.shape

In [None]:
# Breast Cancer Screening 2025 rule engine + merge, de-dup, and scoring
import datetime as dt
from dateutil.relativedelta import relativedelta

year_start = dt.date(REPORTING_YEAR,1,1)
year_end = dt.date(REPORTING_YEAR,12,31)

def counts_for_breast_screening(event_type: str, event_date_iso: str) -> bool:
    """
    UDS 2025 Breast Cancer Screening Rules:
    - Mammogram (all types): Within 2 years (2024-2025)
    - Includes screening, diagnostic, and tomosynthesis
    """
    if not event_date_iso:
        return False
    d = dt.date.fromisoformat(event_date_iso)
    
    # All mammogram types: 2 year lookback (within 27 months to be safe)
    # 2024-01-01 through 2025-12-31
    return (year_start - relativedelta(years=1)) <= d <= year_end

# Merge PDF and CSV events
df_all = pd.concat([
    df_pdf[["patient_id","event_type","event_date","source","confidence","needs_review","evidence"]],
    df_csv_events[["patient_id","event_type","event_date","source","confidence","needs_review","evidence"]],
], ignore_index=True)

# De-dup: keep highest confidence for same patient/type/date
df_all["dedup_key"] = df_all["patient_id"].astype(str)+"|"+df_all["event_type"].astype(str)+"|"+df_all["event_date"].astype(str)
df_all = (df_all
          .sort_values(["confidence","source"], ascending=[False, True])
          .drop_duplicates("dedup_key", keep="first")
          .drop(columns=["dedup_key"])
         )

df_all["counts_breast_2025"] = df_all.apply(lambda r: counts_for_breast_screening(r["event_type"], r["event_date"]), axis=1)
df_all.sort_values(["patient_id","counts_breast_2025","event_date"], ascending=[True, False, False]).head(30)

In [None]:
# Pick best numerator evidence per patient (most recent qualifying; tie-break by confidence)
def pick_best(df):
    q = df[df["counts_breast_2025"]==True].copy()
    if q.empty:
        # keep best available for review (most recent date regardless) if any
        df2 = df.copy()
        df2 = df2[df2["event_date"].notna()]
        if df2.empty:
            return pd.Series({
                "numerator_met": False,
                "best_event_type": None,
                "best_event_date": None,
                "best_source": None,
                "best_confidence": None,
                "needs_review": True,
                "evidence_summary": "no dated breast screening evidence found",
            })
        df2 = df2.sort_values(["event_date","confidence"], ascending=[False,False]).head(1)
        row = df2.iloc[0]
        return pd.Series({
            "numerator_met": False,
            "best_event_type": row.event_type,
            "best_event_date": row.event_date,
            "best_source": row.source,
            "best_confidence": row.confidence,
            "needs_review": True,
            "evidence_summary": f"Best non-qualifying evidence: {row.event_type} {row.event_date} ({row.source}); {row.evidence.get('file', row.evidence.get('source_file',''))}",
        })
    q = q.sort_values(["event_date","confidence"], ascending=[False,False]).head(1)
    row = q.iloc[0]
    file_or_row = row.evidence.get('file', row.evidence.get('source_file',''))
    snippet = row.evidence.get('snippet','')
    summary = f"{row.event_type} on {row.event_date} via {row.source} ({file_or_row})"
    if snippet:
        summary += f" | snippet: {snippet[:160]}"
    return pd.Series({
        "numerator_met": True,
        "best_event_type": row.event_type,
        "best_event_date": row.event_date,
        "best_source": row.source,
        "best_confidence": row.confidence,
        "needs_review": bool(row.needs_review),
        "evidence_summary": summary,
    })

df_best = df_all.groupby("patient_id", as_index=False).apply(pick_best).reset_index(drop=True)
df_best = df_best.sort_values(["numerator_met","needs_review","best_event_date"], ascending=[False, True, False])

# Save auditor table
os.makedirs("audit_data", exist_ok=True)
out_csv = "audit_data/breast_screening_2025_audit_table.csv"
df_best.to_csv(out_csv, index=False, escapechar='\\', doublequote=False)
print(f"✓ Saved audit table to: {out_csv}")
out_csv, df_best.head(50)

In [None]:
# Summary statistics
print("="*80)
print("BREAST CANCER SCREENING UDS 2025 SUMMARY")
print("="*80)
print(f"\nTotal unique patients: {len(df_best)}")
print(f"Numerator met: {df_best['numerator_met'].sum()} ({df_best['numerator_met'].sum()/len(df_best)*100:.1f}%)")
print(f"Needs review: {df_best['needs_review'].sum()} ({df_best['needs_review'].sum()/len(df_best)*100:.1f}%)")

print("\n" + "-"*80)
print("By Event Type:")
print("-"*80)
event_counts = df_best[df_best['numerator_met']==True]['best_event_type'].value_counts()
for event_type, count in event_counts.items():
    print(f"  {event_type}: {count} patients")

print("\n" + "-"*80)
print("By Source:")
print("-"*80)
source_counts = df_best[df_best['numerator_met']==True]['best_source'].value_counts()
for source, count in source_counts.items():
    print(f"  {source}: {count} patients")

print("\n" + "-"*80)
print("Confidence Distribution (qualifying events):")
print("-"*80)
qualifying = df_best[df_best['numerator_met']==True]
if not qualifying.empty:
    print(f"  Mean confidence: {qualifying['best_confidence'].mean():.2f}")
    print(f"  Min confidence: {qualifying['best_confidence'].min():.2f}")
    print(f"  Max confidence: {qualifying['best_confidence'].max():.2f}")
    print(f"  High confidence (≥0.85): {(qualifying['best_confidence']>=0.85).sum()} patients")
    print(f"  Medium confidence (0.65-0.84): {((qualifying['best_confidence']>=0.65) & (qualifying['best_confidence']<0.85)).sum()} patients")
    print(f"  Low confidence (<0.65): {(qualifying['best_confidence']<0.65).sum()} patients")

In [None]:
# DEBUG: Check what pdfplumber extracts vs what's actually in a sample PDF
# Update this path to point to an actual mammogram PDF in your data
test_pdf = "pdf_data/sample_mammogram.pdf"

if os.path.exists(test_pdf):
    print("="*80)
    print("PDFPLUMBER EXTRACTION:")
    print("="*80)
    try:
        with pdfplumber.open(test_pdf) as pdf:
            print(f"Total pages: {len(pdf.pages)}")
            for i, page in enumerate(pdf.pages[:3]):  # Check first 3 pages
                text = page.extract_text()
                print(f"\n--- Page {i+1} (length: {len(text) if text else 0}) ---")
                print(text[:1000] if text else "No text extracted")
                print()
    except Exception as e:
        print(f"Error: {e}")

    print("\n" + "="*80)
    print(f"TRYING OCR with POPPLER_PATH={POPPLER_PATH}:")
    print("="*80)
    try:
        pages = convert_from_path(test_pdf, dpi=150, first_page=1, last_page=1, poppler_path=POPPLER_PATH)
        if pages:
            ocr_text = pytesseract.image_to_string(pages[0])
            print(f"✓ OCR SUCCESS! Extracted {len(ocr_text)} chars")
            print(f"OCR text (first 1000 chars):\n{ocr_text[:1000]}")
    except Exception as e:
        print(f"✗ OCR failed: {e}")
else:
    print(f"⚠️ Test PDF not found: {test_pdf}")
    print("Update the 'test_pdf' variable to point to an actual mammogram PDF in your pdf_data folder")