
# AI-Powered Document Processing — Prototype Notebook

This notebook is a **working prototype** of a production-grade document processing pipeline as described in the myOnsite Healthcare case study.  
It demonstrates an **end-to-end flow**: ingestion → OCR/text extraction → classification → entity extraction → validation & confidence → enrichment → routing → metrics/export.

> ⚠️ Notes  
> - Heavy/enterprise components are **stubbed** with clean interfaces so you can **hot-swap** engines later (Textract/Vision/Claude/GPT-4V, Kafka/Ray, etc.).  
> - All code runs locally with minimal deps. Optional features fall back gracefully if a dependency/model is missing.


In [None]:

# 1) Imports & Setup

import os, re, json, time, uuid, glob, random, math, datetime as dt
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple

import pandas as pd
import numpy as np

# Optional imports (safe fallbacks)
try:
    import pdfplumber
except Exception as e:
    pdfplumber = None

try:
    from PIL import Image
except Exception as e:
    Image = None

try:
    import pytesseract
except Exception as e:
    pytesseract = None

# spaCy is optional. If not installed, we will use regex NER
try:
    import spacy
    _NLP = spacy.load("en_core_web_sm")
except Exception as e:
    spacy = None
    _NLP = None

print("pdfplumber:", bool(pdfplumber), "| PIL:", bool(Image), "| pytesseract:", bool(pytesseract), "| spaCy model:", bool(_NLP))


In [None]:

# 2) Config & Pluggable Engines

class Config:
    DATA_DIR = Path("../data")          # put sample PDFs/images here
    OUTPUT_DIR = Path("../outputs")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    # Thresholds
    MIN_CONFIDENCE_ROUTE = 0.88   # below → route to human
    MIN_FIELD_CONF = 0.7
    
    # Active engines (can be switched)
    OCR_ENGINE = "pdfplumber|tesseract" # try pdfplumber text, else tesseract for images
    CLASSIFIER = "rule_based"           # or "sklearn" (placeholder)
    NER_ENGINE = "spacy|regex"          # try spaCy, else regex fallback

CONFIG = Config()


In [None]:

# 3) Utilities

def read_binary(path: Path) -> bytes:
    with open(path, "rb") as f:
        return f.read()

def guess_mime(path: Path) -> str:
    ext = path.suffix.lower()
    if ext in [".pdf"]: return "application/pdf"
    if ext in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]: return "image"
    if ext in [".txt"]: return "text"
    if ext in [".doc", ".docx"]: return "word"
    return "unknown"

def now_iso():
    return dt.datetime.utcnow().isoformat() + "Z"


### 4) OCR/Text Extraction

In [None]:

def extract_text_pdf(path: Path) -> str:
    if not pdfplumber:
        return ""
    text_chunks = []
    with pdfplumber.open(str(path)) as pdf:
        for page in pdf.pages:
            txt = page.extract_text(x_tolerance=1.5, y_tolerance=1.5) or ""
            text_chunks.append(txt)
    return "\n".join(text_chunks)

def extract_tables_pdf(path: Path) -> List[pd.DataFrame]:
    if not pdfplumber:
        return []
    tables = []
    with pdfplumber.open(str(path)) as pdf:
        for page in pdf.pages:
            for table in page.extract_tables() or []:
                try:
                    df = pd.DataFrame(table[1:], columns=table[0])
                except Exception:
                    df = pd.DataFrame(table)
                tables.append(df)
    return tables

def extract_text_image(path: Path) -> str:
    if not (Image and pytesseract):
        return ""
    img = Image.open(path)
    return pytesseract.image_to_string(img)

def extract_text(path: Path) -> Tuple[str, List[pd.DataFrame]]:
    mime = guess_mime(path)
    text, tables = "", []
    if mime == "application/pdf":
        text = extract_text_pdf(path)
        tables = extract_tables_pdf(path)
    elif mime == "image":
        text = extract_text_image(path)
    elif mime == "text":
        text = path.read_text(encoding="utf-8", errors="ignore")
    else:
        text = ""  # unsupported in this prototype
    return text.strip(), tables


### 5) Classification (Rule-based fallback)

In [None]:

INVOICE_HINTS = ["invoice", "amount due", "bill to", "subtotal", "total", "tax"]
FORM_HINTS = ["form", "name:", "email", "phone", "address"]
CONTRACT_HINTS = ["agreement", "party", "contract", "effective date", "term"]

def classify_document(text: str) -> str:
    t = text.lower()
    score = {"invoice":0, "form":0, "contract":0, "unknown":0}
    score["invoice"] += sum(h in t for h in INVOICE_HINTS)
    score["form"] += sum(h in t for h in FORM_HINTS)
    score["contract"] += sum(h in t for h in CONTRACT_HINTS)
    # pick top class
    cls = max(score, key=score.get)
    return cls if score[cls] > 0 else "unknown"


### 6) Entity Extraction (spaCy if available; regex fallback)

In [None]:

MONEY_RE = re.compile(r"(?:USD\s?)?\$\s?([0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?)")
DATE_RE = re.compile(r"\b(?:\d{1,2}[/.-]){2}\d{2,4}|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s*\d{4}")
EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+")
PHONE_RE = re.compile(r"(?:\+?\d{1,3}[\s-]?)?(?:\(\d{3}\)|\d{3})[\s-]?\d{3}[\s-]?\d{4}")

def extract_entities_regex(text: str) -> Dict[str, List[str]]:
    ents = {
        "MONEY": MONEY_RE.findall(text),
        "DATE": DATE_RE.findall(text),
        "EMAIL": EMAIL_RE.findall(text),
        "PHONE": PHONE_RE.findall(text),
    }
    return ents

def extract_entities_spacy(text: str) -> Dict[str, List[str]]:
    if not _NLP:
        return extract_entities_regex(text)
    doc = _NLP(text)
    ents = {}
    for e in doc.ents:
        ents.setdefault(e.label_, []).append(e.text)
    # also add regex helpers
    for k,v in extract_entities_regex(text).items():
        ents.setdefault(k, [])
        ents[k].extend(v)
    return ents


### 7) Schema Mapping, Validation & Confidence

In [None]:

INVOICE_SCHEMA = {
    "company_name": {"required": True},
    "invoice_number": {"required": True},
    "invoice_date": {"required": True},
    "total_amount": {"required": True},
}

def map_invoice_fields(text: str, ents: Dict[str, List[str]]) -> Dict[str, Any]:
    # naive demo mappers
    fields = {}
    # company name heuristic
    m = re.search(r"(?i)(?:company|vendor|bill from)[:\s]+([A-Za-z0-9&.,'\-\s]{3,})", text)
    if m:
        fields["company_name"] = m.group(1).strip()
    # invoice number heuristic
    m = re.search(r"(?i)(invoice\s*(?:no|number|#)[:\s]*)([A-Za-z0-9-]+)", text)
    if m:
        fields["invoice_number"] = m.group(2).strip()
    # date & total amount
    fields["invoice_date"] = ents.get("DATE", [None])[0]
    money_list = ents.get("MONEY", [])
    fields["total_amount"] = max(money_list, default=None) if money_list else None
    return fields

def validate_fields(fields: Dict[str, Any], schema: Dict[str, Dict[str, Any]]) -> Tuple[bool, Dict[str, float]]:
    conf = {}
    valid = True
    for key, rule in schema.items():
        val = fields.get(key)
        req = rule.get("required", False)
        present = val not in (None, "", [], {})
        if req and not present:
            valid = False
            conf[key] = 0.0
        else:
            # trivial confidence: length/format heuristics
            if key.endswith("date") and val:
                conf[key] = 0.9
            elif key.endswith("amount") and val:
                conf[key] = 0.92
            elif val:
                conf[key] = 0.85
            else:
                conf[key] = 0.0
    return valid, conf

def aggregate_confidence(conf_map: Dict[str, float]) -> float:
    if not conf_map: 
        return 0.0
    vals = list(conf_map.values())
    return float(np.mean(vals))


### 8) Enrichment (Knowledge Graph stub) & Routing

In [None]:

VENDOR_DB = {
    "myOnsite Healthcare LLC": {"vendor_id": "VEND-001", "domain": "healthcare"},
    "Healthcare Solutions Inc.": {"vendor_id": "VEND-002", "domain": "healthcare"},
}

def enrich(fields: Dict[str, Any]) -> Dict[str, Any]:
    company = fields.get("company_name")
    if company and company in VENDOR_DB:
        fields.update(VENDOR_DB[company])
    return fields

def route_decision(doc_conf: float, business_critical: bool = False) -> str:
    if business_critical and doc_conf < 0.95:
        return "human_review"
    if doc_conf < Config.MIN_CONFIDENCE_ROUTE:
        return "human_review"
    return "auto_approve"


### 9) End-to-End Pipeline

In [None]:

def process_document(path: Path) -> Dict[str, Any]:
    doc_id = str(uuid.uuid4())
    text, tables = extract_text(path)
    cls = classify_document(text) if text else "unknown"
    ents = extract_entities_spacy(text) if text else {}
    
    mapped = {}
    confmap = {}
    if cls == "invoice":
        mapped = map_invoice_fields(text, ents)
        valid, confmap = validate_fields(mapped, INVOICE_SCHEMA)
    else:
        # generic fallback
        mapped = {"raw_excerpt": text[:500] if text else ""}
        valid, confmap = True, {"raw_excerpt": 0.6 if text else 0.0}
    
    mapped = enrich(mapped)
    doc_conf = aggregate_confidence(confmap)
    route = route_decision(doc_conf, business_critical=False)
    
    result = {
        "doc_id": doc_id,
        "path": str(path),
        "mime": guess_mime(path),
        "class": cls,
        "entities": ents,
        "fields": mapped,
        "field_confidence": confmap,
        "document_confidence": round(doc_conf, 3),
        "route": route,
        "processed_at": now_iso(),
    }
    return result

def batch_process(data_dir: Path = CONFIG.DATA_DIR) -> pd.DataFrame:
    files = []
    for ext in ("*.pdf","*.png","*.jpg","*.jpeg","*.tif","*.tiff","*.txt"):
        files.extend(Path(data_dir).glob(ext))
    files = sorted(files)
    results = []
    for p in files:
        try:
            res = process_document(p)
            results.append(res)
        except Exception as e:
            results.append({
                "doc_id": str(uuid.uuid4()),
                "path": str(p),
                "error": str(e),
                "processed_at": now_iso(),
            })
    df = pd.DataFrame(results)
    out_csv = CONFIG.OUTPUT_DIR / f"results_{int(time.time())}.csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved results -> {out_csv}")
    return df

# Quick smoke test on empty data dir (user should add sample files later)
df = batch_process(CONFIG.DATA_DIR)
df.head(10)


### 10) (Optional) FastAPI stub to connect the HTML dashboard

In [None]:

# This cell shows how you'd expose the pipeline as an API (run separately).
# Save as: src/api.py and run: uvicorn src.api:app --reload

FASTAPI_SNIPPET = r"""
from fastapi import FastAPI, UploadFile, File
from pathlib import Path
import uuid, os
from typing import List
import pandas as pd

from .pipeline import process_document  # you can move the functions into src/pipeline.py

app = FastAPI()

UPLOAD_DIR = Path("./uploads")
UPLOAD_DIR.mkdir(exist_ok=True)

@app.post("/process")
async def process_files(files: List[UploadFile] = File(...)):
    results = []
    for f in files:
        dest = UPLOAD_DIR / f"{uuid.uuid4()}_{f.filename}"
        with open(dest, "wb") as out:
            out.write(await f.read())
        results.append(process_document(dest))
    return {"results": results}
"""

print(FASTAPI_SNIPPET)
