In [8]:
import os
import re
import time
import math
import shutil
from pathlib import Path
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract
from pytesseract import Output

# =========================
# CONFIG
# =========================
JPEGS_DIR = Path(r".\JPEGS")
OUT_DIR = Path(r".\out")
OCR_TEXT_DIR = OUT_DIR / "ocr_text"

WORKERS = 6
PROGRESS_EVERY = 10

# If you installed tesseract but it's not on PATH, set it explicitly:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

OUT_DIR.mkdir(parents=True, exist_ok=True)
OCR_TEXT_DIR.mkdir(parents=True, exist_ok=True)

ACC_RE = r"\d{1,6}"
CAT_RE = r"\d{1,7}"

# =========================
# FILE LISTING (NO ZIP)
# =========================
def page_num_from_path(p: Path) -> int:
    m = re.search(r"(\d+)", p.stem)
    if not m:
        raise ValueError(f"Can't parse page number from: {p.name}")
    return int(m.group(1))

def list_jpgs_unique_by_page(folder: Path):
    # One glob that matches .jpg/.JPG/etc without double-counting on Windows
    files = list(folder.glob("*.[jJ][pP][gG]"))

    # Normalize + de-dupe physical paths (defensive)
    normed = []
    seen = set()
    for f in files:
        key = os.path.normcase(str(f.resolve()))
        if key not in seen:
            seen.add(key)
            normed.append(f)

    # Group by page number
    by_page = {}
    collisions = {}
    for f in normed:
        pg = page_num_from_path(f)
        by_page.setdefault(pg, []).append(f)

    # Keep the largest file per page (best scan), but retain alternates
    for pg, lst in by_page.items():
        lst_sorted = sorted(lst, key=lambda x: x.stat().st_size, reverse=True)
        by_page[pg] = lst_sorted
        if len(lst_sorted) > 1:
            collisions[pg] = lst_sorted

    pages = sorted(by_page.keys())
    if len(pages) != 102 or pages[0] != 1 or pages[-1] != 102:
        raise RuntimeError(f"Expected pages 1..102. Found {len(pages)} pages: {pages[:5]} ... {pages[-5:]}")

    if collisions:
        sample = ", ".join([f"{k}:{len(v)}" for k, v in list(collisions.items())[:12]])
        print("[Info] Duplicate files for some pages (multiple files map to same page number). Will auto-try alternates if needed.")
        print(f"       Examples: {sample} ...")
    else:
        print("[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅")

    # Return primary path + alternates
    tasks = []
    for pg in range(1, 103):
        paths = by_page[pg]
        tasks.append((pg, paths[0], paths[1:]))  # (page, primary, alternates)
    return tasks

# =========================
# IMAGE PREPROCESSING
# =========================
def otsu_threshold(arr_uint8: np.ndarray) -> int:
    # Simple Otsu implementation (keeps dependencies minimal)
    hist = np.bincount(arr_uint8.ravel(), minlength=256).astype(np.float64)
    total = arr_uint8.size
    sum_total = np.dot(np.arange(256), hist)

    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200

    for t in range(256):
        w_b += hist[t]
        if w_b == 0:
            continue
        w_f = total - w_b
        if w_f == 0:
            break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=2.2, use_otsu=True, fixed_thresh=205) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    im = im.filter(ImageFilter.MedianFilter(size=3))

    arr = np.array(im)
    thr = otsu_threshold(arr) if use_otsu else fixed_thresh
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

# =========================
# OCR HELPERS
# =========================
def ocr_data(img_bw: Image.Image, psm: int) -> dict:
    # Numeric-focused whitelist to reduce garbage
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_data(img_bw, config=cfg, output_type=Output.DICT)

def ocr_text_for_debug(img_bw: Image.Image, psm: int) -> str:
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_string(img_bw, config=cfg)

def ocr_find_headers(img: Image.Image) -> dict:
    """
    Lightweight header detection for pages where section starts mid-page.
    Returns y-coordinates for found headers (in original image coords).
    """
    im = img.convert("L")
    im = ImageOps.autocontrast(im)

    cfg = "--oem 3 --psm 6"
    d = pytesseract.image_to_data(im, config=cfg, output_type=Output.DICT)

    headers = {"PICKLES": [], "SKELETONS": [], "SKINS": []}
    for txt, top, h in zip(d["text"], d["top"], d["height"]):
        if not txt:
            continue
        t = re.sub(r"[^A-Z]", "", txt.upper())
        if t in headers:
            # use center y
            headers[t].append(int(top) + int(h)//2)
    return headers

def clean_token(t: str) -> str:
    t = (t or "").strip().upper()
    t = re.sub(r"[^A-Z0-9]", "", t)
    return t

def split_subtokens(text: str):
    if not text:
        return []
    # 254E1396 -> 254, E, 1396
    m = re.fullmatch(rf"({ACC_RE})([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2), m.group(3)]
    # 254E -> 254, E
    m = re.fullmatch(rf"({ACC_RE})([EDG])", text)
    if m:
        return [m.group(1), m.group(2)]
    # E1396 -> E, 1396
    m = re.fullmatch(rf"([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2)]
    return [text]

def cluster_rows(tokens, row_tol):
    tokens = sorted(tokens, key=lambda d: d["y"])
    rows = []
    cur = []
    cur_y = None
    for tok in tokens:
        if cur_y is None:
            cur = [tok]
            cur_y = tok["y"]
            continue
        if abs(tok["y"] - cur_y) <= row_tol:
            cur.append(tok)
            cur_y = (cur_y * (len(cur)-1) + tok["y"]) / len(cur)
        else:
            rows.append(cur)
            cur = [tok]
            cur_y = tok["y"]
    if cur:
        rows.append(cur)
    return rows

def kmeans_1d(xs, k=3, iters=30):
    xs = np.array(xs, dtype=float)
    centers = np.percentile(xs, np.linspace(0, 100, k+2)[1:-1])
    for _ in range(iters):
        d = np.abs(xs[:, None] - centers[None, :])
        lab = d.argmin(axis=1)
        new = []
        for j in range(k):
            pts = xs[lab == j]
            new.append(centers[j] if len(pts) == 0 else pts.mean())
        new = np.array(new)
        if np.allclose(new, centers):
            break
        centers = new
    return np.sort(centers)

def extract_triplets(img_bw: Image.Image, psm=6, conf_floor=-1.0, kcols=1):
    """
    Now intended to be run on a SINGLE COLUMN crop (kcols=1).
    Improvements:
      - dynamic row_tol using median OCR word height (prevents row merges)
      - keep list (not set) so counts reflect rows; we separately track duplicates
      - infer missing CODE (as before) AND infer missing ACCESSION when column dominated
    """
    w, h = img_bw.size
    data = ocr_data(img_bw, psm=psm)

    tokens = []
    xs = []
    hs = []

    for raw, left, top, width, height, conf in zip(
        data["text"], data["left"], data["top"], data["width"], data["height"], data["conf"]
    ):
        if not raw or raw.strip() == "":
            continue
        try:
            conf = float(conf)
        except:
            conf = -1.0
        if conf < conf_floor:
            continue

        t = clean_token(raw)
        if not t:
            continue

        x = int(left) + int(width) / 2.0
        y = int(top) + int(height) / 2.0
        hh = max(1, int(height))
        tokens.append({"text": t, "x": x, "y": y, "conf": conf, "h": hh})
        xs.append(x)
        hs.append(hh)

    if not tokens:
        return []

    # dynamic row tolerance: ~0.8 * median token height, clamped
    med_h = float(np.median(hs)) if hs else 18.0
    row_tol = max(8.0, min(0.8 * med_h, 30.0))

    centers = kmeans_1d(xs, k=kcols)
    rows = cluster_rows(tokens, row_tol)

    strict = []
    unresolved_rows = []  # store token sequences for 2nd-pass inference

    for row in rows:
        subs = []
        for tok in row:
            col = int(np.argmin(np.abs(centers - tok["x"]))) if kcols > 1 else 0
            for s in split_subtokens(tok["text"]):
                subs.append({"text": s, "x": tok["x"], "y": tok["y"], "conf": tok["conf"], "col": col})

        # single column stream (kcols=1 expected)
        cs = sorted([s for s in subs if s["col"] == 0], key=lambda d: d["x"])
        toks = [t["text"] for t in cs]

        # strict triplet search (take first match in this row)
        found = False
        for j in range(0, len(toks) - 2):
            a, c, b = toks[j], toks[j + 1], toks[j + 2]
            if re.fullmatch(ACC_RE, a) and c in ("E", "D", "G") and re.fullmatch(CAT_RE, b):
                strict.append((int(a), c, int(b)))
                found = True
                break

        if not found and toks:
            unresolved_rows.append(toks)

    if not strict:
        return []

    # dominant code/accession in THIS COLUMN
    codes = [t[1] for t in strict]
    accs = [t[0] for t in strict]
    mode_code, code_ct = Counter(codes).most_common(1)[0]
    mode_acc, acc_ct = Counter(accs).most_common(1)[0]
    code_share = code_ct / len(codes)
    acc_share = acc_ct / len(accs)

    out = list(strict)

    # second-pass inference on unresolved rows:
    # 1) ACC + CAT => fill CODE if strongly dominated
    if code_share >= 0.85:
        for toks in unresolved_rows:
            for j in range(len(toks) - 1):
                a, b = toks[j], toks[j + 1]
                if re.fullmatch(ACC_RE, a) and re.fullmatch(CAT_RE, b):
                    out.append((int(a), mode_code, int(b)))

    # 2) CODE + CAT => fill ACCESSION if strongly dominated
    if acc_share >= 0.70:
        for toks in unresolved_rows:
            for j in range(len(toks) - 1):
                c, b = toks[j], toks[j + 1]
                if c in ("E", "D", "G") and re.fullmatch(CAT_RE, b):
                    out.append((int(mode_acc), c, int(b)))

    # repair accession like 2542 -> 254 when 254 dominates
    if acc_share >= 0.50:
        mstr = str(mode_acc)
        repaired = []
        for (a, c, b) in out:
            astr = str(a)
            if a != mode_acc and astr.startswith(mstr) and len(astr) == len(mstr) + 1:
                repaired.append((mode_acc, c, b))
            else:
                repaired.append((a, c, b))
        out = repaired

    # DO NOT dedupe blindly; but compute duplicates so caller can penalize/flag
    return out
    
# =========================
# DESKEW + COLUMN SPLIT (NEW)
# =========================
def _binarize_for_proj(img: Image.Image, thr=200) -> np.ndarray:
    g = ImageOps.grayscale(img)
    g = ImageOps.autocontrast(g)
    arr = np.array(g)
    # text as 1s
    return (arr < thr).astype(np.uint8)

def estimate_skew_angle(img: Image.Image, search_deg=3.0, step=0.2, downsample=0.40) -> float:
    """
    Returns angle (degrees) to rotate (PIL rotate) to best deskew.
    Uses variance of horizontal projection (Radon-lite).
    """
    w, h = img.size
    small = img.resize((max(1, int(w * downsample)), max(1, int(h * downsample))))
    bw = _binarize_for_proj(small, thr=200)
    im = Image.fromarray((bw * 255).astype(np.uint8))

    angles = np.arange(-search_deg, search_deg + 1e-9, step)
    best_angle = 0.0
    best_score = -1.0

    for a in angles:
        rot = im.rotate(float(a), resample=Image.Resampling.NEAREST, expand=False, fillcolor=0)
        arr = (np.array(rot) > 0).astype(np.uint8)
        proj = arr.sum(axis=1)
        score = float(proj.var())
        if score > best_score:
            best_score = score
            best_angle = float(a)

    return best_angle

def deskew_image(img: Image.Image, angle_deg: float) -> Image.Image:
    if abs(angle_deg) < 0.05:
        return img

    # Ensure fillcolor matches mode
    mode = img.mode
    if mode in ("L", "1"):
        fill = 255
    elif mode == "RGBA":
        fill = (255, 255, 255, 255)
    else:
        # Convert odd modes like "P", "CMYK" safely to RGB
        if mode not in ("RGB", "RGBA"):
            img = img.convert("RGB")
        fill = (255, 255, 255)

    return img.rotate(
        angle_deg,
        resample=Image.Resampling.BICUBIC,
        expand=False,
        fillcolor=fill
    )

def find_column_bounds(img: Image.Image, downsample=0.40) -> tuple[int, int]:
    """
    Finds 2 x-splits between 3 columns using vertical projection valleys near 1/3 and 2/3.
    Returns (x_split1, x_split2) in ORIGINAL image coordinates.
    """
    w, h = img.size
    small = img.resize((max(1, int(w * downsample)), max(1, int(h * downsample))))
    bw = _binarize_for_proj(small, thr=200)

    proj = bw.sum(axis=0).astype(np.float64)
    # smooth
    win = 21
    kernel = np.ones(win) / win
    smooth = np.convolve(proj, kernel, mode="same")

    width = len(smooth)
    targets = [width / 3, 2 * width / 3]
    splits = []

    for t in targets:
        lo = int(max(0, t - 0.12 * width))
        hi = int(min(width, t + 0.12 * width))
        idx = lo + int(np.argmin(smooth[lo:hi]))
        splits.append(idx)

    splits = sorted(splits)
    scale = 1.0 / downsample
    return int(splits[0] * scale), int(splits[1] * scale)

def column_boxes(img: Image.Image, pad_px=25) -> list[tuple[int, int, int, int]]:
    w, h = img.size
    x1, x2 = find_column_bounds(img)
    # pad to avoid cutting characters at boundaries
    left  = (0, 0, min(w, x1 + pad_px), h)
    mid   = (max(0, x1 - pad_px), 0, min(w, x2 + pad_px), h)
    right = (max(0, x2 - pad_px), 0, w, h)
    return [left, mid, right]

# =========================
# TYPE / EXPECTED COUNTS
# =========================
def base_type_for_page(page: int) -> str:
    # This sets the "type before any mid-page header pivot"
    if page <= 99:
        return "SKINS"
    if page == 100:
        return "PICKLES"
    return "SKELETONS"

def expected_rows_for_page(page: int, headers_found: dict) -> int:
    # Known special pages
    if page == 1:
        return 141
    if page == 102:
        return 136

    exp = 162

    # Mid-page section header tends to consume ~2 rows worth of data slots in these docs
    # (your observation: page99 + page100)
    header_present = (len(headers_found.get("PICKLES", [])) > 0) or (len(headers_found.get("SKELETONS", [])) > 0)
    if header_present:
        exp -= 2

    return exp

def type_for_triplet_y(page: int, y_center: float, headers_found: dict) -> str:
    """
    Determine type per-row for the pivot pages.
    Uses base type for the page, then changes after the header line y.
    """
    t = base_type_for_page(page)

    # Page 99: SKINS then PICKLES after PICKLES header
    if page == 99 and headers_found["PICKLES"]:
        pivot = min(headers_found["PICKLES"])
        if y_center > pivot:
            return "PICKLES"
        return "SKINS"

    # Page 100: PICKLES then SKELETONS after SKELETONS header
    if page == 100 and headers_found["SKELETONS"]:
        pivot = min(headers_found["SKELETONS"])
        if y_center > pivot:
            return "SKELETONS"
        return "PICKLES"

    return t

# =========================
# PAGE PROCESSING
# =========================
def process_one(task):
    page, primary_path, alternates = task

    img0 = Image.open(primary_path)
    headers = ocr_find_headers(img0)
    expected = expected_rows_for_page(page, headers)

    attempts = [
        # (scale, use_otsu, fixed_thresh, psm)
        (2.2, True, 205, 6),
        (2.6, True, 205, 6),
        (2.2, False, 200, 6),
        (2.6, False, 200, 6),
        (2.2, True, 205, 4),
        (2.6, True, 205, 4),
    ]

    tried_paths = [primary_path] + list(alternates)

    best = None
    best_score = (10**9, 10**9, -10**9)  # (abs_count_error, duplicates, extracted_count)
    attempts_used = 0

    for path_try in tried_paths:
        img_try = Image.open(path_try)

        # deskew once per source image
        ang = estimate_skew_angle(img_try, search_deg=3.0, step=0.2, downsample=0.40)
        img_ds = deskew_image(img_try, ang)

        # compute 3 column boxes once
        boxes = column_boxes(img_ds, pad_px=25)

        for (scale, use_otsu, thr, psm) in attempts:
            attempts_used += 1

            all_triplets = []
            debug_parts = []

            for ci, box in enumerate(boxes, start=1):
                col = img_ds.crop(box)
                bw = preprocess_bw(col, scale=scale, use_otsu=use_otsu, fixed_thresh=thr)

                trip = extract_triplets(bw, psm=psm, conf_floor=-1.0, kcols=1)
                all_triplets.extend(trip)

                # debug OCR text per column
                debug_parts.append(f"\n===== PAGE {page:03d} COL {ci} | angle={ang:.2f} | scale={scale} | psm={psm} =====\n")
                debug_parts.append(ocr_text_for_debug(bw, psm=psm))

            # write debug text for this attempt (overwrites each attempt; last attempt stored)
            debug_path = OCR_TEXT_DIR / f"page_{page:03d}.txt"
            debug_path.write_text("".join(debug_parts), encoding="utf-8", errors="ignore")

            extracted = len(all_triplets)
            dupes = extracted - len(set(all_triplets))
            abs_err = abs(extracted - expected)

            # scoring: closest count, then fewer duplicates, then higher extracted
            score = (abs_err, dupes, extracted)

            if best is None or score < best_score:
                best_score = score
                best = {
                    "triplets": all_triplets,
                    "path": path_try,
                    "debug": debug_path,
                    "psm": psm,
                    "scale": scale,
                    "angle": ang,
                    "dupes": dupes,
                }

            # success condition: exact expected AND no duplicates
            if extracted == expected and dupes == 0:
                return {
                    "page": page,
                    "rows": all_triplets,
                    "rows_extracted": extracted,
                    "rows_expected": expected,
                    "error": "",
                    "attempts_used": attempts_used,
                    "image_file": str(path_try),
                    "debug_file": str(debug_path),
                    "headers": headers,
                    "dupes": dupes,
                    "deskew_angle": float(ang),
                }

    # mismatch: return best attempt
    got = len(best["triplets"]) if best else 0
    dupes = best["dupes"] if best else 0
    err_bits = []
    if got != expected:
        err_bits.append(f"COUNT_MISMATCH {got} != {expected}")
    if dupes > 0:
        err_bits.append(f"DUPLICATES {dupes}")
    err = " | ".join(err_bits) if err_bits else "UNKNOWN_ERROR"

    return {
        "page": page,
        "rows": best["triplets"] if best else [],
        "rows_extracted": got,
        "rows_expected": expected,
        "error": err,
        "attempts_used": attempts_used,
        "image_file": str(best["path"]) if best else str(primary_path),
        "debug_file": str(best["debug"]) if best else "",
        "headers": headers,
        "dupes": dupes,
        "deskew_angle": float(best["angle"]) if best else 0.0,
    }
    
# =========================
# RUN ALL + OUTPUTS
# =========================
def run_all():
    TARGET_TOTAL_ROWS = 16473  # your known final target

    tasks = list_jpgs_unique_by_page(JPEGS_DIR)

    print(f"[Start] Processing {len(tasks)} pages from {JPEGS_DIR} with {WORKERS} workers", flush=True)
    t0 = time.time()

    results = []
    done = 0

    # Map futures -> page so we can log failures and keep going
    future_to_page = {}

    with ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futures = []
        for t in tasks:
            fut = ex.submit(process_one, t)
            futures.append(fut)
            future_to_page[fut] = t[0]  # page number

        for fut in as_completed(futures):
            page = future_to_page[fut]
            try:
                res = fut.result()
            except Exception as e:
                # Don't crash the whole run; record this page as failed.
                # Expected rows: use your rule-based function where possible.
                # We don't have headers here, so default to 162 except special cases.
                expected = 141 if page == 1 else 136 if page == 102 else 162
                res = {
                    "page": page,
                    "rows": [],
                    "rows_extracted": 0,
                    "rows_expected": expected,
                    "error": f"EXCEPTION {type(e).__name__}: {e}",
                    "attempts_used": 0,
                    "image_file": "",
                    "debug_file": "",
                    "headers": {"PICKLES": [], "SKELETONS": [], "SKINS": []},
                    "dupes": 0,
                    "deskew_angle": 0.0,
                }

            results.append(res)
            done += 1

            # Progress output exactly like your original
            if done % PROGRESS_EVERY == 0 or done == len(tasks):
                rate = done / max(1e-9, (time.time() - t0))
                remaining = len(tasks) - done
                eta = remaining / max(1e-9, rate)
                print(f"  completed {done}/{len(tasks)} | {rate:.2f} pages/sec | ETA ~ {eta/60:.1f} min", flush=True)

    # Sort results by page
    results_sorted = sorted(results, key=lambda r: r["page"])

    page_counts = []
    needs_review = []

    all_rows = []
    complete_rows = []

    for r in results_sorted:
        page = r["page"]
        extracted = int(r.get("rows_extracted", 0))
        expected = int(r.get("rows_expected", 0))
        err = (r.get("error", "") or "").strip()
        is_complete = (err == "")
        dupes = int(r.get("dupes", 0))
        ang = float(r.get("deskew_angle", 0.0))

        pct = (extracted / expected * 100.0) if expected else 0.0

        rec = {
            "Page": page,
            "RowsExtracted": extracted,
            "RowsExpected": expected,
            "PctComplete": round(pct, 2),
            "Complete": bool(is_complete),
            "Error": err,
            "Duplicates": dupes,
            "DeskewAngleDeg": round(ang, 2),
            "AttemptsUsed": int(r.get("attempts_used", 0)),
            "ImageFile": r.get("image_file", ""),
            "OcrTextFile": r.get("debug_file", ""),
        }
        page_counts.append(rec)
        if not is_complete:
            needs_review.append(rec)

        base_type = base_type_for_page(page)

        for (acc, code, cat) in r.get("rows", []):
            row = {
                "Page": page,
                "PageComplete": bool(is_complete),
                "Type": base_type,
                "Accession": int(acc),
                "Code": str(code),
                "Catalog": int(cat),
            }
            all_rows.append(row)
            if is_complete:
                complete_rows.append(row)

    page_counts_df = pd.DataFrame(page_counts)
    needs_review_df = pd.DataFrame(needs_review)
    all_df = pd.DataFrame(all_rows)
    complete_df = pd.DataFrame(complete_rows)

    out_all_csv = OUT_DIR / "output_all_rows.csv"              # includes incomplete pages
    out_complete_csv = OUT_DIR / "output_complete_pages.csv"   # only pages that passed checks
    out_counts_csv = OUT_DIR / "page_counts.csv"
    out_review_csv = OUT_DIR / "needs_review.csv"
    out_summary_txt = OUT_DIR / "run_summary.txt"

    all_df.to_csv(out_all_csv, index=False)
    complete_df.to_csv(out_complete_csv, index=False)
    page_counts_df.to_csv(out_counts_csv, index=False)
    needs_review_df.to_csv(out_review_csv, index=False)

    expected_total = int(page_counts_df["RowsExpected"].sum()) if not page_counts_df.empty else 0
    extracted_total = int(page_counts_df["RowsExtracted"].sum()) if not page_counts_df.empty else 0

    missing_vs_expected = expected_total - extracted_total
    pct_vs_expected = (extracted_total / expected_total * 100.0) if expected_total else 0.0

    missing_vs_target = TARGET_TOTAL_ROWS - extracted_total
    pct_vs_target = (extracted_total / TARGET_TOTAL_ROWS * 100.0) if TARGET_TOTAL_ROWS else 0.0

    complete_pages = int(page_counts_df["Complete"].sum()) if not page_counts_df.empty else 0
    total_pages = int(len(page_counts_df))

    summary_lines = [
        "=== OCR RUN SUMMARY ===",
        "",
        f"Pages processed:          {total_pages}",
        f"Pages complete:           {complete_pages}/{total_pages} ({(complete_pages/total_pages*100.0 if total_pages else 0.0):.2f}%)",
        f"Pages flagged:            {total_pages - complete_pages}",
        "",
        "=== ROW COUNTS ===",
        f"Target rows (you expect): {TARGET_TOTAL_ROWS}",
        f"Target rows (rule-based): {expected_total}",
        "",
        f"Extracted rows:           {extracted_total}",
        "",
        f"Missing vs rule-based:    {missing_vs_expected}",
        f"Completion vs rule-based: {pct_vs_expected:.2f}%",
        "",
        f"Missing vs your target:   {missing_vs_target}",
        f"Completion vs your target:{pct_vs_target:.2f}%",
        "",
        f"Complete-page output:     {out_complete_csv}",
        f"All-rows output:          {out_all_csv}",
        f"Counts:                   {out_counts_csv}",
        f"Needs review:             {out_review_csv}",
        f"Debug dumps:              {OCR_TEXT_DIR}\\page_###.txt",
    ]
    out_summary_txt.write_text("\n".join(summary_lines), encoding="utf-8")

    print("\n[Done] Outputs:", flush=True)
    print(f"  - {out_all_csv}            rows={len(all_df)} (includes incomplete pages)", flush=True)
    print(f"  - {out_complete_csv}       rows={len(complete_df)} (only pages that passed checks)", flush=True)
    print(f"  - {out_counts_csv}", flush=True)
    print(f"  - {out_review_csv}         pages_flagged={len(needs_review_df)}", flush=True)
    print(f"  - {out_summary_txt}", flush=True)
    print(f"\n[Summary] Target={TARGET_TOTAL_ROWS} | Extracted={extracted_total} | Missing={missing_vs_target} | {pct_vs_target:.2f}%", flush=True)

    return page_counts_df, needs_review_df, complete_df, all_df

page_counts_df, needs_review_df, complete_df, all_df = run_all()

[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅
[Start] Processing 102 pages from JPEGS with 6 workers
  completed 10/102 | 0.08 pages/sec | ETA ~ 18.1 min
  completed 20/102 | 0.09 pages/sec | ETA ~ 15.5 min
  completed 30/102 | 0.10 pages/sec | ETA ~ 11.6 min
  completed 40/102 | 0.10 pages/sec | ETA ~ 10.3 min
  completed 50/102 | 0.10 pages/sec | ETA ~ 8.9 min
  completed 60/102 | 0.11 pages/sec | ETA ~ 6.7 min
  completed 70/102 | 0.10 pages/sec | ETA ~ 5.2 min
  completed 80/102 | 0.10 pages/sec | ETA ~ 3.6 min
  completed 90/102 | 0.10 pages/sec | ETA ~ 1.9 min
  completed 100/102 | 0.11 pages/sec | ETA ~ 0.3 min
  completed 102/102 | 0.11 pages/sec | ETA ~ 0.0 min

[Done] Outputs:
  - out\output_all_rows.csv            rows=11263 (includes incomplete pages)
  - out\output_complete_pages.csv       rows=0 (only pages that passed checks)
  - out\page_counts.csv
  - out\needs_review.csv         pages_flagged=102
  - out\run_summary.txt

[Summary] Target=16473 | 