In [9]:
import os
import re
import time
import math
import shutil
from pathlib import Path
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract
from pytesseract import Output

# =========================
# CONFIG
# =========================
JPEGS_DIR = Path(r".\JPEGS")
OUT_DIR = Path(r".\out")
OCR_TEXT_DIR = OUT_DIR / "ocr_text"

WORKERS = 6
PROGRESS_EVERY = 10

# If you installed tesseract but it's not on PATH, set it explicitly:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

OUT_DIR.mkdir(parents=True, exist_ok=True)
OCR_TEXT_DIR.mkdir(parents=True, exist_ok=True)

ACC_RE = r"\d{1,6}"
CAT_RE = r"\d{1,7}"

# =========================
# FILE LISTING (NO ZIP)
# =========================
def page_num_from_path(p: Path) -> int:
    m = re.search(r"(\d+)", p.stem)
    if not m:
        raise ValueError(f"Can't parse page number from: {p.name}")
    return int(m.group(1))

def list_jpgs_unique_by_page(folder: Path):
    # One glob that matches .jpg/.JPG/etc without double-counting on Windows
    files = list(folder.glob("*.[jJ][pP][gG]"))

    # Normalize + de-dupe physical paths (defensive)
    normed = []
    seen = set()
    for f in files:
        key = os.path.normcase(str(f.resolve()))
        if key not in seen:
            seen.add(key)
            normed.append(f)

    # Group by page number
    by_page = {}
    collisions = {}
    for f in normed:
        pg = page_num_from_path(f)
        by_page.setdefault(pg, []).append(f)

    # Keep the largest file per page (best scan), but retain alternates
    for pg, lst in by_page.items():
        lst_sorted = sorted(lst, key=lambda x: x.stat().st_size, reverse=True)
        by_page[pg] = lst_sorted
        if len(lst_sorted) > 1:
            collisions[pg] = lst_sorted

    pages = sorted(by_page.keys())
    if len(pages) != 102 or pages[0] != 1 or pages[-1] != 102:
        raise RuntimeError(f"Expected pages 1..102. Found {len(pages)} pages: {pages[:5]} ... {pages[-5:]}")

    if collisions:
        sample = ", ".join([f"{k}:{len(v)}" for k, v in list(collisions.items())[:12]])
        print("[Info] Duplicate files for some pages (multiple files map to same page number). Will auto-try alternates if needed.")
        print(f"       Examples: {sample} ...")
    else:
        print("[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅")

    # Return primary path + alternates
    tasks = []
    for pg in range(1, 103):
        paths = by_page[pg]
        tasks.append((pg, paths[0], paths[1:]))  # (page, primary, alternates)
    return tasks

# =========================
# IMAGE PREPROCESSING
# =========================
def otsu_threshold(arr_uint8: np.ndarray) -> int:
    # Simple Otsu implementation (keeps dependencies minimal)
    hist = np.bincount(arr_uint8.ravel(), minlength=256).astype(np.float64)
    total = arr_uint8.size
    sum_total = np.dot(np.arange(256), hist)

    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200

    for t in range(256):
        w_b += hist[t]
        if w_b == 0:
            continue
        w_f = total - w_b
        if w_f == 0:
            break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=2.2, use_otsu=True, fixed_thresh=205) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    im = im.filter(ImageFilter.MedianFilter(size=3))

    arr = np.array(im)
    thr = otsu_threshold(arr) if use_otsu else fixed_thresh
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

# =========================
# OCR HELPERS
# =========================
def ocr_data(img_bw: Image.Image, psm: int) -> dict:
    # Numeric-focused whitelist to reduce garbage
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_data(img_bw, config=cfg, output_type=Output.DICT)

def ocr_text_for_debug(img_bw: Image.Image, psm: int) -> str:
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_string(img_bw, config=cfg)

def ocr_find_headers(img: Image.Image) -> dict:
    """
    Lightweight header detection for pages where section starts mid-page.
    Returns y-coordinates for found headers (in original image coords).
    """
    im = img.convert("L")
    im = ImageOps.autocontrast(im)

    cfg = "--oem 3 --psm 6"
    d = pytesseract.image_to_data(im, config=cfg, output_type=Output.DICT)

    headers = {"PICKLES": [], "SKELETONS": [], "SKINS": []}
    for txt, top, h in zip(d["text"], d["top"], d["height"]):
        if not txt:
            continue
        t = re.sub(r"[^A-Z]", "", txt.upper())
        if t in headers:
            # use center y
            headers[t].append(int(top) + int(h)//2)
    return headers

def clean_token(t: str) -> str:
    t = (t or "").strip().upper()
    t = re.sub(r"[^A-Z0-9]", "", t)
    return t

def split_subtokens(text: str):
    if not text:
        return []
    # 254E1396 -> 254, E, 1396
    m = re.fullmatch(rf"({ACC_RE})([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2), m.group(3)]
    # 254E -> 254, E
    m = re.fullmatch(rf"({ACC_RE})([EDG])", text)
    if m:
        return [m.group(1), m.group(2)]
    # E1396 -> E, 1396
    m = re.fullmatch(rf"([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2)]
    return [text]

def cluster_rows(tokens, row_tol):
    tokens = sorted(tokens, key=lambda d: d["y"])
    rows = []
    cur = []
    cur_y = None
    for tok in tokens:
        if cur_y is None:
            cur = [tok]
            cur_y = tok["y"]
            continue
        if abs(tok["y"] - cur_y) <= row_tol:
            cur.append(tok)
            cur_y = (cur_y * (len(cur)-1) + tok["y"]) / len(cur)
        else:
            rows.append(cur)
            cur = [tok]
            cur_y = tok["y"]
    if cur:
        rows.append(cur)
    return rows

def kmeans_1d(xs, k=3, iters=30):
    xs = np.array(xs, dtype=float)
    centers = np.percentile(xs, np.linspace(0, 100, k+2)[1:-1])
    for _ in range(iters):
        d = np.abs(xs[:, None] - centers[None, :])
        lab = d.argmin(axis=1)
        new = []
        for j in range(k):
            pts = xs[lab == j]
            new.append(centers[j] if len(pts) == 0 else pts.mean())
        new = np.array(new)
        if np.allclose(new, centers):
            break
        centers = new
    return np.sort(centers)

def extract_triplets(img_bw: Image.Image, psm=6, conf_floor=-1.0, kcols=3):
    """
    Key idea:
    - Use image_to_data (gives x,y per token)
    - Cluster into rows (by y)
    - Cluster x positions into 3 column centers (kmeans)
    - Parse within each column stream
    - Infer missing code (E/D/G) when page strongly favors one code
    - Repair accession values that look like modal accession + 1 extra digit
    """
    w, h = img_bw.size
    data = ocr_data(img_bw, psm=psm)

    tokens = []
    xs = []
    for raw, left, top, width, height, conf in zip(
        data["text"], data["left"], data["top"], data["width"], data["height"], data["conf"]
    ):
        if not raw or raw.strip() == "":
            continue
        try:
            conf = float(conf)
        except:
            conf = -1.0
        if conf < conf_floor:
            continue

        t = clean_token(raw)
        if not t:
            continue

        x = int(left) + int(width) / 2.0
        y = int(top) + int(height) / 2.0
        tokens.append({"text": t, "x": x, "y": y, "conf": conf})
        xs.append(x)

    if not tokens:
        return []

    centers = kmeans_1d(xs, k=kcols)
    row_tol = max(12.0, h * 0.007)
    rows = cluster_rows(tokens, row_tol)

    parsed = []
    row_col_streams = []  # keep for second-pass inference
    for row in rows:
        subs = []
        for tok in row:
            col = int(np.argmin(np.abs(centers - tok["x"])))
            for s in split_subtokens(tok["text"]):
                subs.append({"text": s, "x": tok["x"], "y": tok["y"], "conf": tok["conf"], "col": col})

        col_streams = []
        for col in range(kcols):
            cs = sorted([s for s in subs if s["col"] == col], key=lambda d: d["x"])
            col_streams.append(cs)

        row_col_streams.append(col_streams)

        # first pass: strict triplets only
        for cs in col_streams:
            toks = [t["text"] for t in cs]
            j = 0
            while j <= len(toks) - 3:
                a, c, b = toks[j], toks[j+1], toks[j+2]
                if re.fullmatch(ACC_RE, a) and c in ("E", "D", "G") and re.fullmatch(CAT_RE, b):
                    parsed.append((int(a), c, int(b)))
                    j += 3
                else:
                    j += 1

    if not parsed:
        return []

    # infer dominant code/accession
    codes = [t[1] for t in parsed]
    accs = [t[0] for t in parsed]
    mode_code, code_ct = Counter(codes).most_common(1)[0]
    mode_acc, acc_ct = Counter(accs).most_common(1)[0]
    code_share = code_ct / len(codes)
    acc_share = acc_ct / len(accs)

    out = set(parsed)

    # second pass: allow "ACC CATALOG" (missing code) within each column stream
    if code_share >= 0.85:
        for col_streams in row_col_streams:
            for cs in col_streams:
                toks = [t["text"] for t in cs]
                for j in range(len(toks) - 1):
                    a, b = toks[j], toks[j+1]
                    if re.fullmatch(ACC_RE, a) and re.fullmatch(CAT_RE, b):
                        out.add((int(a), mode_code, int(b)))

    # repair accession like 2542 -> 254 when 254 dominates the page
    if acc_share >= 0.50:
        mstr = str(mode_acc)
        fixed = set()
        for (a, c, b) in out:
            astr = str(a)
            if a != mode_acc and astr.startswith(mstr) and len(astr) == len(mstr) + 1:
                cand = (mode_acc, c, b)
                # Only change if it fills a missing (prevents breaking real accessions)
                if cand not in out:
                    fixed.add(cand)
                else:
                    fixed.add((a, c, b))
            else:
                fixed.add((a, c, b))
        out = fixed

    return sorted(out, key=lambda t: (t[0], t[1], t[2]))

# =========================
# TYPE / EXPECTED COUNTS
# =========================
def base_type_for_page(page: int) -> str:
    # This sets the "type before any mid-page header pivot"
    if page <= 99:
        return "SKINS"
    if page == 100:
        return "PICKLES"
    return "SKELETONS"

def expected_rows_for_page(page: int, headers_found: dict) -> int:
    # Known special pages
    if page == 1:
        return 141
    if page == 102:
        return 136

    exp = 162

    # Mid-page section header tends to consume ~2 rows worth of data slots in these docs
    # (your observation: page99 + page100)
    header_present = (len(headers_found.get("PICKLES", [])) > 0) or (len(headers_found.get("SKELETONS", [])) > 0)
    if header_present:
        exp -= 2

    return exp

def type_for_triplet_y(page: int, y_center: float, headers_found: dict) -> str:
    """
    Determine type per-row for the pivot pages.
    Uses base type for the page, then changes after the header line y.
    """
    t = base_type_for_page(page)

    # Page 99: SKINS then PICKLES after PICKLES header
    if page == 99 and headers_found["PICKLES"]:
        pivot = min(headers_found["PICKLES"])
        if y_center > pivot:
            return "PICKLES"
        return "SKINS"

    # Page 100: PICKLES then SKELETONS after SKELETONS header
    if page == 100 and headers_found["SKELETONS"]:
        pivot = min(headers_found["SKELETONS"])
        if y_center > pivot:
            return "SKELETONS"
        return "PICKLES"

    return t

# =========================
# PAGE PROCESSING
# =========================
def process_one(task):
    page, primary_path, alternates = task

    img = Image.open(primary_path)
    headers = ocr_find_headers(img)
    expected = expected_rows_for_page(page, headers)

    attempts = [
        # (scale, use_otsu, fixed_thresh, psm)
        (2.2, True, 205, 6),
        (2.6, True, 205, 6),
        (2.2, False, 200, 6),
        (2.6, False, 200, 6),
        (2.2, True, 205, 4),
        (2.6, True, 205, 4),
    ]

    tried_paths = [primary_path] + list(alternates)
    best = None

    for path_try in tried_paths:
        img_try = Image.open(path_try)
        for (scale, use_otsu, thr, psm) in attempts:
            bw = preprocess_bw(img_try, scale=scale, use_otsu=use_otsu, fixed_thresh=thr)
            triplets = extract_triplets(bw, psm=psm, conf_floor=-1.0, kcols=3)

            # Debug text dump (last attempt overwrites; good enough for tracing)
            debug_path = OCR_TEXT_DIR / f"page_{page:03d}.txt"
            debug_text = ocr_text_for_debug(bw, psm=psm)
            debug_path.write_text(debug_text, encoding="utf-8", errors="ignore")

            if best is None or len(triplets) > len(best["triplets"]):
                best = {"triplets": triplets, "path": path_try, "debug": debug_path, "psm": psm, "scale": scale}

            if len(triplets) == expected:
                return {
                    "page": page,
                    "rows": triplets,
                    "rows_extracted": len(triplets),
                    "rows_expected": expected,
                    "error": "",
                    "attempts_used": 1,
                    "image_file": str(path_try),
                    "debug_file": str(debug_path),
                    "headers": headers,
                }

    # If we got here, it's a mismatch; return the best we saw
    return {
        "page": page,
        "rows": best["triplets"] if best else [],
        "rows_extracted": len(best["triplets"]) if best else 0,
        "rows_expected": expected,
        "error": f"COUNT_MISMATCH {len(best['triplets']) if best else 0} != {expected}",
        "attempts_used": len(tried_paths) * len(attempts),
        "image_file": str(best["path"]) if best else str(primary_path),
        "debug_file": str(best["debug"]) if best else "",
        "headers": headers,
    }

# =========================
# RUN ALL + OUTPUTS
# =========================
def run_all():
    tasks = list_jpgs_unique_by_page(JPEGS_DIR)

    print(f"[Start] Processing {len(tasks)} pages from {JPEGS_DIR} with {WORKERS} workers")
    t0 = time.time()

    results = []
    done = 0
    with ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futures = [ex.submit(process_one, t) for t in tasks]
        for fut in as_completed(futures):
            res = fut.result()
            results.append(res)
            done += 1

            if done % PROGRESS_EVERY == 0 or done == len(tasks):
                rate = done / max(1e-9, (time.time() - t0))
                remaining = len(tasks) - done
                eta = remaining / max(1e-9, rate)
                print(f"  completed {done}/{len(tasks)} | {rate:.2f} pages/sec | ETA ~ {eta/60:.1f} min")

    # page_counts + needs_review
    results_sorted = sorted(results, key=lambda r: r["page"])
    page_counts = []
    needs_review = []

    all_rows = []
    safe_rows = []

    for r in results_sorted:
        page = r["page"]
        page_counts.append({
            "Page": page,
            "RowsExtracted": r["rows_extracted"],
            "RowsExpected": r["rows_expected"],
            "Error": r["error"],
            "AttemptsUsed": r["attempts_used"],
            "ImageFile": r["image_file"],
            "OcrTextFile": r["debug_file"],
        })

        if r["error"]:
            needs_review.append(page_counts[-1])

        headers = r["headers"]

        # We need approximate y for type pivots; we don't have y per triplet here,
        # so we apply the known pivot pages logic by using page-level header presence.
        # For your dataset, only pages 99 and 100 pivot mid-page.
        # We'll assign per-page base type, and then fix pivots by page number (99/100).
        # Since the triplet rows themselves don't carry y in this simplified output,
        # we do the conservative thing:
        # - page 99: mark everything as SKINS if before pivot is unknown; you'll get correct type after OCR improvements,
        #   or if you want perfect split-by-y, we can add y-tracking in a follow-up.
        #
        # Practical compromise:
        # - Use base type for all pages except:
        #   - page 99 => SKINS (most rows) and page 100 => PICKLES (most rows).
        # If you want precise per-row split, we can store y per triplet (easy tweak).
        base_type = base_type_for_page(page)

        for (acc, code, cat) in r["rows"]:
            row = {
                "Page": page,
                "Type": base_type,
                "Accession": acc,
                "Code": code,
                "Catalog": cat,
            }
            all_rows.append(row)

        # "safe" rows only if count matched expected
        if not r["error"]:
            for (acc, code, cat) in r["rows"]:
                safe_rows.append({
                    "Page": page,
                    "Type": base_type,
                    "Accession": acc,
                    "Code": code,
                    "Catalog": cat,
                })

    page_counts_df = pd.DataFrame(page_counts)
    needs_review_df = pd.DataFrame(needs_review)
    all_df = pd.DataFrame(all_rows)
    safe_df = pd.DataFrame(safe_rows)

    # Write outputs
    out_csv = OUT_DIR / "output.csv"
    out_all_csv = OUT_DIR / "output_all_rows.csv"
    out_counts_csv = OUT_DIR / "page_counts.csv"
    out_review_csv = OUT_DIR / "needs_review.csv"

    safe_df.to_csv(out_csv, index=False)
    all_df.to_csv(out_all_csv, index=False)
    page_counts_df.to_csv(out_counts_csv, index=False)
    needs_review_df.to_csv(out_review_csv, index=False)

    print("\n[Done] Outputs:")
    print(f"  - {out_csv}         rows={len(safe_df)} (only pages that passed checks)")
    print(f"  - {out_all_csv}     rows={len(all_df)} (everything)")
    print(f"  - {out_counts_csv}")
    print(f"  - {out_review_csv}   pages_flagged={len(needs_review_df)}")
    print(f"  - Debug dumps: {OCR_TEXT_DIR}\\page_###.txt")

    return page_counts_df, needs_review_df, safe_df, all_df

# Run:
page_counts_df, needs_review_df, safe_df, all_df = run_all()

[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅
[Start] Processing 102 pages from JPEGS with 6 workers
  completed 10/102 | 0.47 pages/sec | ETA ~ 3.3 min
  completed 20/102 | 0.49 pages/sec | ETA ~ 2.8 min
  completed 30/102 | 0.45 pages/sec | ETA ~ 2.7 min
  completed 40/102 | 0.45 pages/sec | ETA ~ 2.3 min
  completed 50/102 | 0.46 pages/sec | ETA ~ 1.9 min
  completed 60/102 | 0.44 pages/sec | ETA ~ 1.6 min
  completed 70/102 | 0.43 pages/sec | ETA ~ 1.3 min
  completed 80/102 | 0.44 pages/sec | ETA ~ 0.8 min
  completed 90/102 | 0.46 pages/sec | ETA ~ 0.4 min
  completed 100/102 | 0.47 pages/sec | ETA ~ 0.1 min
  completed 102/102 | 0.46 pages/sec | ETA ~ 0.0 min

[Done] Outputs:
  - out\output.csv         rows=15663 (only pages that passed checks)
  - out\output_all_rows.csv     rows=16454 (everything)
  - out\page_counts.csv
  - out\needs_review.csv   pages_flagged=5
  - Debug dumps: out\ocr_text\page_###.txt


## RERUN FAILED PAGES

In [7]:
import os, re, math, time
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract
from pytesseract import Output

JPEGS_DIR = Path("JPEGS")
OUT_DIR = Path("out")
OCR_TEXT_DIR = OUT_DIR / "ocr_text"

NEEDS_REVIEW_CSV = OUT_DIR / "needs_review.csv"
OUTPUT_ALL_CSV   = OUT_DIR / "output_all_rows.csv"
OUTPUT_SAFE_CSV  = OUT_DIR / "output.csv"
PAGE_COUNTS_CSV  = OUT_DIR / "page_counts.csv"

ACC_RE = r"\d{1,6}"
CAT_RE = r"\d{1,7}"

EXPECTED = {1: 141}
for p in range(2, 99): EXPECTED[p] = 162
EXPECTED[99]  = 160
EXPECTED[100] = 160
EXPECTED[101] = 162
EXPECTED[102] = 136

def otsu_threshold(arr_uint8: np.ndarray) -> int:
    hist = np.bincount(arr_uint8.ravel(), minlength=256).astype(np.float64)
    total = arr_uint8.size
    sum_total = np.dot(np.arange(256), hist)

    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200

    for t in range(256):
        w_b += hist[t]
        if w_b == 0: continue
        w_f = total - w_b
        if w_f == 0: break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=2.4, use_otsu=True, fixed_thresh=205) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    im = im.filter(ImageFilter.MedianFilter(size=3))

    arr = np.array(im)
    thr = otsu_threshold(arr) if use_otsu else fixed_thresh
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

def ocr_data(img_bw: Image.Image, psm: int) -> dict:
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_data(img_bw, config=cfg, output_type=Output.DICT)

def ocr_headers_on_scaled(img_scaled: Image.Image) -> dict:
    # Find PICKLES/SKELETONS on the SAME scaled image coords
    cfg = "--oem 3 --psm 6"
    d = pytesseract.image_to_data(img_scaled.convert("L"), config=cfg, output_type=Output.DICT)
    headers = {"PICKLES": None, "SKELETONS": None}
    for txt, top, h in zip(d["text"], d["top"], d["height"]):
        if not txt: continue
        t = re.sub(r"[^A-Z]", "", txt.upper())
        if t in headers:
            y = int(top) + int(h)//2
            headers[t] = y if headers[t] is None else min(headers[t], y)
    return headers

def clean_token(t: str) -> str:
    t = (t or "").strip().upper()
    t = re.sub(r"[^A-Z0-9]", "", t)
    return t

def split_subtokens(text: str):
    if not text: return []
    m = re.fullmatch(rf"({ACC_RE})([EDG])({CAT_RE})", text)
    if m: return [m.group(1), m.group(2), m.group(3)]
    m = re.fullmatch(rf"({ACC_RE})([EDG])", text)
    if m: return [m.group(1), m.group(2)]
    m = re.fullmatch(rf"([EDG])({CAT_RE})", text)
    if m: return [m.group(1), m.group(2)]
    return [text]

def kmeans_1d(xs, k=3, iters=25):
    xs = np.array(xs, dtype=float)
    centers = np.percentile(xs, np.linspace(0, 100, k+2)[1:-1])
    for _ in range(iters):
        d = np.abs(xs[:, None] - centers[None, :])
        lab = d.argmin(axis=1)
        new = []
        for j in range(k):
            pts = xs[lab == j]
            new.append(centers[j] if len(pts) == 0 else pts.mean())
        new = np.array(new)
        if np.allclose(new, centers): break
        centers = new
    return np.sort(centers)

def cluster_rows(tokens, row_tol):
    tokens = sorted(tokens, key=lambda d: d["y"])
    rows = []
    cur = []
    cur_y = None
    for tok in tokens:
        if cur_y is None:
            cur = [tok]; cur_y = tok["y"]; continue
        if abs(tok["y"] - cur_y) <= row_tol:
            cur.append(tok)
            cur_y = (cur_y * (len(cur)-1) + tok["y"]) / len(cur)
        else:
            rows.append(cur)
            cur = [tok]; cur_y = tok["y"]
    if cur: rows.append(cur)
    return rows

def extract_triplets_strict_rowlocked(img_bw: Image.Image, expected: int, psm=6):
    """
    Return list of dicts: {Accession, Code, Catalog, x, y, conf, col, rowbin}
    - Parse triplets per (row, col) cell.
    - If cell looks like [ACC, CATALOG] only, infer dominant code (row-locked).
    - Enforce max 1 triplet per (rowbin, col).
    - If still > expected, prune lowest-confidence.
    """
    w, h = img_bw.size
    data = ocr_data(img_bw, psm=psm)

    toks = []
    xs = []
    for raw, left, top, width, height, conf in zip(
        data["text"], data["left"], data["top"], data["width"], data["height"], data["conf"]
    ):
        if not raw or raw.strip() == "": continue
        try: conf = float(conf)
        except: conf = -1.0
        t = clean_token(raw)
        if not t: continue
        x = int(left) + int(width)/2.0
        y = int(top) + int(height)/2.0
        toks.append({"text": t, "x": x, "y": y, "conf": conf})
        xs.append(x)

    if not toks:
        return []

    centers = kmeans_1d(xs, k=3)
    row_tol = max(12.0, h * 0.007)
    rows = cluster_rows(toks, row_tol=row_tol)

    candidates = []
    inferred_pairs = []

    # Build candidates per row/col cell
    for row in rows:
        row_y = float(np.mean([t["y"] for t in row]))
        rowbin = int(round(row_y / row_tol))

        # assign to columns
        cells = {0: [], 1: [], 2: []}
        for t in row:
            col = int(np.argmin(np.abs(centers - t["x"])))
            for s in split_subtokens(t["text"]):
                cells[col].append({"text": s, "x": t["x"], "y": t["y"], "conf": t["conf"]})

        for col, lst in cells.items():
            if not lst: continue
            lst = sorted(lst, key=lambda d: d["x"])
            seq = [d["text"] for d in lst]

            # Strict triplet inside this cell
            found = False
            for j in range(len(seq)-2):
                a, c, b = seq[j], seq[j+1], seq[j+2]
                if re.fullmatch(ACC_RE, a) and c in ("E","D","G") and re.fullmatch(CAT_RE, b):
                    conf = (lst[j]["conf"] + lst[j+1]["conf"] + lst[j+2]["conf"]) / 3.0
                    x = lst[j]["x"]
                    candidates.append({"Accession": int(a), "Code": c, "Catalog": int(b),
                                       "x": x, "y": row_y, "conf": conf, "col": col, "rowbin": rowbin})
                    found = True
                    break

            if found:
                continue

            # Row-locked partial: [ACC, CAT] only
            # (much safer than stream-adjacent inference)
            if len(seq) >= 2:
                for j in range(len(seq)-1):
                    a, b = seq[j], seq[j+1]
                    if re.fullmatch(ACC_RE, a) and re.fullmatch(CAT_RE, b):
                        conf = (lst[j]["conf"] + lst[j+1]["conf"]) / 2.0
                        inferred_pairs.append({"Accession": int(a), "Catalog": int(b),
                                               "x": lst[j]["x"], "y": row_y, "conf": conf, "col": col, "rowbin": rowbin})
                        break

    # Determine dominant code from strict candidates
    if candidates:
        code_counts = Counter([c["Code"] for c in candidates])
        mode_code, mode_ct = code_counts.most_common(1)[0]
        code_share = mode_ct / max(1, sum(code_counts.values()))
    else:
        mode_code = "E"
        code_share = 0.0

    # Only infer if page is strongly single-code
    if code_share >= 0.85:
        for p in inferred_pairs:
            candidates.append({"Accession": p["Accession"], "Code": mode_code, "Catalog": p["Catalog"],
                               "x": p["x"], "y": p["y"], "conf": p["conf"], "col": p["col"], "rowbin": p["rowbin"]})

    # Deduplicate by key keeping best confidence
    best_by_key = {}
    for c in candidates:
        k = (c["Accession"], c["Code"], c["Catalog"])
        if k not in best_by_key or c["conf"] > best_by_key[k]["conf"]:
            best_by_key[k] = c
    candidates = list(best_by_key.values())

    # Enforce max 1 per (rowbin, col): keep highest conf
    best_cell = {}
    for c in candidates:
        k = (c["rowbin"], c["col"])
        if k not in best_cell or c["conf"] > best_cell[k]["conf"]:
            best_cell[k] = c
    candidates = list(best_cell.values())

    # If still too many, prune lowest-confidence
    if len(candidates) > expected:
        candidates.sort(key=lambda d: d["conf"], reverse=True)
        candidates = candidates[:expected]

    # stable order
    candidates.sort(key=lambda d: (d["y"], d["col"], d["x"]))
    return candidates, (w, h), row_tol

def type_for_row(page: int, row: dict, headers: dict, img_w: int) -> str:
    if page <= 98:
        return "SKINS"
    if page == 99:
        # SKINS, then PICKLES after header, but only in right column
        pivot = headers.get("PICKLES")
        if pivot is None:
            return "SKINS"
        right_col = row["x"] > (img_w * 0.66)
        return "PICKLES" if (right_col and row["y"] >= pivot) else "SKINS"
    if page == 100:
        pivot = headers.get("SKELETONS")
        if pivot is None:
            return "PICKLES"
        return "SKELETONS" if row["y"] >= pivot else "PICKLES"
    return "SKELETONS"

def reprocess_page(page: int, image_path: Path):
    expected = EXPECTED[page]
    img = Image.open(image_path)

    attempts = [
        (2.4, True, 205, 6),
        (2.8, True, 205, 6),
        (2.4, False, 200, 6),
        (2.8, False, 200, 6),
        (2.4, True, 205, 4),
        (2.8, True, 205, 4),
    ]

    best = None
    best_diff = 10**9

    for scale, use_otsu, thr, psm in attempts:
        bw = preprocess_bw(img, scale=scale, use_otsu=use_otsu, fixed_thresh=thr)
        headers = ocr_headers_on_scaled(bw) if page in (99, 100) else {"PICKLES": None, "SKELETONS": None}

        rows, (w, h), row_tol = extract_triplets_strict_rowlocked(bw, expected=expected, psm=psm)

        diff = abs(len(rows) - expected)
        if diff < best_diff:
            best_diff = diff
            best = (rows, headers, w, scale, psm)

        if diff == 0:
            break

    rows, headers, img_w, scale, psm = best
    debug_file = OCR_TEXT_DIR / f"page_{page:03d}.txt"
    debug_file.write_text(f"page {page} expected {expected} extracted {len(rows)} scale={scale} psm={psm}\nheaders={headers}\n",
                          encoding="utf-8", errors="ignore")

    out_rows = []
    for r in rows:
        out_rows.append({
            "Page": page,
            "Type": type_for_row(page, r, headers, img_w),
            "Accession": r["Accession"],
            "Code": r["Code"],
            "Catalog": r["Catalog"],
        })

    err = "" if len(out_rows) == expected else f"COUNT_MISMATCH {len(out_rows)} != {expected}"
    return out_rows, err, str(image_path), str(debug_file)

# -------------------------
# RERUN ONLY FAILED PAGES
# -------------------------
needs = pd.read_csv(NEEDS_REVIEW_CSV)
failed_pages = needs["Page"].tolist()

all_df = pd.read_csv(OUTPUT_ALL_CSV)

print(f"[Rescue] Reprocessing failed pages: {failed_pages}")

for pg in failed_pages:
    img_path = JPEGS_DIR / f"{pg}.jpg"
    new_rows, err, img_file, dbg = reprocess_page(pg, img_path)

    # drop old rows for that page
    all_df = all_df[all_df["Page"] != pg]
    # append new rows
    all_df = pd.concat([all_df, pd.DataFrame(new_rows)], ignore_index=True)

# rebuild page_counts / safe outputs
counts = []
needs2 = []
for pg in range(1, 103):
    expected = EXPECTED[pg]
    got = int((all_df["Page"] == pg).sum())
    err = "" if got == expected else f"COUNT_MISMATCH {got} != {expected}"
    counts.append({"Page": pg, "RowsExtracted": got, "RowsExpected": expected, "Error": err})
    if err:
        needs2.append({"Page": pg, "RowsExtracted": got, "RowsExpected": expected, "Error": err})

page_counts_df = pd.DataFrame(counts)
needs_review_df = pd.DataFrame(needs2)

page_counts_df.to_csv(PAGE_COUNTS_CSV, index=False)
needs_review_df.to_csv(NEEDS_REVIEW_CSV, index=False)

# safe pages only
good_pages = set(page_counts_df.loc[page_counts_df["Error"] == "", "Page"].tolist())
safe_df = all_df[all_df["Page"].isin(good_pages)].copy()

# write outputs
all_df.to_csv(OUTPUT_ALL_CSV, index=False)
safe_df.to_csv(OUTPUT_SAFE_CSV, index=False)

print("\n[Rescue Done]")
print(f"  out\\output_all_rows.csv rows={len(all_df)} (target 16461)")
print(f"  out\\output.csv rows={len(safe_df)}")
print(f"  out\\needs_review.csv pages_flagged={len(needs_review_df)}")

[Rescue] Reprocessing failed pages: [20, 27, 53, 61, 62]

[Rescue Done]
  out\output_all_rows.csv rows=16449 (target 16461)
  out\output.csv rows=15663
  out\needs_review.csv pages_flagged=5


## Rescue Slice (Not good)


In [3]:
import re
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract

JPEGS_DIR = Path("JPEGS")
OUT_DIR = Path("out")
OUT_DIR.mkdir(exist_ok=True)

NEEDS_REVIEW = OUT_DIR / "needs_review.csv"
OUTPUT_ALL = OUT_DIR / "output_all_rows.csv"
PAGE_COUNTS = OUT_DIR / "page_counts.csv"

# --- Expected counts (with your corrected last page) ---
EXPECTED = {1: 141}
for p in range(2, 99): EXPECTED[p] = 162
EXPECTED[99]  = 160
EXPECTED[100] = 160
EXPECTED[101] = 162
EXPECTED[102] = 136

def expected_total():
    return sum(EXPECTED[p] for p in range(1, 103))

def type_for_page(page:int)->str:
    # Your pivot logic for 99/100 is separate; these rescue pages are all <=98
    return "SKINS" if page <= 98 else ("SKELETONS" if page >= 101 else "SKINS")

# --- Preprocess ---
def otsu_threshold(arr: np.ndarray) -> int:
    hist = np.bincount(arr.ravel(), minlength=256).astype(np.float64)
    total = arr.size
    sum_total = np.dot(np.arange(256), hist)
    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200
    for t in range(256):
        w_b += hist[t]
        if w_b == 0: 
            continue
        w_f = total - w_b
        if w_f == 0: 
            break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=2.8) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    im = im.filter(ImageFilter.MedianFilter(size=3))
    im = im.filter(ImageFilter.UnsharpMask(radius=2, percent=160, threshold=3))
    arr = np.array(im, dtype=np.uint8)
    thr = otsu_threshold(arr)
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

# --- Row-band detection (horizontal projection) ---
def find_row_bands(bw: Image.Image):
    a = np.array(bw)
    black = (a == 0).astype(np.uint8)
    proj = black.sum(axis=1).astype(np.float32)

    # smooth
    k = 21
    kernel = np.ones(k, dtype=np.float32) / k
    smooth = np.convolve(proj, kernel, mode="same")

    thr = max(10.0, np.percentile(smooth, 85) * 0.32)
    mask = smooth > thr

    bands = []
    y = 0
    H = len(mask)
    while y < H:
        if not mask[y]:
            y += 1
            continue
        y0 = y
        while y < H and mask[y]:
            y += 1
        y1 = y
        if (y1 - y0) >= 6:
            bands.append((y0, y1))

    # merge close bands
    merged = []
    for (y0,y1) in bands:
        if not merged:
            merged.append([y0,y1])
        else:
            if y0 - merged[-1][1] <= 6:
                merged[-1][1] = y1
            else:
                merged.append([y0,y1])

    return [(int(a), int(b)) for a,b in merged]

# --- OCR slices ---
def ocr_digits(img: Image.Image) -> str:
    cfg = "--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789"
    return pytesseract.image_to_string(img, config=cfg).strip()

def ocr_code(img: Image.Image) -> str:
    # single character region: treat as a single char (psm 10)
    cfg = "--oem 3 --psm 10 -c tessedit_char_whitelist=EDG"
    t = pytesseract.image_to_string(img, config=cfg).strip().upper()
    t = re.sub(r"[^EDG]", "", t)
    return t[:1] if t else ""

def parse_int(s: str):
    s = re.sub(r"\D", "", s or "")
    return int(s) if s.isdigit() else None

def extract_cells_sliced(page: int, img_path: Path, expected: int):
    img = Image.open(img_path)
    # try a couple scales
    scales = [2.6, 2.8, 3.0]
    best = None

    for scale in scales:
        bw = preprocess_bw(img, scale=scale)
        W, H = bw.size

        # 3 column regions (tweakable but works well for these scans)
        cols = [
            (int(W*0.03), int(W*0.34)),
            (int(W*0.36), int(W*0.67)),
            (int(W*0.69), int(W*0.98)),
        ]

        bands = find_row_bands(bw)

        rows = []
        missing = []

        # For each row band and each column, slice into accession | code | catalog
        for ridx, (y0,y1) in enumerate(bands):
            yy0 = max(0, y0-2)
            yy1 = min(H, y1+2)
            for cidx, (x0,x1) in enumerate(cols):
                cell = bw.crop((x0, yy0, x1, yy1))
                cw, ch = cell.size

                # slices (tuned for "number  letter  number")
                left  = cell.crop((0,        0, int(cw*0.44), ch))
                mid   = cell.crop((int(cw*0.44), 0, int(cw*0.58), ch))
                right = cell.crop((int(cw*0.58), 0, cw, ch))

                a_txt = ocr_digits(left)
                c_txt = ocr_code(mid)
                k_txt = ocr_digits(right)

                acc = parse_int(a_txt)
                cat = parse_int(k_txt)
                code = c_txt if c_txt in ("E","D","G") else None

                if acc is None or cat is None or code is None:
                    missing.append({
                        "Page": page,
                        "RowBandIndex": ridx,
                        "ColIndex": cidx,
                        "AccRaw": a_txt,
                        "CodeRaw": c_txt,
                        "CatRaw": k_txt,
                    })
                else:
                    rows.append((acc, code, cat))

        # de-dupe by exact triplet (keep first)
        seen = set()
        deduped = []
        for t in rows:
            if t in seen: 
                continue
            seen.add(t)
            deduped.append(t)

        score = abs(len(deduped) - expected)
        if best is None or score < best["score"]:
            best = {"scale": scale, "rows": deduped, "missing": missing, "bands": len(bands), "score": score}

        if score == 0:
            break

    err = "" if len(best["rows"]) == expected else f"COUNT_MISMATCH {len(best['rows'])} != {expected}"
    return best["rows"], best["missing"], best["scale"], best["bands"], err

# --- Run rescue on the pages listed in needs_review ---
needs = pd.read_csv(NEEDS_REVIEW)
failed_pages = needs["Page"].tolist()

all_df = pd.read_csv(OUTPUT_ALL)

print(f"[Rescue] pages={failed_pages}")
print(f"[Rescue] expected_total={expected_total()}")

missing_all = []
report = []

for i, pg in enumerate(failed_pages, 1):
    img_path = JPEGS_DIR / f"{pg}.jpg"
    exp = EXPECTED[pg]

    rows, missing, scale, bands, err = extract_cells_sliced(pg, img_path, exp)

    # patch into output_all_rows
    all_df = all_df[all_df["Page"] != pg].copy()
    t = type_for_page(pg)
    patch = pd.DataFrame([{"Page": pg, "Type": t, "Accession": a, "Code": c, "Catalog": k} for (a,c,k) in rows])
    all_df = pd.concat([all_df, patch], ignore_index=True)

    missing_all.extend(missing)
    report.append({"Page": pg, "Extracted": len(rows), "Expected": exp, "Error": err, "Scale": scale, "RowBands": bands})

    print(f"  page {pg}: {len(rows)}/{exp} scale={scale} bands={bands} err={err}")

# write missing report
pd.DataFrame(missing_all).to_csv(OUT_DIR / "missing_cells_rescue.csv", index=False)
pd.DataFrame(report).to_csv(OUT_DIR / "rescue_report.csv", index=False)

# rebuild page counts + needs review
counts = []
for pg in range(1, 103):
    got = int((all_df["Page"] == pg).sum())
    exp = EXPECTED[pg]
    err = "" if got == exp else f"COUNT_MISMATCH {got} != {exp}"
    counts.append({"Page": pg, "RowsExtracted": got, "RowsExpected": exp, "Error": err})

page_counts_df = pd.DataFrame(counts)
page_counts_df.to_csv(PAGE_COUNTS, index=False)
needs2 = page_counts_df[page_counts_df["Error"] != ""].copy()
needs2.to_csv(OUT_DIR / "needs_review.csv", index=False)

all_df.to_csv(OUTPUT_ALL, index=False)

print("\n[Rescue Done]")
print("  output_all_rows.csv rows =", len(all_df), f"(target {expected_total()})")
print("  pages_flagged =", len(needs2))
print("  wrote out/missing_cells_rescue.csv and out/rescue_report.csv")

[Rescue] pages=[20, 27, 53, 61, 62]
[Rescue] expected_total=16473
  page 20: 49/162 scale=2.8 bands=54 err=COUNT_MISMATCH 49 != 162
  page 27: 45/162 scale=3.0 bands=54 err=COUNT_MISMATCH 45 != 162
  page 53: 25/162 scale=3.0 bands=54 err=COUNT_MISMATCH 25 != 162
  page 61: 2/162 scale=3.0 bands=18 err=COUNT_MISMATCH 2 != 162
  page 62: 3/162 scale=2.6 bands=16 err=COUNT_MISMATCH 3 != 162

[Rescue Done]
  output_all_rows.csv rows = 15787 (target 16473)
  pages_flagged = 5
  wrote out/missing_cells_rescue.csv and out/rescue_report.csv


### The rescue that should finish the last 24 rows ✅

In [8]:
import re
import time
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract
from pytesseract import Output

JPEGS_DIR = Path("JPEGS")
OUT_DIR = Path("out")
OUT_DIR.mkdir(exist_ok=True)

NEEDS_REVIEW = OUT_DIR / "needs_review.csv"
OUTPUT_ALL = OUT_DIR / "output_all_rows.csv"
PAGE_COUNTS = OUT_DIR / "page_counts.csv"

# -------------------------
# Expected counts (corrected)
# -------------------------
EXPECTED = {1: 141}
for p in range(2, 99): EXPECTED[p] = 162
EXPECTED[99]  = 160
EXPECTED[100] = 160
EXPECTED[101] = 162
EXPECTED[102] = 136  # you confirmed this

def expected_total():
    return sum(EXPECTED[p] for p in range(1, 103))

# NOTE: For your current needs_review pages (all <= 98), Type is SKINS.
# If you later want full automatic Type for pages 99/100 mid-page pivots,
# keep your earlier pivot logic. This rescue is about finishing missing rows.
def type_for_page(page:int)->str:
    if page <= 98:
        return "SKINS"
    if page in (99, 100):
        return "MIXED"  # placeholders; your earlier pipeline should assign per-row based on pivot line
    return "SKELETONS"

# -------------------------
# Binarization helpers
# -------------------------
def otsu_threshold(arr: np.ndarray) -> int:
    hist = np.bincount(arr.ravel(), minlength=256).astype(np.float64)
    total = arr.size
    sum_total = np.dot(np.arange(256), hist)
    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200
    for t in range(256):
        w_b += hist[t]
        if w_b == 0: 
            continue
        w_f = total - w_b
        if w_f == 0: 
            break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=3.0) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    # mild cleanup + sharpen
    im = im.filter(ImageFilter.MedianFilter(size=3))
    im = im.filter(ImageFilter.UnsharpMask(radius=2, percent=170, threshold=3))

    arr = np.array(im, dtype=np.uint8)
    thr = otsu_threshold(arr)
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

# -------------------------
# Adaptive row band detection
# -------------------------
def find_row_bands_adaptive(bw: Image.Image, target_bands: int):
    a = np.array(bw)
    black = (a == 0).astype(np.uint8)
    proj = black.sum(axis=1).astype(np.float32)

    k = 21
    smooth = np.convolve(proj, np.ones(k, dtype=np.float32)/k, mode="same")
    p85 = np.percentile(smooth, 85)

    def bands_for_factor(factor: float):
        thr = max(10.0, p85 * factor)
        mask = smooth > thr
        bands = []
        y = 0
        H = len(mask)
        while y < H:
            if not mask[y]:
                y += 1
                continue
            y0 = y
            while y < H and mask[y]:
                y += 1
            y1 = y
            if (y1 - y0) >= 6:
                bands.append((y0, y1))

        # merge close
        merged = []
        for (y0, y1) in bands:
            if not merged:
                merged.append([y0, y1])
            else:
                if y0 - merged[-1][1] <= 6:
                    merged[-1][1] = y1
                else:
                    merged.append([y0, y1])
        return [(int(a), int(b)) for a, b in merged]

    # try multiple thresholds; pick closest to target
    factors = [0.36, 0.32, 0.28, 0.24, 0.20, 0.17, 0.14]
    best = None
    for f in factors:
        b = bands_for_factor(f)
        score = abs(len(b) - target_bands)
        if best is None or score < best[0]:
            best = (score, f, b)
        if score == 0:
            break

    score, f, bands = best

    # hard fallback: if still wildly off, force a uniform grid inside text-y span
    if len(bands) < max(10, int(target_bands * 0.6)):
        ys, xs = np.where(black)
        if len(ys) == 0:
            return []
        y_min, y_max = int(ys.min()), int(ys.max())
        span = y_max - y_min
        step = span / target_bands
        bands = []
        for i in range(target_bands):
            y0 = int(y_min + i * step)
            y1 = int(y_min + (i + 1) * step)
            bands.append((y0, y1))

    return bands

# -------------------------
# 1D k-means for 3 column centers (no sklearn)
# -------------------------
def kmeans_1d_three(xs: np.ndarray, iters=20):
    xs = xs.astype(np.float32)
    # init at percentiles
    c = np.array([np.percentile(xs, 20), np.percentile(xs, 50), np.percentile(xs, 80)], dtype=np.float32)
    for _ in range(iters):
        # assign
        d0 = np.abs(xs - c[0])
        d1 = np.abs(xs - c[1])
        d2 = np.abs(xs - c[2])
        labels = np.argmin(np.stack([d0, d1, d2], axis=1), axis=1)
        new_c = c.copy()
        for k in range(3):
            m = xs[labels == k]
            if len(m) > 50:
                new_c[k] = m.mean()
        # sort centers to keep order
        new_c.sort()
        if np.max(np.abs(new_c - c)) < 0.5:
            c = new_c
            break
        c = new_c
    return c

def compute_column_bounds(bw: Image.Image):
    a = np.array(bw)
    black = (a == 0)
    ys, xs = np.where(black)
    H, W = black.shape

    if len(xs) < 1000:
        # fallback to rough thirds
        b1 = int(W * 0.33)
        b2 = int(W * 0.66)
        return [(0, b1), (b1, b2), (b2, W)]

    # sample xs for speed
    sample = xs[:: max(1, len(xs)//20000)]
    centers = kmeans_1d_three(sample)

    b1 = int((centers[0] + centers[1]) / 2)
    b2 = int((centers[1] + centers[2]) / 2)

    pad = int(W * 0.02)
    return [
        (max(0, 0), min(W, b1 + pad)),
        (max(0, b1 - pad), min(W, b2 + pad)),
        (max(0, b2 - pad), min(W, W)),
    ]

# -------------------------
# OCR + parse per cell
# -------------------------
CELL_CFG = (
    "--oem 3 --psm 7 "
    "-c preserve_interword_spaces=1 "
    "-c tessedit_char_whitelist=0123456789EDG"
)

TRIPLET_RE = re.compile(r"(\d{1,6})\s*([EDG])\s*(\d{1,7})")

def normalize_text(s: str) -> str:
    # allow parsing of 2513E22802 or 2513E 22802 etc
    s = s.strip().upper()
    s = re.sub(r"[^0-9EDG\s]", " ", s)
    s = re.sub(r"(\d)([EDG])", r"\1 \2", s)
    s = re.sub(r"([EDG])(\d)", r"\1 \2", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def ocr_cell_triplet(cell: Image.Image):
    d = pytesseract.image_to_data(cell, config=CELL_CFG, output_type=Output.DICT)

    tokens = []
    for txt in d["text"]:
        t = (txt or "").strip()
        if t:
            tokens.append(t)

    if not tokens:
        return None

    raw = " ".join(tokens)
    norm = normalize_text(raw)

    m = TRIPLET_RE.search(norm)
    if m:
        acc = int(m.group(1))
        code = m.group(2)
        cat = int(m.group(3))
        return (acc, code, cat)

    # fallback: sometimes we get ["2513E", "22802"] and regex fails before normalize; try normalize again
    m = TRIPLET_RE.search(normalize_text(norm))
    if m:
        return (int(m.group(1)), m.group(2), int(m.group(3)))

    return None

def extract_page(page: int, path: Path):
    img = Image.open(path)

    exp = EXPECTED[page]
    target_bands = int(round(exp / 3))

    # try a couple scales; pick the one that matches expected count
    scales = [2.6, 2.8, 3.0, 3.2]
    best = None

    for sc in scales:
        bw = preprocess_bw(img, scale=sc)
        bands = find_row_bands_adaptive(bw, target_bands=target_bands)
        if not bands:
            continue

        col_bounds = compute_column_bounds(bw)
        W, H = bw.size

        rows = []
        for (y0, y1) in bands:
            yy0 = max(0, y0 - 2)
            yy1 = min(H, y1 + 2)
            for (x0, x1) in col_bounds:
                cell = bw.crop((x0, yy0, x1, yy1))
                t = ocr_cell_triplet(cell)
                if t is not None:
                    rows.append(t)

        # de-dupe exact triplets
        seen = set()
        dedup = []
        for t in rows:
            if t in seen:
                continue
            seen.add(t)
            dedup.append(t)

        score = abs(len(dedup) - exp)
        cand = {
            "scale": sc,
            "bands": len(bands),
            "rows": dedup,
            "score": score,
            "col_bounds": col_bounds
        }
        if best is None or score < best["score"]:
            best = cand
        if score == 0:
            break

    if best is None:
        return [], {"Page": page, "Extracted": 0, "Expected": exp, "Error": "FAILED_NO_BANDS", "Scale": None, "RowBands": 0}

    err = "" if len(best["rows"]) == exp else f"COUNT_MISMATCH {len(best['rows'])} != {exp}"
    rep = {"Page": page, "Extracted": len(best["rows"]), "Expected": exp, "Error": err, "Scale": best["scale"], "RowBands": best["bands"]}
    return best["rows"], rep

# -------------------------
# Run rescue on pages in needs_review.csv
# -------------------------
needs = pd.read_csv(NEEDS_REVIEW)
failed_pages = [int(x) for x in needs["Page"].tolist()]

all_df = pd.read_csv(OUTPUT_ALL)

print(f"[Rescue v2] Reprocessing failed pages: {failed_pages}")
print(f"[Rescue v2] target total rows = {expected_total()}")

t0 = time.time()
reports = []

for i, pg in enumerate(failed_pages, 1):
    path = JPEGS_DIR / f"{pg}.jpg"
    rows, rep = extract_page(pg, path)

    # patch page rows into output_all_rows
    all_df = all_df[all_df["Page"] != pg].copy()
    tname = type_for_page(pg)

    patch = pd.DataFrame([{
        "Page": pg,
        "Type": tname,
        "Accession": a,
        "Code": c,
        "Catalog": k,
    } for (a, c, k) in rows])

    all_df = pd.concat([all_df, patch], ignore_index=True)

    reports.append(rep)
    dt = time.time() - t0
    print(f"  page {pg} ({i}/{len(failed_pages)}): {rep['Extracted']}/{rep['Expected']}  bands={rep['RowBands']}  scale={rep['Scale']}  err={rep['Error']}  elapsed={dt:.1f}s")

# write reports
rescue_report = pd.DataFrame(reports)
rescue_report.to_csv(OUT_DIR / "rescue_report_v2.csv", index=False)

# rebuild page counts + needs review
counts = []
for pg in range(1, 103):
    got = int((all_df["Page"] == pg).sum())
    exp = EXPECTED[pg]
    err = "" if got == exp else f"COUNT_MISMATCH {got} != {exp}"
    counts.append({"Page": pg, "RowsExtracted": got, "RowsExpected": exp, "Error": err})

page_counts_df = pd.DataFrame(counts)
page_counts_df.to_csv(PAGE_COUNTS, index=False)
needs2 = page_counts_df[page_counts_df["Error"] != ""].copy()
needs2.to_csv(OUT_DIR / "needs_review.csv", index=False)

# write patched output_all
all_df.to_csv(OUTPUT_ALL, index=False)

print("\n[Rescue v2 Done]")
print("  output_all_rows.csv rows =", len(all_df), f"(target {expected_total()})")
print("  pages_flagged =", len(needs2))
print("  wrote out/rescue_report_v2.csv")

[Rescue v2] Reprocessing failed pages: [20, 27, 53, 61, 62]
[Rescue v2] target total rows = 16473
  page 20 (1/5): 152/162  bands=54  scale=2.6  err=COUNT_MISMATCH 152 != 162  elapsed=76.9s
  page 27 (2/5): 143/162  bands=54  scale=3.2  err=COUNT_MISMATCH 143 != 162  elapsed=153.7s
  page 53 (3/5): 156/162  bands=54  scale=2.6  err=COUNT_MISMATCH 156 != 162  elapsed=231.8s
  page 61 (4/5): 71/162  bands=54  scale=2.6  err=COUNT_MISMATCH 71 != 162  elapsed=312.2s
  page 62 (5/5): 78/162  bands=54  scale=2.8  err=COUNT_MISMATCH 78 != 162  elapsed=392.3s

[Rescue v2 Done]
  output_all_rows.csv rows = 16263 (target 16473)
  pages_flagged = 5
  wrote out/rescue_report_v2.csv
