In [11]:
##RESCAN 20,27,53,61,62
import os
import re
import time
import math
import shutil
from pathlib import Path
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from PIL import Image, ImageOps, ImageFilter
import pytesseract
from pytesseract import Output

# =========================
# CONFIG
# =========================
JPEGS_DIR = Path(r".\JPEGS")
OUT_DIR = Path(r".\out")
OCR_TEXT_DIR = OUT_DIR / "ocr_text"

WORKERS = 6
PROGRESS_EVERY = 10

# If you installed tesseract but it's not on PATH, set it explicitly:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

OUT_DIR.mkdir(parents=True, exist_ok=True)
OCR_TEXT_DIR.mkdir(parents=True, exist_ok=True)

ACC_RE = r"\d{1,6}"
CAT_RE = r"\d{1,7}"

# =========================
# FILE LISTING (NO ZIP)
# =========================
def page_num_from_path(p: Path) -> int:
    m = re.search(r"(\d+)", p.stem)
    if not m:
        raise ValueError(f"Can't parse page number from: {p.name}")
    return int(m.group(1))

def list_jpgs_unique_by_page(folder: Path):
    # One glob that matches .jpg/.JPG/etc without double-counting on Windows
    files = list(folder.glob("*.[jJ][pP][gG]"))

    # Normalize + de-dupe physical paths (defensive)
    normed = []
    seen = set()
    for f in files:
        key = os.path.normcase(str(f.resolve()))
        if key not in seen:
            seen.add(key)
            normed.append(f)

    # Group by page number
    by_page = {}
    collisions = {}
    for f in normed:
        pg = page_num_from_path(f)
        by_page.setdefault(pg, []).append(f)

    # Keep the largest file per page (best scan), but retain alternates
    for pg, lst in by_page.items():
        lst_sorted = sorted(lst, key=lambda x: x.stat().st_size, reverse=True)
        by_page[pg] = lst_sorted
        if len(lst_sorted) > 1:
            collisions[pg] = lst_sorted

    pages = sorted(by_page.keys())
    if len(pages) != 102 or pages[0] != 1 or pages[-1] != 102:
        raise RuntimeError(f"Expected pages 1..102. Found {len(pages)} pages: {pages[:5]} ... {pages[-5:]}")

    if collisions:
        sample = ", ".join([f"{k}:{len(v)}" for k, v in list(collisions.items())[:12]])
        print("[Info] Duplicate files for some pages (multiple files map to same page number). Will auto-try alternates if needed.")
        print(f"       Examples: {sample} ...")
    else:
        print("[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅")

    # Return primary path + alternates
    tasks = []
    for pg in range(1, 103):
        paths = by_page[pg]
        tasks.append((pg, paths[0], paths[1:]))  # (page, primary, alternates)
    return tasks

# =========================
# IMAGE PREPROCESSING
# =========================
def otsu_threshold(arr_uint8: np.ndarray) -> int:
    # Simple Otsu implementation (keeps dependencies minimal)
    hist = np.bincount(arr_uint8.ravel(), minlength=256).astype(np.float64)
    total = arr_uint8.size
    sum_total = np.dot(np.arange(256), hist)

    sum_b = 0.0
    w_b = 0.0
    max_var = -1.0
    thresh = 200

    for t in range(256):
        w_b += hist[t]
        if w_b == 0:
            continue
        w_f = total - w_b
        if w_f == 0:
            break
        sum_b += t * hist[t]
        m_b = sum_b / w_b
        m_f = (sum_total - sum_b) / w_f
        var_between = w_b * w_f * (m_b - m_f) ** 2
        if var_between > max_var:
            max_var = var_between
            thresh = t
    return int(thresh)

def preprocess_bw(img: Image.Image, scale=2.2, use_otsu=True, fixed_thresh=205) -> Image.Image:
    im = img.convert("L")
    im = ImageOps.autocontrast(im)
    if scale != 1.0:
        im = im.resize((int(im.size[0]*scale), int(im.size[1]*scale)), Image.Resampling.BICUBIC)
    im = im.filter(ImageFilter.MedianFilter(size=3))

    arr = np.array(im)
    thr = otsu_threshold(arr) if use_otsu else fixed_thresh
    bw = (arr > thr).astype(np.uint8) * 255
    return Image.fromarray(bw)

# =========================
# OCR HELPERS
# =========================
def ocr_data(img_bw: Image.Image, psm: int) -> dict:
    # Numeric-focused whitelist to reduce garbage
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_data(img_bw, config=cfg, output_type=Output.DICT)

def ocr_text_for_debug(img_bw: Image.Image, psm: int) -> str:
    whitelist = "0123456789EDG"
    cfg = f"--oem 3 --psm {psm} -c preserve_interword_spaces=1 -c tessedit_char_whitelist={whitelist}"
    return pytesseract.image_to_string(img_bw, config=cfg)

def ocr_find_headers(img: Image.Image) -> dict:
    """
    Lightweight header detection for pages where section starts mid-page.
    Returns y-coordinates for found headers (in original image coords).
    """
    im = img.convert("L")
    im = ImageOps.autocontrast(im)

    cfg = "--oem 3 --psm 6"
    d = pytesseract.image_to_data(im, config=cfg, output_type=Output.DICT)

    headers = {"PICKLES": [], "SKELETONS": [], "SKINS": []}
    for txt, top, h in zip(d["text"], d["top"], d["height"]):
        if not txt:
            continue
        t = re.sub(r"[^A-Z]", "", txt.upper())
        if t in headers:
            # use center y
            headers[t].append(int(top) + int(h)//2)
    return headers

def clean_token(t: str) -> str:
    t = (t or "").strip().upper()
    t = re.sub(r"[^A-Z0-9]", "", t)
    return t

def split_subtokens(text: str):
    if not text:
        return []
    # 254E1396 -> 254, E, 1396
    m = re.fullmatch(rf"({ACC_RE})([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2), m.group(3)]
    # 254E -> 254, E
    m = re.fullmatch(rf"({ACC_RE})([EDG])", text)
    if m:
        return [m.group(1), m.group(2)]
    # E1396 -> E, 1396
    m = re.fullmatch(rf"([EDG])({CAT_RE})", text)
    if m:
        return [m.group(1), m.group(2)]
    return [text]

def cluster_rows(tokens, row_tol):
    tokens = sorted(tokens, key=lambda d: d["y"])
    rows = []
    cur = []
    cur_y = None
    for tok in tokens:
        if cur_y is None:
            cur = [tok]
            cur_y = tok["y"]
            continue
        if abs(tok["y"] - cur_y) <= row_tol:
            cur.append(tok)
            cur_y = (cur_y * (len(cur)-1) + tok["y"]) / len(cur)
        else:
            rows.append(cur)
            cur = [tok]
            cur_y = tok["y"]
    if cur:
        rows.append(cur)
    return rows

def kmeans_1d(xs, k=3, iters=30):
    xs = np.array(xs, dtype=float)
    centers = np.percentile(xs, np.linspace(0, 100, k+2)[1:-1])
    for _ in range(iters):
        d = np.abs(xs[:, None] - centers[None, :])
        lab = d.argmin(axis=1)
        new = []
        for j in range(k):
            pts = xs[lab == j]
            new.append(centers[j] if len(pts) == 0 else pts.mean())
        new = np.array(new)
        if np.allclose(new, centers):
            break
        centers = new
    return np.sort(centers)

def extract_triplets(img_bw: Image.Image, psm=6, conf_floor=-1.0, kcols=3):
    """
    Key idea:
    - Use image_to_data (gives x,y per token)
    - Cluster into rows (by y)
    - Cluster x positions into 3 column centers (kmeans)
    - Parse within each column stream
    - Infer missing code (E/D/G) when page strongly favors one code
    - Repair accession values that look like modal accession + 1 extra digit
    """
    w, h = img_bw.size
    data = ocr_data(img_bw, psm=psm)

    tokens = []
    xs = []
    for raw, left, top, width, height, conf in zip(
        data["text"], data["left"], data["top"], data["width"], data["height"], data["conf"]
    ):
        if not raw or raw.strip() == "":
            continue
        try:
            conf = float(conf)
        except:
            conf = -1.0
        if conf < conf_floor:
            continue

        t = clean_token(raw)
        if not t:
            continue

        x = int(left) + int(width) / 2.0
        y = int(top) + int(height) / 2.0
        tokens.append({"text": t, "x": x, "y": y, "conf": conf})
        xs.append(x)

    if not tokens:
        return []

    centers = kmeans_1d(xs, k=kcols)
    row_tol = max(12.0, h * 0.007)
    rows = cluster_rows(tokens, row_tol)

    parsed = []
    row_col_streams = []  # keep for second-pass inference
    for row in rows:
        subs = []
        for tok in row:
            col = int(np.argmin(np.abs(centers - tok["x"])))
            for s in split_subtokens(tok["text"]):
                subs.append({"text": s, "x": tok["x"], "y": tok["y"], "conf": tok["conf"], "col": col})

        col_streams = []
        for col in range(kcols):
            cs = sorted([s for s in subs if s["col"] == col], key=lambda d: d["x"])
            col_streams.append(cs)

        row_col_streams.append(col_streams)

        # first pass: strict triplets only
        for cs in col_streams:
            toks = [t["text"] for t in cs]
            j = 0
            while j <= len(toks) - 3:
                a, c, b = toks[j], toks[j+1], toks[j+2]
                if re.fullmatch(ACC_RE, a) and c in ("E", "D", "G") and re.fullmatch(CAT_RE, b):
                    parsed.append((int(a), c, int(b)))
                    j += 3
                else:
                    j += 1

    if not parsed:
        return []

    # infer dominant code/accession
    codes = [t[1] for t in parsed]
    accs = [t[0] for t in parsed]
    mode_code, code_ct = Counter(codes).most_common(1)[0]
    mode_acc, acc_ct = Counter(accs).most_common(1)[0]
    code_share = code_ct / len(codes)
    acc_share = acc_ct / len(accs)

    out = set(parsed)

    # second pass: allow "ACC CATALOG" (missing code) within each column stream
    if code_share >= 0.85:
        for col_streams in row_col_streams:
            for cs in col_streams:
                toks = [t["text"] for t in cs]
                for j in range(len(toks) - 1):
                    a, b = toks[j], toks[j+1]
                    if re.fullmatch(ACC_RE, a) and re.fullmatch(CAT_RE, b):
                        out.add((int(a), mode_code, int(b)))

    # repair accession like 2542 -> 254 when 254 dominates the page
    if acc_share >= 0.50:
        mstr = str(mode_acc)
        fixed = set()
        for (a, c, b) in out:
            astr = str(a)
            if a != mode_acc and astr.startswith(mstr) and len(astr) == len(mstr) + 1:
                cand = (mode_acc, c, b)
                # Only change if it fills a missing (prevents breaking real accessions)
                if cand not in out:
                    fixed.add(cand)
                else:
                    fixed.add((a, c, b))
            else:
                fixed.add((a, c, b))
        out = fixed

    return sorted(out, key=lambda t: (t[0], t[1], t[2]))

# =========================
# TYPE / EXPECTED COUNTS
# =========================
def base_type_for_page(page: int) -> str:
    # This sets the "type before any mid-page header pivot"
    if page <= 99:
        return "SKINS"
    if page == 100:
        return "PICKLES"
    return "SKELETONS"

def expected_rows_for_page(page: int, headers_found: dict) -> int:
    # Known special pages
    if page == 1:
        return 141
    if page == 102:
        return 136

    exp = 162

    # Mid-page section header tends to consume ~2 rows worth of data slots in these docs
    # (your observation: page99 + page100)
    header_present = (len(headers_found.get("PICKLES", [])) > 0) or (len(headers_found.get("SKELETONS", [])) > 0)
    if header_present:
        exp -= 2

    return exp

def type_for_triplet_y(page: int, y_center: float, headers_found: dict) -> str:
    """
    Determine type per-row for the pivot pages.
    Uses base type for the page, then changes after the header line y.
    """
    t = base_type_for_page(page)

    # Page 99: SKINS then PICKLES after PICKLES header
    if page == 99 and headers_found["PICKLES"]:
        pivot = min(headers_found["PICKLES"])
        if y_center > pivot:
            return "PICKLES"
        return "SKINS"

    # Page 100: PICKLES then SKELETONS after SKELETONS header
    if page == 100 and headers_found["SKELETONS"]:
        pivot = min(headers_found["SKELETONS"])
        if y_center > pivot:
            return "SKELETONS"
        return "PICKLES"

    return t

# =========================
# PAGE PROCESSING
# =========================
def process_one(task):
    page, primary_path, alternates = task

    img = Image.open(primary_path)
    headers = ocr_find_headers(img)
    expected = expected_rows_for_page(page, headers)

    attempts = [
        # (scale, use_otsu, fixed_thresh, psm)
        (2.2, True, 205, 6),
        (2.6, True, 205, 6),
        (2.2, False, 200, 6),
        (2.6, False, 200, 6),
        (2.2, True, 205, 4),
        (2.6, True, 205, 4),
    ]

    tried_paths = [primary_path] + list(alternates)
    best = None

    for path_try in tried_paths:
        img_try = Image.open(path_try)
        for (scale, use_otsu, thr, psm) in attempts:
            bw = preprocess_bw(img_try, scale=scale, use_otsu=use_otsu, fixed_thresh=thr)
            triplets = extract_triplets(bw, psm=psm, conf_floor=-1.0, kcols=3)

            # Debug text dump (last attempt overwrites; good enough for tracing)
            debug_path = OCR_TEXT_DIR / f"page_{page:03d}.txt"
            debug_text = ocr_text_for_debug(bw, psm=psm)
            debug_path.write_text(debug_text, encoding="utf-8", errors="ignore")

            if best is None or len(triplets) > len(best["triplets"]):
                best = {"triplets": triplets, "path": path_try, "debug": debug_path, "psm": psm, "scale": scale}

            if len(triplets) == expected:
                return {
                    "page": page,
                    "rows": triplets,
                    "rows_extracted": len(triplets),
                    "rows_expected": expected,
                    "error": "",
                    "attempts_used": 1,
                    "image_file": str(path_try),
                    "debug_file": str(debug_path),
                    "headers": headers,
                }

    # If we got here, it's a mismatch; return the best we saw
    return {
        "page": page,
        "rows": best["triplets"] if best else [],
        "rows_extracted": len(best["triplets"]) if best else 0,
        "rows_expected": expected,
        "error": f"COUNT_MISMATCH {len(best['triplets']) if best else 0} != {expected}",
        "attempts_used": len(tried_paths) * len(attempts),
        "image_file": str(best["path"]) if best else str(primary_path),
        "debug_file": str(best["debug"]) if best else "",
        "headers": headers,
    }

# =========================
# RUN ALL + OUTPUTS  (REVISED OUTPUT/CSVS ONLY)
# =========================
def run_all():
    TARGET_TOTAL_ROWS = 16473  # your stated target end result

    tasks = list_jpgs_unique_by_page(JPEGS_DIR)

    print(f"[Start] Processing {len(tasks)} pages from {JPEGS_DIR} with {WORKERS} workers")
    t0 = time.time()

    results = []
    done = 0
    with ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futures = [ex.submit(process_one, t) for t in tasks]
        for fut in as_completed(futures):
            res = fut.result()
            results.append(res)
            done += 1

            if done % PROGRESS_EVERY == 0 or done == len(tasks):
                rate = done / max(1e-9, (time.time() - t0))
                remaining = len(tasks) - done
                eta = remaining / max(1e-9, rate)
                print(f"  completed {done}/{len(tasks)} | {rate:.2f} pages/sec | ETA ~ {eta/60:.1f} min")

    results_sorted = sorted(results, key=lambda r: r["page"])

    page_counts = []
    needs_review = []

    all_rows = []
    complete_rows = []

    for r in results_sorted:
        page = r["page"]
        extracted = int(r["rows_extracted"])
        expected = int(r["rows_expected"])
        err = (r["error"] or "").strip()
        is_complete = (err == "")

        pct = (extracted / expected * 100.0) if expected else 0.0

        rec = {
            "Page": page,
            "RowsExtracted": extracted,
            "RowsExpected": expected,
            "PctComplete": round(pct, 2),
            "Complete": bool(is_complete),
            "Error": err,
            "AttemptsUsed": int(r["attempts_used"]),
            "ImageFile": r["image_file"],
            "OcrTextFile": r["debug_file"],
        }
        page_counts.append(rec)

        if err:
            needs_review.append(rec)

        base_type = base_type_for_page(page)

        # Include ALL rows, but tag whether page was complete
        for (acc, code, cat) in r["rows"]:
            row = {
                "Page": page,
                "PageComplete": bool(is_complete),
                "Type": base_type,
                "Accession": acc,
                "Code": code,
                "Catalog": cat,
            }
            all_rows.append(row)
            if is_complete:
                complete_rows.append(row)

    page_counts_df = pd.DataFrame(page_counts)
    needs_review_df = pd.DataFrame(needs_review)
    all_df = pd.DataFrame(all_rows)
    complete_df = pd.DataFrame(complete_rows)

    # ---- Write outputs ----
    out_csv = OUT_DIR / "output.csv"  # NOW: includes incomplete pages + PageComplete flag
    out_complete_csv = OUT_DIR / "output_complete_pages.csv"
    out_counts_csv = OUT_DIR / "page_counts.csv"
    out_review_csv = OUT_DIR / "needs_review.csv"
    out_summary_txt = OUT_DIR / "run_summary.txt"

    all_df.to_csv(out_csv, index=False)
    complete_df.to_csv(out_complete_csv, index=False)
    page_counts_df.to_csv(out_counts_csv, index=False)
    needs_review_df.to_csv(out_review_csv, index=False)

    # ---- Summary (target vs extracted, % with 2 decimals) ----
    extracted_total = int(len(all_df))
    missing_total = int(TARGET_TOTAL_ROWS - extracted_total)
    pct_total = (extracted_total / TARGET_TOTAL_ROWS * 100.0) if TARGET_TOTAL_ROWS else 0.0

    # Also show rule-based expected sum, in case it's useful for sanity checking
    expected_total_rule = int(page_counts_df["RowsExpected"].sum()) if not page_counts_df.empty else 0
    missing_vs_rule = int(expected_total_rule - extracted_total)
    pct_vs_rule = (extracted_total / expected_total_rule * 100.0) if expected_total_rule else 0.0

    complete_pages = int(page_counts_df["Complete"].sum()) if not page_counts_df.empty else 0
    total_pages = int(len(page_counts_df))

    summary_lines = [
        "=== OCR RUN SUMMARY ===",
        "",
        f"Pages processed:            {total_pages}",
        f"Pages complete:             {complete_pages}/{total_pages} ({(complete_pages/total_pages*100.0 if total_pages else 0.0):.2f}%)",
        f"Pages flagged:              {total_pages - complete_pages}",
        "",
        "=== ROW COUNTS ===",
        f"Target rows (your target):  {TARGET_TOTAL_ROWS}",
        f"Extracted rows (actual):    {extracted_total}",
        f"Missing vs your target:     {missing_total}",
        f"Completion vs your target:  {pct_total:.2f}%",
        "",
        f"Rule-based expected total:  {expected_total_rule}",
        f"Missing vs rule-based:      {missing_vs_rule}",
        f"Completion vs rule-based:   {pct_vs_rule:.2f}%",
        "",
        "=== OUTPUT FILES ===",
        f"All rows (includes failed): {out_csv}",
        f"Complete pages only:        {out_complete_csv}",
        f"Page counts:                {out_counts_csv}",
        f"Needs review:               {out_review_csv}",
        f"Debug dumps:                {OCR_TEXT_DIR}\\page_###.txt",
    ]
    out_summary_txt.write_text("\n".join(summary_lines), encoding="utf-8")

    print("\n[Done] Outputs:")
    print(f"  - {out_csv}                 rows={len(all_df)} (includes incomplete pages; see PageComplete)")
    print(f"  - {out_complete_csv}        rows={len(complete_df)} (only pages that passed checks)")
    print(f"  - {out_counts_csv}")
    print(f"  - {out_review_csv}          pages_flagged={len(needs_review_df)}")
    print(f"  - {out_summary_txt}")
    print(f"\n[Summary] Target={TARGET_TOTAL_ROWS} | Extracted={extracted_total} | Missing={missing_total} | {pct_total:.2f}%")

    return page_counts_df, needs_review_df, complete_df, all_df

# Run:
page_counts_df, needs_review_df, complete_df, all_df = run_all()

[Info] Found exactly 102 pages with 1 image each (no duplicates). ✅
[Start] Processing 102 pages from JPEGS with 6 workers
  completed 10/102 | 0.46 pages/sec | ETA ~ 3.3 min
  completed 20/102 | 0.48 pages/sec | ETA ~ 2.8 min
  completed 30/102 | 0.51 pages/sec | ETA ~ 2.3 min
  completed 40/102 | 0.52 pages/sec | ETA ~ 2.0 min
  completed 50/102 | 0.53 pages/sec | ETA ~ 1.6 min
  completed 60/102 | 0.54 pages/sec | ETA ~ 1.3 min
  completed 70/102 | 0.57 pages/sec | ETA ~ 0.9 min
  completed 80/102 | 0.57 pages/sec | ETA ~ 0.6 min
  completed 90/102 | 0.56 pages/sec | ETA ~ 0.4 min
  completed 100/102 | 0.57 pages/sec | ETA ~ 0.1 min
  completed 102/102 | 0.57 pages/sec | ETA ~ 0.0 min

[Done] Outputs:
  - out\output.csv                 rows=16473 (includes incomplete pages; see PageComplete)
  - out\output_complete_pages.csv        rows=16473 (only pages that passed checks)
  - out\page_counts.csv
  - out\needs_review.csv          pages_flagged=0
  - out\run_summary.txt

[Summary] T