In [1]:
import subprocess, shutil
PDFIMAGES = shutil.which("pdfimages") or r"C:\Program Files\poppler-24.08.0\Library\bin\pdfimages.exe"
print("pdfimages:", PDFIMAGES)
print(subprocess.run([PDFIMAGES, "-v"], capture_output=True, text=True).stderr)


pdfimages: C:\Users\Julian.Diaz\AppData\Local\anaconda3\envs\py311\Library\bin\pdfimages.EXE
pdfimages version 24.09.0
Copyright 2005-2024 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011, 2022 Glyph & Cog, LLC



In [2]:
# pip/conda: pymupdf
#   conda install -c conda-forge pymupdf   (or)   pip install pymupdf

import fitz  # PyMuPDF
import pathlib, csv
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- CONFIG ---
PDF_DIR      = pathlib.Path(r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\04_eXgen\B\__type___pdf")
OUT_DIR      = pathlib.Path(r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\04_eXgen\B\__type___pdf_images-ext")
DPI          = 300
MAX_WORKERS  = 4

# Heuristics (tune as needed)
MIN_VECTOR_OPS       = 100    # select pages with >= this many vector ops
TEXT_CHAR_MAX        = 200    # de-prioritise "texty" pages (still select if vector is very high)
VERY_HIGH_VECTOR_OPS = 400    # always select if >= this (even if lots of text)

# ----------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
zoom = DPI / 72.0
MAT  = fitz.Matrix(zoom, zoom)

def page_metrics(page: fitz.Page):
    """
    Return a dict with vector_ops, text_chars, image_count.
    vector_ops ~ total number of low-level drawing items on the page.
    """
    drawings = page.get_drawings() or []
    # Sum items across drawings (each drawing has an 'items' list of primitives)
    vector_ops = sum(len(d.get("items", ())) for d in drawings)
    text = page.get_text("text") or ""
    text_chars = len(text)
    image_count = len(page.get_images(full=True))
    return {"vector_ops": vector_ops, "text_chars": text_chars, "image_count": image_count}

def should_render(metrics):
    v = metrics["vector_ops"]
    t = metrics["text_chars"]
    # Always render if extremely vector-heavy
    if v >= VERY_HIGH_VECTOR_OPS:
        return True, f"vector_ops={v} (>= {VERY_HIGH_VECTOR_OPS})"
    # Render if vector-heavy and not too texty
    if v >= MIN_VECTOR_OPS and t <= TEXT_CHAR_MAX:
        return True, f"vector_ops={v} & text_chars={t}"
    # Otherwise skip
    return False, f"skip (vector_ops={v}, text_chars={t})"

def render_selected_pages(pdf_path: pathlib.Path):
    rows = []
    rendered = 0
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return f"FAIL open {pdf_path.name}: {e}", rows

    out_sub = OUT_DIR / pdf_path.stem
    out_sub.mkdir(parents=True, exist_ok=True)

    for i in range(len(doc)):
        page = doc.load_page(i)
        m = page_metrics(page)
        ok, reason = should_render(m)
        if not ok:
            continue
        try:
            pix = page.get_pixmap(matrix=MAT)
            out_png = out_sub / f"page-{i+1:04d}.png"
            pix.save(out_png)
            rendered += 1
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": str(out_png),
                "reason": reason,
            })
        except Exception as e:
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": "",
                "reason": f"ERROR: {e}",
            })

    status = f"OK   {pdf_path.name}: {rendered} vector pages"
    if rendered == 0:
        status = f"SKIP {pdf_path.name}: no vector-heavy pages"
    return status, rows

pdfs = list(PDF_DIR.rglob("*.pdf"))
print("Found PDFs:", len(pdfs))

all_rows = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(render_selected_pages, pdf): pdf for pdf in pdfs}
    for fut in as_completed(futures):
        status, rows = fut.result()
        print(status)
        all_rows.extend(rows)

# Write index
INDEX_CSV = OUT_DIR / "page_index_vector.csv"
if all_rows:
    with INDEX_CSV.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=list(all_rows[0].keys()))
        w.writeheader(); w.writerows(all_rows)
    print(f"\nIndexed {len(all_rows)} rendered pages → {INDEX_CSV}")
else:
    print("\nNo pages rendered. Consider lowering MIN_VECTOR_OPS or raising VERY_HIGH_VECTOR_OPS.")


Found PDFs: 242
SKIP APPENDIX_1_-_ALTERATION_LEGEND.pdf: no vector-heavy pages
SKIP 2004_ANNUAL_REPORT.pdf: no vector-heavy pages
OK   ANNUAL_REPORT_ATTACHMENT.pdf: 1 vector pages
SKIP APPENDIX_1_-_LITHOLOGY_LEGEND.pdf: no vector-heavy pages
SKIP APPENDIX_1_-_REPROCESSED_REGIONAL_MAGNETIC_IMAGERY__1.pdf: no vector-heavy pages
SKIP APPENDIX_1_-_REPROCESSED_REGIONAL_MAGNETIC_IMAGERY.pdf: no vector-heavy pages
SKIP APPENDIX_1_-_REPROCESSED_REGIONAL_MAGNETIC_IMAGERY__2.pdf: no vector-heavy pages
OK   APPENDIX_1_-_PHASE_ONE_GRAVITY_SURVEY_REPORT.pdf: 1 vector pages
SKIP APPENDIX_1_-_REPROCESSED_REGIONAL_MAGNETIC_IMAGERY__3.pdf: no vector-heavy pages
OK   APPENDIX_2_-_ANALYSIS_SPECIFICATIONS.pdf: 1 vector pages
SKIP APPENDIX_2_-_PHASE_TWO_GRAVITY_SURVEY_REPORT.pdf: no vector-heavy pages
SKIP APPENDIX_2_-_REPROCESSED_GRAVITY_IMAGERY.pdf: no vector-heavy pages
OK   APPENDIX_2_-_GEOPHYSICS_-_EM.pdf: 4 vector pages
SKIP APPENDIX_2_-_REPROCESSED_REGIONAL_GRAVITY_IMAGERY.pdf: no vector-heavy pages

In [4]:
import pathlib, shutil

# --- CONFIG ---
SRC_DIR = pathlib.Path(r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\04_eXgen\B\__type___pdf_images-ext")  # parent folder with many subfolders
DST_DIR = pathlib.Path(r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\04_eXgen\B\__type___pdf_images-ext\Summary")            # single destination folder
# --------------

DST_DIR.mkdir(parents=True, exist_ok=True)

count = 0
for sub in SRC_DIR.iterdir():
    if not sub.is_dir():
        continue
    stem = sub.name
    for img in sub.glob("*.png"):
        new_name = f"{stem}_{img.name}"
        dst = DST_DIR / new_name
        shutil.move(str(img), dst)   # move instead of copy
        count += 1

print(f"Moved {count} PNGs → {DST_DIR}")


Moved 1003 PNGs → C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\04_eXgen\B\__type___pdf_images-ext\Summary
