# Workflow to Organise Data Downloaded from GSQ
## Step 0: Select the source folder

In [1]:
from pathlib import Path
import os
import shutil
import zipfile

BASE_FOLDER = Path(
    r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2"
).expanduser().resolve()

TYPE_DIR_PREFIX = "__type__"
RAW_FOLDER_NAME = "__raw__"
NOTEBOOKLM_FOLDER_NAME = "___For_NotebookLM"
SHP_FOLDER_NAME = "__shp__"

RAW_FOLDER = BASE_FOLDER / RAW_FOLDER_NAME
NOTEBOOKLM_ROOT = BASE_FOLDER / NOTEBOOKLM_FOLDER_NAME
SHP_FOLDER = BASE_FOLDER / SHP_FOLDER_NAME

# Hard limit for all merged outputs
MAX_BATCH_MB = 190
MAX_BATCH_BYTES = MAX_BATCH_MB * 1024 * 1024

# Hard limit for merged text outputs (NotebookLM char limit)
MAX_BATCH_CHARS = 490_000



## Step 1: Extract ZIP Files

In [2]:
def run_step0_unzip_all(root: Path):
    """
    Step 0:
      - Walk the tree under 'root'
      - For every .zip file, extract it into <stem>__unzipped next to it
      - If that folder already exists and is non-empty, skip

    Run this BEFORE Step 2 on a new dataset.
    Safe to re-run: it will just skip already-extracted zips.
    """
    root = Path(root)
    if not root.exists() or not root.is_dir():
        print(f"Invalid root for unzip step: {root}")
        return

    total_zips = 0
    extracted = 0

    for dirpath, dirnames, filenames in os.walk(root, topdown=True):
        dirpath = Path(dirpath)

        # Do not recurse into our special folders
        dirnames[:] = [
            d for d in dirnames
            if not d.startswith(TYPE_DIR_PREFIX)
            and d not in {RAW_FOLDER_NAME, SHP_FOLDER_NAME, NOTEBOOKLM_FOLDER_NAME}
        ]

        for name in filenames:
            if not name.lower().endswith(".zip"):
                continue

            total_zips += 1
            zip_path = dirpath / name
            extract_dir = dirpath / f"{zip_path.stem}__unzipped"

            # If already extracted, skip
            if extract_dir.exists() and any(extract_dir.iterdir()):
                print(f"Skipping already-extracted: {zip_path} -> {extract_dir}")
                continue

            print(f"Extracting: {zip_path} -> {extract_dir}")
            extract_dir.mkdir(parents=True, exist_ok=True)
            try:
                with zipfile.ZipFile(zip_path, "r") as zf:
                    zf.extractall(extract_dir)
                extracted += 1
            except Exception as e:
                print(f"  ERROR extracting {zip_path}: {e}")

    print(f"Step 0 complete â€“ found {total_zips} zip(s), extracted {extracted} new folder(s).")


## Step 1.1: collect shapefile families into __shp__ (single folder)

In [3]:
# Common shapefile-related extensions we want to keep together
SHP_RELATED_EXTS = {
    ".shp", ".dbf", ".shx", ".prj", ".cpg", ".qpj",
    ".sbn", ".sbx", ".shp.xml", ".dbf.xml"
}

def run_step1_collect_shapefiles(root: Path):
    """
    Step 1:
      - Walk the tree under 'root'
      - For each .shp found (outside our special folders),
        move it and its companion files (.dbf/.shx/.prj/...) into BASE_FOLDER/__shp__

    Result: all shapefile sets live together in one folder, easy to drag into QGIS.
    Safe to re-run: files already in __shp__ are skipped.
    """
    root = Path(root)
    if not root.exists() or not root.is_dir():
        print(f"Invalid root for shapefile step: {root}")
        return

    SHP_FOLDER.mkdir(parents=True, exist_ok=True)
    moved_sets = 0

    for dirpath, dirnames, filenames in os.walk(root, topdown=True):
        dirpath = Path(dirpath)

        # Donâ€™t walk into our special folders
        dirnames[:] = [
            d for d in dirnames
            if not d.startswith(TYPE_DIR_PREFIX)
            and d not in {RAW_FOLDER_NAME, SHP_FOLDER_NAME, NOTEBOOKLM_FOLDER_NAME}
        ]

        # Look for .shp files in this directory
        shp_files = [f for f in filenames if f.lower().endswith(".shp")]
        if not shp_files:
            continue

        for shp_name in shp_files:
            stem = Path(shp_name).stem
            print(f"Collecting shapefile set: {stem} (from {dirpath})")
            moved_sets += 1

            for ext in SHP_RELATED_EXTS:
                candidate = dirpath / f"{stem}{ext}"
                if not candidate.exists():
                    continue

                dest = SHP_FOLDER / candidate.name

                # Avoid overwriting if the same-named file already exists in __shp__
                if dest.exists():
                    base_stem = candidate.stem
                    suffix = candidate.suffix
                    counter = 1
                    while True:
                        new_name = f"{base_stem}__{counter}{suffix}"
                        new_dest = SHP_FOLDER / new_name
                        if not new_dest.exists():
                            dest = new_dest
                            break
                        counter += 1

                shutil.move(str(candidate), str(dest))

    print(f"Step 1 complete â€“ collected {moved_sets} shapefile set(s) into {SHP_FOLDER}")


## Step 2: Separate the files into file-type by folder

### 2.1 Backup originals into "__raw__"

In [4]:
def scan_file_types(root: Path):
    """
    Walk through 'root' and collect:
      - ext_counts: {ext_key: count}
      - files_by_ext: {ext_key: [Path, Path, ...]}
    ext_key is like 'pdf', 'txt', or 'noext' for files with no extension.
    """
    ext_counts = {}
    files_by_ext = {}

    # Walk top-down so we can modify `dirnames` to skip our own type folders and __raw__
    for dirpath, dirnames, filenames in os.walk(root, topdown=True):
        dirpath = Path(dirpath)

        # ðŸ‘‰ THIS is the line we were talking about
        # Skip __type__* folders and the __raw__ backup folder
                # Skip __type__* folders, the __raw__ backup folder, __shp__, and ___For_NotebookLM
        dirnames[:] = [
            d for d in dirnames
            if not d.startswith(TYPE_DIR_PREFIX)
            and d != RAW_FOLDER_NAME
            and d != SHP_FOLDER_NAME
            and d != NOTEBOOKLM_FOLDER_NAME
        ]


        for name in filenames:
            file_path = dirpath / name

            # Get extension (last part, e.g. ".pdf"), normalised to lowercase
            ext = file_path.suffix.lower()
            if ext:
                ext_key = ext.lstrip(".")  # ".pdf" -> "pdf"
            else:
                ext_key = "noext"

            ext_counts[ext_key] = ext_counts.get(ext_key, 0) + 1
            files_by_ext.setdefault(ext_key, []).append(file_path)

    return ext_counts, files_by_ext


def print_summary(root: Path, ext_counts: dict, files_by_ext: dict, max_examples: int = 5):
    """
    Print a human-friendly summary of what was found.
    """
    total_files = sum(ext_counts.values())
    print("\n=== Scan summary ===")
    print(f"Root folder: {root}")
    print(f"Total files found (excluding {TYPE_DIR_PREFIX} folders): {total_files}\n")

    if not ext_counts:
        print("No files found.")
        return

    print("File types detected (sorted by count):")
    for ext_key, count in sorted(ext_counts.items(), key=lambda x: (-x[1], x[0])):
        if ext_key == "noext":
            label = "(no extension)"
        else:
            label = f".{ext_key}"
        print(f"  {label:15} -> {count} file(s)")



def safe_move_file(src: Path, dest_folder: Path):
    """
    Move 'src' into 'dest_folder'. If a file with the same name
    already exists, append __1, __2, ... before the extension.
    """
    dest_folder.mkdir(parents=True, exist_ok=True)

    target = dest_folder / src.name
    if not target.exists():
        shutil.move(str(src), str(target))
        return

    stem = src.stem
    suffix = src.suffix
    counter = 1
    while True:
        new_name = f"{stem}__{counter}{suffix}"
        candidate = dest_folder / new_name
        if not candidate.exists():
            shutil.move(str(src), str(candidate))
            break
        counter += 1


def group_files_by_type(root: Path, files_by_ext: dict):
    """
    Move files into __type___<ext> folders inside 'root'.
    """
    for ext_key, file_list in files_by_ext.items():
        if ext_key == "noext":
            folder_name = f"{TYPE_DIR_PREFIX}_noext"
        else:
            folder_name = f"{TYPE_DIR_PREFIX}_{ext_key}"

        dest_dir = root / folder_name

        for src in file_list:
            safe_move_file(src, dest_dir)



In [5]:
import fitz  # PyMuPDF
import pathlib, csv
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- CONFIG (hooked to BASE_FOLDER) ---
PDF_DIR = BASE_FOLDER / "__type___pdf"
OUT_DIR = BASE_FOLDER / "__type___pdf_images-ext"
DPI = 300
MAX_WORKERS = 4

# Heuristics (tune as needed)
MIN_VECTOR_OPS       = 100   # select pages with >= this many vector ops
TEXT_CHAR_MAX        = 200   # de-prioritise "texty" pages (still select if vector is very high)
VERY_HIGH_VECTOR_OPS = 400   # always select if >= this (even if lots of text)

OUT_DIR.mkdir(parents=True, exist_ok=True)
zoom = DPI / 72.0
MAT  = fitz.Matrix(zoom, zoom)


def page_metrics(page: fitz.Page):
    """
    Return a dict with vector_ops, text_chars, image_count.
    vector_ops ~ total number of low-level drawing items on the page.
    """
    drawings = page.get_drawings() or []
    vector_ops = sum(len(d.get("items", ())) for d in drawings)
    text = page.get_text("text") or ""
    text_chars = len(text)
    image_count = len(page.get_images(full=True))
    return {"vector_ops": vector_ops, "text_chars": text_chars, "image_count": image_count}


def should_render(metrics):
    v = metrics["vector_ops"]
    t = metrics["text_chars"]
    # Always render if extremely vector-heavy
    if v >= VERY_HIGH_VECTOR_OPS:
        return True, f"vector_ops={v} (>= {VERY_HIGH_VECTOR_OPS})"
    # Render if vector-heavy and not too texty
    if v >= MIN_VECTOR_OPS and t <= TEXT_CHAR_MAX:
        return True, f"vector_ops={v} & text_chars={t}"
    # Otherwise skip
    return False, f"skip (vector_ops={v}, text_chars={t})"


def render_selected_pages(pdf_path: pathlib.Path):
    rows = []
    rendered = 0
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return f"FAIL open {pdf_path.name}: {e}", rows

    out_sub = OUT_DIR / pdf_path.stem
    out_sub.mkdir(parents=True, exist_ok=True)

    for i in range(len(doc)):
        page = doc.load_page(i)
        m = page_metrics(page)
        ok, reason = should_render(m)
        if not ok:
            continue
        try:
            pix = page.get_pixmap(matrix=MAT)
            out_png = out_sub / f"page-{i+1:04d}.png"
            pix.save(out_png)
            rendered += 1
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": str(out_png),
                "reason": reason,
            })
        except Exception as e:
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": "",
                "reason": f"ERROR: {e}",
            })

    status = f"OK   {pdf_path.name}: {rendered} vector pages"
    if rendered == 0:
        status = f"SKIP {pdf_path.name}: no vector-heavy pages"
    return status, rows


def extract_vector_images(pdf_dir: pathlib.Path = PDF_DIR):
    """
    Run vector-heavy page detection + rendering over all PDFs in pdf_dir.
    Writes PNGs into OUT_DIR/<pdf_stem>/ and an index CSV.
    """
    pdfs = list(pdf_dir.rglob("*.pdf"))
    print("Found PDFs:", len(pdfs))

    all_rows = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(render_selected_pages, pdf): pdf for pdf in pdfs}
        for fut in as_completed(futures):
            status, rows = fut.result()
            print(status)
            all_rows.extend(rows)

    # Write index
    index_csv = OUT_DIR / "page_index_vector.csv"
    if all_rows:
        with index_csv.open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=list(all_rows[0].keys()))
            w.writeheader()
            w.writerows(all_rows)
        print(f"\nIndexed {len(all_rows)} rendered pages â†’ {index_csv}")
    else:
        print("\nNo pages rendered. Consider lowering MIN_VECTOR_OPS or raising VERY_HIGH_VECTOR_OPS.")


In [6]:
def backup_originals_to_raw(root: Path, files_by_ext: dict):
    """
    Copy all files from 'root' into ROOT/__raw__/... preserving the
    original folder structure. Skips any __raw__ or __type__* folders
    (but __raw__ doesn't exist yet on first run anyway).
    """
    RAW_FOLDER.mkdir(parents=True, exist_ok=True)

    for file_list in files_by_ext.values():
        for src in file_list:
            rel = src.relative_to(root)
            dst = RAW_FOLDER / rel
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(src, dst)



### 2.2 Clean up now-empty original folders (outside __raw__ and __type__*)

In [7]:
def delete_empty_original_folders(root: Path):
    """
    Delete original folders under 'root' that are NOT:
      - __raw__
      - __type__* folders
      - __shp__
      - ___For_NotebookLM

    Assumes files have already been backed up to __raw__ and moved into __type__*.
    """
    root = Path(root)

    # 1) Remove non-special top-level folders wholesale
    for child in root.iterdir():
        if not child.is_dir():
            continue
        name = child.name

        # keep our special folders
        if (
            name == RAW_FOLDER_NAME
            or name == SHP_FOLDER_NAME
            or name == NOTEBOOKLM_FOLDER_NAME
            or name.startswith(TYPE_DIR_PREFIX)
        ):
            continue

        # Remove entire folder tree (original structure)
        shutil.rmtree(child, ignore_errors=True)

    # 2) Sweep for any remaining empty directories deeper down
    for dirpath, dirnames, filenames in os.walk(root, topdown=False):
        dirpath = Path(dirpath)
        name = dirpath.name

        # never delete the root or our special folders
        if dirpath == root:
            continue
        if (
            name == RAW_FOLDER_NAME
            or name == SHP_FOLDER_NAME
            or name == NOTEBOOKLM_FOLDER_NAME
            or name.startswith(TYPE_DIR_PREFIX)
        ):
            continue

        # if directory is empty now, remove it
        try:
            if not any(dirpath.iterdir()):
                dirpath.rmdir()
        except OSError:
            # not empty or cannot remove -> ignore
            pass



### 2.3 Orchestrator: no confirmation, just do it

In [8]:
def run_step2_separate_by_type(root: Path):
    """
    Step 2:
      - scan files
      - print summary
      - backup originals to __raw__
      - move files into __type___<ext> folders
      - delete original top-level folders (outside __raw__ and __type__*)
    """
    if not root.exists() or not root.is_dir():
        print(f"Invalid root: {root}")
        return {}, {}

    ext_counts, files_by_ext = scan_file_types(root)
    print_summary(root, ext_counts, files_by_ext)

    # If there are files, do the backup + grouping
    if ext_counts:
        backup_originals_to_raw(root, files_by_ext)
        group_files_by_type(root, files_by_ext)

    # ðŸ”¥ Always run cleanup, even if ext_counts is empty
    delete_empty_original_folders(root)

    return ext_counts, files_by_ext




In [9]:
ext_counts, files_by_ext = run_step2_separate_by_type(BASE_FOLDER)



=== Scan summary ===
Root folder: C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2
Total files found (excluding __type__ folders): 0

No files found.


## Step 3: Run OCR on TIF files and merge into single file 

### 3.1 OCR function (quiet, per-file)

In [10]:
from PIL import Image
import fitz  # PyMuPDF
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

TIF_FOLDER      = BASE_FOLDER / "__type___tif"
TIF_OCR_FOLDER  = BASE_FOLDER / "__type___tif__OCR___"
TIF_OCR_FOLDER.mkdir(parents=True, exist_ok=True)

def ocr_tif_to_text_single(tif_path: Path, out_txt: Path, lang="eng"):
    """
    Open a TIF that actually contains PDF bytes and run OCR on each page.
    Writes all text to out_txt. No per-page prints.
    Downscales very large page images so Tesseract doesn't choke.
    """
    with open(tif_path, "rb") as f:
        pdf_bytes = f.read()

    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    all_text_parts = []

    # Base rendering DPI
    zoom = 300 / 72
    mat = fitz.Matrix(zoom, zoom)

    # Hard limit on number of pixels per page image
    MAX_PIXELS = 10_000_000  # ~10 MP

    for page_number in range(len(doc)):
        page = doc[page_number]
        pix = page.get_pixmap(matrix=mat)
        mode = "RGBA" if pix.alpha else "RGB"
        img = Image.frombytes(mode, (pix.width, pix.height), pix.samples)
        gray = img.convert("L")

        # ðŸ”§ Downscale extremely large images before OCR
        w, h = gray.size
        pixels = w * h
        if pixels > MAX_PIXELS:
            scale = (MAX_PIXELS / pixels) ** 0.5
            new_w = max(1, int(w * scale))
            new_h = max(1, int(h * scale))
            # Optional: you can print once per page if you want:
            # print(f"  Downscaling page {page_number+1} from {w}x{h} to {new_w}x{new_h}")
            gray = gray.resize((new_w, new_h), Image.Resampling.LANCZOS)

        try:
            text = pytesseract.image_to_string(gray, lang=lang)
        except pytesseract.TesseractError as e:
            # Fallback: shrink further and try once more
            # (rarely needed, but avoids a hard crash)
            print(f"  Tesseract error on page {page_number+1} of {tif_path.name}: {e}. Retrying at smaller size...")
            smaller = gray.resize(
                (max(1, gray.size[0] // 2), max(1, gray.size[1] // 2)),
                Image.Resampling.LANCZOS,
            )
            text = pytesseract.image_to_string(smaller, lang=lang)

        all_text_parts.append(f"===== PAGE {page_number+1} =====\n{text}\n")

    out_txt.parent.mkdir(parents=True, exist_ok=True)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(all_text_parts))


### 3.2 Run OCR on all .tif with per-file progress

In [11]:
def run_step3_ocr_tifs():
    """
    Step 3:
      - find all .tif in __type___tif
      - OCR each one -> __type___tif__OCR___/<name>.txt
      - delete the .tif working copy afterwards
    """
    if not TIF_FOLDER.exists():
        print(f"No TIF folder found: {TIF_FOLDER}")
        return

    tif_files = sorted(TIF_FOLDER.glob("*.tif"))
    n = len(tif_files)
    if n == 0:
        print(f"No .tif files found in {TIF_FOLDER}")
        return

    print(f"Found {n} TIF files in {TIF_FOLDER}")
    print(f"OCR text will be saved in: {TIF_OCR_FOLDER}\n")

    for i, tif in enumerate(tif_files, start=1):
        print(f"[{i}/{n}] OCR {tif.name}")
        out_txt = TIF_OCR_FOLDER / f"{tif.stem}_ocr.txt"
        ocr_tif_to_text_single(tif, out_txt, lang="eng")

        # remove working tif (original preserved in __raw__)
        try:
            tif.unlink()
        except Exception as e:
            print(f"  Could not delete {tif}: {e}")

    print("\nStep 3 complete â€“ OCR TIFs â†’ .txt\n")

run_step3_ocr_tifs()

No .tif files found in C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___tif


### Generic text batching helper (used in Steps 4, 7, 8)

In [19]:
def batch_merge_text_files(
    source_folder: Path,
    batch_prefix: str,
    dest_root: Path = NOTEBOOKLM_ROOT,
    max_bytes: int = MAX_BATCH_BYTES,  # unused, kept for compatibility
):
    """
    Merge .txt files from source_folder into batches with a hard limit
    on total characters (MAX_BATCH_CHARS).

    Each source file is wrapped with a header marking the boundaries.
    Outputs go to dest_root/<batch_prefix>_batch_XXX.txt
    """
    source_folder = Path(source_folder)
    dest_root.mkdir(parents=True, exist_ok=True)

    txt_files = sorted(source_folder.glob("*.txt"))
    if not txt_files:
        print(f"No .txt files found in {source_folder}")
        return

    # Continue numbering from existing batches, if any
    existing = list(dest_root.glob(f"{batch_prefix}_batch_*.txt"))
    if existing:
        indices = []
        for p in existing:
            stem = p.stem  # e.g. "txt_batch_003"
            parts = stem.split("_")
            try:
                idx = int(parts[-1])
                indices.append(idx)
            except ValueError:
                continue
        batch_index = (max(indices) + 1) if indices else 1
    else:
        batch_index = 1

    print(f"Merging {len(txt_files)} text files from {source_folder}")
    current_chars = 0
    out_path = dest_root / f"{batch_prefix}_batch_{batch_index:03d}.txt"
    fout = open(out_path, "w", encoding="utf-8")
    print(f"  -> Creating {out_path.name}")

    for txt in txt_files:
        # Header marking the start of each file
        header = (
            "\n\n" + "="*80 +
            f"\nFILE: {txt.name}\n" +
            "="*80 + "\n\n"
        )

        # Try UTF-8, fall back to Latin-1
        try:
            with open(txt, "r", encoding="utf-8") as fin:
                content = fin.read()
        except UnicodeDecodeError:
            with open(txt, "r", encoding="latin-1", errors="replace") as fin:
                content = fin.read()

        needed = len(header) + len(content)

        # If this file would overflow the current batch, start a new one
        if current_chars > 0 and current_chars + needed > MAX_BATCH_CHARS:
            fout.close()
            batch_index += 1
            current_chars = 0
            out_path = dest_root / f"{batch_prefix}_batch_{batch_index:03d}.txt"
            fout = open(out_path, "w", encoding="utf-8")
            print(f"  -> Creating {out_path.name}")

        # If even a single file + header is larger than the limit, warn but still write it
        if needed > MAX_BATCH_CHARS and current_chars == 0:
            print(
                f"WARNING: single file {txt.name} (with header) has "
                f"{needed} characters (> {MAX_BATCH_CHARS}). Writing alone in this batch."
            )

        fout.write(header)
        fout.write(content)
        current_chars += needed

    fout.close()
    print(f"Done. Created {batch_index} batch file(s) for prefix '{batch_prefix}'.\n")


## Step 4: Merge OCR in 200Mb batches

In [13]:
def run_step4_merge_tif_ocr_batches():
    """
    Step 4:
      - Merge all .txt in __type___tif__OCR___ into
        ___For_NotebookLM/tif_ocr_batch_XXX.txt
    """
    if not TIF_OCR_FOLDER.exists():
        print(f"No OCR folder found: {TIF_OCR_FOLDER}")
        return

    batch_merge_text_files(
        source_folder=TIF_OCR_FOLDER,
        batch_prefix="tif_ocr",
        dest_root=NOTEBOOKLM_ROOT,
        max_bytes=MAX_BATCH_BYTES,
    )

run_step4_merge_tif_ocr_batches()


Merging 7 text files from C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___tif__OCR___
  -> Creating tif_ocr_batch_001.txt
Done. Created 1 batch file(s) for prefix 'tif_ocr'.



## Step 5: Extract Images from PDFs

In [14]:
import fitz
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

PDF_FOLDER        = BASE_FOLDER / "__type___pdf"
PDF_IMAGES_FOLDER = BASE_FOLDER / "__type___pdf___images"
PDF_IMAGES_FOLDER.mkdir(parents=True, exist_ok=True)

DPI = 300
MAX_WORKERS = 4
MIN_VECTOR_OPS       = 100
TEXT_CHAR_MAX        = 200
VERY_HIGH_VECTOR_OPS = 400

zoom = DPI / 72.0
MAT  = fitz.Matrix(zoom, zoom)

def page_metrics(page: fitz.Page):
    drawings = page.get_drawings() or []
    vector_ops = sum(len(d.get("items", ())) for d in drawings)
    text = page.get_text("text") or ""
    text_chars = len(text)
    image_count = len(page.get_images(full=True))
    return {"vector_ops": vector_ops, "text_chars": text_chars, "image_count": image_count}

def should_render(metrics):
    v, t = metrics["vector_ops"], metrics["text_chars"]
    if v >= VERY_HIGH_VECTOR_OPS:
        return True, f"vector_ops={v} (>= {VERY_HIGH_VECTOR_OPS})"
    if v >= MIN_VECTOR_OPS and t <= TEXT_CHAR_MAX:
        return True, f"vector_ops={v} & text_chars={t}"
    return False, f"skip (vector_ops={v}, text_chars={t})"

def render_selected_pages(pdf_path: Path):
    rows = []
    rendered = 0
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return f"FAIL open {pdf_path.name}: {e}", rows

    out_sub = PDF_IMAGES_FOLDER / pdf_path.stem
    out_sub.mkdir(parents=True, exist_ok=True)

    for i in range(len(doc)):
        page = doc.load_page(i)
        m = page_metrics(page)
        ok, reason = should_render(m)
        if not ok:
            continue
        try:
            pix = page.get_pixmap(matrix=MAT)
            out_png = out_sub / f"page-{i+1:04d}.png"
            pix.save(out_png)
            rendered += 1
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": str(out_png),
                "reason": reason,
            })
        except Exception as e:
            rows.append({
                "pdf_stem": pdf_path.stem,
                "pdf_path": str(pdf_path),
                "page": i + 1,
                "dpi": DPI,
                "vector_ops": m["vector_ops"],
                "text_chars": m["text_chars"],
                "image_count": m["image_count"],
                "image_path": "",
                "reason": f"ERROR: {e}",
            })

    status = f"OK   {pdf_path.name}: {rendered} vector pages"
    if rendered == 0:
        status = f"SKIP {pdf_path.name}: no vector-heavy pages"
    return status, rows

def run_step5_extract_pdf_images():
    pdfs = list(PDF_FOLDER.glob("*.pdf"))
    print("Found PDFs:", len(pdfs))

    all_rows = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(render_selected_pages, pdf): pdf for pdf in pdfs}
        for fut in as_completed(futures):
            status, rows = fut.result()
            print(status)
            all_rows.extend(rows)

    index_csv = PDF_IMAGES_FOLDER / "page_index_vector.csv"
    if all_rows:
        with index_csv.open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=list(all_rows[0].keys()))
            w.writeheader()
            w.writerows(all_rows)
        print(f"\nIndexed {len(all_rows)} rendered pages â†’ {index_csv}")
    else:
        print("\nNo pages rendered. Consider tuning thresholds.")

run_step5_extract_pdf_images()


Found PDFs: 5595
SKIP 01_1990_annual_report_MCC205_208_MCC983_990.pdf: no vector-heavy pages
SKIP 01_9934Y2A1.pdf: no vector-heavy pages
SKIP 01_8104Y7AR.pdf: no vector-heavy pages
SKIP 01_25490_Wonarah.pdf: no vector-heavy pages
SKIP 01_Annual_report.pdf: no vector-heavy pages
SKIP 01_Annual_reports_for_MCC316_MCC317_MCC338_MCC339_and_MCC342.pdf: no vector-heavy pages
SKIP 01_A900_Logistics_Report.pdf: no vector-heavy pages
SKIP 01_Annual_report_EL8773.pdf: no vector-heavy pages
SKIP 01_Annual_report_EL9221.pdf: no vector-heavy pages
OK   01_Annual_Report_EL23726.pdf: 2 vector pages
SKIP 01_Annual_report_MCC308.pdf: no vector-heavy pages
SKIP 01_Annual_report_MLC599.pdf: no vector-heavy pages
OK   01_Annual_Report_EL9978_9-09-04.pdf: 1 vector pages
SKIP 01_AP1782_1967_Reports.pdf: no vector-heavy pages
OK   01_Aro-el23400%2cA04.pdf: 4 vector pages
OK   01_Aro-el23409%2cA04.pdf: 4 vector pages
OK   01_Combined_Final_Report_2000.pdf: 1 vector pages
SKIP 01_ComstockGroup_2012_GA_01.pdf: 

## Step 6: Merge PDFs

In [16]:
from PyPDF2 import PdfMerger
from PyPDF2.errors import DependencyError


PDF_HEAVY_FOLDER = BASE_FOLDER / "__type___pdf__heavy"
PDF_HEAVY_FOLDER.mkdir(parents=True, exist_ok=True)

def get_pdf_files_with_sizes(folder: Path):
    pdfs = []
    for name in os.listdir(folder):
        if name.lower().endswith(".pdf"):
            full_path = folder / name
            if full_path.is_file():
                pdfs.append((full_path, full_path.stat().st_size))
    pdfs.sort(key=lambda x: x[0].name.lower())
    return pdfs

def make_pdf_batches(pdfs, max_bytes: int):
    batches = []
    current_batch, current_size = [], 0
    for path, size in pdfs:
        if current_batch and current_size + size > max_bytes:
            batches.append(current_batch)
            current_batch, current_size = [], 0
        current_batch.append((path, size))
        current_size += size
    if current_batch:
        batches.append(current_batch)
    return batches

def merge_pdf_batch(batch, output_path: Path):
    """
    Merge one batch of PDFs into a single output PDF.

    - Uses strict=False so PyPDF2 is more tolerant of bad structure
    - Disables outline/bookmark import (import_outline=False)
    - Skips encrypted or otherwise problematic PDFs instead of crashing
    """
    merger = PdfMerger(strict=False)
    try:
        for path, _ in batch:
            try:
                # Do NOT import outlines/bookmarks â€“ they are often broken
                merger.append(str(path), import_outline=False)
            except DependencyError as e:
                # AES-encrypted PDF but pycryptodome not installed
                print(
                    f"  Skipping encrypted PDF (needs PyCryptodome): "
                    f"{path.name} ({e})"
                )
            except Exception as e:
                # Any other weirdness, e.g. broken destinations / outlines
                print(
                    f"  Skipping problematic PDF during merge: "
                    f"{path.name} ({e})"
                )

        with open(output_path, "wb") as f_out:
            merger.write(f_out)
    finally:
        merger.close()



def run_step6_merge_pdfs():
    """
    Step 6:
      - Move PDFs > MAX_BATCH_BYTES to __type___pdf__heavy
      - Batch-merge the rest into <~199 MB PDFs in ___For_NotebookLM
      - Delete the processed PDFs from __type___pdf
    """
    pdfs = get_pdf_files_with_sizes(PDF_FOLDER)
    if not pdfs:
        print("No PDFs found in __type___pdf")
        return

    normal, heavy = [], []
    for path, size in pdfs:
        if size > MAX_BATCH_BYTES:
            heavy.append((path, size))
        else:
            normal.append((path, size))

    # Move heavy ones
    for path, size in heavy:
        dest = PDF_HEAVY_FOLDER / path.name
        print(f"Heavy PDF (>199 MB): {path.name} -> {dest}")
        shutil.move(str(path), str(dest))

    print(f"\nHeavy PDFs moved: {len(heavy)}")
    print(f"Normal PDFs to batch: {len(normal)}")

    if not normal:
        print("No normal PDFs to merge.")
        return

    batches = make_pdf_batches(normal, MAX_BATCH_BYTES)
    print(f"Planned {len(batches)} PDF batch(es).\n")

    for i, batch in enumerate(batches, start=1):
        out_name = f"pdf_batch_{i:03d}.pdf"
        out_path = NOTEBOOKLM_ROOT / out_name
        total_mb = sum(size for _, size in batch) / (1024**2)
        print(f"  Creating {out_name}: {len(batch)} files, sum input â‰ˆ {total_mb:.2f} MB")
        merge_pdf_batch(batch, out_path)

    # Delete processed normal PDFs from __type___pdf
    for path, _ in normal:
        if path.exists():
            path.unlink()

    print("\nStep 6 complete â€“ PDFs batched and heavy ones isolated.\n")

run_step6_merge_pdfs()



Heavy PDFs moved: 0
Normal PDFs to batch: 5595
Planned 73 PDF batch(es).

  Creating pdf_batch_001.pdf: 60 files, sum input â‰ˆ 186.86 MB
  Creating pdf_batch_002.pdf: 40 files, sum input â‰ˆ 184.10 MB
  Creating pdf_batch_003.pdf: 30 files, sum input â‰ˆ 187.78 MB
  Creating pdf_batch_004.pdf: 28 files, sum input â‰ˆ 187.62 MB
  Creating pdf_batch_005.pdf: 26 files, sum input â‰ˆ 184.47 MB
  Creating pdf_batch_006.pdf: 33 files, sum input â‰ˆ 188.94 MB
  Creating pdf_batch_007.pdf: 33 files, sum input â‰ˆ 179.22 MB
  Creating pdf_batch_008.pdf: 38 files, sum input â‰ˆ 184.98 MB
  Creating pdf_batch_009.pdf: 38 files, sum input â‰ˆ 167.68 MB
  Creating pdf_batch_010.pdf: 34 files, sum input â‰ˆ 187.65 MB
  Creating pdf_batch_011.pdf: 42 files, sum input â‰ˆ 188.49 MB
  Creating pdf_batch_012.pdf: 63 files, sum input â‰ˆ 188.19 MB
  Creating pdf_batch_013.pdf: 110 files, sum input â‰ˆ 188.16 MB
  Creating pdf_batch_014.pdf: 119 files, sum input â‰ˆ 179.50 MB
  Creating pdf_batch_015.pd

Invalid stream (index 0) within object 537 0: Stream has ended unexpectedly
Invalid stream (index 0) within object 538 0: Stream has ended unexpectedly
Invalid stream (index 0) within object 539 0: Stream has ended unexpectedly
Invalid stream (index 0) within object 540 0: Stream has ended unexpectedly
Invalid stream (index 0) within object 541 0: Stream has ended unexpectedly


  Creating pdf_batch_058.pdf: 34 files, sum input â‰ˆ 187.15 MB
  Creating pdf_batch_059.pdf: 124 files, sum input â‰ˆ 188.04 MB
  Creating pdf_batch_060.pdf: 119 files, sum input â‰ˆ 174.24 MB


Invalid stream (index 9) within object 25 0: Stream has ended unexpectedly


  Creating pdf_batch_061.pdf: 157 files, sum input â‰ˆ 184.59 MB


Invalid stream (index 2) within object 17 0: Stream has ended unexpectedly
Invalid stream (index 2) within object 20 0: Stream has ended unexpectedly


  Creating pdf_batch_062.pdf: 65 files, sum input â‰ˆ 184.79 MB


incorrect startxref pointer(3)


  Creating pdf_batch_063.pdf: 28 files, sum input â‰ˆ 178.77 MB
  Creating pdf_batch_064.pdf: 61 files, sum input â‰ˆ 189.99 MB
  Creating pdf_batch_065.pdf: 122 files, sum input â‰ˆ 180.36 MB
  Creating pdf_batch_066.pdf: 74 files, sum input â‰ˆ 174.32 MB


Object ID 4,0 ref repaired
Object ID 4,0 ref repaired


  Creating pdf_batch_067.pdf: 92 files, sum input â‰ˆ 156.77 MB
  Creating pdf_batch_068.pdf: 58 files, sum input â‰ˆ 189.36 MB
  Creating pdf_batch_069.pdf: 77 files, sum input â‰ˆ 186.87 MB


Object ID 4,0 ref repaired


  Creating pdf_batch_070.pdf: 70 files, sum input â‰ˆ 189.54 MB


Object ID 4,0 ref repaired


  Creating pdf_batch_071.pdf: 56 files, sum input â‰ˆ 186.81 MB
  Creating pdf_batch_072.pdf: 102 files, sum input â‰ˆ 187.16 MB
  Creating pdf_batch_073.pdf: 54 files, sum input â‰ˆ 87.98 MB

Step 6 complete â€“ PDFs batched and heavy ones isolated.



## Step 7: Merge DOCs

In [20]:
from docx import Document

DOCX_FOLDER      = BASE_FOLDER / "__type___docx"
DOCX_TXT_FOLDER  = BASE_FOLDER / "__type___docx__TXT__"
DOCX_TXT_FOLDER.mkdir(parents=True, exist_ok=True)

def docx_to_txt_single(docx_path: Path, out_txt: Path):
    doc = Document(str(docx_path))
    lines = []
    for para in doc.paragraphs:
        lines.append(para.text)
    out_txt.parent.mkdir(parents=True, exist_ok=True)
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))



def run_step7_docx_to_txt_and_merge():
    """
    Step 7:
      - Convert all .docx in __type___docx â†’ .txt in __type___docx__TXT__
      - Delete the working .docx
      - Batch-merge those txt into ___For_NotebookLM/docx_batch_XXX.txt

    If no .docx remain (e.g. from a previous run), it will still merge any
    existing .txt files in __type___docx__TXT__ into batches.
    """
    # --- 1. Convert any remaining DOCX to TXT ---
    docx_files = []
    if DOCX_FOLDER.exists():
        docx_files = sorted(DOCX_FOLDER.glob("*.docx"))

    n = len(docx_files)
    if n > 0:
        print(f"Found {n} DOCX files in {DOCX_FOLDER}")

        for i, docx in enumerate(docx_files, start=1):
            print(f"[{i}/{n}] Converting {docx.name} -> txt")
            out_txt = DOCX_TXT_FOLDER / f"{docx.stem}.txt"
            docx_to_txt_single(docx, out_txt)
            try:
                docx.unlink()
            except Exception as e:
                print(f"  Could not delete {docx}: {e}")
    else:
        print(f"No .docx files found in {DOCX_FOLDER} â€“ skipping conversion step.")

    # --- 2. Always try to merge any TXT in DOCX_TXT_FOLDER ---
    txt_files = list(DOCX_TXT_FOLDER.glob("*.txt"))
    if not txt_files:
        print(f"No .txt files found in {DOCX_TXT_FOLDER}, nothing to merge.")
        return

    batch_merge_text_files(
        source_folder=DOCX_TXT_FOLDER,
        batch_prefix="docx",
        dest_root=NOTEBOOKLM_ROOT,
        max_bytes=MAX_BATCH_BYTES,
    )

    print("Step 7 complete â€“ DOCX â†’ TXT â†’ batches.\n")

run_step7_docx_to_txt_and_merge()


No .docx files found in C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___docx â€“ skipping conversion step.
No .txt files found in C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___docx__TXT__, nothing to merge.


## Step 8 â€“ Merge raw .txt into batches

In [21]:
TXT_FOLDER = BASE_FOLDER / "__type___txt"

def run_step8_merge_txt_batches():
    """
    Step 8:
      - Batch-merge all .txt in __type___txt into ___For_NotebookLM/txt_batch_XXX.txt
      - Delete the processed .txt files
    """
    if not TXT_FOLDER.exists():
        print(f"No TXT folder found: {TXT_FOLDER}")
        return

    txt_files = list(TXT_FOLDER.glob("*.txt"))
    if not txt_files:
        print(f"No .txt files found in {TXT_FOLDER}")
        return

    batch_merge_text_files(
        source_folder=TXT_FOLDER,
        batch_prefix="txt",
        dest_root=NOTEBOOKLM_ROOT,
        max_bytes=MAX_BATCH_BYTES,
    )

    # Delete processed txt files
    for path in txt_files:
        if path.exists():
            path.unlink()

    print("Step 8 complete â€“ raw TXT batched.\n")

run_step8_merge_txt_batches()


Merging 1050 text files from C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___txt
  -> Creating txt_batch_003.txt
  -> Creating txt_batch_004.txt
  -> Creating txt_batch_005.txt
  -> Creating txt_batch_006.txt
  -> Creating txt_batch_007.txt
  -> Creating txt_batch_008.txt
  -> Creating txt_batch_009.txt
  -> Creating txt_batch_010.txt
  -> Creating txt_batch_011.txt
  -> Creating txt_batch_012.txt
  -> Creating txt_batch_013.txt
  -> Creating txt_batch_014.txt
  -> Creating txt_batch_015.txt
  -> Creating txt_batch_016.txt
  -> Creating txt_batch_017.txt
  -> Creating txt_batch_018.txt
  -> Creating txt_batch_019.txt
  -> Creating txt_batch_020.txt
  -> Creating txt_batch_021.txt
  -> Creating txt_batch_022.txt
  -> Creating txt_batch_023.txt
  -> Creating txt_batch_024.txt
  -> Creating txt_batch_025.txt
  -> Creating txt_batch_026.txt
  -> Creating txt_batch_027.txt
  -> Creating txt_batch_028.txt
  -> C

## Step 9: Final cleanup â€“ delete empty __type___docx/txt/tif/pdf folders

In [22]:
def cleanup_empty_type_folders():
    for name in ["__type___docx", "__type___txt", "__type___tif", "__type___pdf"]:
        folder = BASE_FOLDER / name
        if folder.exists():
            if any(folder.iterdir()):
                print(f"Not empty (skipping delete): {folder}")
            else:
                try:
                    folder.rmdir()
                    print(f"Deleted empty folder: {folder}")
                except OSError as e:
                    print(f"Could not delete {folder}: {e}")

cleanup_empty_type_folders()


Could not delete C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___txt: [WinError 5] Access is denied: 'C:\\Users\\Julian.Diaz\\OneDrive - XENITH CONSULTING PTY LTD\\Documents\\00_Projects\\96_NT-Gold-CaseStudy\\Reports-Tenements2\\__type___txt'
Could not delete C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___tif: [WinError 5] Access is denied: 'C:\\Users\\Julian.Diaz\\OneDrive - XENITH CONSULTING PTY LTD\\Documents\\00_Projects\\96_NT-Gold-CaseStudy\\Reports-Tenements2\\__type___tif'
Could not delete C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\__type___pdf: [WinError 5] Access is denied: 'C:\\Users\\Julian.Diaz\\OneDrive - XENITH CONSULTING PTY LTD\\Documents\\00_Projects\\96_NT-Gold-CaseStudy\\Reports-Tenements2\\__type___pdf'


## Step 10: chunk large NotebookLM text files

In [23]:
def run_step9_chunk_large_notebooklm_txt_files():
    """
    Step 9:
      - Scan ___For_NotebookLM for .txt files
      - For any file with more than MAX_BATCH_CHARS characters,
        split into smaller parts (<= MAX_BATCH_CHARS) and delete the original.

    This cleans up cases where a single source file is larger than the limit
    and ended up alone in one batch.
    """
    CHUNK_CHARS = MAX_BATCH_CHARS  # currently 490_000
    txt_files = sorted(NOTEBOOKLM_ROOT.glob("*.txt"))
    if not txt_files:
        print(f"No .txt files found in {NOTEBOOKLM_ROOT}")
        return

    print(f"Checking {len(txt_files)} NotebookLM .txt files for oversize chunks...")
    chunked_count = 0

    for path in txt_files:
        # We wrote these ourselves as UTF-8 in batch_merge_text_files
        try:
            text = path.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            # Extremely unlikely here, but be defensive
            text = path.read_text(encoding="latin-1", errors="replace")

        length = len(text)
        if length <= CHUNK_CHARS:
            continue

        # Need to split this file into smaller pieces
        base = path.stem        # e.g. "txt_batch_002"
        suffix = path.suffix    # ".txt"

        n_parts = (length + CHUNK_CHARS - 1) // CHUNK_CHARS
        print(f"  Chunking {path.name} ({length} chars) into {n_parts} part(s).")

        for i in range(n_parts):
            start = i * CHUNK_CHARS
            end = min((i + 1) * CHUNK_CHARS, length)
            chunk = text[start:end]

            part_name = f"{base}_part{i+1:02d}{suffix}"
            part_path = path.with_name(part_name)
            part_path.write_text(chunk, encoding="utf-8")

        # Remove the oversized original
        path.unlink()
        chunked_count += 1

    if chunked_count == 0:
        print("No oversized NotebookLM .txt files needed chunking.")
    else:
        print(f"Step 9 complete â€“ chunked {chunked_count} oversized .txt file(s).")


In [24]:
run_step9_chunk_large_notebooklm_txt_files()


Checking 82 NotebookLM .txt files for oversize chunks...
  Chunking txt_batch_001.txt (1393464 chars) into 3 part(s).
  Chunking txt_batch_003.txt (1393464 chars) into 3 part(s).
  Chunking txt_batch_031.txt (1336682 chars) into 3 part(s).
  Chunking txt_batch_045.txt (1028523 chars) into 3 part(s).
  Chunking txt_batch_047.txt (2136882 chars) into 5 part(s).
  Chunking txt_batch_052.txt (2608008 chars) into 6 part(s).
  Chunking txt_batch_056.txt (1887786 chars) into 4 part(s).
  Chunking txt_batch_060.txt (1656104 chars) into 4 part(s).
  Chunking txt_batch_062.txt (1205814 chars) into 3 part(s).
  Chunking txt_batch_066.txt (549889 chars) into 2 part(s).
  Chunking txt_batch_071.txt (717913 chars) into 2 part(s).
  Chunking txt_batch_073.txt (1075368 chars) into 3 part(s).
  Chunking txt_batch_078.txt (717474 chars) into 2 part(s).
Step 9 complete â€“ chunked 13 oversized .txt file(s).
