In [3]:
import os
from PIL import Image
import fitz  # PyMuPDF
import pytesseract
from PyPDF2 import PdfMerger

# Point pytesseract to your Tesseract installation
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


def ocr_pdf_to_text(pdf_path, txt_path=None, lang="eng"):
    """
    Render each page of a PDF (even if named .tif) to an image using PyMuPDF,
    run OCR with Tesseract, and write all text to a .txt file.
    """
    if txt_path is None:
        base, _ = os.path.splitext(pdf_path)
        txt_path = base + "_ocr.txt"  # e.g. cr_25613_1.pdf_ocr.txt

    # Open the file as a PDF via its bytes, ignoring extension
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")

    all_text = []

    for page_number in range(len(doc)):
        page = doc[page_number]
        print(f"OCR text: page {page_number + 1} of {len(doc)}")

        # Render at ~300 dpi for better OCR quality
        zoom = 300 / 72  # 72 dpi is default
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        mode = "RGBA" if pix.alpha else "RGB"
        img = Image.frombytes(mode, (pix.width, pix.height), pix.samples)

        # Convert to grayscale for OCR
        gray = img.convert("L")

        text = pytesseract.image_to_string(gray, lang=lang)
        all_text.append(f"===== PAGE {page_number + 1} =====\n{text}\n")

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write("\n".join(all_text))

    print(f"Saved OCR text to: {txt_path}")


def ocr_pdf_to_searchable_pdf(pdf_path, out_pdf_path=None, lang="eng"):
    """
    Create a new PDF where each page is the rendered image plus
    an invisible, searchable text layer underneath (via Tesseract).
    """
    if out_pdf_path is None:
        base, _ = os.path.splitext(pdf_path)
        out_pdf_path = base + "_searchable.pdf"  # e.g. cr_25613_1.pdf_searchable.pdf

    # Open as PDF via bytes, ignoring extension
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")

    temp_pdfs = []

    for page_number in range(len(doc)):
        page = doc[page_number]
        print(f"OCR (PDF): page {page_number + 1} of {len(doc)}")

        zoom = 300 / 72
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        mode = "RGBA" if pix.alpha else "RGB"
        img = Image.frombytes(mode, (pix.width, pix.height), pix.samples)
        gray = img.convert("L")

        # Get a one-page searchable PDF from Tesseract
        pdf_bytes_page = pytesseract.image_to_pdf_or_hocr(
            gray,
            extension="pdf",
            lang=lang
        )

        temp_page_path = f"{out_pdf_path}.page{page_number + 1}.tmp.pdf"
        with open(temp_page_path, "wb") as f:
            f.write(pdf_bytes_page)
        temp_pdfs.append(temp_page_path)

    # Merge all temporary one-page PDFs
    merger = PdfMerger()
    for p in temp_pdfs:
        merger.append(p)
    with open(out_pdf_path, "wb") as f_out:
        merger.write(f_out)
    merger.close()

    # Clean up temporary files
    for p in temp_pdfs:
        os.remove(p)

    print(f"Saved searchable PDF to: {out_pdf_path}")


# ==== RUN HERE ====
pdf_path = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\02_SHP\2025_11\NW-QLD-Available-Exercise\HistoricReports\cr_25613_1.pdf.tif"

# 1) Extract text into a .txt file
ocr_pdf_to_text(pdf_path)

# 2) Create a searchable PDF
ocr_pdf_to_searchable_pdf(pdf_path)


OCR text: page 1 of 31
OCR text: page 2 of 31
OCR text: page 3 of 31
OCR text: page 4 of 31
OCR text: page 5 of 31
OCR text: page 6 of 31
OCR text: page 7 of 31
OCR text: page 8 of 31
OCR text: page 9 of 31
OCR text: page 10 of 31
OCR text: page 11 of 31
OCR text: page 12 of 31
OCR text: page 13 of 31
OCR text: page 14 of 31
OCR text: page 15 of 31
OCR text: page 16 of 31
OCR text: page 17 of 31
OCR text: page 18 of 31
OCR text: page 19 of 31
OCR text: page 20 of 31
OCR text: page 21 of 31
OCR text: page 22 of 31
OCR text: page 23 of 31
OCR text: page 24 of 31
OCR text: page 25 of 31
OCR text: page 26 of 31
OCR text: page 27 of 31
OCR text: page 28 of 31
OCR text: page 29 of 31
OCR text: page 30 of 31
OCR text: page 31 of 31
Saved OCR text to: C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\05_Geodatabases\02_SHP\2025_11\NW-QLD-Available-Exercise\HistoricReports\cr_25613_1.pdf_ocr.txt
OCR (PDF): page 1 of 31
OCR (PDF): page 2 of 31
OCR (PDF): page 3 of 31
OCR (PDF):