In [1]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw
import os

####################################################
# 3. Main visualizer
####################################################

def visualize_layout_debug(
    pdf_path,
    page_num=0,
    dpi=150,
    save_dir="Projects/exam_paper_parser_V1.0/data/raw_papers/pdf_layout",
    show_labels=True
):
    """
    Produces three debug images for a given PDF page:
      1. RAW layout: text (red), images (green), raw drawing prims (blue thin)
      2. MERGED layout: text (red), images (green), merged clusters (blue thick)
      3. COORD GRID layout: page raster + PDF coordinate grid labels

    Also prints:
      TEXT BLOCKS (red)
      IMAGE BLOCKS (green)
      DRAWING PRIMITIVES (blue RAW)
      FINAL drawing_clusters (blue MERGED)
    """

    os.makedirs(save_dir, exist_ok=True)

    # -----------------
    # Open and render
    # -----------------
    doc = fitz.open(pdf_path)
    page = doc[page_num]

    # Coordinate transform for boxes:
    # On your PDF we discovered just scaling is enough (no Y-flip).

    # -----------------
    # 1. Extract text/image blocks
    # -----------------
    text_blocks = []
    txt_idx = 0
    image_blocks = []
    img_idx = 0
    page_dict = page.get_text("dict")
    for b_idx, block in enumerate(page_dict["blocks"]):
        bbox_pdf = tuple(block["bbox"])
        btype = block.get("type", 0)

        # collect any text spans
        spans_text = []
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    spans_text.append(span["text"])
        joined_text = " ".join(spans_text).strip()

        if btype == 0:
            text_blocks.append({
                "id": f"T{txt_idx}",
                "bbox_pdf": bbox_pdf,
                "text": joined_text,
            })
            txt_idx +=1
        elif btype == 1:
            image_blocks.append({
                "id": f"I{img_idx}",
                "bbox_pdf": bbox_pdf,
                "text": joined_text,
            })
            img_idx +=1
        else:
            pass

    # -----------------
    # 8. Console debug info
    # -----------------
    print("\n=== TEXT BLOCKS (red) ===")
    for tb in text_blocks:
        print(tb["id"])
        print(" pdf bbox:", tb["bbox_pdf"])
        print(" text:", tb["text"][:200].replace("\n", " "))
        print()

    print("=== IMAGE BLOCKS (green) ===")
    for ib in image_blocks:
        print(ib["id"])
        print(" pdf bbox:", ib["bbox_pdf"])
        print(" note (usually empty):", ib["text"][:80])
        print()

    doc.close()

    return {
        "text_blocks": text_blocks,
        "image_blocks": image_blocks
    }


####################################################
# 4. Example usage
####################################################

if __name__ == "__main__":
    pdf_path = "../data/raw_papers/June 2018 QP - Paper 1 (H) Edexcel Physics GCSE.pdf"
    page_num = 2  # sonar graph page in your screenshots
    save_dir_tmp = "../data/raw_papers/pdf_layout"

    visualize_layout_debug(
        pdf_path,
        page_num=page_num,
        dpi=150,
        save_dir=save_dir_tmp,
        show_labels=True
    )




=== TEXT BLOCKS (red) ===
T0
 pdf bbox: (198.9763946533203, 767.2429809570312, 541.0152587890625, 799.2008666992188)
 text: 3 *P60466A0332* Turn over

T1
 pdf bbox: (4.050839900970459, 69.09228515625, 15.002639770507812, 710.6641845703125)
 text: DO NOT WRITE IN THIS AREA  DO NOT WRITE IN THIS AREA  DO NOT WRITE IN THIS AREA

T2
 pdf bbox: (36.738258361816406, 120.83814239501953, 47.690059661865234, 762.4212036132812)
 text: DO NOT WRITE IN THIS AREA  DO NOT WRITE IN THIS AREA  DO NOT WRITE IN THIS AREA

T3
 pdf bbox: (64.67610168457031, 93.0523681640625, 469.5926208496094, 116.78492736816406)
 text: (c) Figure 1 shows the depth of the sea, measured using sonar, at different distances  from the shore.

T4
 pdf bbox: (282.56451416015625, 129.16949462890625, 413.88201904296875, 140.1212921142578)
 text: distance from the shore in m

T5
 pdf bbox: (192.30972290039062, 151.16070556640625, 472.26068115234375, 162.1125030517578)
 text: 140 120 100 80 60 40 20 0

T6
 pdf bbox: (174.185531616

In [2]:
import os, json, io
import fitz  # PyMuPDF

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def save_bytes_to_png(b, out_path):
    with open(out_path, "wb") as f:
        f.write(b)

def extract_images_and_drawings(pdf_path, out_dir="extracted_artifacts", zoom=2.0, pad=2.0):
    """
    Extracts:
      - Raster images (once per XREF) + all their placements (rects) per page
      - Vector drawings (per item via page.get_drawings) as clipped rasters

    Saves each artifact as a PNG and returns a JSON-friendly manifest with coords.
    Coords are in page space (top-left origin, y down).

    Args:
      pdf_path: path to input PDF
      out_dir: base folder to save per-page images/drawings
      zoom: scale factor for higher-resolution PNGs
      pad: padding (points) to expand around drawing rects before clipping
    """
    doc = fitz.open(pdf_path)
    ensure_dir(out_dir)

    manifest = {
        "pdf": os.path.abspath(pdf_path),
        "pages": []
    }

    # Helpful: map xref -> saved file path so we store the image content once
    saved_images = {}

    for pno in range(doc.page_count):
        page = doc[pno]
        page_dir = os.path.join(out_dir, f"page_{pno+1:04d}")
        ensure_dir(page_dir)

        page_w, page_h = page.rect.width, page.rect.height
        page_entry = {
            "page_number": pno + 1,
            "page_size": {"width": page_w, "height": page_h},
            "raster_images": [],
            "drawings": []
        }

        # --------- RASTER IMAGES ----------
        # 1) Save image content once per XREF
        image_list = page.get_images(full=True)  # [(xref, smask, ...), ...]
        for img in image_list:
            xref = img[0]
            if xref not in saved_images:
                img_info = doc.extract_image(xref)
                ext = img_info.get("ext", "png")
                # Save each unique image bytes once (under /images)
                images_dir = os.path.join(out_dir, "images_unique")
                ensure_dir(images_dir)
                img_path = os.path.join(images_dir, f"xref_{xref}.{ext}")
                save_bytes_to_png(img_info["image"], img_path)
                saved_images[xref] = img_path

            # 2) Record ALL placements of this image on THIS page (rects)
            placements = page.get_image_rects(xref)
            for idx, r in enumerate(placements):
                page_entry["raster_images"].append({
                    "type": "image",
                    "xref": xref,
                    "instance_index": idx,  # if same image appears multiple times
                    "file": saved_images[xref],
                    "rect": {"x0": r.x0, "y0": r.y0, "x1": r.x1, "y1": r.y1},
                })

        # --------- VECTOR DRAWINGS ----------
        # Fast method: clip-render each drawing's bounding rect
        # (Will include anything in that rect; simple and effective for exam diagrams)
        drawings = page.get_drawings()  # list of dicts; includes "rect" and path items
        for didx, d in enumerate(drawings):
            r = fitz.Rect(d["rect"])
            # pad a little to avoid hard crops on thin lines
            r = fitz.Rect(r.x0 - pad, r.y0 - pad, r.x1 + pad, r.y1 + pad).intersect(page.rect)
            if r.is_empty:
                continue

            # render at higher resolution
            mat = fitz.Matrix(zoom, zoom)
            pm = page.get_pixmap(matrix=mat, clip=r, alpha=False)  # no transparency; white bg
            draw_path = os.path.join(page_dir, f"drawing_{didx+1:03d}.png")
            pm.save(draw_path)

            page_entry["drawings"].append({
                "type": "drawing",
                "index": didx,
                "file": draw_path,
                "rect": {"x0": r.x0, "y0": r.y0, "x1": r.x1, "y1": r.y1}
            })

        manifest["pages"].append(page_entry)

    # Save the manifest so you can later bind drawings/images to questions by coords
    manifest_path = os.path.join(out_dir, "artifacts_manifest.json")
    with open(manifest_path, "w") as f:
        json.dump(manifest, f, indent=2)

    doc.close()
    return manifest_path

# Example:
pdf_path = "../data/raw_papers/June 2018 QP - Paper 1 (H) Edexcel Physics GCSE.pdf"
page_num = 2  # sonar graph page in your screenshots
save_dir_tmp = "../data/raw_papers/pdf_layout"
manifest_json = extract_images_and_drawings(pdf_path, out_dir="../data/raw_papers/paper_artifacts", zoom=2.0, pad=3.0)
print("Wrote manifest:", manifest_json)


Wrote manifest: ../data/raw_papers/paper_artifacts/artifacts_manifest.json


**stich back the darwings onto a pdf**

In [4]:
import os
import json
import fitz  # PyMuPDF

def load_manifest(manifest_path):
    with open(manifest_path, "r") as f:
        return json.load(f)

def as_rect(rdict):
    # expects {"x0":..., "y0":..., "x1":..., "y1":...}
    return fitz.Rect(rdict["x0"], rdict["y0"], rdict["x1"], rdict["y1"])

def reconstruct_from_manifest(
    manifest_path,
    original_pdf_path=None,
    out_overlay_path="reconstructed_overlay.pdf",
    out_reconstructed_only_path="reconstructed_only.pdf",
    opacity=0.65,              # 0..1 (only affects overlay)
    draw_inspection_boxes=True, # draw a thin border around placed rects in overlay
    border_rgb=(1, 0, 0),
    border_width=0.6
):
    """
    Reads artifacts_manifest.json and places each artifact PNG back at its recorded rect.
    - If original_pdf_path is provided, creates an overlay PDF (original + pasted artifacts).
    - Always creates a 'reconstructed_only' PDF with blank pages and the pasted artifacts.

    Notes:
    - Coordinates match PyMuPDF page space (origin top-left, y down).
    - PNGs for drawings were saved from rect clips; they will be scaled to fit rect.
    - PNGs for raster images are the unique images saved once; they will be placed at each recorded rect.

    Returns: (overlay_path or None, reconstructed_only_path)
    """
    mf = load_manifest(manifest_path)
    pages = mf.get("pages", [])
    if not pages:
        raise ValueError("Manifest has no 'pages' entries.")

    # If original provided → open for overlay. Otherwise skip overlay.
    overlay_doc = None
    if original_pdf_path:
        overlay_doc = fitz.open(original_pdf_path)

    # Always build a reconstructed-only doc (blank pages sized like original if available)
    reconstructed_only_doc = fitz.open()

    # If we don't have the original, but manifest stored sizes → we can use those for page sizes
    page_sizes = []
    for p in pages:
        ps = p.get("page_size", {})
        page_sizes.append((ps.get("width", 595.0), ps.get("height", 842.0)))  # default A4-ish if missing

    # Prepare pages in reconstructed-only pdf
    for (w, h) in page_sizes:
        reconstructed_only_doc.new_page(width=w, height=h)

    # Iterate each page’s artifacts and place them
    for idx, p in enumerate(pages):
        pno = p.get("page_number")  # 1-based in our manifest
        if pno is None:
            continue
        page_index = pno - 1

        # Overlay page (if we have original)
        overlay_page = overlay_doc[page_index] if overlay_doc else None
        recon_page  = reconstructed_only_doc[page_index]

        # Helper to insert one image onto a page at rect
        def place_image(page, image_path, rect, try_opacity=None):
            """
            Inserts image at rect. If the installed PyMuPDF doesn't support the
            'opacity' kwarg, it falls back to a call without it.
            """
            if not os.path.exists(image_path):
                print(f"[WARN] Missing image: {image_path}")
                return

            base_kwargs = dict(filename=image_path, rect=rect, keep_proportion=False, overlay=True)

            if try_opacity is None:
                # Old behavior (no transparency)
                page.insert_image(**base_kwargs)
                return

            # Try with opacity; if TypeError, retry without it.
            try:
                page.insert_image(**{**base_kwargs, "opacity": try_opacity})
            except TypeError:
                # Installed PyMuPDF version doesn't support 'opacity'
                page.insert_image(**base_kwargs)


        # 1) Raster images
        for inst in p.get("raster_images", []):
            rect = as_rect(inst["rect"])
            img_path = inst["file"]

            if overlay_page:
                place_image(overlay_page, img_path, rect, try_opacity=opacity)
                if draw_inspection_boxes:
                    overlay_page.draw_rect(rect, color=border_rgb, width=border_width)

            place_image(recon_page, img_path, rect, try_opacity=None)

        # 2) Vector drawings (saved as PNG clips)
        for inst in p.get("drawings", []):
            rect = as_rect(inst["rect"])
            img_path = inst["file"]

            if overlay_page:
                place_image(overlay_page, img_path, rect, try_opacity=opacity)
                if draw_inspection_boxes:
                    overlay_page.draw_rect(rect, color=border_rgb, width=border_width)

            place_image(recon_page, img_path, rect, try_opacity=None)

    # Save outputs
    overlay_out = None
    if overlay_doc:
        overlay_doc.save(out_overlay_path)
        overlay_doc.close()
        overlay_out = os.path.abspath(out_overlay_path)

    reconstructed_only_doc.save(out_reconstructed_only_path)
    reconstructed_only_doc.close()

    return overlay_out, os.path.abspath(out_reconstructed_only_path)


# ---------- Example usage ----------
manifest = "../data/raw_papers/paper_artifacts/artifacts_manifest.json"
original = "../data/raw_papers/June 2018 QP - Paper 1 (H) Edexcel Physics GCSE.pdf"
overlay_pdf, recon_only_pdf = reconstruct_from_manifest(
    manifest_path=manifest,
    original_pdf_path=original,
    out_overlay_path="../data/raw_papers/paper_artifacts/tally/tally_overlay.pdf",
    out_reconstructed_only_path="../data/raw_papers/paper_artifacts/tally/tally_reconstructed_only.pdf",
    opacity=0.5,
    draw_inspection_boxes=True
)
print("Overlay PDF:", overlay_pdf)
print("Reconstructed-only PDF:", recon_only_pdf)


[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0002/drawing_001.png
[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0002/drawing_001.png
[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0003/drawing_001.png
[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0003/drawing_001.png
[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0004/drawing_001.png
[WARN] Missing image: ../data/raw_papers/paper_artifacts/page_0004/drawing_001.png
Overlay PDF: /Users/garimajaiswal/Learning/Python/Projects/exam_paper_parser_V1.0/data/raw_papers/paper_artifacts/tally/tally_overlay.pdf
Reconstructed-only PDF: /Users/garimajaiswal/Learning/Python/Projects/exam_paper_parser_V1.0/data/raw_papers/paper_artifacts/tally/tally_reconstructed_only.pdf
