In [2]:
import fitz

def get_pdf_objects_tree(pdf_path):
    doc = fitz.open(pdf_path)
    pdf_tree = {"pages": []}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        page_info = {
            "page_number": page_num + 1,
            "objects": []
        }

        # Text blocks
        blocks = page.get_text("blocks")
        for block in blocks:
            # PyMuPDF doesn't expose object ID for text blocks directly,
            # so we list text blocks as type "text" without ID
            page_info["objects"].append({
                "type": "text_block",
                "content": block[4].strip()
            })

        # Images
        for img in page.get_images(full=True):
            xref = img[0]
            # Get basic image metadata
            img_info = doc.extract_image(xref)
            page_info["objects"].append({
                "type": "image",
                "object_id": xref,
                "width": img_info.get("width"),
                "height": img_info.get("height"),
                "colorspace": img_info.get("colorspace")
            })

        # Annotations
        for annot in page.annots() or []:
            page_info["objects"].append({
                "type": "annotation",
                "object_id": annot.xref,
                "subtype": annot.type[1],
                "content": annot.info.get("content", "")
            })

        pdf_tree["pages"].append(page_info)

    return pdf_tree


def fingerprint_obj(obj):
    """Create a simple fingerprint string for an object for comparison."""
    if obj["type"] == "text_block":
        return f"text:{obj['content'].strip()}"
    elif obj["type"] == "image":
        return f"image:{obj.get('width')}_{obj.get('height')}_{obj.get('colorspace')}"
    elif obj["type"] == "annotation":
        subtype = obj.get('subtype', 'unknown')
        content = obj.get('content', '').strip()
        position = ""
        if 'rect' in obj and obj['rect']:
            rect = obj['rect']
            if isinstance(rect, (list, tuple)) and len(rect) >= 4:
                position = f"{round(rect[0])},{round(rect[1])},{round(rect[2])},{round(rect[3])}"
        return f"annot:{subtype}_{content}_{position}"
    else:
        return str(obj)

def compare_pdf_trees(tree1, tree2):
    """Compare two PDF object trees and return differences including object data and fingerprints."""
    diffs = {"only_in_pdf1": [], "only_in_pdf2": []}
    pages1 = {p["page_number"]: p for p in tree1["pages"]}
    pages2 = {p["page_number"]: p for p in tree2["pages"]}
    all_pages = set(pages1.keys()).union(pages2.keys())

    for page_num in sorted(all_pages):
        objs1 = pages1.get(page_num, {}).get("objects", [])
        objs2 = pages2.get(page_num, {}).get("objects", [])

        fp_map1 = {fingerprint_obj(o): o for o in objs1}
        fp_map2 = {fingerprint_obj(o): o for o in objs2}

        fps1 = set(fp_map1.keys())
        fps2 = set(fp_map2.keys())

        only1 = fps1 - fps2
        only2 = fps2 - fps1

        if only1:
            diffs["only_in_pdf1"].append({
                "page": page_num,
                "objects": [
                    {**fp_map1[fp], "fingerprint": fp} for fp in only1
                ]
            })
        if only2:
            diffs["only_in_pdf2"].append({
                "page": page_num,
                "objects": [
                    {**fp_map2[fp], "fingerprint": fp} for fp in only2
                ]
            })

    return diffs

# Placeholder for PDF parsing function
def get_pdf_objects_tree(filename):
    """Mock function: Replace with real PDF parsing logic to return a tree structure."""
    raise NotImplementedError("Replace with actual PDF parsing function.")

if __name__ == "__main__":
    import pprint

    # Replace with actual calls to your PDF parser
    tree1 = get_pdf_objects_tree("pdf1.pdf")
    tree2 = get_pdf_objects_tree("pdf2.pdf")

    diff_tree = compare_pdf_trees(tree1, tree2)
    pprint.pprint(diff_tree)


NotImplementedError: Replace with actual PDF parsing function.