In [None]:
# Install required packages (run once)
#!pip install pymupdf pandas --quiet

!pip install pymupdf pandas --quiet

import fitz  # PyMuPDF
import pandas as pd

def extract_annotations(pdf_path):
    doc = fitz.open(pdf_path)
    annotations = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        annots = page.annots()
        if annots:
            for annot in annots:
                info = {
                    "page": page_num + 1,
                    "type": annot.type[1],  # annotation type name
                    "content": annot.info.get("content", ""),
                    "rect": annot.rect,
                }
                annotations.append(info)
    return annotations

pdf1_path = "pdf1.pdf"
pdf2_path = "pdf2.pdf"

pdf1_annotations = extract_annotations(pdf1_path)
pdf2_annotations = extract_annotations(pdf2_path)

df1 = pd.DataFrame(pdf1_annotations)
df2 = pd.DataFrame(pdf2_annotations)

print("Annotations in PDF 1:")
display(df1)

print("Annotations in PDF 2:")
display(df2)

diff = pd.merge(
    df1.assign(source="PDF 1"),
    df2.assign(source="PDF 2"),
    how='outer',
    on=["page", "type", "content", "rect"],
    indicator=True
)



diff_only_in_pdf1 = diff[diff['_merge'] == 'left_only']
diff_only_in_pdf2 = diff[diff['_merge'] == 'right_only']

print("Annotations only in PDF 1:")
display(diff_only_in_pdf1)

print("Annotations only in PDF 2:")
display(diff_only_in_pdf2)


In [7]:
import fitz  # PyMuPDF
import pandas as pd

def extract_annotations(pdf_path):
    doc = fitz.open(pdf_path)
    annotations = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        annot = page.first_annot
        while annot:
            annot_info = {
                "page": page_num + 1,
                "type": annot.type[1],
                "content": annot.info.get("content", ""),
                "rect": str(annot.rect)  # Convert Rect to string for comparison
            }
            annotations.append(annot_info)
            annot = annot.next
    return pd.DataFrame(annotations)

# Extract annotations from both PDFs
df1 = extract_annotations("pdf1.pdf")
df2 = extract_annotations("pdf2.pdf")

# Add a source column
df1["source"] = "PDF 1"
df2["source"] = "PDF 2"

# Merge with indicator to detect differences
diff = pd.merge(
    df1,
    df2,
    how='outer',
    on=["page", "type", "content", "rect"],
    indicator=True
)

# Extract differences
only_in_pdf1 = diff[diff['_merge'] == 'left_only']
only_in_pdf2 = diff[diff['_merge'] == 'right_only']

# Print results
print("Only in PDF 1:")
print(only_in_pdf1[["page", "type", "content", "rect"]])

print("\nOnly in PDF 2:")
print(only_in_pdf2[["page", "type", "content", "rect"]])


Only in PDF 1:
Empty DataFrame
Columns: [page, type, content, rect]
Index: []

Only in PDF 2:
    page      type                 content  \
0      1  FreeText                     thi   
1      1       Ink                           
2      1       Ink                           
73     1    Square           Wiring change   
74     1    Square                     thc   
75     1      Text  This should be changed   

                                                 rect  
0   Rect(1390.5350341796875, 792.7939453125, 1548....  
1   Rect(1228.864990234375, 589.8170166015625, 156...  
2   Rect(1320.675048828125, 570.928955078125, 1551...  
73  Rect(851.5889892578125, 91.300048828125, 1084....  
74  Rect(1410.9940185546875, 167.264892578125, 156...  
75  Rect(1418.81298828125, 406.41796875, 1440.8129...  


In [9]:
#visual diff
import fitz  # PyMuPDF
from PIL import Image, ImageChops

def pdf_to_images(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

# Load images from two PDFs
imgs1 = pdf_to_images("pdf1.pdf")
imgs2 = pdf_to_images("pdf2.pdf")

# Compare images page by page
diffs = []
for i, (im1, im2) in enumerate(zip(imgs1, imgs2)):
    diff = ImageChops.difference(im1, im2)
    if diff.getbbox():  # Non-empty difference
        diffs.append((i + 1, diff))

# Show or save differences
for page_num, diff_img in diffs:
    print(f"Difference found on page {page_num}")
    diff_img.show()  # or diff_img.save(f"diff_page_{page_num}.png")


Difference found on page 1


In [None]:
#pdf as tree
import fitz  # PyMuPDF
import hashlib

def hash_image(xref, doc):
    """Get a hash for an image XObject by its raw bytes."""
    base_image = doc.extract_image(xref)
    img_bytes = base_image["image"]
    return hashlib.md5(img_bytes).hexdigest()

def extract_pdf_object_tree(pdf_path):
    doc = fitz.open(pdf_path)
    pdf_tree = {
        "pages": [],
        "metadata": doc.metadata
    }
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        page_dict = {
            "page_number": page_num + 1,
            "text_blocks": [],
            "images": [],
            "annotations": [],
            "vector_drawings": []
        }
        
        # Extract text blocks
        # Each block: (x0, y0, x1, y1, "text", block_no, block_type)
        for block in page.get_text("blocks"):
            x0, y0, x1, y1, text, block_no, block_type = block
            page_dict["text_blocks"].append({
                "bbox": [x0, y0, x1, y1],
                "text": text.strip(),
                "block_no": block_no,
                "block_type": block_type
            })
        
        # Extract images
        # Returns list of tuples with image info including xref
        for img_info in page.get_images(full=True):
            xref = img_info[0]
            img_hash = hash_image(xref, doc)
            page_dict["images"].append({
                "xref": xref,
                "hash": img_hash,
                "width": img_info[2],
                "height": img_info[3],
                "bpc": img_info[4],  # bits per component
                "colorspace": img_info[5]
            })
        
        # Extract annotations
        annots = page.annots()
        if annots:
            for annot in annots:
                page_dict["annotations"].append({
                    "type": annot.type[1],
                    "contents": annot.info.get("content", ""),
                    "rect": list(annot.rect)
                })
        
        # Extract vector drawings (lines, curves, fills)
        # Extract vector drawings (lines, curves, fills)
        drawings = page.get_drawings()
        for drawing in drawings:
            bbox = drawing.get("bbox", None)
            items = drawing.get("items", [])
            page_dict["vector_drawings"].append({
                "items": items,
                "bbox": bbox
            })

        pdf_tree["pages"].append(page_dict)
    
    return pdf_tree

if __name__ == "__main__":
    tree = extract_pdf_object_tree("pdf1.pdf")
    import pprint
    pprint.pprint(tree)


In [46]:
import json

if __name__ == "__main__":
    tree = extract_pdf_object_tree("pdf1.pdf")
    
    # Save to JSON file
    with open("pdf_tree.json", "w", encoding="utf-8") as f:
        json.dump(tree, f, indent=2, ensure_ascii=False)
    
    print("PDF tree saved to pdf_tree.json")


TypeError: Object of type Point is not JSON serializable

In [None]:
import fitz

def get_pdf_objects_tree(pdf_path):
    doc = fitz.open(pdf_path)
    pdf_tree = {"pages": []}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        page_info = {
            "page_number": page_num + 1,
            "objects": []
        }

        # Text blocks
        blocks = page.get_text("blocks")
        for block in blocks:
            # PyMuPDF doesn't expose object ID for text blocks directly,
            # so we list text blocks as type "text" without ID
            page_info["objects"].append({
                "type": "text_block",
                "content": block[4].strip()
            })

        # Images
        for img in page.get_images(full=True):
            xref = img[0]
            # Get basic image metadata
            img_info = doc.extract_image(xref)
            page_info["objects"].append({
                "type": "image",
                "object_id": xref,
                "width": img_info.get("width"),
                "height": img_info.get("height"),
                "colorspace": img_info.get("colorspace")
            })

        # Annotations
        for annot in page.annots() or []:
            page_info["objects"].append({
                "type": "annotation",
                "object_id": annot.xref,
                "subtype": annot.type[1],
                "content": annot.info.get("content", "")
            })

        pdf_tree["pages"].append(page_info)

    return pdf_tree

if __name__ == "__main__":
    import pprint
    tree = get_pdf_objects_tree("pdf2.pdf")
    pprint.pprint(tree)


In [None]:
#not accurate, a lot of noise

def fingerprint_obj(obj):
    """Create a simple fingerprint string for an object for comparison."""
    if obj["type"] == "text_block":
        return f"text:{obj['content'].strip()}"
    elif obj["type"] == "image":
        # Use object_id + dimensions + colorspace as fingerprint
        return f"image:{obj['object_id']}_{obj.get('width')}_{obj.get('height')}_{obj.get('colorspace')}"
    elif obj["type"] == "annotation":
        return f"annot:{obj['object_id']}_{obj.get('subtype')}_{obj.get('content')}"
    else:
        # fallback fingerprint
        return str(obj)

def compare_pdf_trees(tree1, tree2):

    
    diffs = {"only_in_pdf1": [], "only_in_pdf2": []}

    # Build dict by page number for quick lookup
    pages1 = {p["page_number"]: p for p in tree1["pages"]}
    pages2 = {p["page_number"]: p for p in tree2["pages"]}

    all_pages = set(pages1.keys()).union(pages2.keys())

    for page_num in sorted(all_pages):
        objs1 = pages1.get(page_num, {}).get("objects", [])
        objs2 = pages2.get(page_num, {}).get("objects", [])

        # Fingerprint sets for quick diff
        fps1 = {fingerprint_obj(o) for o in objs1}
        fps2 = {fingerprint_obj(o) for o in objs2}

        only1 = fps1 - fps2
        only2 = fps2 - fps1

        if only1:
            diffs["only_in_pdf1"].append({"page": page_num, "objects": list(only1)})
        if only2:
            diffs["only_in_pdf2"].append({"page": page_num, "objects": list(only2)})

    return diffs

if __name__ == "__main__":
    import pprint

    tree1 = get_pdf_objects_tree("pdf1.pdf")
    tree2 = get_pdf_objects_tree("pdf2.pdf")

    diff_tree = compare_pdf_trees(tree1, tree2)
    pprint.pprint(diff_tree)



In [44]:
def fingerprint_obj(obj):
    """Create a simple fingerprint string for an object for comparison."""
    if obj["type"] == "text_block":
        return f"text:{obj['content'].strip()}"
    elif obj["type"] == "image":
        # Use dimensions + colorspace instead of object_id for images too
        return f"image:{obj.get('width')}_{obj.get('height')}_{obj.get('colorspace')}"
    elif obj["type"] == "annotation":
        # Don't use object_id for annotations - use content + position + type
        subtype = obj.get('subtype', 'unknown')
        content = obj.get('content', '').strip()
        
        # Get position info if available
        position = ""
        if 'rect' in obj and obj['rect']:
            rect = obj['rect']
            if isinstance(rect, (list, tuple)) and len(rect) >= 4:
                # Round coordinates to handle minor differences
                position = f"{round(rect[0])},{round(rect[1])},{round(rect[2])},{round(rect[3])}"
        
        # Create fingerprint without object_id
        return f"annot:{subtype}_{content}_{position}"
    else:
        # fallback fingerprint
        return str(obj)

def compare_pdf_trees(tree1, tree2):
    diffs = {"only_in_pdf1": [], "only_in_pdf2": []}
    # Build dict by page number for quick lookup
    pages1 = {p["page_number"]: p for p in tree1["pages"]}
    pages2 = {p["page_number"]: p for p in tree2["pages"]}
    all_pages = set(pages1.keys()).union(pages2.keys())
    
    for page_num in sorted(all_pages):
        objs1 = pages1.get(page_num, {}).get("objects", [])
        objs2 = pages2.get(page_num, {}).get("objects", [])
        
        # Fingerprint sets for quick diff
        fps1 = {fingerprint_obj(o) for o in objs1}
        fps2 = {fingerprint_obj(o) for o in objs2}
        
        only1 = fps1 - fps2
        only2 = fps2 - fps1
        
        if only1:
            diffs["only_in_pdf1"].append({"page": page_num, "objects": list(only1)})
        if only2:
            diffs["only_in_pdf2"].append({"page": page_num, "objects": list(only2)})
    
    return diffs

if __name__ == "__main__":
    import pprint
    tree1 = get_pdf_objects_tree("pdf1.pdf")
    tree2 = get_pdf_objects_tree("pdf2.pdf")
    diff_tree = compare_pdf_trees(tree1, tree2)
    print("new")
    pprint.pprint(diff_tree)

new
{'only_in_pdf1': [],
 'only_in_pdf2': [{'objects': ['annot:Square_thc_',
                               'text:thi',
                               'annot:FreeText_thi_',
                               'annot:Ink__',
                               'annot:Square_Wiring change_',
                               'annot:Text_This should be changed_'],
                   'page': 1}]}


In [42]:
# Extract annotations from PDF tree
def extract_annotations(pdf_tree):
    annotations = []
    for page in pdf_tree["pages"]:
        for obj in page["objects"]:
            if obj["type"] == "annotation":
                annotations.append({
                    "page": page["page_number"],
                    "data": obj
                })
    return annotations

# Store annotations (example with simple dict storage)
def store_annotations(pdf_id, annotations):
    # In real app: database insert
    storage = {"pdf_id": pdf_id, "annotations": annotations}
    return storage

# Combine annotations back into PDF tree
def combine_annotations(pdf_tree, stored_annotations):
    # Create lookup by page
    annot_by_page = {}
    for annot in stored_annotations["annotations"]:
        page = annot["page"]
        if page not in annot_by_page:
            annot_by_page[page] = []
        annot_by_page[page].append(annot["data"])
    
    # Add back to tree
    for page in pdf_tree["pages"]:
        page_num = page["page_number"]
        # Remove existing annotations
        page["objects"] = [obj for obj in page["objects"] if obj["type"] != "annotation"]
        # Add stored annotations
        if page_num in annot_by_page:
            page["objects"].extend(annot_by_page[page_num])
    
    return pdf_tree

# Usage
tree = get_pdf_objects_tree("pdf2.pdf")
annotations = extract_annotations(tree)
stored = store_annotations("pdf3.pdf", annotations)
combined_tree = combine_annotations(tree, stored)

In [43]:
def compare_pdf_trees(tree1, tree2):
    diffs = {"only_in_pdf1": [], "only_in_pdf2": []}
    pages1 = {p["page_number"]: p for p in tree1["pages"]}
    pages2 = {p["page_number"]: p for p in tree2["pages"]}
    all_pages = set(pages1.keys()).union(pages2.keys())

    for page_num in sorted(all_pages):
        objs1 = pages1.get(page_num, {}).get("objects", [])
        objs2 = pages2.get(page_num, {}).get("objects", [])

        # Map fingerprint → object
        fp_map1 = {fingerprint_obj(o): o for o in objs1}
        fp_map2 = {fingerprint_obj(o): o for o in objs2}

        fps1 = set(fp_map1.keys())
        fps2 = set(fp_map2.keys())

        only1 = fps1 - fps2
        only2 = fps2 - fps1

        if only1:
            diffs["only_in_pdf1"].append({
                "page": page_num,
                "objects": [{**fp_map1[fp], "fingerprint": fp} for fp in only1]
            })
        if only2:
            diffs["only_in_pdf2"].append({
                "page": page_num,
                "objects": [{**fp_map2[fp], "fingerprint": fp} for fp in only2]
            })

    return diffs

# Usage



In [25]:
print(annotations)

[{'page': 1, 'data': {'type': 'annotation', 'object_id': 76, 'subtype': 'Square', 'content': '39.30'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 77, 'subtype': 'Square', 'content': '39.38'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 78, 'subtype': 'Square', 'content': '39.52'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 79, 'subtype': 'Square', 'content': 'CL 39.61'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 80, 'subtype': 'Square', 'content': 'CL 39.49'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 81, 'subtype': 'Square', 'content': '39.29'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 82, 'subtype': 'Square', 'content': '39.43'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 83, 'subtype': 'Square', 'content': '39.24'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 84, 'subtype': 'Square', 'content': '39.43'}}, {'page': 1, 'data': {'type': 'annotation', 'object_id': 85, 'subty

In [33]:

doc = fitz.open("pdf3.pdf")

for annot_data in stored["annotations"]:
    page_num = annot_data["page"] - 1
    page = doc[page_num]
    data = annot_data["data"]
    
    # Add annotation
    annot = page.add_rect_annot(fitz.Rect(data.get("x", 0), data.get("y", 0), 
                                         data.get("x", 0) + 50, data.get("y", 0) + 20))
    if "content" in data:
        annot.set_info(content=data["content"])

doc.save("pdf3_with_annotations.pdf")
doc.close()
The method is set_info() with content as a parameter, not set_content().RetryClaude does not have the ability to run the code it generates yet.Claude can make mistakes. Please double-check responses.You’re almost out of usage - your limits will reset at 6:00 PM

SyntaxError: invalid character '’' (U+2019) (3015733762.py, line 16)