In [17]:
import fitz

def get_pdf_objects_tree(pdf_path):
    doc = fitz.open(pdf_path)
    pdf_tree = {"pages": []}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        page_info = {
            "page_number": page_num + 1,
            "objects": []
        }

        # Text blocks
        blocks = page.get_text("blocks")
        for block in blocks:
            # PyMuPDF doesn't expose object ID for text blocks directly,
            # so we list text blocks as type "text" without ID
            page_info["objects"].append({
                "type": "text_block",
                "content": block[4].strip()
            })

        # Images
        for img in page.get_images(full=True):
            xref = img[0]
            # Get basic image metadata
            img_info = doc.extract_image(xref)
            page_info["objects"].append({
                "type": "image",
                "object_id": xref,
                "width": img_info.get("width"),
                "height": img_info.get("height"),
                "colorspace": img_info.get("colorspace")
            })

        # Annotations
        for annot in page.annots() or []:
            page_info["objects"].append({
                "type": "annotation",
                "object_id": annot.xref,
                "subtype": annot.type[1],
                "content": annot.info.get("content", "")
            })

        pdf_tree["pages"].append(page_info)

    return pdf_tree


def fingerprint_obj(obj):
    """Create a simple fingerprint string for an object for comparison."""
    if obj["type"] == "text_block":
        return f"text:{obj['content'].strip()}"
    elif obj["type"] == "image":
        return f"image:{obj.get('width')}_{obj.get('height')}_{obj.get('colorspace')}"
    elif obj["type"] == "annotation":
        subtype = obj.get('subtype', 'unknown')
        content = obj.get('content', '').strip()
        position = ""
        if 'rect' in obj and obj['rect']:
            rect = obj['rect']
            if isinstance(rect, (list, tuple)) and len(rect) >= 4:
                position = f"{round(rect[0])},{round(rect[1])},{round(rect[2])},{round(rect[3])}"
        return f"annot:{subtype}_{content}_{position}"
    else:
        return str(obj)

def compare_pdf_trees(tree1, tree2):
    """Compare two PDF object trees and return differences including object data and fingerprints."""
    diffs = {"only_in_pdf1": [], "only_in_pdf2": []}
    pages1 = {p["page_number"]: p for p in tree1["pages"]}
    pages2 = {p["page_number"]: p for p in tree2["pages"]}
    all_pages = set(pages1.keys()).union(pages2.keys())

    for page_num in sorted(all_pages):
        objs1 = pages1.get(page_num, {}).get("objects", [])
        objs2 = pages2.get(page_num, {}).get("objects", [])

        fp_map1 = {fingerprint_obj(o): o for o in objs1}
        fp_map2 = {fingerprint_obj(o): o for o in objs2}

        fps1 = set(fp_map1.keys())
        fps2 = set(fp_map2.keys())

        only1 = fps1 - fps2
        only2 = fps2 - fps1

        if only1:
            diffs["only_in_pdf1"].append({
                "page": page_num,
                "objects": [
                    {**fp_map1[fp], "fingerprint": fp} for fp in only1
                ]
            })
        if only2:
            diffs["only_in_pdf2"].append({
                "page": page_num,
                "objects": [
                    {**fp_map2[fp], "fingerprint": fp} for fp in only2
                ]
            })

    return diffs



if __name__ == "__main__":
    import pprint

    # Replace with actual calls to your PDF parser
    tree1 = get_pdf_objects_tree("pdf1.pdf")
    tree2 = get_pdf_objects_tree("merged_annotations_enhanced.pdf")

    diff_tree = compare_pdf_trees(tree1, tree2)
    pprint.pprint(diff_tree)


MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

{'only_in_pdf1': [], 'only_in_pdf2': []}


In [5]:
import fitz  # PyMuPDF

# Your exact list of objects to keep on page 1
keep_objects = [
    {'content': 'This should be changed',
     'object_id': 163,
     'subtype': 'Text',
     'type': 'annotation',
     'rect': [100, 100, 300, 140],  # example rect - replace with actual
     'font': 'helv',
     'font_size': 12,
     'text_color': [0, 0, 0],
     'fill_color': [1, 1, 1],
     'stroke_color': [0, 0, 0]
    },
    {'content': 'Wiring change',
     'object_id': 146,
     'subtype': 'Square',
     'type': 'annotation',
     'rect': [120, 150, 280, 200],
     'stroke_color': [1, 0, 0]
    },
    {'content': '',
     'object_id': 170,
     'subtype': 'Ink',
     'type': 'annotation',
     # Ink annotations are complex; skipping detailed re-draw here
    },
    {'content': 'thi',
     'object_id': 151,
     'subtype': 'FreeText',
     'type': 'annotation',
     'rect': [50, 50, 100, 80],
     'font': 'helv',
     'font_size': 12,
     'text_color': [0, 0, 0],
     'fill_color': [1, 1, 1],
     'stroke_color': [0, 0, 0]
    },
    {'content': 'thc',
     'object_id': 158,
     'subtype': 'Square',
     'type': 'annotation',
     'rect': [200, 200, 250, 250],
     'stroke_color': [0, 0, 1]
    },
    # text_block - no object_id
    {'content': 'thi',
     'type': 'text_block',
     'position': (72, 72),  # example position for text blocks
     'font': 'helv',
     'font_size': 12,
     'color': [0, 0, 0]
    }
]

def rgb_to_fitz(color):
    if not color:
        return (0, 0, 0)
    return tuple(color)

def filter_pdf_keep_only(input_path, output_path, keep_objects):
    doc = fitz.open(input_path)
    page = doc[0]  # assuming only one page for this example

    # IDs of annotations to keep
    keep_ids = {obj.get("object_id") for obj in keep_objects if "object_id" in obj}

    # Delete all annotations not in keep_ids
    annots = list(page.annots()) or []
    for annot in annots:
        if annot.xref not in keep_ids:
            page.delete_annot(annot)

    # Clear all page content (text, images, drawings)
    page.clean_contents()

    # Now redraw the annotations from keep_objects:
    for obj in keep_objects:
        if obj['type'] == 'annotation':
            rect = fitz.Rect(obj.get('rect', [50, 50, 150, 80]))
            subtype = obj['subtype']
            if subtype in ('Text', 'FreeText'):
                # Draw fill rect
                fill = rgb_to_fitz(obj.get('fill_color'))
                stroke = rgb_to_fitz(obj.get('stroke_color'))
                page.draw_rect(rect, fill=fill, color=stroke)

                # Insert text inside rect
                page.insert_textbox(
                    rect,
                    obj.get('content', ''),
                    fontname=obj.get('font', 'helv'),
                    fontsize=obj.get('font_size', 12),
                    color=rgb_to_fitz(obj.get('text_color')),
                    align=0
                )
            elif subtype == 'Square':
                stroke = rgb_to_fitz(obj.get('stroke_color'))
                page.draw_rect(rect, color=stroke, width=1)
            elif subtype == 'Ink':
                # Complex to redraw ink; skip or implement if you have path data
                pass
        elif obj['type'] == 'text_block':
            pos = obj.get('position', (72, 72))
            page.insert_text(
                pos,
                obj.get('content', ''),
                fontsize=obj.get('font_size', 12),
                fontname=obj.get('font', 'helv'),
                color=rgb_to_fitz(obj.get('color'))
            )

    doc.save(output_path)
    print(f"Filtered PDF saved to {output_path}")

if __name__ == "__main__":
    filter_pdf_keep_only("pdf2.pdf", "pdf2_filtered_only_keep.pdf", keep_objects)


Filtered PDF saved to pdf2_filtered_only_keep.pdf


In [6]:
import fitz  # PyMuPDF

def rgb_to_list(color):
    # Converts fitz colors to list [r,g,b]
    if color is None:
        return None
    if isinstance(color, (list, tuple)):
        return list(color)
    # if color is a fitz.Color or int, convert accordingly
    try:
        return list(color)
    except Exception:
        return None

def extract_annotations_by_ids(pdf_path, keep_ids):
    doc = fitz.open(pdf_path)
    extracted = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        annots = list(page.annots() or [])
        for annot in annots:
            if annot.xref in keep_ids:
                annot_dict = {}
                annot_dict['object_id'] = annot.xref
                annot_dict['type'] = 'annotation'
                annot_dict['subtype'] = annot.info.get("subtype", "") or annot.type[1]  # fallback type name
                annot_dict['rect'] = list(annot.rect)  # [x0, y0, x1, y1]

                # Content/Text
                content = annot.info.get("content", "") or annot.info.get("title", "") or ""
                annot_dict['content'] = content

                # Colors (if available)
                stroke = annot.colors.get('stroke', None) if hasattr(annot, 'colors') else None
                fill = annot.colors.get('fill', None) if hasattr(annot, 'colors') else None
                text = annot.colors.get('text', None) if hasattr(annot, 'colors') else None
                annot_dict['stroke_color'] = rgb_to_list(stroke)
                annot_dict['fill_color'] = rgb_to_list(fill)
                annot_dict['text_color'] = rgb_to_list(text)

                # Font and font size - only for text / freetext types
                if annot_dict['subtype'] in ('Text', 'FreeText'):
                    # Attempt to get font info from annotation appearance or defaults
                    # PyMuPDF doesn't expose font easily; you may have to hardcode or guess
                    annot_dict['font'] = 'helv'  # default Helvetica
                    annot_dict['font_size'] = 12  # default size

                extracted.append(annot_dict)

    return extracted


if __name__ == "__main__":
    # Example: IDs to keep
    keep_ids = {146, 151, 158, 163, 170}
    annots_data = extract_annotations_by_ids("pdf2.pdf", keep_ids)
    for a in annots_data:
        print(a)


{'object_id': 146, 'type': 'annotation', 'subtype': 'Square', 'rect': [851.5889892578125, 91.300048828125, 1084.175048828125, 666.6429443359375], 'content': 'Wiring change', 'stroke_color': [1.0, 0.8313725590705872, 0.0], 'fill_color': [], 'text_color': None}
{'object_id': 151, 'type': 'annotation', 'subtype': 'FreeText', 'rect': [1390.5350341796875, 792.7939453125, 1548.5350341796875, 974.7939453125], 'content': 'thi', 'stroke_color': [], 'fill_color': [], 'text_color': None, 'font': 'helv', 'font_size': 12}
{'object_id': 158, 'type': 'annotation', 'subtype': 'Square', 'rect': [1410.9940185546875, 167.264892578125, 1560.9549560546875, 455.6519775390625], 'content': 'thc', 'stroke_color': [1.0, 0.4000000059604645, 0.4000000059604645], 'fill_color': [], 'text_color': None}
{'object_id': 163, 'type': 'annotation', 'subtype': 'Text', 'rect': [1418.81298828125, 406.41796875, 1440.81298828125, 428.41796875], 'content': 'This should be changed', 'stroke_color': [1.0, 0.8313725590705872, 0.0]

In [12]:
from pdfrw import PdfReader, PdfWriter, PdfDict, PdfArray, PdfName, PdfString

def color_to_pdfarray(color):
    if color and all(isinstance(c, (int, float)) for c in color):
        return PdfArray(color)
    return None

def create_square_annot(obj):
    annot = PdfDict(
        Type=PdfName.Annot,
        Subtype=PdfName.Square,
        Rect=PdfArray(obj['rect']),
        Contents=PdfString(obj.get('content', '')),
        F=4,  # Print flag
        Border=[0, 0, 2]  # border width
    )
    stroke = color_to_pdfarray(obj.get('stroke_color'))
    if stroke:
        annot.C = stroke
    fill = color_to_pdfarray(obj.get('fill_color'))
    if fill:
        annot.IC = fill  # fill color
    return annot

def create_freetext_annot(obj):
    r, g, b = (obj.get('text_color') or obj.get('stroke_color') or [0, 0, 0])[:3]
    da = f"/{obj.get('font', 'Helv')} {obj.get('font_size', 12)} Tf {r} {g} {b} rg"
    annot = PdfDict(
        Type=PdfName.Annot,
        Subtype=PdfName.FreeText,
        Rect=PdfArray(obj['rect']),
        Contents=PdfString(obj.get('content', '')),
        DA=PdfString(da),
        F=4,
        Border=[0, 0, 1]
    )
    # Optional colors
    stroke = color_to_pdfarray(obj.get('stroke_color'))
    if stroke:
        annot.C = stroke
    fill = color_to_pdfarray(obj.get('fill_color'))
    if fill:
        annot.IC = fill
    return annot

def create_text_annot(obj):
    # Text annotations are typically "sticky notes" with icons, small rect
    annot = PdfDict(
        Type=PdfName.Annot,
        Subtype=PdfName.Text,
        Rect=PdfArray(obj['rect']),
        Contents=PdfString(obj.get('content', '')),
        Name=PdfName.Note,
        F=4,
    )
    stroke = color_to_pdfarray(obj.get('stroke_color'))
    if stroke:
        annot.C = stroke
    return annot

def create_ink_annot(obj):
    # Ink annotation needs InkList (list of paths)
    # Here we fake a simple path (must be list of lists of [x,y] points)
    # Since your content is empty and no path data is given, we approximate
    
    rect = obj['rect']
    # Create a simple straight line from left-bottom to right-top of rect
    inklist = PdfArray([
        PdfArray([
            rect[0], rect[1],
            rect[2], rect[3]
        ])
    ])
    annot = PdfDict(
        Type=PdfName.Annot,
        Subtype=PdfName.Ink,
        Rect=PdfArray(rect),
        Contents=PdfString(obj.get('content', '')),
        InkList=inklist,
        F=4,
        Border=[0, 0, 2]
    )
    stroke = color_to_pdfarray(obj.get('stroke_color'))
    if stroke:
        annot.C = stroke
    return annot

def add_annotations_to_pdf(input_pdf_path, output_pdf_path, annotations):
    pdf = PdfReader(input_pdf_path)
    page = pdf.pages[0]  # For demo, adding annotations to first page only
    
    if not hasattr(page, 'Annots') or page.Annots is None:
        page.Annots = PdfArray()
        
    for obj in annotations:
        if obj.get('type') != 'annotation':
            continue
        
        subtype = obj.get('subtype')
        
        if subtype == 'Square':
            annot = create_square_annot(obj)
        elif subtype == 'FreeText':
            annot = create_freetext_annot(obj)
        elif subtype == 'Text':
            annot = create_text_annot(obj)
        elif subtype == 'Ink':
            annot = create_ink_annot(obj)
        else:
            print(f"Unsupported subtype: {subtype}")
            continue
        
        page.Annots.append(annot)
        
    PdfWriter(output_pdf_path, trailer=pdf).write()
    print(f"Saved annotated PDF as {output_pdf_path}")




add_annotations_to_pdf("pdf1.pdf", "pdf1_with_annotations.pdf", annots_data)


Saved annotated PDF as pdf1_with_annotations.pdf
