In [2]:
import pypdf
from pypdf.generic import DictionaryObject, ArrayObject, FloatObject, NameObject, TextStringObject
import json
from typing import Dict, List, Any, Optional
import copy

def extract_annotations_pypdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """Extract specific annotations using pypdf with full data preservation"""
    print(f"Extracting annotations from {pdf_path}")
    extracted_annotations = []
    
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        
        for page_num, page in enumerate(reader.pages):
            if '/Annots' in page:
                annots = page['/Annots']
                
                for annot_ref in annots:
                    try:
                        annot = annot_ref.get_object()
                        obj_id = getattr(annot_ref, 'idnum', None)
                        
                        if obj_id in keep_ids:
                            # Store the complete annotation data
                            annotation_data = {
                                'source_page': page_num,
                                'object_id': obj_id,
                                'annotation_dict': dict(annot),  # Complete annotation dictionary
                                'annotation_ref': annot_ref,     # Reference for cloning
                                'raw_annotation': annot          # Raw annotation object
                            }
                            
                            # Extract key information for logging
                            subtype = str(annot.get('/Subtype', 'Unknown'))
                            rect = annot.get('/Rect', [0, 0, 0, 0])
                            content = str(annot.get('/Contents', ''))
                            
                            annotation_data['summary'] = {
                                'subtype': subtype,
                                'rect': [float(x) for x in rect] if rect else None,
                                'content': content,
                                'page': page_num + 1
                            }
                            
                            extracted_annotations.append(annotation_data)
                            print(f"  ✓ Found annotation ID {obj_id}: {subtype} on page {page_num + 1}")
                            
                    except Exception as e:
                        print(f"  ✗ Error processing annotation: {e}")
    
    print(f"Extracted {len(extracted_annotations)} annotations")
    return extracted_annotations

def create_annotation_copy(original_annot: DictionaryObject, writer: pypdf.PdfWriter) -> DictionaryObject:
    """Create a proper copy of an annotation dictionary for merging"""
    new_annot = DictionaryObject()
    
    # Copy all fields from original annotation with proper type conversion
    for key, value in original_annot.items():
        try:
            if isinstance(value, (list, ArrayObject)):
                # Handle arrays (like /Rect, /C, /Vertices, /InkList, etc.)
                new_array = ArrayObject()
                for item in value:
                    if isinstance(item, (int, float)):
                        new_array.append(FloatObject(float(item)))
                    elif isinstance(item, (list, ArrayObject)):
                        # Handle nested arrays (like in /InkList)
                        nested_array = ArrayObject()
                        for nested_item in item:
                            nested_array.append(FloatObject(float(nested_item)))
                        new_array.append(nested_array)
                    else:
                        new_array.append(item)
                new_annot[key] = new_array
                
            elif isinstance(value, (dict, DictionaryObject)):
                # Handle dictionaries (like /AP, /BS, etc.)
                new_dict = DictionaryObject()
                for sub_key, sub_value in value.items():
                    if isinstance(sub_value, (int, float)):
                        new_dict[sub_key] = FloatObject(float(sub_value))
                    elif isinstance(sub_value, str):
                        new_dict[sub_key] = TextStringObject(sub_value)
                    else:
                        new_dict[sub_key] = sub_value
                new_annot[key] = new_dict
                
            elif isinstance(value, (int, float)):
                # Convert numbers to proper PDF objects
                new_annot[key] = FloatObject(float(value))
                
            elif isinstance(value, str):
                # Convert strings to proper PDF objects
                new_annot[key] = TextStringObject(value)
                
            else:
                # Copy other values directly
                new_annot[key] = value
                
        except Exception as e:
            print(f"    Warning: Could not copy field {key}: {e}")
            # For critical fields, try alternative approaches
            if key == NameObject('/Rect') and isinstance(value, (list, ArrayObject)):
                try:
                    # Manually create rect array
                    rect_array = ArrayObject()
                    for coord in value:
                        rect_array.append(FloatObject(float(coord)))
                    new_annot[key] = rect_array
                    print(f"    ✓ Manually fixed /Rect field")
                except:
                    print(f"    ✗ Failed to fix /Rect field")
            continue
    
    return new_annot

def merge_annotations_to_pdf(source_pdf: str, target_pdf: str, output_pdf: str, 
                           extracted_annotations: List[Dict], target_page: int = 0) -> bool:
    """Merge extracted annotations into target PDF"""
    print(f"\nMerging annotations into {target_pdf}")
    
    try:
        # Read target PDF
        with open(target_pdf, 'rb') as file:
            reader = pypdf.PdfReader(file)
            writer = pypdf.PdfWriter()
            
            # Copy all pages from target PDF
            for page in reader.pages:
                writer.add_page(page)
            
            # Get the target page for annotations
            if target_page >= len(writer.pages):
                print(f"Error: Target page {target_page} doesn't exist. PDF has {len(writer.pages)} pages.")
                return False
            
            target_page_obj = writer.pages[target_page]
            
            # Initialize annotations array if it doesn't exist
            if '/Annots' not in target_page_obj:
                target_page_obj[NameObject('/Annots')] = ArrayObject()
            
            # Add each extracted annotation
            for i, annot_data in enumerate(extracted_annotations):
                try:
                    # Create a copy of the annotation with proper PDF object types
                    original_annot = annot_data['raw_annotation']
                    new_annot = create_annotation_copy(original_annot, writer)
                    
                    # Ensure the annotation has essential fields
                    if NameObject('/Rect') not in new_annot and '/Rect' in original_annot:
                        # Force add the rectangle
                        try:
                            rect_data = original_annot['/Rect']
                            rect_array = ArrayObject()
                            for coord in rect_data:
                                rect_array.append(FloatObject(float(coord)))
                            new_annot[NameObject('/Rect')] = rect_array
                            print(f"    ✓ Force-added /Rect to annotation {i+1}")
                        except Exception as e:
                            print(f"    ✗ Could not force-add /Rect: {e}")
                    
                    # Update the parent page reference to point to target page
                    new_annot[NameObject('/P')] = target_page_obj
                    
                    # Add the annotation to the writer and get its reference
                    annot_ref = writer._add_object(new_annot)
                    
                    # Add the annotation reference to the target page
                    target_page_obj['/Annots'].append(annot_ref)
                    
                    summary = annot_data['summary']
                    print(f"  ✓ Merged annotation {i+1}: {summary['subtype']} "
                          f"(ID: {annot_data['object_id']}) to page {target_page + 1}")
                    
                    # Verify critical fields were preserved
                    critical_fields = ['/Rect', '/Subtype', '/Contents']
                    preserved_fields = []
                    for field in critical_fields:
                        if NameObject(field) in new_annot or field in new_annot:
                            preserved_fields.append(field)
                    print(f"    Fields preserved: {', '.join(preserved_fields)}")
                    
                except Exception as e:
                    print(f"  ✗ Failed to merge annotation {i+1}: {e}")
                    continue
            
            # Save the merged PDF
            with open(output_pdf, 'wb') as output_file:
                writer.write(output_file)
            
            print(f"\n✓ Successfully saved merged PDF as: {output_pdf}")
            return True
            
    except Exception as e:
        print(f"✗ Error during merge: {e}")
        return False

def verify_merged_annotations(pdf_path: str) -> bool:
    """Verify that annotations are visible in the merged PDF"""
    print(f"\nVerifying annotations in {pdf_path}...")
    
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            
            total_annotations = 0
            for page_num, page in enumerate(reader.pages):
                if '/Annots' in page:
                    annots = page['/Annots']
                    page_annot_count = len(annots)
                    total_annotations += page_annot_count
                    
                    print(f"  Page {page_num + 1}: {page_annot_count} annotations")
                    
                    # Check first few annotations for essential data
                    for i, annot_ref in enumerate(annots[:3]):  # Check first 3
                        try:
                            annot = annot_ref.get_object()
                            subtype = annot.get('/Subtype', 'Unknown')
                            rect = annot.get('/Rect', None)
                            content = annot.get('/Contents', '')
                            
                            rect_str = f"[{', '.join(f'{float(x):.1f}' for x in rect)}]" if rect else "Missing"
                            print(f"    Ann {i+1}: {subtype}, Rect: {rect_str}, Content: '{str(content)[:30]}...'")
                            
                        except Exception as e:
                            print(f"    Ann {i+1}: Error reading - {e}")
            
            print(f"  Total annotations found: {total_annotations}")
            return total_annotations > 0
            
    except Exception as e:
        print(f"  ✗ Verification failed: {e}")
        return False

def save_extraction_summary(annotations: List[Dict], filename: str = "extraction_summary.json"):
    """Save a summary of extracted annotations for review"""
    summary = []
    for annot in annotations:
        summary.append({
            'object_id': annot['object_id'],
            'source_page': annot['source_page'] + 1,
            'summary': annot['summary'],
            'all_fields': list(annot['annotation_dict'].keys())
        })
    
    with open(filename, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"✓ Extraction summary saved to: {filename}")

def main():
    # Configuration
    source_pdf = "pdf2.pdf"      # PDF to extract annotations from
    target_pdf = "pdf1.pdf"      # PDF to merge annotations into
    output_pdf = "merged_annotations.pdf"  # Output file
    keep_ids = {146, 151, 158, 163, 170}   # Annotation IDs to extract
    target_page = 0              # Page to add annotations to (0-based)
    
    print("="*60)
    print("PYPDF ANNOTATION EXTRACTION AND MERGER")
    print("="*60)
    print(f"Source PDF: {source_pdf}")
    print(f"Target PDF: {target_pdf}")
    print(f"Output PDF: {output_pdf}")
    print(f"Annotation IDs to extract: {sorted(keep_ids)}")
    print(f"Target page for annotations: {target_page + 1}")
    
    # Step 1: Extract annotations from source PDF
    try:
        extracted_annotations = extract_annotations_pypdf(source_pdf, keep_ids)
        
        if not extracted_annotations:
            print("✗ No annotations found with the specified IDs")
            return
        
        # Save extraction summary
        save_extraction_summary(extracted_annotations)
        
    except Exception as e:
        print(f"✗ Failed to extract annotations: {e}")
        return
    
    # Step 2: Merge annotations into target PDF
    try:
        success = merge_annotations_to_pdf(
            source_pdf, target_pdf, output_pdf, 
            extracted_annotations, target_page
        )
        
        if success:
            print("\n" + "="*60)
            print("SUCCESS!")
            print("="*60)
            print(f"✓ Extracted {len(extracted_annotations)} annotations from {source_pdf}")
            print(f"✓ Merged them into {target_pdf}")
            print(f"✓ Saved result as {output_pdf}")
            print(f"✓ All annotations placed on page {target_page + 1}")
            
            # Verify the merge worked
            if verify_merged_annotations(output_pdf):
                print("✓ Annotations are visible in the merged PDF")
            else:
                print("⚠ Warning: Annotations may not be visible - check the PDF manually")
            
            # Show what was merged
            print(f"\nMerged annotations:")
            for i, annot in enumerate(extracted_annotations):
                summary = annot['summary']
                print(f"  {i+1}. {summary['subtype']} (ID: {annot['object_id']}) - {summary['content'][:50]}...")
        else:
            print("✗ Merge operation failed")
            
    except Exception as e:
        print(f"✗ Merge failed: {e}")

def merge_to_specific_page():
    """Alternative function to merge annotations to a specific page"""
    source_pdf = "pdf2.pdf"
    target_pdf = "pdf1.pdf"
    output_pdf = "merged_specific_page.pdf"
    keep_ids = {146, 151, 158, 163, 170}
    
    # Ask user for target page
    try:
        target_page = int(input("Enter target page number (1-based): ")) - 1
        if target_page < 0:
            target_page = 0
    except:
        target_page = 0
        print("Using default page 1")
    
    # Extract and merge
    extracted_annotations = extract_annotations_pypdf(source_pdf, keep_ids)
    if extracted_annotations:
        success = merge_annotations_to_pdf(
            source_pdf, target_pdf, output_pdf, 
            extracted_annotations, target_page
        )
        if success:
            print(f"✓ Annotations merged to page {target_page + 1} in {output_pdf}")

if __name__ == "__main__":
    main()
    
    # Uncomment the line below if you want to specify a different target page interactively
    # merge_to_specific_page()

PYPDF ANNOTATION EXTRACTION AND MERGER
Source PDF: pdf2.pdf
Target PDF: pdf1.pdf
Output PDF: merged_annotations.pdf
Annotation IDs to extract: [146, 151, 158, 163, 170]
Target page for annotations: 1
Extracting annotations from pdf2.pdf
  ✓ Found annotation ID 146: /Square on page 1
  ✓ Found annotation ID 151: /FreeText on page 1
  ✓ Found annotation ID 158: /Square on page 1
  ✓ Found annotation ID 163: /Text on page 1
  ✓ Found annotation ID 170: /Ink on page 1
Extracted 5 annotations
✓ Extraction summary saved to: extraction_summary.json

Merging annotations into pdf1.pdf
  ✓ Merged annotation 1: /Square (ID: 146) to page 1
    Fields preserved: /Rect, /Subtype, /Contents
  ✓ Merged annotation 2: /FreeText (ID: 151) to page 1
    Fields preserved: /Rect, /Subtype, /Contents
  ✓ Merged annotation 3: /Square (ID: 158) to page 1
    Fields preserved: /Rect, /Subtype, /Contents
  ✓ Merged annotation 4: /Text (ID: 163) to page 1
    Fields preserved: /Rect, /Subtype, /Contents
  ✓ Merge