In [4]:
import pypdf
from pypdf.generic import DictionaryObject, ArrayObject, FloatObject, NameObject, TextStringObject, IndirectObject
import json
from typing import Dict, List, Any, Optional
import copy

def extract_annotations_pypdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """Extract specific annotations using pypdf with full data preservation"""
    print(f"Extracting annotations from {pdf_path}")
    extracted_annotations = []
    
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        
        for page_num, page in enumerate(reader.pages):
            if '/Annots' in page:
                annots = page['/Annots']
                
                for annot_ref in annots:
                    try:
                        annot = annot_ref.get_object()
                        obj_id = getattr(annot_ref, 'idnum', None)
                        
                        if obj_id in keep_ids:
                            # Store the complete annotation data
                            annotation_data = {
                                'source_page': page_num,
                                'object_id': obj_id,
                                'annotation_dict': dict(annot),
                                'annotation_ref': annot_ref,
                                'raw_annotation': annot
                            }
                            
                            # Extract key information for logging
                            subtype = str(annot.get('/Subtype', 'Unknown'))
                            rect = annot.get('/Rect', [0, 0, 0, 0])
                            content = str(annot.get('/Contents', ''))
                            
                            annotation_data['summary'] = {
                                'subtype': subtype,
                                'rect': [float(x) for x in rect] if rect else None,
                                'content': content,
                                'page': page_num + 1
                            }
                            
                            extracted_annotations.append(annotation_data)
                            print(f"  ✓ Found annotation ID {obj_id}: {subtype} on page {page_num + 1}")
                            
                    except Exception as e:
                        print(f"  ✗ Error processing annotation: {e}")
    
    print(f"Extracted {len(extracted_annotations)} annotations")
    return extracted_annotations

def ensure_annotation_visibility(annot: DictionaryObject) -> DictionaryObject:
    """Ensure annotation has all fields needed for visibility"""
    
    # Ensure /F (flags) field for visibility
    if NameObject('/F') not in annot:
        # Set flags: Print (bit 2) = 4, NoView (bit 5) = 32
        # We want Print=True, NoView=False, so flags = 4
        annot[NameObject('/F')] = FloatObject(4)
    
    # Ensure proper appearance for certain annotation types
    subtype = annot.get('/Subtype', '')
    
    if subtype == NameObject('/Square') or subtype == '/Square':
        # Ensure border style exists
        if NameObject('/BS') not in annot:
            border_style = DictionaryObject()
            border_style[NameObject('/W')] = FloatObject(1.0)  # Width
            border_style[NameObject('/S')] = NameObject('/S')  # Solid
            annot[NameObject('/BS')] = border_style
        
        # Ensure color exists
        if NameObject('/C') not in annot:
            # Default to red color [1, 0, 0]
            color_array = ArrayObject()
            color_array.extend([FloatObject(1.0), FloatObject(0.0), FloatObject(0.0)])
            annot[NameObject('/C')] = color_array
    
    elif subtype == NameObject('/FreeText') or subtype == '/FreeText':
        # Ensure text appearance
        if NameObject('/DA') not in annot:
            # Default appearance string
            annot[NameObject('/DA')] = TextStringObject('0 0 0 rg /Helv 12 Tf')
        
        # Ensure alignment
        if NameObject('/Q') not in annot:
            annot[NameObject('/Q')] = FloatObject(0)  # Left aligned
    
    elif subtype == NameObject('/Text') or subtype == '/Text':
        # Ensure icon name
        if NameObject('/Name') not in annot:
            annot[NameObject('/Name')] = NameObject('/Note')
        
        # Ensure state
        if NameObject('/State') not in annot:
            annot[NameObject('/State')] = TextStringObject('Unmarked')
    
    elif subtype == NameObject('/Ink') or subtype == '/Ink':
        # Ensure border style for ink annotations
        if NameObject('/BS') not in annot:
            border_style = DictionaryObject()
            border_style[NameObject('/W')] = FloatObject(1.0)
            annot[NameObject('/BS')] = border_style
        
        # Ensure color
        if NameObject('/C') not in annot:
            color_array = ArrayObject()
            color_array.extend([FloatObject(0.0), FloatObject(0.0), FloatObject(1.0)])  # Blue
            annot[NameObject('/C')] = color_array
    
    return annot

def create_annotation_copy(original_annot: DictionaryObject, writer: pypdf.PdfWriter) -> DictionaryObject:
    """Create a proper copy of an annotation dictionary for merging"""
    new_annot = DictionaryObject()
    
    # Copy all fields from original annotation with proper type conversion
    for key, value in original_annot.items():
        try:
            if isinstance(value, (list, ArrayObject)):
                # Handle arrays (like /Rect, /C, /Vertices, /InkList, etc.)
                new_array = ArrayObject()
                for item in value:
                    if isinstance(item, (int, float)):
                        new_array.append(FloatObject(float(item)))
                    elif isinstance(item, (list, ArrayObject)):
                        # Handle nested arrays (like in /InkList)
                        nested_array = ArrayObject()
                        for nested_item in item:
                            nested_array.append(FloatObject(float(nested_item)))
                        new_array.append(nested_array)
                    else:
                        new_array.append(item)
                new_annot[key] = new_array
                
            elif isinstance(value, (dict, DictionaryObject)):
                # Handle dictionaries (like /AP, /BS, etc.)
                new_dict = DictionaryObject()
                for sub_key, sub_value in value.items():
                    if isinstance(sub_value, (int, float)):
                        new_dict[sub_key] = FloatObject(float(sub_value))
                    elif isinstance(sub_value, str):
                        new_dict[sub_key] = TextStringObject(sub_value)
                    else:
                        new_dict[sub_key] = sub_value
                new_annot[key] = new_dict
                
            elif isinstance(value, (int, float)):
                # Convert numbers to proper PDF objects
                new_annot[key] = FloatObject(float(value))
                
            elif isinstance(value, str):
                # Convert strings to proper PDF objects
                new_annot[key] = TextStringObject(value)
                
            else:
                # Copy other values directly
                new_annot[key] = value
                
        except Exception as e:
            print(f"    Warning: Could not copy field {key}: {e}")
            # For critical fields, try alternative approaches
            if key == NameObject('/Rect') and isinstance(value, (list, ArrayObject)):
                try:
                    # Manually create rect array
                    rect_array = ArrayObject()
                    for coord in value:
                        rect_array.append(FloatObject(float(coord)))
                    new_annot[key] = rect_array
                    print(f"    ✓ Manually fixed /Rect field")
                except:
                    print(f"    ✗ Failed to fix /Rect field")
            continue
    
    # Enhance annotation for visibility
    new_annot = ensure_annotation_visibility(new_annot)
    
    return new_annot

def merge_annotations_to_pdf(source_pdf: str, target_pdf: str, output_pdf: str,
                              extracted_annotations: List[Dict], target_page: int = 0) -> bool:
    """Merge extracted annotations into target PDF with enhanced visibility"""
    print(f"\nMerging annotations into {target_pdf}")
    
    try:
        # Read the target PDF
        with open(target_pdf, 'rb') as file:
            reader = pypdf.PdfReader(file)
            writer = pypdf.PdfWriter()

            # Copy pages to writer
            for page in reader.pages:
                writer.add_page(page)

            if target_page >= len(writer.pages):
                print(f"Error: Target page {target_page} doesn't exist.")
                return False

            page_obj = writer.pages[target_page]

            # Ensure /Annots array exists
            if "/Annots" not in page_obj:
                page_obj[NameObject("/Annots")] = ArrayObject()

            # Process and add each annotation
            for i, annot_data in enumerate(extracted_annotations):
                original_annot = annot_data['raw_annotation']

                # Deep copy and convert all necessary fields
                new_annot = create_annotation_copy(original_annot, writer)

                # Ensure /Rect exists
                if NameObject("/Rect") not in new_annot:
                    rect = original_annot.get("/Rect", [100, 100, 200, 200])  # default box
                    new_annot[NameObject("/Rect")] = ArrayObject([FloatObject(float(x)) for x in rect])

                # Remove /P if present (avoid broken references)
                if NameObject("/P") in new_annot:
                    del new_annot[NameObject("/P")]

                # Add annotation as indirect object
                annot_ref = writer._add_object(new_annot)
                page_obj["/Annots"].append(annot_ref)

                summary = annot_data.get('summary', {})
                print(f"  ✓ Added annotation {i+1}: {summary.get('subtype')} on page {target_page + 1}")

        # Write the output
        with open(output_pdf, 'wb') as out_file:
            writer.write(out_file)

        print(f"✓ Finished writing {output_pdf}")
        return True

    except Exception as e:
        print(f"✗ Merge failed: {e}")
        return False


def create_test_annotations_pdf(output_pdf: str = "test_annotations.pdf"):
    """Create a test PDF with visible annotations for comparison"""
    print(f"\nCreating test PDF with visible annotations: {output_pdf}")
    
    try:
        writer = pypdf.PdfWriter()
        
        # Create a blank page
        from pypdf.pdf import PageObject
        page = PageObject.create_blank_page(width=612, height=792)  # Letter size
        
        # Create test annotations
        annotations = []
        
        # 1. Square annotation (visible rectangle)
        square_annot = DictionaryObject()
        square_annot[NameObject('/Type')] = NameObject('/Annot')
        square_annot[NameObject('/Subtype')] = NameObject('/Square')
        square_annot[NameObject('/Rect')] = ArrayObject([
            FloatObject(100), FloatObject(600), FloatObject(200), FloatObject(700)
        ])
        square_annot[NameObject('/C')] = ArrayObject([
            FloatObject(1.0), FloatObject(0.0), FloatObject(0.0)  # Red
        ])
        square_annot[NameObject('/BS')] = DictionaryObject({
            NameObject('/W'): FloatObject(2.0),
            NameObject('/S'): NameObject('/S')
        })
        square_annot[NameObject('/Contents')] = TextStringObject('Test Square')
        square_annot[NameObject('/F')] = FloatObject(4)  # Print flag
        annotations.append(square_annot)
        
        # 2. FreeText annotation
        text_annot = DictionaryObject()
        text_annot[NameObject('/Type')] = NameObject('/Annot')
        text_annot[NameObject('/Subtype')] = NameObject('/FreeText')
        text_annot[NameObject('/Rect')] = ArrayObject([
            FloatObject(250), FloatObject(600), FloatObject(400), FloatObject(650)
        ])
        text_annot[NameObject('/Contents')] = TextStringObject('Test FreeText')
        text_annot[NameObject('/DA')] = TextStringObject('1 0 0 rg /Helv 12 Tf')  # Red text
        text_annot[NameObject('/Q')] = FloatObject(0)  # Left aligned
        text_annot[NameObject('/F')] = FloatObject(4)
        annotations.append(text_annot)
        
        # 3. Text (Note) annotation
        note_annot = DictionaryObject()
        note_annot[NameObject('/Type')] = NameObject('/Annot')
        note_annot[NameObject('/Subtype')] = NameObject('/Text')
        note_annot[NameObject('/Rect')] = ArrayObject([
            FloatObject(450), FloatObject(600), FloatObject(470), FloatObject(620)
        ])
        note_annot[NameObject('/Contents')] = TextStringObject('Test Note')
        note_annot[NameObject('/Name')] = NameObject('/Note')
        note_annot[NameObject('/F')] = FloatObject(4)
        annotations.append(note_annot)
        
        # Add annotations to page
        annot_array = ArrayObject()
        for annot in annotations:
            annot_ref = writer._add_object(annot)
            annot_array.append(annot_ref)
        
        page[NameObject('/Annots')] = annot_array
        writer.add_page(page)
        
        # Save test PDF
        with open(output_pdf, 'wb') as f:
            writer.write(f)
        
        print(f"✓ Test PDF created: {output_pdf}")
        print("  - Red square at top-left")
        print("  - Red text in middle")
        print("  - Note icon on right")
        print("  Open this PDF to verify annotations are visible in your viewer")
        
    except Exception as e:
        print(f"✗ Failed to create test PDF: {e}")

def verify_merged_annotations(pdf_path: str) -> bool:
    """Verify that annotations are visible in the merged PDF"""
    print(f"\nVerifying annotations in {pdf_path}...")
    
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            
            total_annotations = 0
            for page_num, page in enumerate(reader.pages):
                if '/Annots' in page:
                    annots = page['/Annots']
                    page_annot_count = len(annots)
                    total_annotations += page_annot_count
                    
                    print(f"  Page {page_num + 1}: {page_annot_count} annotations")
                    
                    # Check first few annotations for essential data
                    for i, annot_ref in enumerate(annots[-5:]):  # Check last 5 (our merged ones)
                        try:
                            annot = annot_ref.get_object()
                            subtype = annot.get('/Subtype', 'Unknown')
                            rect = annot.get('/Rect', None)
                            content = annot.get('/Contents', '')
                            flags = annot.get('/F', 'None')
                            color = annot.get('/C', 'None')
                            
                            rect_str = f"[{', '.join(f'{float(x):.1f}' for x in rect)}]" if rect else "Missing"
                            print(f"    Ann {i+1}: {subtype}, Rect: {rect_str}")
                            print(f"           Content: '{str(content)[:30]}...', Flags: {flags}, Color: {color}")
                            
                        except Exception as e:
                            print(f"    Ann {i+1}: Error reading - {e}")
            
            print(f"  Total annotations found: {total_annotations}")
            return total_annotations > 0
            
    except Exception as e:
        print(f"  ✗ Verification failed: {e}")
        return False

def save_extraction_summary(annotations: List[Dict], filename: str = "extraction_summary.json"):
    """Save a summary of extracted annotations for review"""
    summary = []
    for annot in annotations:
        summary.append({
            'object_id': annot['object_id'],
            'source_page': annot['source_page'] + 1,
            'summary': annot['summary'],
            'all_fields': list(annot['annotation_dict'].keys())
        })
    
    with open(filename, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"✓ Extraction summary saved to: {filename}")

def main():
    # Configuration
    source_pdf = "pdf2.pdf"
    target_pdf = "pdf1.pdf"
    output_pdf = "merged_annotations_enhanced.pdf"
    keep_ids = {146, 151, 158, 163, 170}
    target_page = 0
    
    print("="*60)
    print("ENHANCED PDF ANNOTATION EXTRACTION AND MERGER")
    print("="*60)
    print(f"Source PDF: {source_pdf}")
    print(f"Target PDF: {target_pdf}")
    print(f"Output PDF: {output_pdf}")
    print(f"Annotation IDs to extract: {sorted(keep_ids)}")
    print(f"Target page for annotations: {target_page + 1}")
    
    # Create test PDF first for comparison
    create_test_annotations_pdf()
    
    # Step 1: Extract annotations from source PDF
    try:
        extracted_annotations = extract_annotations_pypdf(source_pdf, keep_ids)
        
        if not extracted_annotations:
            print("✗ No annotations found with the specified IDs")
            return
        
        # Save extraction summary
        save_extraction_summary(extracted_annotations)
        
    except Exception as e:
        print(f"✗ Failed to extract annotations: {e}")
        return
    
    # Step 2: Merge annotations into target PDF
    try:
        success = merge_annotations_to_pdf(
            source_pdf, target_pdf, output_pdf, 
            extracted_annotations, target_page
        )
        
        if success:
            print("\n" + "="*60)
            print("SUCCESS!")
            print("="*60)
            print(f"✓ Extracted {len(extracted_annotations)} annotations from {source_pdf}")
            print(f"✓ Merged them into {target_pdf}")
            print(f"✓ Saved result as {output_pdf}")
            print(f"✓ All annotations placed on page {target_page + 1}")
            
            # Verify the merge worked
            if verify_merged_annotations(output_pdf):
                print("✓ Annotations are present in the merged PDF")
                print("\nTROUBLESHOoting TIPS:")
                print("1. Open 'test_annotations.pdf' first to verify your PDF viewer shows annotations")
                print("2. If test annotations are visible but merged ones aren't, try:")
                print("   - Different PDF viewer (Adobe Reader, Foxit, etc.)")
                print("   - Enable 'Show Comments' or 'Show Annotations' in viewer")
                print("   - Check if annotations are on correct page")
                print("3. Some annotation types may appear differently in different viewers")
            else:
                print("⚠ Warning: Annotations may not be visible - check the PDF manually")
            
            # Show what was merged
            print(f"\nMerged annotations:")
            for i, annot in enumerate(extracted_annotations):
                summary = annot['summary']
                print(f"  {i+1}. {summary['subtype']} (ID: {annot['object_id']}) - {summary['content'][:50]}...")
        else:
            print("✗ Merge operation failed")
            
    except Exception as e:
        print(f"✗ Merge failed: {e}")

if __name__ == "__main__":
    main()

ENHANCED PDF ANNOTATION EXTRACTION AND MERGER
Source PDF: pdf2.pdf
Target PDF: pdf1.pdf
Output PDF: merged_annotations_enhanced.pdf
Annotation IDs to extract: [146, 151, 158, 163, 170]
Target page for annotations: 1

Creating test PDF with visible annotations: test_annotations.pdf
✗ Failed to create test PDF: No module named 'pypdf.pdf'
Extracting annotations from pdf2.pdf
  ✓ Found annotation ID 146: /Square on page 1
  ✓ Found annotation ID 151: /FreeText on page 1
  ✓ Found annotation ID 158: /Square on page 1
  ✓ Found annotation ID 163: /Text on page 1
  ✓ Found annotation ID 170: /Ink on page 1
Extracted 5 annotations
✓ Extraction summary saved to: extraction_summary.json

Merging annotations into pdf1.pdf
  ✓ Added annotation 1: /Square on page 1
  ✓ Added annotation 2: /FreeText on page 1
  ✓ Added annotation 3: /Square on page 1
  ✓ Added annotation 4: /Text on page 1
  ✓ Added annotation 5: /Ink on page 1
✓ Finished writing merged_annotations_enhanced.pdf

SUCCESS!
✓ Extracte