In [1]:
!pip install pypdf pdfminer

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
     ---------------------------------------- 0.0/4.2 MB ? eta -:--:--
     ---------------------------------------- 4.2/4.2 MB 35.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py): started
  Building wheel for pdfminer (setup.py): finished with status 'done'
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140761 sha256=9cddf8d4f0be44d991f3798e732701edc315cf7cc459b7c00f0f7e7641547183
  Stored in directory: c:\users\i7714\appdata\local\pip\cache\wheels\90\7b\26\62139fb7c8c5c242c492e02ce8613ca4c3df4cd86afb8e6264
Successfully built pdfminer
Installing collected packages: pdfminer
Successfully installed pdfminer-20191125



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import fitz  # PyMuPDF
import pypdf
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextBox, LTTextLine, LTChar, LTAnno
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
import json
from typing import Dict, List, Any
import traceback

def rgb_to_list(color):
    """Converts fitz colors to list [r,g,b]"""
    if color is None:
        return None
    if isinstance(color, (list, tuple)):
        return list(color)
    try:
        return list(color)
    except Exception:
        return None

def extract_with_pymupdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """Enhanced PyMuPDF extraction with maximum data"""
    print("=== PyMuPDF (fitz) Extraction ===")
    doc = fitz.open(pdf_path)
    extracted = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        annots = list(page.annots() or [])
        
        for annot in annots:
            if annot.xref in keep_ids:
                annot_dict = {
                    'library': 'PyMuPDF',
                    'page_number': page_num + 1,
                    'object_id': annot.xref,
                    'type': 'annotation',
                    'subtype': annot.info.get("subtype", "") or annot.type[1],
                    'rect': list(annot.rect),
                }
                
                # Enhanced content extraction
                content_sources = [
                    annot.info.get("content", ""),
                    annot.info.get("title", ""),
                    getattr(annot, 'content', ''),
                ]
                content = next((c for c in content_sources if c), "")
                annot_dict['content'] = content
                
                # Try to get text content using different methods
                try:
                    text_content = annot.get_text()
                    if text_content:
                        annot_dict['extracted_text'] = text_content
                except:
                    pass
                
                # Enhanced color extraction
                if hasattr(annot, 'colors'):
                    annot_dict['stroke_color'] = rgb_to_list(annot.colors.get('stroke'))
                    annot_dict['fill_color'] = rgb_to_list(annot.colors.get('fill'))
                    annot_dict['text_color'] = rgb_to_list(annot.colors.get('text'))
                
                # Font information (enhanced)
                try:
                    if hasattr(annot, 'fontname'):
                        annot_dict['font'] = annot.fontname
                    if hasattr(annot, 'fontsize'):
                        annot_dict['font_size'] = annot.fontsize
                except:
                    annot_dict['font'] = 'default'
                    annot_dict['font_size'] = 12
                
                # Geometric data for non-square annotations
                try:
                    if hasattr(annot, 'vertices') and annot.vertices:
                        annot_dict['vertices'] = [list(v) for v in annot.vertices]
                except:
                    pass
                
                # Border and line information
                try:
                    border = annot.border
                    if border:
                        annot_dict['border'] = {
                            'width': border.get('width', 0),
                            'style': border.get('style', ''),
                            'dashes': border.get('dashes', [])
                        }
                except:
                    pass
                
                # Additional metadata
                for attr in ['author', 'created', 'modified', 'opacity', 'flags']:
                    try:
                        val = getattr(annot, attr, None)
                        if val is not None:
                            annot_dict[attr] = val
                    except:
                        pass
                
                # Raw annotation dictionary
                try:
                    raw_dict = annot.get_annot_dict()
                    annot_dict['raw_dict_keys'] = list(raw_dict.keys()) if raw_dict else []
                except:
                    pass
                
                # Appearance stream info
                try:
                    ap = annot.get_ap()
                    if ap:
                        annot_dict['has_appearance_stream'] = True
                except:
                    annot_dict['has_appearance_stream'] = False
                
                extracted.append(annot_dict)
    
    doc.close()
    return extracted

def extract_with_pypdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """pypdf annotation extraction"""
    print("=== pypdf Extraction ===")
    extracted = []
    
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            
            for page_num, page in enumerate(reader.pages):
                if '/Annots' in page:
                    annots = page['/Annots']
                    
                    for annot_ref in annots:
                        try:
                            annot = annot_ref.get_object()
                            
                            # Get object ID (indirect reference number)
                            obj_id = annot_ref.idnum if hasattr(annot_ref, 'idnum') else None
                            
                            if obj_id in keep_ids:
                                annot_dict = {
                                    'library': 'pypdf',
                                    'page_number': page_num + 1,
                                    'object_id': obj_id,
                                    'type': 'annotation',
                                }
                                
                                # Extract all available keys from annotation dictionary
                                for key, value in annot.items():
                                    try:
                                        if key == '/Subtype':
                                            annot_dict['subtype'] = str(value)
                                        elif key == '/Rect':
                                            annot_dict['rect'] = [float(x) for x in value]
                                        elif key == '/Contents':
                                            annot_dict['content'] = str(value)
                                        elif key == '/T':  # Title
                                            annot_dict['title'] = str(value)
                                        elif key == '/C':  # Color
                                            annot_dict['color'] = [float(x) for x in value] if value else None
                                        elif key == '/CA':  # Constant opacity
                                            annot_dict['opacity'] = float(value)
                                        elif key == '/Border':
                                            annot_dict['border'] = [float(x) for x in value] if value else None
                                        elif key == '/BS':  # Border style
                                            bs = value
                                            if isinstance(bs, dict):
                                                annot_dict['border_style'] = {k: v for k, v in bs.items()}
                                        elif key == '/Vertices':
                                            annot_dict['vertices'] = [float(x) for x in value] if value else None
                                        elif key == '/InkList':
                                            annot_dict['ink_list'] = [[float(x) for x in stroke] for stroke in value] if value else None
                                        elif key == '/AP':  # Appearance dictionary
                                            annot_dict['has_appearance'] = True
                                        else:
                                            # Store other keys for analysis
                                            annot_dict[f'raw_{key[1:]}'] = str(value)
                                    except Exception as e:
                                        annot_dict[f'error_{key}'] = str(e)
                                
                                # Get all raw keys for comparison
                                annot_dict['all_keys'] = [str(k) for k in annot.keys()]
                                
                                extracted.append(annot_dict)
                                
                        except Exception as e:
                            print(f"Error processing annotation: {e}")
                            
    except Exception as e:
        print(f"pypdf extraction failed: {e}")
    
    return extracted

def extract_with_pdfminer(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """pdfminer annotation extraction - focuses on document structure"""
    print("=== pdfminer Extraction ===")
    extracted = []
    
    try:
        # pdfminer is primarily for text extraction, but let's see what we can get
        with open(pdf_path, 'rb') as file:
            # This is a simplified approach - pdfminer doesn't directly handle annotations
            # but we can try to extract document structure information
            pages = list(extract_pages(pdf_path))
            
            for page_num, page in enumerate(pages):
                page_dict = {
                    'library': 'pdfminer',
                    'page_number': page_num + 1,
                    'page_width': page.width,
                    'page_height': page.height,
                    'note': 'pdfminer primarily extracts text layout, not annotations directly'
                }
                
                # Extract text elements that might be related to annotations
                text_elements = []
                for element in page:
                    if hasattr(element, 'bbox'):
                        text_elements.append({
                            'type': type(element).__name__,
                            'bbox': element.bbox,
                            'text': getattr(element, 'get_text', lambda: '')() if hasattr(element, 'get_text') else ''
                        })
                
                page_dict['text_elements_count'] = len(text_elements)
                page_dict['sample_elements'] = text_elements[:5]  # First 5 elements as sample
                
                extracted.append(page_dict)
                
    except Exception as e:
        print(f"pdfminer extraction failed: {e}")
        extracted.append({
            'library': 'pdfminer',
            'error': str(e),
            'note': 'pdfminer is not designed for annotation extraction'
        })
    
    return extracted

def compare_results(results_dict: Dict[str, List[Dict]]):
    """Compare and analyze results from different libraries"""
    print("\n" + "="*60)
    print("COMPARISON ANALYSIS")
    print("="*60)
    
    for lib_name, results in results_dict.items():
        print(f"\n{lib_name} Results:")
        print(f"  Found {len(results)} items")
        
        for i, result in enumerate(results):
            print(f"  Item {i+1}:")
            for key, value in result.items():
                if isinstance(value, (list, dict)) and len(str(value)) > 100:
                    print(f"    {key}: [{type(value).__name__} with {len(value) if hasattr(value, '__len__') else '?'} items]")
                else:
                    print(f"    {key}: {value}")
            print()
    
    # Find common fields
    all_fields = set()
    for results in results_dict.values():
        for result in results:
            all_fields.update(result.keys())
    
    print(f"\nAll unique fields found: {sorted(all_fields)}")
    
    # Compare data richness
    print(f"\nData Richness Comparison:")
    for lib_name, results in results_dict.items():
        if results:
            avg_fields = sum(len(result.keys()) for result in results) / len(results)
            print(f"  {lib_name}: Average {avg_fields:.1f} fields per annotation")

def main():
    pdf_path = "pdf2.pdf"
    keep_ids = {146, 151, 158, 163, 170}
    
    print(f"Extracting annotations with IDs: {keep_ids}")
    print(f"From file: {pdf_path}")
    print("\n" + "="*60)
    
    # Extract with all three libraries
    results = {}
    
    try:
        results['PyMuPDF'] = extract_with_pymupdf(pdf_path, keep_ids)
    except Exception as e:
        print(f"PyMuPDF failed: {e}")
        results['PyMuPDF'] = []
    
    try:
        results['pypdf'] = extract_with_pypdf(pdf_path, keep_ids)
    except Exception as e:
        print(f"pypdf failed: {e}")
        results['pypdf'] = []
    
    try:
        results['pdfminer'] = extract_with_pdfminer(pdf_path, keep_ids)
    except Exception as e:
        print(f"pdfminer failed: {e}")
        results['pdfminer'] = []
    
    # Compare results
    compare_results(results)
    
    # Save detailed results to JSON for analysis
    try:
        with open('annotation_comparison.json', 'w') as f:
            json.dump(results, f, indent=2, default=str)
        print(f"\nDetailed results saved to 'annotation_comparison.json'")
    except Exception as e:
        print(f"Failed to save JSON: {e}")
    
    return results

if __name__ == "__main__":
    results = main()
    
    # Print summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print("1. PyMuPDF: Best for general annotation extraction, handles most annotation types")
    print("2. pypdf: Provides raw PDF dictionary access, good for detailed annotation data")
    print("3. pdfminer: Primarily for text layout, not designed for annotation extraction")
    print("\nFor non-square annotations, pypdf's raw dictionary access may provide")
    print("the most complete data, while PyMuPDF offers the most convenient API.")

ImportError: cannot import name 'HOCRConverter' from 'pdfminer.converter' (C:\python\Lib\site-packages\pdfminer\converter.py)

In [3]:
import fitz  # PyMuPDF
import pypdf
import json
from typing import Dict, List, Any

def rgb_to_list(color):
    """Converts fitz colors to list [r,g,b]"""
    if color is None:
        return None
    if isinstance(color, (list, tuple)):
        return list(color)
    try:
        return list(color)
    except Exception:
        return None

def extract_with_pymupdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """Enhanced PyMuPDF extraction with maximum data"""
    print("=== PyMuPDF (fitz) Extraction ===")
    doc = fitz.open(pdf_path)
    extracted = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        annots = list(page.annots() or [])
        
        for annot in annots:
            if annot.xref in keep_ids:
                annot_dict = {
                    'library': 'PyMuPDF',
                    'page_number': page_num + 1,
                    'object_id': annot.xref,
                    'type': 'annotation',
                    'subtype': annot.info.get("subtype", "") or annot.type[1],
                    'rect': list(annot.rect),
                }
                
                # Enhanced content extraction
                content_sources = [
                    annot.info.get("content", ""),
                    annot.info.get("title", ""),
                    getattr(annot, 'content', ''),
                ]
                content = next((c for c in content_sources if c), "")
                annot_dict['content'] = content
                
                # Try to get text content
                try:
                    text_content = annot.get_text()
                    if text_content:
                        annot_dict['extracted_text'] = text_content
                except:
                    pass
                
                # Enhanced color extraction
                if hasattr(annot, 'colors'):
                    annot_dict['stroke_color'] = rgb_to_list(annot.colors.get('stroke'))
                    annot_dict['fill_color'] = rgb_to_list(annot.colors.get('fill'))
                    annot_dict['text_color'] = rgb_to_list(annot.colors.get('text'))
                
                # Font information
                try:
                    if hasattr(annot, 'fontname'):
                        annot_dict['font'] = annot.fontname
                    if hasattr(annot, 'fontsize'):
                        annot_dict['font_size'] = annot.fontsize
                    else:
                        annot_dict['font'] = 'default'
                        annot_dict['font_size'] = 12
                except:
                    annot_dict['font'] = 'default'
                    annot_dict['font_size'] = 12
                
                # Geometric data for non-square annotations
                try:
                    if hasattr(annot, 'vertices') and annot.vertices:
                        annot_dict['vertices'] = [list(v) for v in annot.vertices]
                        annot_dict['vertices_count'] = len(annot.vertices)
                except:
                    pass
                
                # Border and line information
                try:
                    border = annot.border
                    if border:
                        annot_dict['border'] = {
                            'width': border.get('width', 0),
                            'style': border.get('style', ''),
                            'dashes': border.get('dashes', [])
                        }
                except:
                    pass
                
                # Additional metadata
                metadata_attrs = ['author', 'created', 'modified', 'opacity', 'flags']
                for attr in metadata_attrs:
                    try:
                        val = getattr(annot, attr, None)
                        if val is not None:
                            annot_dict[attr] = str(val)
                    except:
                        pass
                
                # Raw annotation dictionary keys
                try:
                    raw_dict = annot.get_annot_dict()
                    if raw_dict:
                        annot_dict['raw_dict_keys'] = list(raw_dict.keys())
                        # Try to extract some useful raw data
                        for key in ['/AP', '/AS', '/CA', '/F', '/RC']:
                            if key in raw_dict:
                                annot_dict[f'raw_{key[1:]}'] = str(raw_dict[key])
                except:
                    pass
                
                # Check for appearance stream
                try:
                    ap = annot.get_ap()
                    annot_dict['has_appearance_stream'] = ap is not None
                except:
                    annot_dict['has_appearance_stream'] = False
                
                extracted.append(annot_dict)
    
    doc.close()
    return extracted

def extract_with_pypdf(pdf_path: str, keep_ids: set) -> List[Dict[str, Any]]:
    """pypdf annotation extraction - access to raw PDF data"""
    print("=== pypdf Extraction ===")
    extracted = []
    
    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            
            for page_num, page in enumerate(reader.pages):
                if '/Annots' in page:
                    annots = page['/Annots']
                    
                    for annot_ref in annots:
                        try:
                            annot = annot_ref.get_object()
                            
                            # Get object ID
                            obj_id = getattr(annot_ref, 'idnum', None)
                            
                            if obj_id in keep_ids:
                                annot_dict = {
                                    'library': 'pypdf',
                                    'page_number': page_num + 1,
                                    'object_id': obj_id,
                                    'type': 'annotation',
                                }
                                
                                # Extract standard annotation fields
                                field_mapping = {
                                    '/Subtype': 'subtype',
                                    '/Rect': 'rect',
                                    '/Contents': 'content',
                                    '/T': 'title',
                                    '/C': 'color',
                                    '/CA': 'opacity',
                                    '/F': 'flags',
                                    '/M': 'modified',
                                    '/CreationDate': 'created',
                                    '/Popup': 'popup',
                                    '/P': 'parent_page'
                                }
                                
                                for pdf_key, dict_key in field_mapping.items():
                                    if pdf_key in annot:
                                        try:
                                            value = annot[pdf_key]
                                            if pdf_key in ['/Rect', '/C'] and value:
                                                annot_dict[dict_key] = [float(x) for x in value]
                                            elif pdf_key in ['/CA', '/F']:
                                                annot_dict[dict_key] = float(value) if value else None
                                            else:
                                                annot_dict[dict_key] = str(value) if value else None
                                        except Exception as e:
                                            annot_dict[f'{dict_key}_error'] = str(e)
                                
                                # Geometric data for complex annotations
                                geometric_fields = ['/Vertices', '/InkList', '/L', '/CL', '/QuadPoints']
                                for field in geometric_fields:
                                    if field in annot:
                                        try:
                                            value = annot[field]
                                            if value:
                                                if field == '/InkList':
                                                    annot_dict['ink_list'] = [[float(x) for x in stroke] for stroke in value]
                                                else:
                                                    annot_dict[field[1:].lower()] = [float(x) for x in value]
                                        except Exception as e:
                                            annot_dict[f'{field[1:]}_error'] = str(e)
                                
                                # Border information
                                if '/Border' in annot:
                                    try:
                                        border = annot['/Border']
                                        annot_dict['border'] = [float(x) for x in border] if border else None
                                    except:
                                        pass
                                
                                if '/BS' in annot:
                                    try:
                                        bs = annot['/BS']
                                        if isinstance(bs, dict):
                                            border_style = {}
                                            for k, v in bs.items():
                                                border_style[str(k)] = str(v)
                                            annot_dict['border_style'] = border_style
                                    except:
                                        pass
                                
                                # Appearance dictionary
                                if '/AP' in annot:
                                    annot_dict['has_appearance'] = True
                                    try:
                                        ap = annot['/AP']
                                        if isinstance(ap, dict):
                                            annot_dict['appearance_keys'] = [str(k) for k in ap.keys()]
                                    except:
                                        pass
                                
                                # Font information (for text annotations)
                                if '/DA' in annot:  # Default Appearance
                                    try:
                                        da = str(annot['/DA'])
                                        annot_dict['default_appearance'] = da
                                        # Try to parse font info from DA string
                                        if 'Tf' in da:
                                            parts = da.split()
                                            for i, part in enumerate(parts):
                                                if part == 'Tf' and i > 0:
                                                    try:
                                                        annot_dict['font_size_from_da'] = float(parts[i-1])
                                                    except:
                                                        pass
                                    except:
                                        pass
                                
                                # Store all keys for analysis
                                annot_dict['all_pdf_keys'] = [str(k) for k in annot.keys()]
                                annot_dict['total_fields'] = len(annot.keys())
                                
                                extracted.append(annot_dict)
                                
                        except Exception as e:
                            print(f"Error processing annotation: {e}")
                            
    except Exception as e:
        print(f"pypdf extraction failed: {e}")
    
    return extracted

def compare_results(results_dict: Dict[str, List[Dict]]):
    """Compare and analyze results from different libraries"""
    print("\n" + "="*60)
    print("COMPARISON ANALYSIS")
    print("="*60)
    
    for lib_name, results in results_dict.items():
        print(f"\n{lib_name} Results:")
        print(f"  Found {len(results)} annotations")
        
        if results:
            # Show first annotation in detail
            first_result = results[0]
            print(f"  Sample annotation fields:")
            for key, value in sorted(first_result.items()):
                if isinstance(value, (list, dict)) and len(str(value)) > 80:
                    print(f"    {key}: [{type(value).__name__}] {str(value)[:80]}...")
                else:
                    print(f"    {key}: {value}")
            
            if len(results) > 1:
                print(f"  ... and {len(results)-1} more annotations")
    
    # Compare data richness
    print(f"\n" + "="*40)
    print("DATA RICHNESS COMPARISON")
    print("="*40)
    
    for lib_name, results in results_dict.items():
        if results:
            total_fields = sum(len([k for k in result.keys() if not k.endswith('_error')]) for result in results)
            avg_fields = total_fields / len(results)
            print(f"{lib_name:12}: {len(results)} annotations, avg {avg_fields:.1f} fields each")
            
            # Show unique capabilities
            all_keys = set()
            for result in results:
                all_keys.update(result.keys())
            
            unique_keys = sorted([k for k in all_keys if not k.startswith('library') and not k.startswith('page_number')])
            print(f"             Key fields: {', '.join(unique_keys[:10])}")
            if len(unique_keys) > 10:
                print(f"             ... and {len(unique_keys)-10} more")
    
    # Find annotations that one library found but another didn't
    pymupdf_ids = {r.get('object_id') for r in results_dict.get('PyMuPDF', [])}
    pypdf_ids = {r.get('object_id') for r in results_dict.get('pypdf', [])}
    
    print(f"\nAnnotation ID Coverage:")
    print(f"  PyMuPDF found IDs: {sorted(pymupdf_ids) if pymupdf_ids else 'None'}")
    print(f"  pypdf found IDs:   {sorted(pypdf_ids) if pypdf_ids else 'None'}")
    
    if pymupdf_ids != pypdf_ids:
        only_pymupdf = pymupdf_ids - pypdf_ids
        only_pypdf = pypdf_ids - pymupdf_ids
        if only_pymupdf:
            print(f"  Only PyMuPDF: {sorted(only_pymupdf)}")
        if only_pypdf:
            print(f"  Only pypdf:   {sorted(only_pypdf)}")

def main():
    pdf_path = "pdf2.pdf"
    keep_ids = {146, 151, 158, 163, 170}
    
    print(f"Comparing annotation extraction libraries")
    print(f"Target annotation IDs: {sorted(keep_ids)}")
    print(f"PDF file: {pdf_path}")
    
    # Extract with both libraries
    results = {}
    
    print(f"\n" + "="*60)
    try:
        results['PyMuPDF'] = extract_with_pymupdf(pdf_path, keep_ids)
        print(f"✓ PyMuPDF extraction completed")
    except Exception as e:
        print(f"✗ PyMuPDF failed: {e}")
        results['PyMuPDF'] = []
    
    try:
        results['pypdf'] = extract_with_pypdf(pdf_path, keep_ids)
        print(f"✓ pypdf extraction completed")
    except Exception as e:
        print(f"✗ pypdf failed: {e}")
        results['pypdf'] = []
    
    # Compare results
    compare_results(results)
    
    # Save results
    try:
        with open('annotation_comparison.json', 'w') as f:
            json.dump(results, f, indent=2, default=str)
        print(f"\n✓ Detailed results saved to 'annotation_comparison.json'")
    except Exception as e:
        print(f"✗ Failed to save JSON: {e}")
    
    return results

if __name__ == "__main__":
    results = main()
    
    print("\n" + "="*60)
    print("SUMMARY & RECOMMENDATIONS")
    print("="*60)
    print("PyMuPDF: Easy to use, good for basic annotation data")
    print("pypdf:   Raw PDF access, better for complex geometric data")
    print("\nFor non-square annotations:")
    print("- Check 'vertices', 'ink_list', 'quadpoints' fields in pypdf results")
    print("- pypdf's raw PDF keys may reveal data PyMuPDF doesn't expose")
    print("- Compare 'total_fields' count to see which extracts more data")

Comparing annotation extraction libraries
Target annotation IDs: [146, 151, 158, 163, 170]
PDF file: pdf2.pdf

=== PyMuPDF (fitz) Extraction ===
✓ PyMuPDF extraction completed
=== pypdf Extraction ===
✓ pypdf extraction completed

COMPARISON ANALYSIS

PyMuPDF Results:
  Found 5 annotations
  Sample annotation fields:
    border: {'width': 2.0, 'style': None, 'dashes': ()}
    content: Wiring change
    fill_color: []
    flags: 4
    font: default
    font_size: 12
    has_appearance_stream: False
    library: PyMuPDF
    object_id: 146
    opacity: -1
    page_number: 1
    rect: [851.5889892578125, 91.300048828125, 1084.175048828125, 666.6429443359375]
    stroke_color: [1.0, 0.8313725590705872, 0.0]
    subtype: Square
    text_color: None
    type: annotation
  ... and 4 more annotations

pypdf Results:
  Found 5 annotations
  Sample annotation fields:
    all_pdf_keys: [list] ['/Type', '/Subtype', '/Rect', '/BS', '/C', '/M', '/T', '/Contents', '/NM', '/Zo...
    appearance_keys: [