# Page Parser Pipeline Showcase

This notebook demonstrates the page-parser pipeline functionality, including document processing, layout detection, text extraction, and image annotation capabilities.

## Setup and Imports

In [None]:
import sys
import json
from pathlib import Path
import cv2
import numpy as np
from typing import List, Dict, Any
import matplotlib.pyplot as plt
from PIL import Image

# Add src to path for imports
sys.path.append('src')

from src.pipeline import pipeline
from src.pipeline.document import pdf_to_png, Detector
from src.pipeline.image import annotate_image
from src.config.settings import get_settings
from src.common.logging import setup_logging, get_logger

## Initialize Logging and Settings

In [None]:
# Setup logging
setup_logging(log_level="INFO")
logger = get_logger("pipeline_showcase")

# Load settings
settings = get_settings(Path("settings.toml"))
print(f"Vision provider: {settings.vision.provider.name}")
print(f"Model: {settings.vision.provider.model}")
print(f"Chart labels: {settings.filtration.chart_labels}")
print(f"OCR language: {settings.processing.ocr_lang}")
print(f"Zoom factor: {settings.processing.zoom_factor}")

## PDF to PNG Conversion

First, let's convert a PDF document to PNG images for processing.

In [None]:
# Convert PDF to PNG
pdf_path = "data/2507.21509v1.pdf"
temp_output_dir = "temp_images"
Path(temp_output_dir).mkdir(exist_ok=True)

logger.info(f"Converting PDF: {pdf_path}")
image_paths = pdf_to_png(
    pdf_path=pdf_path,
    output_dir=temp_output_dir,
    zoom_x=settings.processing.zoom_factor,
    zoom_y=settings.processing.zoom_factor,
)

print(f"Generated {len(image_paths)} page images:")
for i, path in enumerate(image_paths[:3]):  # Show first 3
    print(f"  Page {i+1}: {path}")
if len(image_paths) > 3:
    print(f"  ... and {len(image_paths) - 3} more")

## Display Sample Page

Let's display the first page to see what we're working with.

In [None]:
# Display first page
first_page_path = image_paths[0]
img = Image.open(first_page_path)

plt.figure(figsize=(12, 16))
plt.imshow(img)
plt.axis('off')
plt.title(f"Sample Page: {Path(first_page_path).name}")
plt.tight_layout()
plt.show()

print(f"Image dimensions: {img.size}")

## Layout Detection

Now let's use the YOLO detector to identify different elements in the document.

In [None]:
# Initialize detector
model_path = "models/yolov12l-doclaynet.pt"
detector = Detector(model_path=model_path)

print(f"Detector initialized with model: {model_path}")
print(f"Model class names: {detector.model.names}")

In [None]:
# Detect layout elements in the first page
detections = detector.parse_layout(Path(first_page_path))

print(f"Found {len(detections)} elements:")
for i, detection in enumerate(detections):
    x1, y1, x2, y2 = detection.bbox
    width = x2 - x1
    height = y2 - y1
    print(f"  {i+1}. {detection.label_name} (confidence: {detection.confidence:.3f}) - Size: {width}x{height}")

## Image Annotation

Let's annotate the image with the detected bounding boxes and labels.

In [None]:
# Annotate the image with detections
annotated_img = annotate_image(
    image_path=str(first_page_path),
    detections=detections,
    line_width=3,
    font_size=12,
)

# Display annotated image
plt.figure(figsize=(12, 16))
# Convert BGR to RGB for matplotlib
annotated_img_rgb = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
plt.imshow(annotated_img_rgb)
plt.axis('off')
plt.title("Annotated Page with Layout Detection")
plt.tight_layout()
plt.show()

## Filter Chart Elements

Let's filter the detections to focus on chart/figure elements.

In [None]:
from src.pipeline.utils import filter_detections

# Filter for chart elements
chart_labels = settings.filtration.chart_labels
chart_detections = filter_detections(detections, chart_labels)

print(f"Chart labels to filter: {chart_labels}")
print(f"Found {len(chart_detections)} chart/figure elements:")
for i, detection in enumerate(chart_detections):
    x1, y1, x2, y2 = detection.bbox
    width = x2 - x1
    height = y2 - y1
    print(f"  {i+1}. {detection.label_name} (confidence: {detection.confidence:.3f}) - Size: {width}x{height}")

In [None]:
# Annotate image with only chart elements
if chart_detections:
    chart_annotated_img = annotate_image(
        image_path=str(first_page_path),
        detections=chart_detections,
        line_width=4,
        font_size=14,
    )
    
    # Display chart-only annotated image
    plt.figure(figsize=(12, 16))
    chart_annotated_img_rgb = cv2.cvtColor(chart_annotated_img, cv2.COLOR_BGR2RGB)
    plt.imshow(chart_annotated_img_rgb)
    plt.axis('off')
    plt.title("Chart/Figure Elements Only")
    plt.tight_layout()
    plt.show()
else:
    print("No chart elements found in this page.")

## Full Pipeline Execution

Now let's run the complete pipeline on a document to see the full workflow in action.

In [None]:
# Run full pipeline
output_dir = "showcase_output"
Path(output_dir).mkdir(exist_ok=True)

logger.info("Running full pipeline...")
try:
    results = pipeline(
        document_path=pdf_path,
        model_path=model_path,
        output_dir=output_dir,
        settings_file=Path("settings.toml"),
    )
    
    print(f"\nPipeline completed successfully!")
    print(f"Processed {len(results)} pages")
    
    # Show summary statistics
    total_elements = sum(len(page['elements']) for page in results)
    chart_elements = sum(len([e for e in page['elements'] if e['type'] == 'chart']) for page in results)
    text_elements = sum(len([e for e in page['elements'] if e['type'] == 'text']) for page in results)
    
    print(f"Total elements extracted: {total_elements}")
    print(f"  - Chart elements: {chart_elements}")
    print(f"  - Text elements: {text_elements}")
    
except Exception as e:
    print(f"Pipeline failed: {e}")
    logger.error(f"Pipeline error: {e}", exc_info=True)

## Examine Pipeline Results

Let's look at the detailed results from the pipeline execution.

In [None]:
# Display results for first page
if 'results' in locals() and results:
    first_page_result = results[0]
    
    print(f"Page {first_page_result['page_number']} Results:")
    print(f"Number of elements: {len(first_page_result['elements'])}")
    print("\nElement breakdown:")
    
    for i, element in enumerate(first_page_result['elements']):
        print(f"\nElement {i+1}:")
        print(f"  Type: {element['type']}")
        
        if element['type'] == 'chart':
            print(f"  Label: {element['label']}")
            print(f"  Bbox: {element['bbox']}")
            print(f"  Summary: {element['summary'][:200]}..." if len(element['summary']) > 200 else f"  Summary: {element['summary']}")
        elif element['type'] == 'text':
            print(f"  Bbox: {element['bbox']}")
            print(f"  Text preview: {element['text'][:200]}..." if len(element['text']) > 200 else f"  Text: {element['text']}")
else:
    print("No pipeline results available.")

## Advanced Annotation Examples

Let's demonstrate more advanced annotation features with different styling options.

In [None]:
# Create multiple annotation styles
from src.pipeline.image.annotate import Fonts

if detections:
    # Style 1: Thin lines, small font
    style1_img = annotate_image(
        image_path=str(first_page_path),
        detections=detections,
        line_width=1,
        font=Fonts.Arial,
        font_size=8,
    )
    
    # Style 2: Thick lines, large font
    style2_img = annotate_image(
        image_path=str(first_page_path),
        detections=detections,
        line_width=5,
        font=Fonts.Consolas,
        font_size=16,
    )
    
    # Display both styles side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 16))
    
    ax1.imshow(cv2.cvtColor(style1_img, cv2.COLOR_BGR2RGB))
    ax1.set_title("Style 1: Thin Lines, Small Font")
    ax1.axis('off')
    
    ax2.imshow(cv2.cvtColor(style2_img, cv2.COLOR_BGR2RGB))
    ax2.set_title("Style 2: Thick Lines, Large Font")
    ax2.axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("No detections available for styling demonstration.")

## Save Annotated Images

Let's save some annotated images to the output directory.

In [None]:
# Save annotated images with different styles
save_dir = Path(output_dir) / "annotated_images"
save_dir.mkdir(exist_ok=True)

if detections:
    # Save full annotation
    full_annotated = annotate_image(
        image_path=str(first_page_path),
        detections=detections,
        line_width=3,
        font_size=12,
        autosave=True,
        save_path=str(save_dir / "full_annotated.png")
    )
    
    # Save chart-only annotation if charts exist
    if chart_detections:
        chart_only_annotated = annotate_image(
            image_path=str(first_page_path),
            detections=chart_detections,
            line_width=4,
            font_size=14,
            autosave=True,
            save_path=str(save_dir / "charts_only.png")
        )
    
    print(f"Annotated images saved to: {save_dir}")
    print(f"Files created:")
    for file in save_dir.glob("*.png"):
        print(f"  - {file.name}")
else:
    print("No detections to save.")

## Pipeline Performance Summary

Let's create a summary of the pipeline's performance and capabilities.

In [None]:
# Create performance summary
print("=" * 50)
print("PAGE PARSER PIPELINE SHOWCASE SUMMARY")
print("=" * 50)

print(f"\nInput Document: {Path(pdf_path).name}")
print(f"Model Used: {Path(model_path).name}")
print(f"Pages Converted: {len(image_paths)}")
print(f"Elements Detected (Page 1): {len(detections)}")

if 'results' in locals() and results:
    total_pages = len(results)
    total_elements = sum(len(page['elements']) for page in results)
    total_charts = sum(len([e for e in page['elements'] if e['type'] == 'chart']) for page in results)
    total_text = sum(len([e for e in page['elements'] if e['type'] == 'text']) for page in results)
    
    print(f"\nFull Pipeline Results:")
    print(f"  Total Pages Processed: {total_pages}")
    print(f"  Total Elements Extracted: {total_elements}")
    print(f"  Chart Elements: {total_charts}")
    print(f"  Text Elements: {total_text}")

print(f"\nCapabilities Demonstrated:")
print(f"  ✓ PDF to PNG conversion")
print(f"  ✓ Layout detection with YOLO")
print(f"  ✓ Element filtering and classification")
print(f"  ✓ Image annotation with bounding boxes")
print(f"  ✓ Multiple annotation styles")
print(f"  ✓ Chart summarization (if configured)")
print(f"  ✓ Text extraction with OCR")
print(f"  ✓ JSON output generation")

print(f"\nOutput Files Generated:")
output_path = Path(output_dir)
if output_path.exists():
    for file in output_path.rglob("*.*"):
        if file.is_file():
            print(f"  - {file.relative_to(output_path)}")

print("\n" + "=" * 50)
print("Showcase completed successfully!")
print("=" * 50)

## Cleanup

Clean up temporary files created during the showcase.

In [None]:
import shutil

# Clean up temporary image directory
temp_dir = Path(temp_output_dir)
if temp_dir.exists():
    shutil.rmtree(temp_dir)
    print(f"Cleaned up temporary directory: {temp_dir}")

print("\nShowcase completed! Check the output directory for generated files.")
print(f"Output directory: {Path(output_dir).absolute()}")