# Two-Tier Document Parser Visualization

This notebook demonstrates the capabilities of both parsing services:
- **Fast Parser**: Ultra-fast text extraction using PyMuPDF4LLM
- **Accurate Parser**: High-quality multimodal extraction using MinerU

## Setup
Make sure both services are running:
```bash
docker-compose up --build
```

In [None]:
# Import required libraries
import requests
import base64
import json
from pathlib import Path
from IPython.display import display, Markdown, Image, HTML
import pandas as pd
from io import BytesIO
import matplotlib.pyplot as plt
import time

# Service endpoints
FAST_PARSER_URL = "http://localhost:8004"
ACCURATE_PARSER_URL = "http://localhost:8005"

print("‚úÖ Libraries imported successfully")

## 1. Health Check
Verify both services are running

In [None]:
def check_health(service_name, url):
    """Check health status of a parser service."""
    try:
        response = requests.get(f"{url}/health", timeout=5)
        if response.status_code == 200:
            data = response.json()
            print(f"\n‚úÖ {service_name} is healthy")
            print(f"   Status: {data['status']}")
            print(f"   Workers: {data['workers']}")
            if 'no_gil' in data:
                print(f"   No-GIL Mode: {data['no_gil']}")
            if 'gpu_available' in data:
                print(f"   GPU Available: {data['gpu_available']}")
            print(f"   Parser: {data['parser']}")
            print(f"   Version: {data['version']}")
            return True
        else:
            print(f"\n‚ùå {service_name} returned status code {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"\n‚ùå {service_name} is not reachable: {e}")
        return False

# Check both services
fast_healthy = check_health("Fast Parser", FAST_PARSER_URL)
accurate_healthy = check_health("Accurate Parser", ACCURATE_PARSER_URL)

if fast_healthy and accurate_healthy:
    print("\nüéâ Both services are ready!")
else:
    print("\n‚ö†Ô∏è Some services are not available. Make sure Docker Compose is running.")

## 2. Upload PDF File
Specify the path to your local PDF file

In [None]:
# Specify your PDF file path here
PDF_FILE_PATH = "/path/to/your/document.pdf"  # Change this to your PDF file path

# Validate file exists
pdf_path = Path(PDF_FILE_PATH)
if not pdf_path.exists():
    print(f"‚ùå File not found: {PDF_FILE_PATH}")
    print("Please update PDF_FILE_PATH with a valid PDF file path")
elif not pdf_path.suffix.lower() == '.pdf':
    print(f"‚ùå File is not a PDF: {PDF_FILE_PATH}")
else:
    print(f"‚úÖ PDF file found: {pdf_path.name}")
    print(f"   Size: {pdf_path.stat().st_size / 1024:.2f} KB")

## 3. Parse with Fast Parser (PyMuPDF4LLM)
Ultra-fast text extraction

In [None]:
def parse_with_fast_parser(file_path):
    """Parse PDF with fast parser service."""
    print(f"\n‚è≥ Parsing with Fast Parser...")
    start_time = time.time()
    
    with open(file_path, 'rb') as f:
        files = {'file': (file_path.name, f, 'application/pdf')}
        response = requests.post(f"{FAST_PARSER_URL}/parse", files=files, timeout=30)
    
    elapsed = time.time() - start_time
    
    if response.status_code == 200:
        result = response.json()
        print(f"‚úÖ Fast Parser completed in {elapsed:.2f}s")
        return result
    else:
        print(f"‚ùå Fast Parser failed: {response.status_code}")
        print(response.text)
        return None

# Parse with fast parser
if pdf_path.exists():
    fast_result = parse_with_fast_parser(pdf_path)
else:
    fast_result = None
    print("‚ö†Ô∏è Skipping fast parser - file not found")

### 3.1 Display Fast Parser Metadata

In [None]:
if fast_result:
    metadata = fast_result['metadata']
    
    print("\nüìä Fast Parser Metadata:")
    print(f"   Parser: {metadata['parser']}")
    print(f"   Version: {metadata['version']}")
    print(f"   Pages: {metadata['pages']}")
    print(f"   Processing Time: {metadata['processing_time_ms']} ms")
    print(f"   Filename: {metadata['filename']}")
    print(f"   License: {metadata['license']}")
    print(f"   Source Code: {metadata['source_code']}")
    
    # Create metadata DataFrame
    df_fast_meta = pd.DataFrame([metadata]).T
    df_fast_meta.columns = ['Value']
    display(df_fast_meta)

### 3.2 Display Fast Parser Markdown Output

In [None]:
if fast_result:
    print("\nüìÑ Fast Parser Markdown Output:")
    print("=" * 80)
    display(Markdown(fast_result['markdown']))
    print("=" * 80)
    print(f"\nMarkdown length: {len(fast_result['markdown'])} characters")

## 4. Parse with Accurate Parser (MinerU)
High-quality multimodal extraction (takes longer)

In [None]:
def parse_with_accurate_parser(file_path):
    """Parse PDF with accurate parser service."""
    print(f"\n‚è≥ Parsing with Accurate Parser (this may take 1-3 minutes)...")
    start_time = time.time()
    
    with open(file_path, 'rb') as f:
        files = {'file': (file_path.name, f, 'application/pdf')}
        response = requests.post(f"{ACCURATE_PARSER_URL}/parse", files=files, timeout=300)
    
    elapsed = time.time() - start_time
    
    if response.status_code == 200:
        result = response.json()
        print(f"‚úÖ Accurate Parser completed in {elapsed:.2f}s")
        return result
    else:
        print(f"‚ùå Accurate Parser failed: {response.status_code}")
        print(response.text)
        return None

# Parse with accurate parser
if pdf_path.exists():
    accurate_result = parse_with_accurate_parser(pdf_path)
else:
    accurate_result = None
    print("‚ö†Ô∏è Skipping accurate parser - file not found")

### 4.1 Display Accurate Parser Metadata

In [None]:
if accurate_result:
    metadata = accurate_result['metadata']
    
    print("\nüìä Accurate Parser Metadata:")
    print(f"   Parser: {metadata['parser']}")
    print(f"   Version: {metadata['version']}")
    print(f"   Pages: {metadata['pages']}")
    print(f"   Processing Time: {metadata['processing_time_ms']} ms")
    print(f"   Filename: {metadata['filename']}")
    print(f"   Images Extracted: {len(accurate_result['images'])}")
    print(f"   Tables Extracted: {len(accurate_result['tables'])}")
    print(f"   Formulas Extracted: {len(accurate_result['formulas'])}")
    print(f"   License: {metadata['license']}")
    print(f"   Source Code: {metadata['source_code']}")
    
    # Create metadata DataFrame
    df_accurate_meta = pd.DataFrame([metadata]).T
    df_accurate_meta.columns = ['Value']
    display(df_accurate_meta)

### 4.2 Display Accurate Parser Markdown Output

In [None]:
if accurate_result:
    print("\nüìÑ Accurate Parser Markdown Output:")
    print("=" * 80)
    display(Markdown(accurate_result['markdown']))
    print("=" * 80)
    print(f"\nMarkdown length: {len(accurate_result['markdown'])} characters")

### 4.3 Display Extracted Images

In [None]:
if accurate_result and accurate_result['images']:
    print(f"\nüñºÔ∏è Displaying {len(accurate_result['images'])} extracted images:\n")
    
    for idx, img_data in enumerate(accurate_result['images']):
        print(f"\nImage {idx + 1}/{len(accurate_result['images'])}")
        print(f"  ID: {img_data['image_id']}")
        print(f"  Page: {img_data['page']}")
        if img_data.get('bbox'):
            print(f"  Bounding Box: {img_data['bbox']}")
        
        # Decode and display image
        img_bytes = base64.b64decode(img_data['image_base64'])
        display(Image(data=img_bytes))
        print("\n" + "-" * 80)
else:
    print("\nüì≠ No images extracted")

### 4.4 Display Extracted Tables

In [None]:
if accurate_result and accurate_result['tables']:
    print(f"\nüìä Displaying {len(accurate_result['tables'])} extracted tables:\n")
    
    for idx, table_data in enumerate(accurate_result['tables']):
        print(f"\nTable {idx + 1}/{len(accurate_result['tables'])}")
        print(f"  ID: {table_data['table_id']}")
        print(f"  Page: {table_data['page']}")
        if table_data.get('bbox'):
            print(f"  Bounding Box: {table_data['bbox']}")
        
        print("\nTable Content:")
        display(Markdown(table_data['markdown']))
        print("\n" + "-" * 80)
else:
    print("\nüì≠ No tables extracted (tables may be embedded in markdown)")

### 4.5 Display Extracted Formulas

In [None]:
if accurate_result and accurate_result['formulas']:
    print(f"\nüßÆ Displaying {len(accurate_result['formulas'])} extracted formulas:\n")
    
    for idx, formula_data in enumerate(accurate_result['formulas']):
        print(f"\nFormula {idx + 1}/{len(accurate_result['formulas'])}")
        print(f"  ID: {formula_data['formula_id']}")
        print(f"  Page: {formula_data['page']}")
        if formula_data.get('bbox'):
            print(f"  Bounding Box: {formula_data['bbox']}")
        
        print("\nLaTeX:")
        display(Markdown(f"$${formula_data['latex']}$$"))
        print("\n" + "-" * 80)
else:
    print("\nüì≠ No formulas extracted (formulas may be embedded in markdown)")

## 5. Comparison: Fast vs Accurate Parser

In [None]:
if fast_result and accurate_result:
    print("\n‚öñÔ∏è Parser Comparison:\n")
    
    comparison_data = {
        'Metric': [
            'Parser',
            'Processing Time (ms)',
            'Pages',
            'Markdown Length',
            'Images Extracted',
            'Tables Extracted',
            'Formulas Extracted'
        ],
        'Fast Parser': [
            fast_result['metadata']['parser'],
            fast_result['metadata']['processing_time_ms'],
            fast_result['metadata']['pages'],
            len(fast_result['markdown']),
            'N/A',
            'N/A',
            'N/A'
        ],
        'Accurate Parser': [
            accurate_result['metadata']['parser'],
            accurate_result['metadata']['processing_time_ms'],
            accurate_result['metadata']['pages'],
            len(accurate_result['markdown']),
            len(accurate_result['images']),
            len(accurate_result['tables']),
            len(accurate_result['formulas'])
        ]
    }
    
    df_comparison = pd.DataFrame(comparison_data)
    display(df_comparison)
    
    # Visualization: Processing time comparison
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Processing time
    parsers = ['Fast Parser', 'Accurate Parser']
    times = [
        fast_result['metadata']['processing_time_ms'],
        accurate_result['metadata']['processing_time_ms']
    ]
    ax1.bar(parsers, times, color=['#3498db', '#e74c3c'])
    ax1.set_ylabel('Processing Time (ms)')
    ax1.set_title('Processing Time Comparison')
    ax1.grid(axis='y', alpha=0.3)
    
    # Content extraction
    categories = ['Images', 'Tables', 'Formulas']
    accurate_counts = [
        len(accurate_result['images']),
        len(accurate_result['tables']),
        len(accurate_result['formulas'])
    ]
    ax2.bar(categories, accurate_counts, color='#e74c3c')
    ax2.set_ylabel('Count')
    ax2.set_title('Accurate Parser Multimodal Extraction')
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n‚úÖ Comparison complete!")
elif fast_result:
    print("\n‚ö†Ô∏è Only fast parser results available")
elif accurate_result:
    print("\n‚ö†Ô∏è Only accurate parser results available")
else:
    print("\n‚ùå No parsing results available for comparison")

## 6. Export Results
Save parsing results to JSON files

In [None]:
import json
from pathlib import Path

# Create output directory
output_dir = Path('parser_results')
output_dir.mkdir(exist_ok=True)

# Save fast parser results
if fast_result:
    fast_output = output_dir / 'fast_parser_result.json'
    with open(fast_output, 'w', encoding='utf-8') as f:
        json.dump(fast_result, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Fast parser results saved to: {fast_output}")

# Save accurate parser results
if accurate_result:
    accurate_output = output_dir / 'accurate_parser_result.json'
    with open(accurate_output, 'w', encoding='utf-8') as f:
        json.dump(accurate_result, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Accurate parser results saved to: {accurate_output}")

# Save markdown outputs
if fast_result:
    fast_md = output_dir / 'fast_parser_output.md'
    fast_md.write_text(fast_result['markdown'], encoding='utf-8')
    print(f"‚úÖ Fast parser markdown saved to: {fast_md}")

if accurate_result:
    accurate_md = output_dir / 'accurate_parser_output.md'
    accurate_md.write_text(accurate_result['markdown'], encoding='utf-8')
    print(f"‚úÖ Accurate parser markdown saved to: {accurate_md}")

print(f"\nüìÅ All results saved to: {output_dir.absolute()}")

## Summary

This notebook demonstrated:
- ‚úÖ Health checks for both parser services
- ‚úÖ Fast parser (PyMuPDF4LLM) for ultra-fast text extraction
- ‚úÖ Accurate parser (MinerU) for high-quality multimodal extraction
- ‚úÖ Visualization of markdown, images, tables, and formulas
- ‚úÖ Performance comparison between parsers
- ‚úÖ Export results to JSON and markdown files

### Key Takeaways:
- **Fast Parser**: Best for quick text extraction, ~100-500ms per document
- **Accurate Parser**: Best for documents with images/tables/formulas, ~1-3 minutes per document

### License
Both parsers are licensed under AGPL-3.0. Source code: https://github.com/daddal001/two_tier_document_parser