PDF to Markdown Pipeline - Usage Example Notebook
================================================

This notebook demonstrates how to use the high-fidelity PDF to Markdown pipeline
with LangChain Ollama integration.

Prerequisites:
- Ollama server running with a vision model (e.g., llama3.2-vision:11b)
- Required Python packages: PyMuPDF, langchain-ollama, PIL, etc.

In [1]:
# Install required packages (run once)
%pip install PyMuPDF langchain-ollama pillow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import the pipeline components
import sys
sys.path.append('.')  # Adjust path as needed

from src.pipeline import (
    PDFToMarkdownPipeline, 
    PipelineConfig,
    convert_pdf_to_markdown
)
from pathlib import Path

In [3]:
# Configuration
OLLAMA_MODEL = "qwen2.5vl:3b-q4_K_M"  # Change to your preferred model
OLLAMA_BASE_URL = "http://localhost:11434"  # Your Ollama server URL
PDF_PATH = "sample_document.pdf"  # Path to your PDF file
OUTPUT_DIR = "./output"

print(f"Using model: {OLLAMA_MODEL}")
print(f"Ollama server: {OLLAMA_BASE_URL}")

Using model: qwen2.5vl:3b-q4_K_M
Ollama server: http://localhost:11434


### Method 1: Simple conversion (recommended for most users)

In [4]:
print("=== Simple Conversion ===")

result = convert_pdf_to_markdown(
    pdf_path=PDF_PATH,
    ollama_model=OLLAMA_MODEL,
    ollama_base_url=OLLAMA_BASE_URL,
    output_dir=OUTPUT_DIR
)

if result.success:
    print(f"✅ Successfully converted {len(result.pages)} pages")
    print(f"📁 Output saved to: {OUTPUT_DIR}")
else:
    print("❌ Conversion failed:")
    for error in result.errors:
        print(f"   - {error}")

=== Simple Conversion ===
Processing PDF with 1 pages...
Processing page 1/1
  Analyzing page structure...
  Strategy: vision_only (confidence: 0.80)
  Processing with vision model...
    Describing diagrams...
  Integrating content...
  Generating markdown...
Conversion completed! Files saved to ./output
  - output\page_001.md
  - output\combined_document.md
  - output\conversion_metadata.json
✅ Successfully converted 1 pages
📁 Output saved to: ./output


### Method 2: Advanced usage with custom configuration

In [5]:
print("\n=== Advanced Configuration ===")

# Create custom configuration
config = PipelineConfig()
config.dpi = 400  # Higher resolution for better OCR
config.vision_model_temp = 0.1  # Lower temperature for consistent output
config.text_extraction_priority = True  # Prefer text extraction when possible
config.preserve_formatting = True  # Maintain original formatting
config.image_embed_mode = "base64"  # Embed images as base64

# Initialize pipeline with custom config
pipeline = PDFToMarkdownPipeline(
    ollama_model=OLLAMA_MODEL,
    ollama_base_url=OLLAMA_BASE_URL,
    config=config
)

# Show pipeline information
print("Pipeline Configuration:")
info = pipeline.get_pipeline_info()
for component, name in info["components"].items():
    print(f"  {component}: {name}")


=== Advanced Configuration ===
Pipeline Configuration:
  analyzer: PDFAnalyzer
  text_extractor: TextExtractor
  vision_processor: VisionProcessor
  integrator: ContentIntegrator
  markdown_generator: MarkdownGenerator


### Method 3: Process single page for testing

In [None]:
print("\n=== Single Page Processing ===")

import fitz

# Open PDF and process first page only
doc = fitz.open(PDF_PATH)
if doc.page_count > 0:
    first_page = doc[0]
    
    print("Analyzing first page...")
    analysis = pipeline.analyzer.analyze_page_content(first_page)
    
    print(f"Page Analysis:")
    print(f"  - Has extractable text: {analysis.has_extractable_text}")
    print(f"  - Text coverage: {analysis.text_coverage:.2f}")
    print(f"  - Has images: {analysis.has_images}")
    print(f"  - Has tables: {analysis.has_tables}")
    print(f"  - Has formulas: {analysis.has_formulas}")
    print(f"  - Layout complexity: {analysis.layout_complexity:.2f}")
    print(f"  - Recommended strategy: {analysis.strategy.value}")
    print(f"  - Confidence: {analysis.confidence:.2f}")
    
    # Process the page
    print("\nProcessing page...")
    page_markdown = pipeline.convert_page(first_page)
    
    print(f"\nGenerated markdown ({len(page_markdown)} characters):")
    print("=" * 50)
    print(page_markdown[:500] + "..." if len(page_markdown) > 500 else page_markdown)
    print("=" * 50)

doc.close()


=== Single Page Processing ===
Analyzing first page...
Page Analysis:
  - Has extractable text: True
  - Text coverage: 0.41
  - Has images: True
  - Has tables: False
  - Has formulas: False
  - Layout complexity: 0.90
  - Recommended strategy: vision_only
  - Confidence: 0.80

Processing page...
  Analyzing page structure...
  Strategy: vision_only (confidence: 0.80)
  Processing with vision model...
    Describing diagrams...


### Method 4: Batch processing with custom content handling

In [None]:
print("\n=== Batch Processing with Content Analysis ===")

def analyze_pdf_structure(pdf_path: str):
    """Analyze entire PDF structure before processing"""
    doc = fitz.open(pdf_path)
    analyses = {}
    
    print(f"Analyzing PDF structure ({doc.page_count} pages)...")
    
    for page_num in range(doc.page_count):
        page = doc[page_num]
        analysis = pipeline.analyzer.analyze_page_content(page)
        analyses[page_num] = analysis
        
        print(f"Page {page_num + 1}: {analysis.strategy.value} "
              f"(conf: {analysis.confidence:.2f}, "
              f"complex: {analysis.layout_complexity:.2f})")
    
    doc.close()
    return analyses

# Analyze structure first
if Path(PDF_PATH).exists():
    pdf_analyses = analyze_pdf_structure(PDF_PATH)
    
    # Show summary statistics
    strategies = [a.strategy.value for a in pdf_analyses.values()]
    strategy_counts = {s: strategies.count(s) for s in set(strategies)}
    
    print("\nStrategy Distribution:")
    for strategy, count in strategy_counts.items():
        print(f"  {strategy}: {count} pages")
    
    avg_complexity = sum(a.layout_complexity for a in pdf_analyses.values()) / len(pdf_analyses)
    print(f"\nAverage layout complexity: {avg_complexity:.2f}")

### Method 5: Testing different vision models

In [None]:
print("\n=== Model Comparison ===")

# List of models to test (uncomment available models)
test_models = [
    "llama3.2-vision:11b",
    # "llava:13b",
    # "bakllava",
]

def test_model_performance(models: list, test_pdf: str):
    """Test different models on the same page"""
    if not Path(test_pdf).exists():
        print(f"Test PDF not found: {test_pdf}")
        return
    
    doc = fitz.open(test_pdf)
    test_page = doc[0]  # Use first page for testing
    
    results = {}
    
    for model in models:
        try:
            print(f"\nTesting model: {model}")
            
            # Create pipeline with this model
            test_pipeline = PDFToMarkdownPipeline(model, OLLAMA_BASE_URL)
            
            # Process page
            markdown = test_pipeline.convert_page(test_page)
            
            results[model] = {
                "success": True,
                "length": len(markdown),
                "preview": markdown[:200] + "..." if len(markdown) > 200 else markdown
            }
            
            print(f"  ✅ Success - {len(markdown)} chars")
            
        except Exception as e:
            results[model] = {
                "success": False,
                "error": str(e)
            }
            print(f"  ❌ Failed: {e}")
    
    doc.close()
    return results

# Run model comparison (only if you have multiple models)
if len(test_models) > 1:
    model_results = test_model_performance(test_models, PDF_PATH)
    
    print("\n=== Model Comparison Results ===")
    for model, result in model_results.items():
        if result["success"]:
            print(f"{model}: {result['length']} characters")
        else:
            print(f"{model}: FAILED - {result['error']}")

### Method 6: Content-specific extraction

In [None]:
print("\n=== Content-Specific Extraction ===")

def extract_specific_content(pdf_path: str, content_types: list):
    """Extract only specific types of content"""
    if not Path(pdf_path).exists():
        print(f"PDF not found: {pdf_path}")
        return
    
    doc = fitz.open(pdf_path)
    extracted_content = {content_type: [] for content_type in content_types}
    
    for page_num in range(min(3, doc.page_count)):  # Test first 3 pages
        page = doc[page_num]
        page_image = pipeline.vision_processor.chat_model
        
        # Convert page to image for vision processing
        from utils import Utils
        page_image_b64 = Utils.extract_page_image(page, 300)
        
        print(f"\nProcessing page {page_num + 1} for specific content...")
        
        for content_type in content_types:
            try:
                if content_type == "tables":
                    result = pipeline.vision_processor.extract_table_data(page_image_b64)
                elif content_type == "formulas":
                    result = pipeline.vision_processor.extract_formulas(page_image_b64)
                elif content_type == "diagrams":
                    result = pipeline.vision_processor.describe_diagrams(page_image_b64)
                else:
                    continue
                
                if result.content.strip():
                    extracted_content[content_type].append({
                        "page": page_num + 1,
                        "content": result.content,
                        "confidence": result.confidence
                    })
                    print(f"  ✅ Found {content_type}")
                
            except Exception as e:
                print(f"  ❌ Error extracting {content_type}: {e}")
    
    doc.close()
    return extracted_content

# Extract specific content types
content_types = ["tables", "formulas", "diagrams"]
specific_content = extract_specific_content(PDF_PATH, content_types)

print("\n=== Extraction Summary ===")
for content_type, items in specific_content.items():
    print(f"{content_type.title()}: {len(items)} found")
    for item in items[:2]:  # Show first 2 items
        preview = item["content"][:100] + "..." if len(item["content"]) > 100 else item["content"]
        print(f"  Page {item['page']}: {preview}")

### Method 7: Error handling and debugging

In [None]:
print("\n=== Error Handling Examples ===")

# Test with non-existent file
print("Testing with non-existent file...")
bad_result = convert_pdf_to_markdown("nonexistent.pdf", OLLAMA_MODEL, OLLAMA_BASE_URL)
print(f"Expected failure: {not bad_result.success}")

# Test with wrong Ollama URL
print("\nTesting with wrong Ollama URL...")
try:
    bad_pipeline = PDFToMarkdownPipeline(OLLAMA_MODEL, "http://localhost:99999")
    # This will fail when we try to use the vision processor
    print("Pipeline created (will fail on actual processing)")
except Exception as e:
    print(f"Connection error: {e}")

# %%
# Final summary
print("\n" + "="*60)
print("PDF to Markdown Pipeline Demo Complete!")
print("="*60)

if Path(OUTPUT_DIR).exists():
    output_files = list(Path(OUTPUT_DIR).glob("*"))
    print(f"\nGenerated files in {OUTPUT_DIR}:")
    for file in output_files:
        size = file.stat().st_size if file.is_file() else 0
        print(f"  📄 {file.name} ({size:,} bytes)")

print(f"\nPipeline ready for production use!")
print(f"💡 Tip: Adjust PipelineConfig settings for your specific needs")
print(f"🔧 Remember to tune vision model temperature and DPI settings")

: 