## Setup

In [None]:
# Install the SDK if needed
# %pip install docu-devs-api-client pydantic

In [None]:
import os
import json
from pydantic import BaseModel, Field
from docudevs import DocuDevsClient

API_KEY = os.getenv("DOCUDEVS_API_KEY", "your-api-key-here")
client = DocuDevsClient(token=API_KEY)

## First: Process a Document

Operations work on completed jobs, so let's process something first.

In [None]:
# Load and process a document
with open("docs/invoice.pdf", "rb") as f:
    invoice_bytes = f.read()

class Invoice(BaseModel):
    invoice_number: str
    date: str
    vendor: str
    total: float
    line_items: list[dict] = Field(default_factory=list)

job_id = await client.submit_and_process_document(
    document=invoice_bytes,
    document_mime_type="application/pdf",
    schema=json.dumps(Invoice.model_json_schema()),
    prompt="Extract all invoice details."
)

result = await client.wait_until_ready(job_id, result_format="json")
print(f"Job completed: {job_id}")
print(f"Extracted: {json.dumps(result, indent=2)[:500]}...")

## Error Analysis

Was the extraction accurate? Error analysis examines the results and flags potential issues.

In [None]:
# Run error analysis on the completed job
analysis = await client.submit_and_wait_for_error_analysis(
    job_guid=job_id,
    timeout=120
)

print("=== Error Analysis ===")

# Parse the result JSON
if hasattr(analysis, 'result') and analysis.result:
    result_data = json.loads(analysis.result) if isinstance(analysis.result, str) else analysis.result
    
    # Display overall quality
    quality = result_data.get('extraction_quality', 'unknown')
    confidence = result_data.get('overall_confidence', 0)
    print(f"Quality: {quality}")
    print(f"Overall confidence: {confidence:.0%}")
    
    # Display field analysis
    fields = result_data.get('field_analysis', [])
    if fields:
        print(f"\nField Analysis ({len(fields)} fields):")
        for field in fields:
            name = field.get('field_name', 'unknown')
            conf = field.get('confidence', 0)
            issues = field.get('issues', [])
            status = "‚ö†Ô∏è" if issues else "‚úì"
            print(f"  {status} {name}: {conf:.0%} confidence")
            for issue in issues:
                print(f"      Issue: {issue}")
            for suggestion in field.get('suggestions', []):
                print(f"      Suggestion: {suggestion}")
    
    # Display OCR issues if any
    ocr_issues = result_data.get('ocr_issues', [])
    if ocr_issues:
        print(f"\nOCR Issues ({len(ocr_issues)}):")
        for issue in ocr_issues:
            print(f"  - {issue.get('type')}: {issue.get('description')}")
else:
    print("No analysis result available")

## Generative Tasks: Ask Questions About Documents

Generative tasks let you ask questions about document content. They work on **OCR jobs**
(not extraction jobs) since they need the raw OCR text to reason about.

Let's OCR the same invoice and then ask questions about it.

In [None]:
# First, OCR the document (generative tasks need OCR jobs, not extraction jobs)
ocr_job_id = await client.submit_and_ocr_document(
    document=invoice_bytes,
    document_mime_type="application/pdf",
    ocr="DEFAULT",
    ocr_format="markdown"
)
await client.wait_until_ready(ocr_job_id)
print(f"OCR job completed: {ocr_job_id}")

# Now ask a question
question_result = await client.submit_and_wait_for_generative_task(
    parent_job_id=ocr_job_id,
    prompt="What payment terms are mentioned in this invoice? If there's a due date, when is it?",
    timeout=120
)

if hasattr(question_result, 'result'):
    result_data = json.loads(question_result.result) if isinstance(question_result.result, str) else question_result.result
    print("\nAnswer:")
    print(result_data.get('generated_text', 'No response'))

In [None]:
# Generate a summary (uses the same OCR job)
summary_result = await client.submit_and_wait_for_generative_task(
    parent_job_id=ocr_job_id,
    prompt="Provide a one-paragraph summary of this invoice suitable for an expense report.",
    temperature=0.3,  # Lower = more focused/deterministic
    max_tokens=200
)

if hasattr(summary_result, 'result'):
    result_data = json.loads(summary_result.result) if isinstance(summary_result.result, str) else summary_result.result
    print("Summary:")
    print(result_data.get('generated_text', 'No response'))

## Multiple Questions on One Document

In [None]:
questions = [
    "What company issued this invoice?",
    "What is the total amount due?",
    "How many line items are there?",
    "What is the most expensive item?"
]

print("=== Document Q&A ===")
for question in questions:
    answer = await client.submit_and_wait_for_generative_task(
        parent_job_id=ocr_job_id,
        prompt=f"Based on this invoice, answer briefly: {question}"
    )
    
    if hasattr(answer, 'result'):
        result_data = json.loads(answer.result) if isinstance(answer.result, str) else answer.result
        print(f"\nQ: {question}")
        print(f"A: {result_data.get('generated_text', 'No response')}")

## Check Operation Status

For longer operations, you can check status and see all operations run on a job.

In [None]:
# Get all operations for the OCR job (which has generative tasks)
status = await client.get_operation_status(job_guid=ocr_job_id)

print(f"Operations on OCR job {ocr_job_id}:")
if hasattr(status, 'operations'):
    for op in status.operations:
        print(f"  - {op.operation_type}: {op.status}")

## Complete Workflow Example

Here's a workflow that combines extraction (for structured data), error analysis, 
and generative tasks (for summaries). Note that extraction and generative tasks 
use different job types.

In [None]:
async def process_and_analyze(document_bytes, mime_type, schema):
    """Process a document with extraction, error analysis, and summary."""
    
    # Step 1: Extract structured data
    print("Step 1: Extracting structured data...")
    extraction_job_id = await client.submit_and_process_document(
        document=document_bytes,
        document_mime_type=mime_type,
        schema=schema,
        prompt="Extract all information according to the schema."
    )
    extraction = await client.wait_until_ready(extraction_job_id, result_format="json")
    print(f"  ‚úì Extracted data")
    
    # Step 2: Error analysis (works on extraction jobs)
    print("Step 2: Running error analysis...")
    try:
        analysis = await client.submit_and_wait_for_error_analysis(job_guid=extraction_job_id, timeout=60)
        analysis_data = json.loads(analysis.result) if hasattr(analysis, 'result') and isinstance(analysis.result, str) else {}
        confidence = analysis_data.get('overall_confidence', 'N/A')
        quality = analysis_data.get('extraction_quality', 'unknown')
        print(f"  ‚úì Quality: {quality}, Confidence: {confidence}")
    except Exception as e:
        print(f"  ‚ö† Error analysis skipped: {e}")
        confidence = "unknown"
    
    # Step 3: OCR the document for generative tasks
    print("Step 3: OCR for generative task...")
    ocr_job_id = await client.submit_and_ocr_document(
        document=document_bytes,
        document_mime_type=mime_type,
        ocr="DEFAULT",
        ocr_format="markdown"
    )
    await client.wait_until_ready(ocr_job_id)
    print(f"  ‚úì OCR complete")
    
    # Step 4: Generate summary (requires OCR job)
    print("Step 4: Generating summary...")
    summary = await client.submit_and_wait_for_generative_task(
        parent_job_id=ocr_job_id,
        prompt="Summarize this document in 2-3 sentences.",
        timeout=60
    )
    summary_text = ""
    if hasattr(summary, 'result'):
        summary_data = json.loads(summary.result) if isinstance(summary.result, str) else summary.result
        summary_text = summary_data.get('generated_text', '')
    print(f"  ‚úì Summary generated")
    
    return {
        "extraction_job_id": extraction_job_id,
        "ocr_job_id": ocr_job_id,
        "extraction": extraction,
        "confidence": confidence,
        "summary": summary_text
    }

# Run the workflow
schema = json.dumps(Invoice.model_json_schema())
result = await process_and_analyze(invoice_bytes, "application/pdf", schema)

print("\n=== Results ===")
print(f"Extraction Job: {result['extraction_job_id']}")
print(f"OCR Job: {result['ocr_job_id']}")
print(f"Confidence: {result['confidence']}")
print(f"Summary: {result['summary']}")

## Tips for Operations

1. **Error analysis works on extraction jobs**: Use it to validate structured extraction results

2. **Generative tasks require OCR jobs**: If you need both extraction and generative tasks, 
   you'll need two jobs (or use OCR-only and do extraction via generative prompts)

3. **Be specific with generative prompts**: "Summarize in 2 sentences" beats "summarize"

4. **Use temperature for control**: Lower values (0.1-0.3) = more deterministic; 
   higher values (0.7-0.9) = more creative

## Wrap Up

You've now seen the main features of DocuDevs:

- **[Basic Extraction](01-basic-extraction.ipynb)**: Extract structured data with Pydantic schemas
- **[Map-Reduce](02-map-reduce.ipynb)**: Handle long documents by chunking
- **[Knowledge Search](03-knowledge-search.ipynb)**: Enrich extractions with your reference data
- **Operations** (this notebook): Error analysis and follow-up questions

Happy extracting! üöÄ