Skip to content

SmolDocling is not able to do the OCR. #1217

@simjak

Description

@simjak

I tried SmolDocling on the JFK documents https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10003-10041.pdf
and it failed, while gemini fast model easily converted to markdown.
Outputs:

Predicted page in DOCTAGS:
Assistant: <doctag><page_header><loc_10><loc_2><loc_63><loc_8>104-10003-100401</page_header>
<page_header><loc_85><loc_3><loc_457><loc_8>2025 RELEASE UNDER THE PRESIDENT JOHN F. KENNEDY ASSASSINATION RECORDS ACT OF 1992</page_header>
<picture><loc_39><loc_19><loc_131><loc_38><logo></picture>
<picture><loc_39><loc_121><loc_131><loc_146><logo></picture>
<picture><loc_39><loc_153><loc_131><loc_178><logo></picture>
<picture><loc_39><loc_185><loc_131><loc_209><logo></picture>
<picture><loc_39><loc_216><loc_131><loc_232><logo></picture>
<picture><loc_39><loc_239><loc_131><loc_264><logo></picture>
<picture><loc_39><loc_271><loc_131><loc_287><logo></picture>
.....
Remote function called successfully
Saving results to output/104-10003-10041.pdf.md...
104-10003-100401

2025 RELEASE UNDER THE PRESIDENT JOHN F. KENNEDY ASSASSINATION RECORDS ACT OF 1992

<!-- image -->

<!-- image -->

<!-- image -->

Pipeline code for Modal.com:

import os

import modal

app = modal.App("docling")

# Set up persistent volume to store models
models_vol = modal.Volume.from_name("docling-models", create_if_missing=True)
MODEL_DIR = "/models"

# Define container image with all required dependencies
image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install("packaging")  # Install packaging first
    .pip_install("torch")  # Install torch before flash-attn
    .pip_install(
        "docling[vlm]",
        "docling_core",
        "huggingface_hub[hf_transfer]==0.26.2",
        "requests",  # For URL downloading
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
)


@app.function(
    image=image,
    volumes={MODEL_DIR: models_vol},
)
def download_smoldocling():
    """Download SmolDocling model to the volume."""
    import subprocess

    print("Downloading SmolDocling model...")

    # Pre-download the model to cache
    try:
        import huggingface_hub

        model_name = "ds4sd/SmolDocling-256M-preview"

        # Try to use huggingface_hub directly
        print(f"Downloading model {model_name} using huggingface_hub...")
        cache_dir = os.path.join(MODEL_DIR, "smoldocling")
        os.makedirs(cache_dir, exist_ok=True)

        # Download model files - removing deprecated parameter
        huggingface_hub.snapshot_download(
            repo_id=model_name,
            cache_dir=cache_dir,
            local_dir=cache_dir,
        )

        print(f"SmolDocling model downloaded to {cache_dir}")

        # Create a symlink if necessary for docling to find it
        docling_model_dir = os.path.join(MODEL_DIR, "ds4sd--SmolDocling-256M-preview")
        if not os.path.exists(docling_model_dir):
            os.makedirs(os.path.dirname(docling_model_dir), exist_ok=True)
            try:
                os.symlink(cache_dir, docling_model_dir)
                print(f"Created symlink from {cache_dir} to {docling_model_dir}")
            except FileExistsError:
                print(f"Symlink from {cache_dir} to {docling_model_dir} already exists")

        # List contents
        print("Model directory contents:")
        subprocess.run(["ls", "-la", cache_dir])

    except Exception as e:
        print(f"Error downloading SmolDocling model: {e}")
        import traceback

        traceback.print_exc()

    # Commit the changes to the volume
    models_vol.commit()
    print("Changes committed to volume")


@app.function(
    image=image,
    volumes={MODEL_DIR: models_vol},
    gpu="L4",
)
def process_document(
    # Input source - one of these must be provided
    image_files=None,
    pdf_url=None,
):
    """
    Unified processing function that handles different input sources and output formats.

    Args:
        image_files: List of image file paths to process
        pdf_url: URL to a PDF document to download and process
        input_dir: Directory containing images to process
        prompt_text: Text prompt for the SmolDocling model

    Returns:
        Dictionary with results information
    """
    import json
    import time
    from pathlib import Path

    import requests
    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.pipeline_options import (
        AcceleratorDevice,
        VlmPipelineOptions,
        smoldocling_vlm_conversion_options,
    )
    from docling.document_converter import DocumentConverter, PdfFormatOption
    from docling.pipeline.vlm_pipeline import VlmPipeline

    ## Use experimental VlmPipeline
    pipeline_options = VlmPipelineOptions()
    # If force_backend_text = True, text from backend will be used instead of generated text
    pipeline_options.force_backend_text = False

    ## On GPU systems, but without flash_attention2 (dependency removed):
    pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
    pipeline_options.accelerator_options.cuda_use_flash_attention2 = False

    ## Pick a VLM model. We choose SmolDocling-256M by default
    pipeline_options.vlm_options = smoldocling_vlm_conversion_options

    ## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
    # pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options

    ## Alternative VLM models:
    # pipeline_options.vlm_options = granite_vision_vlm_conversion_options

    from docling_core.types.doc import DocItemLabel, ImageRefMode
    from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS

    ## Set up pipeline for PDF or image inputs
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            ),
            InputFormat.IMAGE: PdfFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            ),
        }
    )

    out_path = Path("scratch")
    out_path.mkdir(parents=True, exist_ok=True)

    # Download the PDF
    if pdf_url:
        response = requests.get(pdf_url)
        pdf_path = Path("scratch/input.pdf")
        pdf_path.write_bytes(response.content)
        sources = [pdf_path]
    else:
        sources = image_files or []

    results = []
    for source in sources:
        start_time = time.time()
        print("================================================")
        print("Processing... {}".format(source))
        print("================================================")
        print("")

        res = converter.convert(source)

        print("")
        print(res.document.export_to_markdown())

        for page in res.pages:
            print("")
            print("Predicted page in DOCTAGS:")
            print(page.predictions.vlm_response.text)

        res.document.save_as_html(
            filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
            image_mode=ImageRefMode.REFERENCED,
            labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
        )

        with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict()))

        res.document.save_as_json(
            out_path / f"{res.input.file.stem}.md",
            image_mode=ImageRefMode.PLACEHOLDER,
        )

        res.document.save_as_markdown(
            out_path / f"{res.input.file.stem}.md",
            image_mode=ImageRefMode.PLACEHOLDER,
        )

        # Print markdown content
        with open(out_path / f"{res.input.file.stem}.md", "r") as f:
            markdown_content = f.read()
            print(f"Markdown content for {res.input.file.stem}:")
            print(markdown_content)

        markdown_result = [
            {"filename": f"{res.input.file.stem}.md", "content": markdown_content}
        ]

        # Remove scratch directory

        pg_num = res.document.num_pages()
        print("")
        inference_time = time.time() - start_time
        print(
            f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
        )

        print("================================================")
        print("done!")
        print("================================================")

        results.append(markdown_result)

    return {"status": "success", "file_count": len(results), "results": results}


@app.local_entrypoint()
def main(
    pdf_url: str = "https://arxiv.org/pdf/2501.12948",
    prompt_text: str | None = None,
):
    os.makedirs("output", exist_ok=True)
    output_file = os.path.join("output", pdf_url.split("/")[-1] + ".md")

    prompt_text = prompt_text or "Convert page to Markdown."

    # Call the remote function
    print("Calling remote function...")
    result = process_document.remote(
        pdf_url=pdf_url,
    )
    print("Remote function called successfully")

    # Save results locally
    print(f"Saving results to {output_file}...")

    # Handle both result formats
    if "content" in result:
        content = result["content"]
    else:
        # Combine results from individual pages
        content = ""
        for res in result["results"]:
            content += res["content"] + "\n\n"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(content)

    print(f"Results saved to {output_file}")

Run the script:

uv init

uv add modal

uv run modal token new

uv run modal deploy docling_modal.py

# Download SmolDocling Model
uv run modal run docling_modal.py::download_smoldocling

uv run modal run docling_modal.py --pdf-url="https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10003-10041.pdf"

Metadata

Metadata

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions