-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Open
Labels
Description
I tried SmolDocling on the JFK documents https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10003-10041.pdf
and it failed, while gemini fast model easily converted to markdown.
Outputs:
Predicted page in DOCTAGS:
Assistant: <doctag><page_header><loc_10><loc_2><loc_63><loc_8>104-10003-100401</page_header>
<page_header><loc_85><loc_3><loc_457><loc_8>2025 RELEASE UNDER THE PRESIDENT JOHN F. KENNEDY ASSASSINATION RECORDS ACT OF 1992</page_header>
<picture><loc_39><loc_19><loc_131><loc_38><logo></picture>
<picture><loc_39><loc_121><loc_131><loc_146><logo></picture>
<picture><loc_39><loc_153><loc_131><loc_178><logo></picture>
<picture><loc_39><loc_185><loc_131><loc_209><logo></picture>
<picture><loc_39><loc_216><loc_131><loc_232><logo></picture>
<picture><loc_39><loc_239><loc_131><loc_264><logo></picture>
<picture><loc_39><loc_271><loc_131><loc_287><logo></picture>
.....Remote function called successfully
Saving results to output/104-10003-10041.pdf.md...
104-10003-100401
2025 RELEASE UNDER THE PRESIDENT JOHN F. KENNEDY ASSASSINATION RECORDS ACT OF 1992
<!-- image -->
<!-- image -->
<!-- image -->Pipeline code for Modal.com:
import os
import modal
app = modal.App("docling")
# Set up persistent volume to store models
models_vol = modal.Volume.from_name("docling-models", create_if_missing=True)
MODEL_DIR = "/models"
# Define container image with all required dependencies
image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install("packaging") # Install packaging first
.pip_install("torch") # Install torch before flash-attn
.pip_install(
"docling[vlm]",
"docling_core",
"huggingface_hub[hf_transfer]==0.26.2",
"requests", # For URL downloading
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
)
@app.function(
image=image,
volumes={MODEL_DIR: models_vol},
)
def download_smoldocling():
"""Download SmolDocling model to the volume."""
import subprocess
print("Downloading SmolDocling model...")
# Pre-download the model to cache
try:
import huggingface_hub
model_name = "ds4sd/SmolDocling-256M-preview"
# Try to use huggingface_hub directly
print(f"Downloading model {model_name} using huggingface_hub...")
cache_dir = os.path.join(MODEL_DIR, "smoldocling")
os.makedirs(cache_dir, exist_ok=True)
# Download model files - removing deprecated parameter
huggingface_hub.snapshot_download(
repo_id=model_name,
cache_dir=cache_dir,
local_dir=cache_dir,
)
print(f"SmolDocling model downloaded to {cache_dir}")
# Create a symlink if necessary for docling to find it
docling_model_dir = os.path.join(MODEL_DIR, "ds4sd--SmolDocling-256M-preview")
if not os.path.exists(docling_model_dir):
os.makedirs(os.path.dirname(docling_model_dir), exist_ok=True)
try:
os.symlink(cache_dir, docling_model_dir)
print(f"Created symlink from {cache_dir} to {docling_model_dir}")
except FileExistsError:
print(f"Symlink from {cache_dir} to {docling_model_dir} already exists")
# List contents
print("Model directory contents:")
subprocess.run(["ls", "-la", cache_dir])
except Exception as e:
print(f"Error downloading SmolDocling model: {e}")
import traceback
traceback.print_exc()
# Commit the changes to the volume
models_vol.commit()
print("Changes committed to volume")
@app.function(
image=image,
volumes={MODEL_DIR: models_vol},
gpu="L4",
)
def process_document(
# Input source - one of these must be provided
image_files=None,
pdf_url=None,
):
"""
Unified processing function that handles different input sources and output formats.
Args:
image_files: List of image file paths to process
pdf_url: URL to a PDF document to download and process
input_dir: Directory containing images to process
prompt_text: Text prompt for the SmolDocling model
Returns:
Dictionary with results information
"""
import json
import time
from pathlib import Path
import requests
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions,
smoldocling_vlm_conversion_options,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
## Use experimental VlmPipeline
pipeline_options = VlmPipelineOptions()
# If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = False
## On GPU systems, but without flash_attention2 (dependency removed):
pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
pipeline_options.accelerator_options.cuda_use_flash_attention2 = False
## Pick a VLM model. We choose SmolDocling-256M by default
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
# pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
),
}
)
out_path = Path("scratch")
out_path.mkdir(parents=True, exist_ok=True)
# Download the PDF
if pdf_url:
response = requests.get(pdf_url)
pdf_path = Path("scratch/input.pdf")
pdf_path.write_bytes(response.content)
sources = [pdf_path]
else:
sources = image_files or []
results = []
for source in sources:
start_time = time.time()
print("================================================")
print("Processing... {}".format(source))
print("================================================")
print("")
res = converter.convert(source)
print("")
print(res.document.export_to_markdown())
for page in res.pages:
print("")
print("Predicted page in DOCTAGS:")
print(page.predictions.vlm_response.text)
res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
)
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))
res.document.save_as_json(
out_path / f"{res.input.file.stem}.md",
image_mode=ImageRefMode.PLACEHOLDER,
)
res.document.save_as_markdown(
out_path / f"{res.input.file.stem}.md",
image_mode=ImageRefMode.PLACEHOLDER,
)
# Print markdown content
with open(out_path / f"{res.input.file.stem}.md", "r") as f:
markdown_content = f.read()
print(f"Markdown content for {res.input.file.stem}:")
print(markdown_content)
markdown_result = [
{"filename": f"{res.input.file.stem}.md", "content": markdown_content}
]
# Remove scratch directory
pg_num = res.document.num_pages()
print("")
inference_time = time.time() - start_time
print(
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
)
print("================================================")
print("done!")
print("================================================")
results.append(markdown_result)
return {"status": "success", "file_count": len(results), "results": results}
@app.local_entrypoint()
def main(
pdf_url: str = "https://arxiv.org/pdf/2501.12948",
prompt_text: str | None = None,
):
os.makedirs("output", exist_ok=True)
output_file = os.path.join("output", pdf_url.split("/")[-1] + ".md")
prompt_text = prompt_text or "Convert page to Markdown."
# Call the remote function
print("Calling remote function...")
result = process_document.remote(
pdf_url=pdf_url,
)
print("Remote function called successfully")
# Save results locally
print(f"Saving results to {output_file}...")
# Handle both result formats
if "content" in result:
content = result["content"]
else:
# Combine results from individual pages
content = ""
for res in result["results"]:
content += res["content"] + "\n\n"
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
print(f"Results saved to {output_file}")Run the script:
uv init
uv add modal
uv run modal token new
uv run modal deploy docling_modal.py
# Download SmolDocling Model
uv run modal run docling_modal.py::download_smoldocling
uv run modal run docling_modal.py --pdf-url="https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10003-10041.pdf"Reactions are currently unavailable