# DeepSeek OCR Job Pipeline

This notebook runs a three-stage OCR pipeline on Hugging Face Jobs:

1. **Extract** – Run DeepSeek OCR over a dataset, save Markdown and crop detected figures
2. **Describe** – Generate captions for extracted figures  
3. **Assemble** – Enrich Markdown with figure captions

All stages share a single HF dataset repository. Each stage loads the dataset, processes it, and pushes updates back.


In [14]:
import os
import shutil
import tempfile
import time
from pathlib import Path

from huggingface_hub import HfApi, create_repo, fetch_job_logs, inspect_job, run_uv_job, whoami
from huggingface_hub._jobs_api import JobInfo, JobStage

In [15]:
# Configuration
USERNAME = whoami()["name"]

HUB_IMAGE = "vllm/vllm-openai:v0.12.0"
HARDWARE = "a100-large"
TIMEOUT = "3h"

CODE_REPO = f"{USERNAME}/deepseek-ocr-job-code"
DATASET_REPO = f"{USERNAME}/deepseek-ocr-dataset"

# Source dataset
SOURCE_DATASET = "HuggingFaceM4/FineVision"
SOURCE_CONFIG = "olmOCR-mix-0225-documents"
MAX_SAMPLES = 50

print(f"Code: {CODE_REPO} | Dataset: {DATASET_REPO}")
print(f"Source: {SOURCE_DATASET}/{SOURCE_CONFIG} ({MAX_SAMPLES} samples)")

Code: florentgbelidji/deepseek-ocr-job-code | Dataset: florentgbelidji/deepseek-ocr-dataset
Source: HuggingFaceM4/FineVision/olmOCR-mix-0225-documents (50 samples)


In [None]:
# Base environment for all stages
BASE_ENV = {
    # vLLM
    "MODEL_ID": "deepseek-ai/DeepSeek-OCR",
    "SERVED_MODEL_NAME": "deepseek-ocr",
    "HOST": "0.0.0.0",
    "PORT": "8000",
    "MAX_MODEL_LEN": "8192",
    "GPU_MEMORY_UTILIZATION": "0.90",
    "TENSOR_PARALLEL_SIZE": "1",
    # Code
    "JOB_CODE_REPO": CODE_REPO,
    "JOB_CODE_REVISION": "main",
    "JOB_CODE_LOCAL_DIR": "/tmp/deepseek-ocr-job-code",
    # Auth
    "HF_TOKEN": os.environ.get("HF_TOKEN", ""),
    # Prompts
    "DOC_PROMPT": "<image>\n<|grounding|>Convert this document to Markdown.",
    "DOC_MAX_TOKENS": "4096",
    "DOC_TEMPERATURE": "0.1",
    "FIGURE_PROMPT": "<image>\nDescribe this image in detail.",
    "FIGURE_MAX_TOKENS": "512",
    "FIGURE_TEMPERATURE": "0.6",
}

In [17]:
# Upload code to HF Hub
CODE_PATHS = [
    Path("ds-batch-ocr.py"),
    Path("hf_job_runner.py"),
    Path("ds_batch_ocr"),
]

api = HfApi()
create_repo(repo_id=CODE_REPO, repo_type="dataset", exist_ok=True)
create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)

bundle_dir = Path(tempfile.mkdtemp(prefix="job-code-"))
for path in CODE_PATHS:
    src = Path.cwd() / path if not path.is_absolute() else path
    if src.is_dir():
        shutil.copytree(src, bundle_dir / path.name, dirs_exist_ok=True)
    else:
        shutil.copy2(src, bundle_dir / path.name)

api.upload_folder(folder_path=str(bundle_dir), repo_id=CODE_REPO, repo_type="dataset")
print(f"Uploaded code to {CODE_REPO}")


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded code to florentgbelidji/deepseek-ocr-job-code


In [18]:
# Helper functions
CODE_URL = f"https://huggingface.co/datasets/{CODE_REPO}/resolve/main/hf_job_runner.py"

def launch(stage: str, flavor: str, env: dict) -> JobInfo:
    full_env = {**BASE_ENV, **env, "PIPELINE_STAGE": stage}
    job = run_uv_job(CODE_URL, image=HUB_IMAGE, flavor=flavor, env=full_env, timeout=TIMEOUT)
    print(f"Launched {stage}: {job.url}")
    return job

def wait(job: JobInfo, poll: int = 60) -> JobInfo:
    while True:
        info = inspect_job(job_id=job.id)
        stage = info.status.stage
        print(f"  {job.id}: {stage}")
        if stage not in {JobStage.RUNNING, "RUNNING", "UPDATING"}:
            return info
        time.sleep(poll)

def logs(job: JobInfo, tail: int = 100):
    for line in list(fetch_job_logs(job_id=job.id, namespace=job.owner.name))[-tail:]:
        print(line, end="")

In [19]:
# Stage 1: Extract
stage1 = launch("extract", flavor=HARDWARE, env={
    "DATASET_NAME": SOURCE_DATASET,
    "DATASET_CONFIG": SOURCE_CONFIG,
    "DATASET_SPLIT": "train",
    "MAX_SAMPLES": str(MAX_SAMPLES),
    "OUTPUT_DIR": "./outputs/extract",
    "EXTRACT_BATCH_SIZE": "256",
    "EXTRACT_MAX_CONCURRENCY": "8",
    "HF_REPO_ID": DATASET_REPO,
    "HF_PATH_IN_REPO": "outputs/extract",
})


Launched extract: https://huggingface.co/jobs/florentgbelidji/693ac3ce1a39f67af5a41bdb


In [20]:
stage1_done = wait(stage1)
print(f"Extract complete: {DATASET_REPO}")


  693ac3ce1a39f67af5a41bdb: RUNNING
  693ac3ce1a39f67af5a41bdb: RUNNING
  693ac3ce1a39f67af5a41bdb: RUNNING
  693ac3ce1a39f67af5a41bdb: RUNNING
  693ac3ce1a39f67af5a41bdb: COMPLETED
Extract complete: florentgbelidji/deepseek-ocr-dataset


In [22]:
logs(stage1_done, tail=50)




In [23]:
# Stage 2: Describe
stage2 = launch("describe", flavor=HARDWARE, env={
    "OUTPUT_DIR": "./outputs/describe",
    "DESCRIBE_BATCH_SIZE": "8",
    "DESCRIBE_MAX_CONCURRENCY": "4",
    "SOURCE_REPO_ID": DATASET_REPO,
    "HF_REPO_ID": DATASET_REPO,
})


Launched describe: https://huggingface.co/jobs/florentgbelidji/693ac5871a39f67af5a41be0


In [24]:
stage2_done = wait(stage2)
print(f"Describe complete: {DATASET_REPO}")


  693ac5871a39f67af5a41be0: RUNNING
  693ac5871a39f67af5a41be0: RUNNING
  693ac5871a39f67af5a41be0: RUNNING
  693ac5871a39f67af5a41be0: RUNNING
  693ac5871a39f67af5a41be0: COMPLETED
Describe complete: florentgbelidji/deepseek-ocr-dataset


In [25]:
logs(stage2_done, tail=50)




In [27]:
# Stage 3: Assemble
stage3 = launch("assemble", flavor='cpu-upgrade', env={
    "SOURCE_REPO_ID": DATASET_REPO,
    "HF_REPO_ID": DATASET_REPO,
    "HF_COMMIT_MESSAGE": "Add assembled documents with figure captions",
})


Launched assemble: https://huggingface.co/jobs/florentgbelidji/693ac781c67c9f186cfe22bc




In [28]:
stage3_done = wait(stage3)
print(f"Pipeline complete! Dataset: https://huggingface.co/datasets/{DATASET_REPO}")


  693ac781c67c9f186cfe22bc: RUNNING
  693ac781c67c9f186cfe22bc: RUNNING
  693ac781c67c9f186cfe22bc: RUNNING
  693ac781c67c9f186cfe22bc: RUNNING
  693ac781c67c9f186cfe22bc: COMPLETED
Pipeline complete! Dataset: https://huggingface.co/datasets/florentgbelidji/deepseek-ocr-dataset


In [29]:
logs(stage3_done, tail=50)

Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.52s/ shards]2025-12-11 05:34:08,844 | INFO | httpx | HTTP Request: GET https://huggingface.co/api/datasets/florentgbelidji/deepseek-ocr-dataset "HTTP/1.1 200 OK"2025-12-11 05:34:08,999 | INFO | httpx | HTTP Request: GET https://huggingface.co/api/datasets/florentgbelidji/deepseek-ocr-dataset/tree/7c711a21ad466896107bdd1732321a2192b1291b?recursive=true&expand=false "HTTP/1.1 200 OK"2025-12-11 05:34:09,011 | INFO | httpx | HTTP Request: POST https://huggingface.co/api/validate-yaml "HTTP/1.1 200 OK"2025-12-11 05:34:09,247 | INFO | httpx | HTTP Request: POST https://huggingface.co/api/datasets/florentgbelidji/deepseek-ocr-dataset/preupload/main "HTTP/1.1 200 OK"2025-12-11 05:34:11,010 | INFO | httpx | HTTP Request: POST https://huggingface.co/api/datasets/florentgbelidji/deepseek-ocr-dataset/commit/main "HTTP/1.1 200 OK"2025-12-11 05:34:11,010 | INFO | ds_batch_ocr.stages | Assemble complete/deepseek-ocr-dataset.py "HTTP/

In [None]:
# Done!
