# DeepSeek OCR Job Pipeline

This notebook submits a three-stage pipeline to Hugging Face Jobs using the `ds-batch-ocr.py` driver:

1. **extract** – run DeepSeek OCR over a dataset batch, save Markdown, and crop detected figures.
2. **describe** – generate captions for the extracted figure crops.
3. **assemble** – combine documents and figure descriptions into polished Markdown bundles.

Each stage runs as an independent Job so you can mix hardware, retry specific steps, or parallelize work. The helper utilities below keep track of job IDs and share artifacts between stages via the Jobs repositories.


In [None]:
import os
from pathlib import Path
import time
from typing import Dict, Optional

from huggingface_hub import fetch_job_logs, inspect_job, run_uv_job, whoami
from huggingface_hub._jobs_api import JobInfo, JobStage



In [None]:
# ---- Pipeline configuration ----
HUB_IMAGE = "vllm/vllm-openai:nightly-758ea2e980a1eeacec6097bfd98bd0a7c8fb864a"
HARDWARE_FLAVOR = "a100-large"
JOB_TIMEOUT = "3h"

PIPELINE_ROOT = "/workspace/pipeline"
STAGE1_DIR = f"{PIPELINE_ROOT}/stage1"
STAGE2_DIR = f"{PIPELINE_ROOT}/stage2"
STAGE3_DIR = f"{PIPELINE_ROOT}/stage3"

USERNAME = whoami()["name"]

CODE_REPO_ID = f"{USERNAME}/deepseek-ocr-job-code-final"
CODE_REPO_TYPE = "dataset"
CODE_REPO_REVISION = "main"
CODE_LOCAL_DIR = "/tmp/deepseek-ocr-job-code"

BASE_ENV: Dict[str, str] = {
    "MODEL_ID": "deepseek-ai/DeepSeek-OCR",
    "SERVED_MODEL_NAME": "deepseek-ocr",
    "HOST": "0.0.0.0",
    "PORT": "8000",
    "TENSOR_PARALLEL_SIZE": "1",
    "MAX_MODEL_LEN": "8192",
    "GPU_MEMORY_UTILIZATION": "0.90",
    "JOB_CODE_REPO": CODE_REPO_ID,
    "JOB_CODE_REVISION": CODE_REPO_REVISION,
    "JOB_CODE_LOCAL_DIR": CODE_LOCAL_DIR,
    "PIPELINE_BATCH_SIZE": "4",
    "PIPELINE_MAX_CONCURRENCY": "4",
    "PIPELINE_TOKEN_MARGIN": "512",
    "PIPELINE_ARTIFACT_STRATEGY": "hf-hub",
    "HF_TOKEN": os.environ.get("HF_TOKEN", ""),
}

DATASET_ENV: Dict[str, str] = {
    "DATASET_NAME": "HuggingFaceM4/FineVision",
    "DATASET_CONFIG": "olmOCR-mix-0225-documents",
    "DATASET_SPLIT": "train",
    "MAX_SAMPLES": "10",
}

PROMPT_ENV: Dict[str, str] = {
    "DOC_PROMPT": "<image>\n<|grounding|>Convert this document to Markdown.",
    "DOC_MAX_TOKENS": "4096",
    "DOC_TEMPERATURE": "0.4",
    "FIGURE_PROMPT": "<image>\nDescribe this image in detail. If it's a table or a graph, don't parse it, just describe it in simple words or return  `graph`",
    "FIGURE_MAX_TOKENS": "512",
    "FIGURE_TEMPERATURE": "0.",
}

ARTIFACT_REPO_ID = f"{USERNAME}/deepseek-ocr-artifacts-test-Z"
ARTIFACT_REPO_BRANCH = None
STAGE1_UPLOAD_REPO = ARTIFACT_REPO_ID
STAGE1_UPLOAD_PATH_IN_REPO = "stage1"
STAGE1_UPLOAD_BRANCH = ARTIFACT_REPO_BRANCH
STAGE1_UPLOAD_COMMIT_MESSAGE = None

STAGE2_UPLOAD_REPO = ARTIFACT_REPO_ID
STAGE2_UPLOAD_PATH_IN_REPO = "stage2"
STAGE2_UPLOAD_BRANCH = ARTIFACT_REPO_BRANCH
STAGE2_UPLOAD_COMMIT_MESSAGE = None

ASSEMBLED_DATASET_REPO = ARTIFACT_REPO_ID
ASSEMBLED_DATASET_PATH_IN_REPO = "stage3"
ASSEMBLED_DATASET_COMMIT_MESSAGE = "Add assembled DeepSeek OCR batch"
ASSEMBLED_DATASET_BRANCH = ARTIFACT_REPO_BRANCH

USERNAME


In [None]:
BASE_ENV

In [None]:
import shutil
import tempfile
from huggingface_hub import HfApi, create_repo

CODE_PATHS = [
    Path("/home/ec2-user/aws-llm-lab/ds-batch-ocr.py"),
    Path("/home/ec2-user/aws-llm-lab/hf_job_runner.py"),
    Path("/home/ec2-user/aws-llm-lab/ds_batch_ocr"),
]

api = HfApi()
create_repo(repo_id=CODE_REPO_ID, repo_type=CODE_REPO_TYPE, exist_ok=True)
create_repo(repo_id=ARTIFACT_REPO_ID, repo_type="dataset", exist_ok=True)

bundle_dir = Path(tempfile.mkdtemp(prefix="job-code-"))
for path in CODE_PATHS:
    target = bundle_dir / path.name
    if path.is_dir():
        shutil.copytree(path, target, dirs_exist_ok=True)
    else:
        shutil.copy2(path, target)

api.upload_folder(
    folder_path=str(bundle_dir),
    repo_id=CODE_REPO_ID,
    repo_type=CODE_REPO_TYPE,
    commit_message="Sync DeepSeek OCR HF job code",
)

print(f"Uploaded code to {CODE_REPO_ID} ({CODE_REPO_TYPE})")
bundle_dir


In [None]:
SCRIPT_PATH = Path("/home/ec2-user/aws-llm-lab/hf_job_runner.py")
if not SCRIPT_PATH.exists():
    raise FileNotFoundError(f"Job runner script not found at {SCRIPT_PATH}")

CODE_RUN_URL = (
    f"https://huggingface.co/datasets/{CODE_REPO_ID}/resolve/"
    f"{CODE_REPO_REVISION or 'main'}/{SCRIPT_PATH.name}"
)
CODE_RUN_URL


In [None]:
def merged_env(*layers: Optional[Dict[str, str]]) -> Dict[str, str]:
    env: Dict[str, str] = {}
    for layer in layers:
        if not layer:
            continue
        for key, value in layer.items():
            if value is None:
                continue
            env[key] = str(value)
    return env


def launch_stage(stage: str, env_overrides: Optional[Dict[str, str]] = None) -> JobInfo:
    env = merged_env(BASE_ENV, PROMPT_ENV, env_overrides)
    env["PIPELINE_STAGE"] = stage

    job = run_uv_job(
        CODE_RUN_URL,
        image=HUB_IMAGE,
        flavor=HARDWARE_FLAVOR,
        env=env,
        timeout=JOB_TIMEOUT,
    )
    print(f"Submitted stage '{stage}' job: {job.url}")
    return job


def wait_for_completion(job: JobInfo, poll_interval: int = 60) -> JobInfo:
    print(f"Monitoring {job.url}")
    while True:
        print(job.id)
        info = inspect_job(job_id=job.id)
        stage_value = info.status.stage
        print(f"  -> {stage_value}", flush=True)
        if stage_value not in {JobStage.RUNNING, "RUNNING", "UPDATING"}:
            print(f"Job finished in state: {stage_value}")
            return info
        time.sleep(poll_interval)


def job_repo_id(job: JobInfo) -> str:
    return f"jobs/{job.owner.name}/{job.id}"


def stream_job_logs(job: JobInfo, tail: Optional[int] = None) -> None:
    lines = list(fetch_job_logs(job_id=job.id, namespace=job.owner.name))
    if tail is not None:
        lines = lines[-tail:]
    for line in lines:
        print(line, end="")


In [None]:
stage1_env = merged_env(
    DATASET_ENV,
    {
        "STAGE1_OUTPUT_DIR": STAGE1_DIR,
        "EXTRACT_BATCH_SIZE": "64",
        "EXTRACT_MAX_CONCURRENCY": "4",
        "STAGE1_UPLOAD_REPO": STAGE1_UPLOAD_REPO,
        "STAGE1_UPLOAD_PATH_IN_REPO": STAGE1_UPLOAD_PATH_IN_REPO,
        "STAGE1_UPLOAD_BRANCH": STAGE1_UPLOAD_BRANCH,
    },
)

stage2_base_env = {
    "STAGE1_DIR": STAGE1_DIR,
    "STAGE2_OUTPUT_DIR": STAGE2_DIR,
    "DESCRIBE_BATCH_SIZE": "8",
    "DESCRIBE_MAX_CONCURRENCY": "4",
    "STAGE2_UPLOAD_PATH_IN_REPO": STAGE2_UPLOAD_PATH_IN_REPO,
    "STAGE2_UPLOAD_BRANCH": STAGE2_UPLOAD_BRANCH,
}

stage3_base_env = {
    "STAGE1_DIR": STAGE1_DIR,
    "STAGE2_DIR": STAGE2_DIR,
    "STAGE3_OUTPUT_DIR": STAGE3_DIR,
}

stage1_env


In [None]:
stage1_job = launch_stage("extract", stage1_env)
stage1_job


In [None]:
stage1_info = wait_for_completion(stage1_job)
stage1_repo = STAGE1_UPLOAD_REPO or job_repo_id(stage1_info)
stage1_job_id = stage1_info.id
stage1_job_owner = stage1_info.owner.name
print("Stage 1 artifacts repo:", stage1_repo)


In [None]:
# Optional: inspect the tail of the job logs
stream_job_logs(stage1_info, tail=200)


In [None]:
stage1_repo = "florentgbelidji/deepseek-ocr-artifacts-test-XX"
stage2_repo = "florentgbelidji/deepseek-ocr-artifacts-test-XX"

In [None]:
stage2_env = merged_env(
    stage2_base_env,
    {
        "STAGE1_REPO_ID": stage1_repo,
        "STAGE1_ARTIFACT_STRATEGY": "hf-hub",
        "STAGE1_MANIFEST_NAME": f"{STAGE1_UPLOAD_PATH_IN_REPO}/manifest.json",
    },
)

stage2_job = launch_stage("describe", stage2_env)
stage2_job


In [None]:
stage2_info = wait_for_completion(stage2_job)
stage2_repo = STAGE2_UPLOAD_REPO or job_repo_id(stage2_info)
stage2_job_id = stage2_info.id
stage2_job_owner = stage2_info.owner.name
print("Stage 2 artifacts repo:", stage2_repo)


In [None]:
# Optional: inspect the second-stage logs
stream_job_logs(stage2_info, tail=200)


In [None]:
stage3_overrides = {
    "STAGE1_REPO_ID": stage1_repo,
    "STAGE1_ARTIFACT_STRATEGY": "hf-hub",
    "STAGE1_MANIFEST_NAME": f"{STAGE1_UPLOAD_PATH_IN_REPO}/manifest.json",
    "STAGE2_REPO_ID": stage2_repo,
    "STAGE2_ARTIFACT_STRATEGY": "hf-hub",
    "STAGE2_MANIFEST_NAME": f"{STAGE2_UPLOAD_PATH_IN_REPO}/figure_descriptions.json",
    "ASSEMBLED_DATASET_REPO": ASSEMBLED_DATASET_REPO,
    "ASSEMBLED_DATASET_PATH_IN_REPO": ASSEMBLED_DATASET_PATH_IN_REPO,
    "ASSEMBLED_DATASET_COMMIT_MESSAGE": ASSEMBLED_DATASET_COMMIT_MESSAGE,
    "ASSEMBLED_DATASET_BRANCH": ASSEMBLED_DATASET_BRANCH,
}

stage3_env = merged_env(stage3_base_env, stage3_overrides)

stage3_job = launch_stage("assemble", stage3_env)
stage3_job


In [None]:
stage3_info = wait_for_completion(stage3_job)
final_repo = ASSEMBLED_DATASET_REPO or job_repo_id(stage3_info)
print("Final bundle repo:", final_repo)


In [None]:
# Optional: inspect the third-stage logs
stream_job_logs(stage3_info, tail=200)


In [None]:
{"stage1_repo": stage1_repo, "stage2_repo": stage2_repo, "stage3_repo": final_repo}
