Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
117 commits
Select commit Hold shift + click to select a range
00e8a2c
chore: remove GitHub workflows
fffoivos Nov 30, 2025
269cabf
docs: remove PyPI badge
fffoivos Nov 30, 2025
b82d04e
fix chunk merging
fffoivos Jan 25, 2026
6bcde8b
Fix editable install by switching root build backend to setuptools
fffoivos Mar 4, 2026
ab87731
Simplify OCR stack around DeepSeek
fffoivos Mar 9, 2026
1adac28
Merge remote-tracking branch 'origin/development' into development
fffoivos Mar 9, 2026
83f7bf2
Add GitHub Pages docs workflow
fffoivos Mar 9, 2026
1bf4261
Fix docs links for Pages build
fffoivos Mar 9, 2026
79cc99c
docs: map pipeline concepts to implementation
fffoivos Mar 9, 2026
379b8f0
Handle HTML download interstitials
fffoivos Mar 17, 2026
aca4dbb
Add browser-gated download mode
fffoivos Mar 17, 2026
96241f9
docs: document pipeline artifact contract and runtime outputs
adidev001 Mar 20, 2026
c900b42
Merge pull request #93 from adidev001/docs/pipeline-artifact-contract
fffoivos Mar 23, 2026
00aed53
Upgrade Docling and simplify OCR runtime
fffoivos Mar 24, 2026
efd1698
add multi-worker deepseek gpu sharding
fffoivos Mar 29, 2026
8ed469b
add deepseek throughput tuning controls
fffoivos Mar 29, 2026
b749225
fallback to eager when deepseek sdpa is unsupported
fffoivos Mar 29, 2026
864b0ea
fix deepseek plain ocr crop defaults
fffoivos Mar 29, 2026
b319ae5
add deepseek max token cap control
fffoivos Mar 29, 2026
2635c0c
add deepseek generation guards and page metrics
fffoivos Mar 29, 2026
4536e0e
Add DeepSeek OCR speed controls and sharding
fffoivos Mar 30, 2026
0ebabe7
Update DeepSeek runtime to Torch 2.9.1 cu130
fffoivos Mar 30, 2026
502f8bc
Add vLLM DeepSeek OCR runtime
fffoivos Mar 30, 2026
cbeb638
Add DeepSeek markdown repair pipeline
fffoivos Mar 30, 2026
5ad8620
Add DeepSeek pipeline benchmark harness
fffoivos Mar 30, 2026
41b983e
Document DeepSeek pipeline benchmark results
fffoivos Mar 30, 2026
9179062
Merge branch 'codex/docling-281' into development
fffoivos Mar 30, 2026
0a86323
Harden DeepSeek repair classification
fffoivos Mar 30, 2026
f5af409
Merge branch 'codex/docling-281' into development
fffoivos Mar 30, 2026
3038fa8
Update DeepSeek benchmark note
fffoivos Mar 30, 2026
ec61ddb
Merge remote-tracking branch 'origin/codex/docling-281' into merge-co…
fffoivos Mar 30, 2026
6ab1e49
Improve DeepSeek scheduling and standardize defaults
fffoivos Mar 31, 2026
94e6ee2
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
f59bd96
Add OpenArchives OCR rollout plan
fffoivos Mar 31, 2026
2f846a7
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
39fe1c1
Add OpenArchives OCR shard and merge tooling
fffoivos Mar 31, 2026
fd7ce89
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
1371040
Record OpenArchives parquet recovery result
fffoivos Mar 31, 2026
c08ad49
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
151105c
Add OpenArchives OCR enrichment manifest tooling
fffoivos Mar 31, 2026
170d398
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
4e506d1
Add OpenArchives OCR node runner
fffoivos Mar 31, 2026
f056abf
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
b4086f2
Align DeepSeek runtime pins with validated OCR nodes
fffoivos Mar 31, 2026
372de21
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
03c2823
Update DeepSeek extra to validated vLLM stack
fffoivos Mar 31, 2026
fda3c63
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
5d2a386
Add host-aware OpenArchives download policy overrides
fffoivos Mar 31, 2026
c5d178e
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
fde2967
Add OpenArchives download probe and policy
fffoivos Mar 31, 2026
c769448
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
36b7666
Tighten OpenArchives download scheduling defaults
fffoivos Mar 31, 2026
97a55e1
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
9514618
Add OpenArchives download freeze runner
fffoivos Mar 31, 2026
fbaf892
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
5f9dbd5
Add OpenArchives HF shard refresh tool
fffoivos Mar 31, 2026
31df75b
Add resumable OpenArchives PDF staging puller
fffoivos Mar 31, 2026
8caaafc
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
7363ab2
Merge remote-tracking branch 'origin/development' into tmp-transfer-m…
fffoivos Mar 31, 2026
8208898
Fix OpenArchives HF refresh card totals
fffoivos Mar 31, 2026
ee40e85
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
b966086
Prioritize staged OA pulls from dynamic unreachable lists
fffoivos Mar 31, 2026
2e053ca
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
e23d4f1
Add rsync transport for staged OA PDF pulls
fffoivos Mar 31, 2026
5528f61
Merge branch 'codex/docling-281' into development
fffoivos Mar 31, 2026
93f5d12
Add cutoff-based OpenArchives OCR sharding
fffoivos Mar 31, 2026
dbe14b7
Add priority-only mode for staged OA pulls
fffoivos Apr 1, 2026
27bac46
Merge branch 'codex/docling-281' into development
fffoivos Apr 1, 2026
e38b189
Document OA single-machine download and supplement runbook
fffoivos Apr 1, 2026
94ec617
Revert "Document OA single-machine download and supplement runbook"
fffoivos Apr 1, 2026
63a9a67
Harden OpenArchives PDF download validation
fffoivos Apr 1, 2026
d143e60
Merge branch 'codex/docling-281' into development
fffoivos Apr 1, 2026
489698e
deepseek reliability hardening
fffoivos Apr 2, 2026
f584ab1
Pack repair batches and preserve OCR text continuity
fffoivos Apr 2, 2026
a1f0eba
Auto-wire CUDA wheel libs for DeepSeek workers
fffoivos Apr 3, 2026
c84769a
Fix DeepSeek CUDA wheel lib discovery for venv symlinks
fffoivos Apr 3, 2026
c320790
Add DeepSeek setup report and OCR shard unification
fffoivos Apr 3, 2026
d565554
Keep runtime report inside the venv
fffoivos Apr 3, 2026
e1745ec
Harden fresh-node GlossAPI setup
fffoivos Apr 3, 2026
02f9e93
Harden OCR stage continuity and Docling tuning
fffoivos Apr 3, 2026
85da5db
Add extraction checkpoint benchmark harness
fffoivos Apr 3, 2026
5de066e
Fallback PDF page counting in benchmark harness
fffoivos Apr 3, 2026
4d46285
Use pypdfium2 for benchmark page counts
fffoivos Apr 3, 2026
4e49582
Pack multi-GPU extraction work by page budget
fffoivos Apr 3, 2026
442fe46
Document extraction queue tuning knobs
fffoivos Apr 3, 2026
91769e8
Tune Docling page batching for extraction
fffoivos Apr 3, 2026
9c30ab8
Add explicit extract benchmark tuning flags
fffoivos Apr 3, 2026
1f8204a
Respect extraction queue batch boundaries
fffoivos Apr 3, 2026
f29bb44
Apply Docling queue policy before worker startup
fffoivos Apr 3, 2026
bc16454
Add full pipeline sample checkpoint runner
fffoivos Apr 3, 2026
0ba41c7
Prefer repo DeepSeek runtime for OCR workers
fffoivos Apr 3, 2026
e9f73c2
Prefer validated DeepSeek runtimes
fffoivos Apr 3, 2026
3a3a401
Allow full pipeline checkpoints to resume
fffoivos Apr 3, 2026
a8d2b93
Normalize OCR targets and expose repair packing
fffoivos Apr 3, 2026
dbea9d1
Harden full-pipeline export retries
fffoivos Apr 3, 2026
6f29a28
Fix MkDocs navigation for OCR docs
fffoivos Apr 3, 2026
2a9ac30
refactor: split corpus OCR orchestration from runtime
fffoivos Apr 14, 2026
c13ba21
unify OCR cleaner and speed up shared repeats
fffoivos Apr 10, 2026
586a543
reuse filtered page views in OCR analyzer
fffoivos Apr 10, 2026
bbc643b
speed up hybrid OCR matching in Rust
fffoivos Apr 10, 2026
880fbe2
move shared OCR text normalization into Rust
fffoivos Apr 10, 2026
8b052f8
parallelize combined OCR document rendering
fffoivos Apr 10, 2026
ad55b88
speed up OCR document rendering with process pools
fffoivos Apr 10, 2026
f405195
reduce OCR process-pool overhead and tune defaults
fffoivos Apr 10, 2026
d00af17
speed up OCR table handling and page fast paths
fffoivos Apr 10, 2026
d5fafeb
skip irrelevant OCR passes on marker-free pages
fffoivos Apr 10, 2026
7cebe12
avoid redundant Rust prebuilds on OCR startup
fffoivos Apr 10, 2026
c15f4b3
refactor OCR table policy and document cleaner runtime
fffoivos Apr 10, 2026
bc51980
widen same-type OCR span merge gap
fffoivos Apr 11, 2026
51db020
add latex short atom block matching
fffoivos Apr 11, 2026
ba80f1e
extend latex short atom tails
fffoivos Apr 11, 2026
2b78f45
expand latex structural repeat coverage
fffoivos Apr 11, 2026
f208ae1
add OCR clean+debug match indexing
fffoivos Apr 11, 2026
fae3963
sanitize OCR spans against page bounds
fffoivos Apr 11, 2026
5d7a0ba
ocr: refresh cleaner metrics after reruns
fffoivos Apr 14, 2026
f6e711b
test: cover OCR clean and export handoff
fffoivos Apr 14, 2026
0874623
chore: bump version to 0.1.4
fffoivos Apr 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Build and Deploy Docs

on:
push:
branches:
- development
- main
- master
workflow_dispatch:

permissions:
contents: write

jobs:
docs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.x"

- name: Install MkDocs
run: |
python -m pip install --upgrade pip
pip install 'mkdocs<2' 'mkdocs-material<10'

- name: Build site
run: mkdocs build --strict

- name: Deploy to gh-pages
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./site
publish_branch: gh-pages
force_orphan: true
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,13 @@ htmlcov/
# OCR test outputs
test_ocr_*_output/
*_demo_output/
artifacts/

# OCR model weights (if downloaded locally)
nanonets/
ocr_models/
deepseek-ocr-2-model/
models/

# Noise analysis reports
glossapi_noise_analysis_report.md
Expand All @@ -78,4 +81,4 @@ dependency_setup/.venvs/
deepseek-ocr/DeepSeek-OCR-empty/
# Local DeepSeek checkout and repro scripts (keep out of master)
deepseek-ocr/
repro_rapidocr_onnx/
deepseek-ocr-2/
20 changes: 20 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Contributing to GlossAPI

## Working branches and PR flow
- Open PRs are pushed against the `development` branch.
- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint.

## Some design principles
- Corpus methods should be easy to use and descriptive.
- Python files should be readable and well organized (check folder structure).
- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline.

## Pipeline awareness and folder layout
- Tie any pipeline change to the artifacts it produces. Common touchpoints:
- `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`).
- `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders.
- `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`.
- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable.

## Keep changes small
- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting.
118 changes: 95 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss.

## Why GlossAPI
- Handles download → extraction → cleaning → sectioning in one pipeline.
- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR.
- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation.
- Rust-powered cleaner/noise metrics keep Markdown quality predictable.
- Greek-first metadata and section classification tuned for academic corpora.
- Modular Corpus API lets you resume from any stage or plug into existing flows.
Expand Down Expand Up @@ -40,56 +40,128 @@ PY

## Automated Environment Profiles

Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes:
Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime:

```bash
# Vanilla pipeline (no GPU OCR extras)
./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests
# Docling / main GlossAPI environment
./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests

# Docling + RapidOCR mode
./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests

# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR)
./dependency_setup/setup_glossapi.sh \
--mode deepseek \
# DeepSeek OCR runtime (uv-managed)
./dependency_setup/setup_deepseek_uv.sh \
--venv dependency_setup/.venvs/deepseek \
--weights-dir /path/to/deepseek-ocr \
--model-root /path/to/deepseek-ocr-2-model \
--download-model \
--run-tests --smoke-test
```

Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately.
`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`.
The uv-managed DeepSeek runtime is OCR-only on purpose: it installs `glossapi[deepseek]` and does not carry the Docling layout stack.

If you want a guided install that asks which phases you plan to use, run:

```bash
python install_glossapi.py
```

That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them.

## Browser-Gated Download Mode

`Corpus.download(...)` now supports three high-level routes for file acquisition:

- `download_mode="standard"`: direct HTTP downloader only
- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial
- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints

Use `browser_mode=True` as a legacy alias for `download_mode="browser"`.

### Policy-driven routing

If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL:

```yaml
default:
downloader: standard

rules:
- match:
domains: [eur-lex.europa.eu]
downloader: browser

- match:
url_regex: "https://example.org/protected/.*"
downloader: auto
```

```python
from glossapi import Corpus

corpus = Corpus(input_dir="out", output_dir="out")
corpus.download(
input_parquet="input_urls.parquet",
download_policy_file="download_policy.yml",
)
```

### Operational notes

- Browser mode is for browser-gated file endpoints, not viewer-only sources.
- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files.
- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory.
- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files.

### Regression strategy

The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs.

For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite.

**DeepSeek runtime checklist**
- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub.
- Export these to force the real CLI and avoid silent stub output:
- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR.
- Export these to force the real runtime and avoid silent stub output:
- `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`
- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`
- `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py`
- `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python`
- `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR`
- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib`
- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`.
- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`.
- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`).
- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`.
- `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python`
- `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`
- `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2`
- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`.
- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise.

## Choose Your Install Path

| Scenario | Commands | Notes |
| --- | --- | --- |
| Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. |
| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. |
| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. |
| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. |
| Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. |
| Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. |

See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips.

## Repo Landmarks
- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files.
- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs.
- `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown.
- `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic.
- `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus).
- `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides.

## Pipeline map

Use this as the shortest path from a documentation concept to the public call that implements it.

| Stage | Main call | Important parameters | Writes |
| --- | --- | --- | --- |
| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` |
| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `use_gpus`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/<stem>.md`, `json/<stem>.docling.json(.zst)`, `json/metrics/*.json` |
| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/<stem>.md`, updated parquet metrics/flags |
| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/<stem>.md`, optional `json/<stem>.latex_map.jsonl` |
| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` |
| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` |
| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns |
| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL |

## Contributing
- Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check.
- Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together.
Expand Down
33 changes: 15 additions & 18 deletions dependency_setup/deepseek_gpu_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
Minimal DeepSeek OCR integration smoke test.

This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and
verifies that real Markdown output is produced. It requires the DeepSeek-OCR
weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to
the repository root (override via ``DEEPSEEK_MODEL_DIR``).
verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2
weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the
repository root (override via ``DEEPSEEK_MODEL_DIR``).
"""
from __future__ import annotations

Expand All @@ -20,15 +20,16 @@

REPO_ROOT = Path(__file__).resolve().parents[1]
SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs"
DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve()
DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve()


def ensure_model_available(model_root: Path) -> None:
expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors"
direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
expected = direct_root / "model-00001-of-000001.safetensors"
if not expected.exists() or expected.stat().st_size < 1_000_000:
raise FileNotFoundError(
f"Expected DeepSeek-OCR weights at {expected}. "
"Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) "
f"Expected DeepSeek-OCR-2 weights at {expected}. "
"Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) "
"or set DEEPSEEK_MODEL_DIR to the directory that contains them."
)

Expand All @@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None:
from glossapi import Corpus

ensure_model_available(model_root)
sample_pdf = SAMPLES_DIR / "sample01_plain.pdf"
model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2")
sample_pdf = SAMPLES_DIR / "alpha.pdf"
if not sample_pdf.exists():
raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}")

Expand Down Expand Up @@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None:
parquet_path = dl_dir / "download_results.parquet"
df.to_parquet(parquet_path, index=False)

os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1")
os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0")
os.environ.setdefault(
"GLOSSAPI_DEEPSEEK_VLLM_SCRIPT",
str(model_root / "run_pdf_ocr_vllm.py"),
"GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT",
str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"),
)
os.environ.setdefault(
"GLOSSAPI_DEEPSEEK_PYTHON",
sys.executable,
)
ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str(
model_root / "libjpeg-turbo" / "lib"
)
os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra
os.environ["LD_LIBRARY_PATH"] = (
f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":")
)
os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir))

corpus = Corpus(input_dir=input_dir, output_dir=output_dir)
corpus.ocr(
Expand All @@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None:


def main() -> None:
model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR")
model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR")
if model_dir_env:
model_root = Path(model_dir_env).expanduser().resolve()
else:
Expand Down
28 changes: 28 additions & 0 deletions dependency_setup/deepseek_uv/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[project]
name = "glossapi-deepseek-runtime"
version = "0.1.0"
description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution"
requires-python = ">=3.11,<3.13"
dependencies = [
"glossapi[deepseek]",
"torch==2.10.0",
"torchvision==0.25.0",
"torchaudio==2.10.0",
]

[dependency-groups]
test = [
"pytest",
"fpdf2",
]

[tool.uv.sources]
glossapi = { path = "../..", editable = true }
torch = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
torchaudio = { index = "pytorch-cu130" }

[[tool.uv.index]]
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true
Loading
Loading