From 00e8a2c449c808394751602a56c6461c2869781f Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 30 Nov 2025 10:04:35 +0200 Subject: [PATCH 01/93] chore: remove GitHub workflows --- .github/workflows/docs-selfhost.yml | 51 ---------------------------- .github/workflows/docs.yml | 33 ------------------ .github/workflows/python-publish.yml | 40 ---------------------- 3 files changed, 124 deletions(-) delete mode 100644 .github/workflows/docs-selfhost.yml delete mode 100644 .github/workflows/docs.yml delete mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/docs-selfhost.yml b/.github/workflows/docs-selfhost.yml deleted file mode 100644 index 57c67b2..0000000 --- a/.github/workflows/docs-selfhost.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Build Docs (Self-Host Deploy) - -on: - workflow_dispatch: - -jobs: - build-and-deploy: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Install MkDocs - run: | - python -m pip install --upgrade pip - pip install mkdocs mkdocs-material - - - name: Build site - run: | - mkdocs build --strict - - - name: Install rsync and ssh - run: sudo apt-get update -y && sudo apt-get install -y rsync openssh-client - - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.SSH_KEY }} - - - name: Add known hosts - run: | - mkdir -p ~/.ssh - echo "${{ secrets.SSH_KNOWN_HOSTS }}" >> ~/.ssh/known_hosts - chmod 644 ~/.ssh/known_hosts - - - name: Deploy via rsync - env: - SSH_USER: ${{ secrets.SSH_USER }} - SSH_HOST: ${{ secrets.SSH_HOST }} - SSH_TARGET: ${{ secrets.SSH_TARGET }} - run: | - if [ -z "$SSH_USER" ] || [ -z "$SSH_HOST" ] || [ -z "$SSH_TARGET" ]; then - echo "Missing SSH_USER/SSH_HOST/SSH_TARGET secrets." && exit 1 - fi - rsync -az --delete site/ "$SSH_USER@$SSH_HOST:$SSH_TARGET" - diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 30bbcd8..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Build and Deploy Docs - -on: - push: - branches: [ main, master ] - workflow_dispatch: - -jobs: - docs: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Install MkDocs - run: | - python -m pip install --upgrade pip - pip install mkdocs mkdocs-material - - - name: Build site - run: | - mkdocs build --strict - - - name: Deploy to GitHub Pages - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./site diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml deleted file mode 100644 index 757eb51..0000000 --- a/.github/workflows/python-publish.yml +++ /dev/null @@ -1,40 +0,0 @@ -# This workflow will upload a Python Package using GitHub Actions when a release is created -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-repositories - -name: Upload Python Package - -on: - workflow_dispatch: - release: - types: [published] - -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - # IMPORTANT: this permission is mandatory for trusted publishing - id-token: write - contents: read - - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - name: Copy README to pipeline directory - run: | - cp README.md pipeline/ - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: | - cd pipeline - python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - packages-dir: pipeline/dist/ - password: ${{ secrets.PYPI_API_TOKEN }} From 269cabf8c2f72f36c0238dfa16ca173b94bc593d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 30 Nov 2025 10:17:45 +0200 Subject: [PATCH 02/93] docs: remove PyPI badge --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index d37c347..ebc6baf 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # GlossAPI -[![PyPI Status](https://img.shields.io/pypi/v/glossapi?logo=pypi)](https://pypi.org/project/glossapi/) - GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss.eu/) that turns academic PDFs into structured Markdown, cleans noisy text with Rust extensions, and optionally enriches math/code content. ## Why GlossAPI From b82d04e4da77193e7bcb6df4b9772fa897499111 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Sun, 25 Jan 2026 12:50:56 +0200 Subject: [PATCH 03/93] fix chunk merging --- CONTRIBUTING.md | 20 +++++++++++++++++ src/glossapi/corpus/phase_export.py | 14 +++--------- src/glossapi/gloss_extract.py | 21 +++++++++++------- tests/test_jsonl_export.py | 34 +++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..979e757 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to GlossAPI + +## Working branches and PR flow +- Open PRs are pushed against the `development` branch. +- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint. + +## Some design principles +- Corpus methods should be easy to use and descriptive. +- Python files should be readable and well organized (check folder structure). +- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline. + +## Pipeline awareness and folder layout +- Tie any pipeline change to the artifacts it produces. Common touchpoints: + - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`). + - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders. + - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`. +- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable. + +## Keep changes small +- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting. diff --git a/src/glossapi/corpus/phase_export.py b/src/glossapi/corpus/phase_export.py index 26a6a82..4bcc6a8 100644 --- a/src/glossapi/corpus/phase_export.py +++ b/src/glossapi/corpus/phase_export.py @@ -471,8 +471,6 @@ def _normalize_value(value: Any) -> Any: chunk_paths: List[Path] = entry.get("chunk_paths", []) or [] base_path: Optional[Path] = entry.get("base_path") representative_path: Optional[Path] = base_path - if representative_path is None and chunk_paths: - representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0] base_metadata = metadata_by_stem.get(stem) chunk_metadata = metadata_chunks_by_stem.get(stem, []) if base_metadata is None and not chunk_metadata: @@ -480,17 +478,11 @@ def _normalize_value(value: Any) -> Any: metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata) metadata = {k: _normalize_value(v) for k, v in metadata.items()} original_filename_value = metadata.get("filename") - if chunk_paths: - ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key) - parts: List[str] = [] - for path in ordered_chunks: - parts.append(path.read_text(encoding="utf-8")) - document_text = "\n".join(parts) - elif representative_path is not None: - document_text = representative_path.read_text(encoding="utf-8") - else: + if base_path is None or not base_path.exists(): continue + document_text = base_path.read_text(encoding="utf-8") + filetype = metadata.get("filetype") or metadata.get("file_ext") if not filetype: filename_candidate = original_filename_value or metadata.get("filename") diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..44dbcfa 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -914,6 +914,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: except Exception as e: self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}") + # Always attempt to assemble whatever chunks succeeded (best-effort) + out_md_path = output_dir / f"{stem}.md" + final_md_written = False + if all_segments: + try: + final_md = "\n\n".join(all_segments) + out_md_path.write_text(final_md, encoding="utf-8") + final_md_written = True + except Exception as e: + self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not completed: # Record failure/timeout provenance in parquet try: @@ -928,6 +939,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: chunk_size=self.chunk_size, chunk_count=len(manifest.get("entries", [])), chunk_manifest_path=manifest_path, + no_partial_output=not final_md_written, ) except Exception as e: self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}") @@ -939,14 +951,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}") return False - # Assemble final markdown - try: - final_md = "\n\n".join(all_segments) - out_md_path = output_dir / f"{stem}.md" - with out_md_path.open("w", encoding="utf-8") as fp: - fp.write(final_md) - except Exception as e: - self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not final_md_written: return False # Record success provenance in parquet try: diff --git a/tests/test_jsonl_export.py b/tests/test_jsonl_export.py index e05caa0..aecd7a3 100644 --- a/tests/test_jsonl_export.py +++ b/tests/test_jsonl_export.py @@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path): assert len(seen_doc_ids) == len(texts) +def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path): + corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks") + + base_text = "## Base Title\n\nMerged body from extraction." + base_path = corpus.cleaned_markdown_dir / "chunked.md" + base_path.parent.mkdir(parents=True, exist_ok=True) + base_path.write_text(base_text, encoding="utf-8") + + chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked" + chunk_dir.mkdir(parents=True, exist_ok=True) + (chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8") + (chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8") + + _write_download_results( + corpus.output_dir / "download_results" / "download_results.parquet", + [ + { + "filename": "chunked.pdf", + "filter": "ok", + "needs_ocr": False, + "is_empty": False, + "char_count_no_comments": 10, + } + ], + ) + + out_path = corpus.output_dir / "chunked.jsonl" + corpus.jsonl(out_path) + + record = json.loads(out_path.read_text(encoding="utf-8").strip()) + assert record["document"] == base_text + + @pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed") def test_hf_streaming_loader_example(tmp_path): corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7") @@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path): table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019)) assert set(table.column("doc_id").to_pylist()) == {"a"} + def _expected_doc_id(filename: str) -> str: return hashlib.sha256(filename.encode("utf-8")).hexdigest() From 6bcde8b283ab8e517d01787e8c4ba48768656432 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Wed, 4 Mar 2026 21:29:44 +0200 Subject: [PATCH 04/93] Fix editable install by switching root build backend to setuptools --- pyproject.toml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e6672c..c7cf7c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["maturin>=1.5,<2.0"] -build-backend = "maturin" +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" [project] name = "glossapi" @@ -34,7 +34,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Rust", - "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)", ] [project.optional-dependencies] @@ -66,17 +65,16 @@ docs = [ "mkdocs-material>=9.5", ] -[tool.maturin] -bindings = "pyo3" -# The crate is located under rust/glossapi_rs_noise -module-name = "glossapi_rs_noise" -python-source = "src" +[tool.setuptools] +package-dir = {"" = "src"} +include-package-data = true -manifest-path = "rust/glossapi_rs_noise/Cargo.toml" -include = ["src/**"] -python-packages = [ - "glossapi" -] +[tool.setuptools.packages.find] +where = ["src"] +include = ["glossapi", "glossapi.*"] + +[tool.setuptools.package-data] +glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ From ab87731241d266e37235c738be36f6a0d2fc0737 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 00:26:16 +0000 Subject: [PATCH 05/93] Simplify OCR stack around DeepSeek --- .gitignore | 4 + README.md | 41 +- dependency_setup/deepseek_gpu_smoke.py | 33 +- dependency_setup/deepseek_uv/pyproject.toml | 28 + dependency_setup/deepseek_uv/uv.lock | 2605 +++++++++++++++++ dependency_setup/dependency_notes.md | 66 +- .../requirements-glossapi-deepseek.txt | 29 +- .../requirements-glossapi-docling.txt | 38 + .../requirements-glossapi-rapidocr.txt | 4 - dependency_setup/setup_deepseek_uv.sh | 138 + dependency_setup/setup_glossapi.sh | 115 +- dependency_setup/setup_glossapi_deepseek.sh | 2 +- .../deepseek_only_upgrade_roadmap.md | 262 ++ docs/architecture/index.md | 2 +- docs/configuration.md | 26 +- docs/getting_started.md | 58 +- docs/index.md | 4 +- docs/math_enrichment_runtime.md | 5 +- docs/ocr_and_math_enhancement.md | 39 +- docs/quickstart.md | 18 +- docs/stages/ocr.md | 9 +- docs/testing/compatibility_matrix.md | 276 ++ docs/troubleshooting.md | 10 +- mkdocs.yml | 4 +- pyproject.toml | 25 +- src/glossapi/__init__.py | 51 +- src/glossapi/_pipeline.py | 4 +- src/glossapi/corpus/phase_clean.py | 2 + src/glossapi/corpus/phase_extract.py | 36 +- src/glossapi/corpus/phase_ocr_math.py | 40 +- src/glossapi/gloss_extract.py | 221 +- src/glossapi/ocr/__init__.py | 7 +- src/glossapi/ocr/deepseek/__init__.py | 2 +- src/glossapi/ocr/deepseek/preflight.py | 70 +- .../ocr/deepseek/run_pdf_ocr_transformers.py | 188 ++ src/glossapi/ocr/deepseek/runner.py | 185 +- src/glossapi/ocr/docling/__init__.py | 5 + src/glossapi/ocr/docling/pipeline.py | 95 + src/glossapi/ocr/docling_pipeline.py | 82 + src/glossapi/ocr/rapidocr/__init__.py | 26 - src/glossapi/ocr/rapidocr/__init__.py.backup | 6 - src/glossapi/ocr/rapidocr/_paths.py | 114 - src/glossapi/ocr/rapidocr/dispatch.py | 33 - src/glossapi/ocr/rapidocr/docling_pipeline.py | 501 ---- .../ocr/rapidocr/docling_pipeline.py.backup | 501 ---- src/glossapi/ocr/rapidocr/onnx.py | 105 - src/glossapi/ocr/rapidocr/pipeline.py | 229 -- src/glossapi/ocr/rapidocr/pool.py | 72 - src/glossapi/ocr/rapidocr/safe.py | 301 -- tests/test_corpus_guards.py | 19 +- tests/test_deepseek_preflight.py | 30 +- tests/test_deepseek_runner_contract.py | 62 + tests/test_deepseek_runner_stub.py | 59 - tests/test_ocr_backends_smoke.py | 8 +- tests/test_ocr_dispatch_backends.py | 28 +- tests/test_ocr_imports.py | 13 - tests/test_pipeline_smoke.py | 98 +- tests/test_rapidocr_patch.py | 368 --- 58 files changed, 4241 insertions(+), 3161 deletions(-) create mode 100644 dependency_setup/deepseek_uv/pyproject.toml create mode 100644 dependency_setup/deepseek_uv/uv.lock create mode 100644 dependency_setup/requirements-glossapi-docling.txt delete mode 100644 dependency_setup/requirements-glossapi-rapidocr.txt create mode 100755 dependency_setup/setup_deepseek_uv.sh create mode 100644 docs/architecture/deepseek_only_upgrade_roadmap.md create mode 100644 docs/testing/compatibility_matrix.md create mode 100644 src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py create mode 100644 src/glossapi/ocr/docling/__init__.py create mode 100644 src/glossapi/ocr/docling/pipeline.py create mode 100644 src/glossapi/ocr/docling_pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/__init__.py delete mode 100644 src/glossapi/ocr/rapidocr/__init__.py.backup delete mode 100644 src/glossapi/ocr/rapidocr/_paths.py delete mode 100644 src/glossapi/ocr/rapidocr/dispatch.py delete mode 100644 src/glossapi/ocr/rapidocr/docling_pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/docling_pipeline.py.backup delete mode 100644 src/glossapi/ocr/rapidocr/onnx.py delete mode 100644 src/glossapi/ocr/rapidocr/pipeline.py delete mode 100644 src/glossapi/ocr/rapidocr/pool.py delete mode 100644 src/glossapi/ocr/rapidocr/safe.py create mode 100644 tests/test_deepseek_runner_contract.py delete mode 100644 tests/test_deepseek_runner_stub.py delete mode 100644 tests/test_rapidocr_patch.py diff --git a/.gitignore b/.gitignore index 8c98a88..929a8c5 100644 --- a/.gitignore +++ b/.gitignore @@ -58,10 +58,13 @@ htmlcov/ # OCR test outputs test_ocr_*_output/ *_demo_output/ +artifacts/ # OCR model weights (if downloaded locally) nanonets/ ocr_models/ +deepseek-ocr-2-model/ +models/ # Noise analysis reports glossapi_noise_analysis_report.md @@ -78,4 +81,5 @@ dependency_setup/.venvs/ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ +deepseek-ocr-2/ repro_rapidocr_onnx/ diff --git a/README.md b/README.md index ebc6baf..e581361 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss. ## Why GlossAPI - Handles download → extraction → cleaning → sectioning in one pipeline. -- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR. +- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation. - Rust-powered cleaner/noise metrics keep Markdown quality predictable. - Greek-first metadata and section classification tuned for academic corpora. - Modular Corpus API lets you resume from any stage or plug into existing flows. @@ -40,45 +40,40 @@ PY ## Automated Environment Profiles -Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes: +Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime: ```bash -# Vanilla pipeline (no GPU OCR extras) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Docling / main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# Docling + RapidOCR mode -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR runtime (uv-managed) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub. -- Export these to force the real CLI and avoid silent stub output: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. +- Export these to force the real runtime and avoid silent stub output: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`. -- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`). -- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`. +- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise. ## Choose Your Install Path | Scenario | Commands | Notes | | --- | --- | --- | | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. | -| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. | +| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. | +| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. | | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. | | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. | diff --git a/dependency_setup/deepseek_gpu_smoke.py b/dependency_setup/deepseek_gpu_smoke.py index e85d202..ddfb314 100644 --- a/dependency_setup/deepseek_gpu_smoke.py +++ b/dependency_setup/deepseek_gpu_smoke.py @@ -3,9 +3,9 @@ Minimal DeepSeek OCR integration smoke test. This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and -verifies that real Markdown output is produced. It requires the DeepSeek-OCR -weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to -the repository root (override via ``DEEPSEEK_MODEL_DIR``). +verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2 +weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the +repository root (override via ``DEEPSEEK_MODEL_DIR``). """ from __future__ import annotations @@ -20,15 +20,16 @@ REPO_ROOT = Path(__file__).resolve().parents[1] SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs" -DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve() +DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve() def ensure_model_available(model_root: Path) -> None: - expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors" + direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + expected = direct_root / "model-00001-of-000001.safetensors" if not expected.exists() or expected.stat().st_size < 1_000_000: raise FileNotFoundError( - f"Expected DeepSeek-OCR weights at {expected}. " - "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) " + f"Expected DeepSeek-OCR-2 weights at {expected}. " + "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) " "or set DEEPSEEK_MODEL_DIR to the directory that contains them." ) @@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None: from glossapi import Corpus ensure_model_available(model_root) - sample_pdf = SAMPLES_DIR / "sample01_plain.pdf" + model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + sample_pdf = SAMPLES_DIR / "alpha.pdf" if not sample_pdf.exists(): raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}") @@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None: parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") os.environ.setdefault( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - str(model_root / "run_pdf_ocr_vllm.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"), ) os.environ.setdefault( "GLOSSAPI_DEEPSEEK_PYTHON", sys.executable, ) - ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str( - model_root / "libjpeg-turbo" / "lib" - ) - os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra - os.environ["LD_LIBRARY_PATH"] = ( - f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":") - ) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) corpus = Corpus(input_dir=input_dir, output_dir=output_dir) corpus.ocr( @@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None: def main() -> None: - model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") + model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR") if model_dir_env: model_root = Path(model_dir_env).expanduser().resolve() else: diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml new file mode 100644 index 0000000..809b499 --- /dev/null +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" +requires-python = ">=3.11,<3.13" +dependencies = [ + "glossapi[docling,deepseek]", + "torch==2.6.0", + "torchvision==0.21.0", + "torchaudio==2.6.0", +] + +[dependency-groups] +test = [ + "pytest", + "fpdf2", +] + +[tool.uv.sources] +glossapi = { path = "../..", editable = true } +torch = { index = "pytorch-cu118" } +torchvision = { index = "pytorch-cu118" } +torchaudio = { index = "pytorch-cu118" } + +[[tool.uv.index]] +name = "pytorch-cu118" +url = "https://download.pytorch.org/whl/cu118" +explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock new file mode 100644 index 0000000..f5eefaa --- /dev/null +++ b/dependency_setup/deepseek_uv/uv.lock @@ -0,0 +1,2605 @@ +version = 1 +revision = 3 +requires-python = ">=3.11, <3.13" +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] + +[[package]] +name = "accelerate" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" }, +] + +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" }, +] + +[[package]] +name = "aiofiles" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, + { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, + { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, + { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, + { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, + { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, + { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, + { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, + { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, + { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, + { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, + { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, + { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, + { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, + { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, +] + +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/35/02daf95b9cd686320bb622eb148792655c9412dbb9b67abb5694e5910a24/charset_normalizer-3.4.5.tar.gz", hash = "sha256:95adae7b6c42a6c5b5b559b1a99149f090a57128155daeea91732c8d970d8644", size = 134804, upload-time = "2026-03-06T06:03:19.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/9e/bcec3b22c64ecec47d39bf5167c2613efd41898c019dccd4183f6aa5d6a7/charset_normalizer-3.4.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:610f72c0ee565dfb8ae1241b666119582fdbfe7c0975c175be719f940e110694", size = 279531, upload-time = "2026-03-06T06:00:52.252Z" }, + { url = "https://files.pythonhosted.org/packages/58/12/81fd25f7e7078ab5d1eedbb0fac44be4904ae3370a3bf4533c8f2d159acd/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60d68e820af339df4ae8358c7a2e7596badeb61e544438e489035f9fbf3246a5", size = 188006, upload-time = "2026-03-06T06:00:53.8Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6e/f2d30e8c27c1b0736a6520311982cf5286cfc7f6cac77d7bc1325e3a23f2/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b473fc8dca1c3ad8559985794815f06ca3fc71942c969129070f2c3cdf7281", size = 205085, upload-time = "2026-03-06T06:00:55.311Z" }, + { url = "https://files.pythonhosted.org/packages/d0/90/d12cefcb53b5931e2cf792a33718d7126efb116a320eaa0742c7059a95e4/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d4eb8ac7469b2a5d64b5b8c04f84d8bf3ad340f4514b98523805cbf46e3b3923", size = 200545, upload-time = "2026-03-06T06:00:56.532Z" }, + { url = "https://files.pythonhosted.org/packages/03/f4/44d3b830a20e89ff82a3134912d9a1cf6084d64f3b95dcad40f74449a654/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bcb3227c3d9aaf73eaaab1db7ccd80a8995c509ee9941e2aae060ca6e4e5d81", size = 193863, upload-time = "2026-03-06T06:00:57.823Z" }, + { url = "https://files.pythonhosted.org/packages/25/4b/f212119c18a6320a9d4a730d1b4057875cdeabf21b3614f76549042ef8a8/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:75ee9c1cce2911581a70a3c0919d8bccf5b1cbc9b0e5171400ec736b4b569497", size = 181827, upload-time = "2026-03-06T06:00:59.323Z" }, + { url = "https://files.pythonhosted.org/packages/74/00/b26158e48b425a202a92965f8069e8a63d9af1481dfa206825d7f74d2a3c/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d1401945cb77787dbd3af2446ff2d75912327c4c3a1526ab7955ecf8600687c", size = 191085, upload-time = "2026-03-06T06:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c2/1c1737bf6fd40335fe53d28fe49afd99ee4143cc57a845e99635ce0b9b6d/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a45e504f5e1be0bd385935a8e1507c442349ca36f511a47057a71c9d1d6ea9e", size = 190688, upload-time = "2026-03-06T06:01:02.479Z" }, + { url = "https://files.pythonhosted.org/packages/5a/3d/abb5c22dc2ef493cd56522f811246a63c5427c08f3e3e50ab663de27fcf4/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e09f671a54ce70b79a1fc1dc6da3072b7ef7251fadb894ed92d9aa8218465a5f", size = 183077, upload-time = "2026-03-06T06:01:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/44/33/5298ad4d419a58e25b3508e87f2758d1442ff00c2471f8e0403dab8edad5/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d01de5e768328646e6a3fa9e562706f8f6641708c115c62588aef2b941a4f88e", size = 206706, upload-time = "2026-03-06T06:01:05.773Z" }, + { url = "https://files.pythonhosted.org/packages/7b/17/51e7895ac0f87c3b91d276a449ef09f5532a7529818f59646d7a55089432/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:131716d6786ad5e3dc542f5cc6f397ba3339dc0fb87f87ac30e550e8987756af", size = 191665, upload-time = "2026-03-06T06:01:07.473Z" }, + { url = "https://files.pythonhosted.org/packages/90/8f/cce9adf1883e98906dbae380d769b4852bb0fa0004bc7d7a2243418d3ea8/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a374cc0b88aa710e8865dc1bd6edb3743c59f27830f0293ab101e4cf3ce9f85", size = 201950, upload-time = "2026-03-06T06:01:08.973Z" }, + { url = "https://files.pythonhosted.org/packages/08/ca/bce99cd5c397a52919e2769d126723f27a4c037130374c051c00470bcd38/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d31f0d1671e1534e395f9eb84a68e0fb670e1edb1fe819a9d7f564ae3bc4e53f", size = 195830, upload-time = "2026-03-06T06:01:10.155Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/2e3d023a06911f1281f97b8f036edc9872167036ca6f55cc874a0be6c12c/charset_normalizer-3.4.5-cp311-cp311-win32.whl", hash = "sha256:cace89841c0599d736d3d74a27bc5821288bb47c5441923277afc6059d7fbcb4", size = 132029, upload-time = "2026-03-06T06:01:11.706Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1f/a853b73d386521fd44b7f67ded6b17b7b2367067d9106a5c4b44f9a34274/charset_normalizer-3.4.5-cp311-cp311-win_amd64.whl", hash = "sha256:f8102ae93c0bc863b1d41ea0f4499c20a83229f52ed870850892df555187154a", size = 142404, upload-time = "2026-03-06T06:01:12.865Z" }, + { url = "https://files.pythonhosted.org/packages/b4/10/dba36f76b71c38e9d391abe0fd8a5b818790e053c431adecfc98c35cd2a9/charset_normalizer-3.4.5-cp311-cp311-win_arm64.whl", hash = "sha256:ed98364e1c262cf5f9363c3eca8c2df37024f52a8fa1180a3610014f26eac51c", size = 132796, upload-time = "2026-03-06T06:01:14.106Z" }, + { url = "https://files.pythonhosted.org/packages/9c/b6/9ee9c1a608916ca5feae81a344dffbaa53b26b90be58cc2159e3332d44ec/charset_normalizer-3.4.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed97c282ee4f994ef814042423a529df9497e3c666dca19be1d4cd1129dc7ade", size = 280976, upload-time = "2026-03-06T06:01:15.276Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d8/a54f7c0b96f1df3563e9190f04daf981e365a9b397eedfdfb5dbef7e5c6c/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0294916d6ccf2d069727d65973c3a1ca477d68708db25fd758dd28b0827cff54", size = 189356, upload-time = "2026-03-06T06:01:16.511Z" }, + { url = "https://files.pythonhosted.org/packages/42/69/2bf7f76ce1446759a5787cb87d38f6a61eb47dbbdf035cfebf6347292a65/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dc57a0baa3eeedd99fafaef7511b5a6ef4581494e8168ee086031744e2679467", size = 206369, upload-time = "2026-03-06T06:01:17.853Z" }, + { url = "https://files.pythonhosted.org/packages/10/9c/949d1a46dab56b959d9a87272482195f1840b515a3380e39986989a893ae/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ed1a9a204f317ef879b32f9af507d47e49cd5e7f8e8d5d96358c98373314fc60", size = 203285, upload-time = "2026-03-06T06:01:19.473Z" }, + { url = "https://files.pythonhosted.org/packages/67/5c/ae30362a88b4da237d71ea214a8c7eb915db3eec941adda511729ac25fa2/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad83b8f9379176c841f8865884f3514d905bcd2a9a3b210eaa446e7d2223e4d", size = 196274, upload-time = "2026-03-06T06:01:20.728Z" }, + { url = "https://files.pythonhosted.org/packages/b2/07/c9f2cb0e46cb6d64fdcc4f95953747b843bb2181bda678dc4e699b8f0f9a/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:a118e2e0b5ae6b0120d5efa5f866e58f2bb826067a646431da4d6a2bdae7950e", size = 184715, upload-time = "2026-03-06T06:01:22.194Z" }, + { url = "https://files.pythonhosted.org/packages/36/64/6b0ca95c44fddf692cd06d642b28f63009d0ce325fad6e9b2b4d0ef86a52/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:754f96058e61a5e22e91483f823e07df16416ce76afa4ebf306f8e1d1296d43f", size = 193426, upload-time = "2026-03-06T06:01:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/50/bc/a730690d726403743795ca3f5bb2baf67838c5fea78236098f324b965e40/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0c300cefd9b0970381a46394902cd18eaf2aa00163f999590ace991989dcd0fc", size = 191780, upload-time = "2026-03-06T06:01:25.053Z" }, + { url = "https://files.pythonhosted.org/packages/97/4f/6c0bc9af68222b22951552d73df4532b5be6447cee32d58e7e8c74ecbb7b/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c108f8619e504140569ee7de3f97d234f0fbae338a7f9f360455071ef9855a95", size = 185805, upload-time = "2026-03-06T06:01:26.294Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b9/a523fb9b0ee90814b503452b2600e4cbc118cd68714d57041564886e7325/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d1028de43596a315e2720a9849ee79007ab742c06ad8b45a50db8cdb7ed4a82a", size = 208342, upload-time = "2026-03-06T06:01:27.55Z" }, + { url = "https://files.pythonhosted.org/packages/4d/61/c59e761dee4464050713e50e27b58266cc8e209e518c0b378c1580c959ba/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:19092dde50335accf365cce21998a1c6dd8eafd42c7b226eb54b2747cdce2fac", size = 193661, upload-time = "2026-03-06T06:01:29.051Z" }, + { url = "https://files.pythonhosted.org/packages/1c/43/729fa30aad69783f755c5ad8649da17ee095311ca42024742701e202dc59/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4354e401eb6dab9aed3c7b4030514328a6c748d05e1c3e19175008ca7de84fb1", size = 204819, upload-time = "2026-03-06T06:01:30.298Z" }, + { url = "https://files.pythonhosted.org/packages/87/33/d9b442ce5a91b96fc0840455a9e49a611bbadae6122778d0a6a79683dd31/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a68766a3c58fde7f9aaa22b3786276f62ab2f594efb02d0a1421b6282e852e98", size = 198080, upload-time = "2026-03-06T06:01:31.478Z" }, + { url = "https://files.pythonhosted.org/packages/56/5a/b8b5a23134978ee9885cee2d6995f4c27cc41f9baded0a9685eabc5338f0/charset_normalizer-3.4.5-cp312-cp312-win32.whl", hash = "sha256:1827734a5b308b65ac54e86a618de66f935a4f63a8a462ff1e19a6788d6c2262", size = 132630, upload-time = "2026-03-06T06:01:33.056Z" }, + { url = "https://files.pythonhosted.org/packages/70/53/e44a4c07e8904500aec95865dc3f6464dc3586a039ef0df606eb3ac38e35/charset_normalizer-3.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:728c6a963dfab66ef865f49286e45239384249672cd598576765acc2a640a636", size = 142856, upload-time = "2026-03-06T06:01:34.489Z" }, + { url = "https://files.pythonhosted.org/packages/ea/aa/c5628f7cad591b1cf45790b7a61483c3e36cf41349c98af7813c483fd6e8/charset_normalizer-3.4.5-cp312-cp312-win_arm64.whl", hash = "sha256:75dfd1afe0b1647449e852f4fb428195a7ed0588947218f7ba929f6538487f02", size = 132982, upload-time = "2026-03-06T06:01:35.641Z" }, + { url = "https://files.pythonhosted.org/packages/c5/60/3a621758945513adfd4db86827a5bafcc615f913dbd0b4c2ed64a65731be/charset_normalizer-3.4.5-py3-none-any.whl", hash = "sha256:9db5e3fcdcee89a78c04dffb3fe33c79f77bd741a624946db2591c81b2fc85b0", size = 55455, upload-time = "2026-03-06T06:03:17.827Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "dask" +version = "2026.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "cloudpickle" }, + { name = "fsspec" }, + { name = "importlib-metadata", marker = "python_full_version < '3.12'" }, + { name = "packaging" }, + { name = "partd" }, + { name = "pyyaml" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/52/b0f9172b22778def907db1ff173249e4eb41f054b46a9c83b1528aaf811f/dask-2026.1.2.tar.gz", hash = "sha256:1136683de2750d98ea792670f7434e6c1cfce90cab2cc2f2495a9e60fd25a4fc", size = 10997838, upload-time = "2026-01-30T21:04:20.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/23/d39ccc4ed76222db31530b0a7d38876fdb7673e23f838e8d8f0ed4651a4f/dask-2026.1.2-py3-none-any.whl", hash = "sha256:46a0cf3b8d87f78a3d2e6b145aea4418a6d6d606fe6a16c79bd8ca2bb862bc91", size = 1482084, upload-time = "2026-01-30T21:04:18.363Z" }, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + +[[package]] +name = "deprecated" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, +] + +[[package]] +name = "dill" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, +] + +[[package]] +name = "docling" +version = "2.48.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accelerate" }, + { name = "beautifulsoup4" }, + { name = "certifi" }, + { name = "docling-core", extra = ["chunking"] }, + { name = "docling-ibm-models" }, + { name = "docling-parse" }, + { name = "easyocr" }, + { name = "filetype" }, + { name = "huggingface-hub" }, + { name = "lxml" }, + { name = "marko" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pillow" }, + { name = "pluggy" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "pylatexenc" }, + { name = "pypdfium2" }, + { name = "python-docx" }, + { name = "python-pptx" }, + { name = "requests" }, + { name = "rtree" }, + { name = "scipy" }, + { name = "tqdm" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/be/32/e117cb0dcc76c93828d2cd9b45c3f8ccf6c86314a60e9c65f16067d3df26/docling-2.48.0.tar.gz", hash = "sha256:e94a5f75c544ec1bbb9169d2f4da72e1f497fd2fcda57cfacc454c93b1c92a8e", size = 189422, upload-time = "2025-08-26T05:31:02.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/32/a9c6677c66178a397b89b5b6fe1e7b3d3de98ddc2b331fbcd7440419b9f0/docling-2.48.0-py3-none-any.whl", hash = "sha256:8a1c1dfd5ed84cadb0f81fcb1464e5d501c4bfaa121e15306e09e3c0c983cc3e", size = 212266, upload-time = "2025-08-26T05:31:00.779Z" }, +] + +[[package]] +name = "docling-core" +version = "2.68.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "jsonref" }, + { name = "jsonschema" }, + { name = "latex2mathml" }, + { name = "pandas" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "tabulate" }, + { name = "typer" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/b7/95e329d143528decd8f6af5d4db6c2d6bc3dc40f9d53ee5b7d5b901dfe11/docling_core-2.68.0.tar.gz", hash = "sha256:261ecb6281d45fcf0559640297eda728f8f7dd4fe8c8bf7ced42dbf9b4e46223", size = 267551, upload-time = "2026-03-07T12:20:24.523Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/66/d8bbe25dec2bb91d9090b939349b1c9b94c307edceada46c5bc6f213a569/docling_core-2.68.0-py3-none-any.whl", hash = "sha256:175145398c005399819a7cfe7b634257caaaecfbb4451840b8ddb31fc2f5ac12", size = 247092, upload-time = "2026-03-07T12:20:23.172Z" }, +] + +[package.optional-dependencies] +chunking = [ + { name = "semchunk" }, + { name = "transformers" }, + { name = "tree-sitter" }, + { name = "tree-sitter-c" }, + { name = "tree-sitter-javascript" }, + { name = "tree-sitter-python" }, + { name = "tree-sitter-typescript" }, +] + +[[package]] +name = "docling-ibm-models" +version = "3.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "accelerate" }, + { name = "docling-core" }, + { name = "huggingface-hub" }, + { name = "jsonlines" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "rtree" }, + { name = "safetensors", extra = ["torch"] }, + { name = "torch" }, + { name = "torchvision" }, + { name = "tqdm" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/91/f883e0a2b3466e1126dfd4463f386c70f5b90d271c27b6f5a97d2f8312e6/docling_ibm_models-3.11.0.tar.gz", hash = "sha256:454401563a8e79cb33b718bc559d9bacca8a0183583e48f8e616c9184c1f5eb1", size = 87721, upload-time = "2026-01-23T12:29:35.384Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/5d/97e9c2e10fbd3ee1723ac82c335f8211a9633c0397cc11ed057c3ba4006e/docling_ibm_models-3.11.0-py3-none-any.whl", hash = "sha256:68f7961069d643bfdab21b1c9ef24a979db293496f4c2283d95b1025a9ac5347", size = 87352, upload-time = "2026-01-23T12:29:34.045Z" }, +] + +[[package]] +name = "docling-parse" +version = "4.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docling-core" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "tabulate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bb/7a/653c3b11920113217724fab9b4740f9f8964864f92a2a27590accecec5ac/docling_parse-4.7.3.tar.gz", hash = "sha256:5936e6bcb7969c2a13f38ecc75cada3b0919422dc845e96da4b0b7b3bbc394ce", size = 67646746, upload-time = "2026-01-14T14:18:19.376Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/81/dd317e0bce475153dc08a60a9a8615b1a04d4d3c9803175e6cb7b7e9b49b/docling_parse-4.7.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66896bbe925073e4d48f18ec29dcd611a390d6b2378fae72125e77b020cd5664", size = 14615974, upload-time = "2026-01-14T14:17:30.246Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b5/088590e0b32fd0a393ca419c644d1435a1c99fa6b2a87888eef4d0fdea33/docling_parse-4.7.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:281347b3e937c1a5ffa6f8774ee603b64a0899fe8a6885573dec7eb48a3421d8", size = 14981051, upload-time = "2026-01-14T14:17:32.426Z" }, + { url = "https://files.pythonhosted.org/packages/b7/63/2b6c9127924487573d5419d58ec77955f0b7c0a923c8232ad461d71039aa/docling_parse-4.7.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3d86c51f9ce35a1b40b2f410f7271d9bd5fc58e7240f4cae7fdd2cef757e671", size = 15092586, upload-time = "2026-01-14T14:17:34.634Z" }, + { url = "https://files.pythonhosted.org/packages/af/89/ed27a83eb113bdf0b0f82f3c30a0db3c005df58b236f6487b232dacdb57a/docling_parse-4.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:3b04459cc97a8a4929622e341b9981e23987a63af07db599afc5e1c4d389060b", size = 16144866, upload-time = "2026-01-14T14:17:36.742Z" }, + { url = "https://files.pythonhosted.org/packages/d6/26/9d86ae12699a25b7233f76ce062253e9c14e57781e00166b792b3a9d56db/docling_parse-4.7.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d89231aa4fba3e38b80c11beb8edc07569e934c1f3935b51f57904fefe958ba5", size = 14616739, upload-time = "2026-01-14T14:17:38.567Z" }, + { url = "https://files.pythonhosted.org/packages/f2/fd/1aebb8a7f15d658f3be858ddbbc4ef7206089d540a7df0dcd4b846b99901/docling_parse-4.7.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dffd19ed373b0da5cea124606b183489a8686c3d18643e94485be1bdda5713ea", size = 14980782, upload-time = "2026-01-14T14:17:40.659Z" }, + { url = "https://files.pythonhosted.org/packages/3e/47/a722527c9f89c65f69f8a463be4f12ad73bae18132f29d8de8b2d9f6f082/docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc32b6f25a673e41b9a8112b6b841284f60dbac9427b7848a03b435460f74aee", size = 15092450, upload-time = "2026-01-14T14:17:42.838Z" }, + { url = "https://files.pythonhosted.org/packages/91/c7/316373a92ba42c2aeaee128fc77a34333449fe3e820b9d524e0ee396ea35/docling_parse-4.7.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef691045623863624f2cb7347572d0262a53cb84940ef7dd851d9f13a2eb8833", size = 16147359, upload-time = "2026-01-14T14:17:44.906Z" }, +] + +[[package]] +name = "easydict" +version = "1.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/9f/d18d6b5e19244788a6d09c14a8406376b4f4bfcc008e6d17a4f4c15362e8/easydict-1.13.tar.gz", hash = "sha256:b1135dedbc41c8010e2bc1f77ec9744c7faa42bce1a1c87416791449d6c87780", size = 6809, upload-time = "2024-03-04T12:04:41.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, +] + +[[package]] +name = "easyocr" +version = "1.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ninja" }, + { name = "numpy" }, + { name = "opencv-python-headless" }, + { name = "pillow" }, + { name = "pyclipper" }, + { name = "python-bidi" }, + { name = "pyyaml" }, + { name = "scikit-image" }, + { name = "scipy" }, + { name = "shapely" }, + { name = "torch" }, + { name = "torchvision" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, +] + +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, +] + +[[package]] +name = "filetype" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, +] + +[[package]] +name = "fonttools" +version = "4.61.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, + { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, + { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, + { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, + { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, + { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, + { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, + { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, + { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, + { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, +] + +[[package]] +name = "fpdf2" +version = "2.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "fonttools" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/f2/72feae0b2827ed38013e4307b14f95bf0b3d124adfef4d38a7d57533f7be/fpdf2-2.8.7.tar.gz", hash = "sha256:7060ccee5a9c7ab0a271fb765a36a23639f83ef8996c34e3d46af0a17ede57f9", size = 362351, upload-time = "2026-02-28T05:39:16.456Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/0a/cf50ecffa1e3747ed9380a3adfc829259f1f86b3fdbd9e505af789003141/fpdf2-2.8.7-py3-none-any.whl", hash = "sha256:d391fc508a3ce02fc43a577c830cda4fe6f37646f2d143d489839940932fbc19", size = 327056, upload-time = "2026-02-28T05:39:14.619Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" }, + { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" }, + { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" }, + { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" }, + { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" }, + { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" }, + { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" }, + { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" }, + { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" }, + { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, + { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, + { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, + { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, + { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, + { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, + { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, + { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, + { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + +[[package]] +name = "glossapi" +version = "0.1.3" +source = { editable = "../../" } +dependencies = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "dask" }, + { name = "ftfy" }, + { name = "joblib" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pypdfium2" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "zstandard" }, +] + +[package.optional-dependencies] +deepseek = [ + { name = "accelerate" }, + { name = "addict" }, + { name = "easydict" }, + { name = "einops" }, + { name = "img2pdf" }, + { name = "pillow" }, + { name = "pymupdf" }, + { name = "tokenizers" }, + { name = "transformers" }, +] +docling = [ + { name = "docling" }, +] + +[package.metadata] +requires-dist = [ + { name = "accelerate", marker = "extra == 'deepseek'", specifier = ">=1.2.1,<2" }, + { name = "addict", marker = "extra == 'deepseek'" }, + { name = "aiofiles", specifier = ">=23.0.0" }, + { name = "aiohttp", specifier = ">=3.8.0" }, + { name = "dask", specifier = ">=2022.1.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.48.0" }, + { name = "easydict", marker = "extra == 'deepseek'" }, + { name = "einops", marker = "extra == 'deepseek'" }, + { name = "ftfy", specifier = ">=6.0.0" }, + { name = "img2pdf", marker = "extra == 'deepseek'", specifier = ">=0.5.1" }, + { name = "joblib", specifier = ">=1.0.0" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, + { name = "numpy", specifier = "<2" }, + { name = "pandas", specifier = ">=1.3.0" }, + { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "pyarrow", specifier = ">=7.0.0" }, + { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, + { name = "pypdfium2", specifier = ">=4.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = "==1.6.1" }, + { name = "tenacity", specifier = ">=8.0.0" }, + { name = "tokenizers", marker = "extra == 'deepseek'", specifier = "==0.20.3" }, + { name = "torch", marker = "extra == 'cuda'", specifier = "==2.5.1" }, + { name = "torchvision", marker = "extra == 'cuda'", specifier = "==0.20.1" }, + { name = "tqdm", specifier = ">=4.67.0" }, + { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, + { name = "zstandard", specifier = ">=0.22.0" }, +] +provides-extras = ["docling", "cuda", "deepseek", "docs"] + +[[package]] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "glossapi", extra = ["deepseek", "docling"] }, + { name = "torch" }, + { name = "torchaudio" }, + { name = "torchvision" }, +] + +[package.dev-dependencies] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "glossapi", extras = ["docling", "deepseek"], editable = "../../" }, + { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, +] + +[package.metadata.requires-dev] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[[package]] +name = "hf-xet" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, + { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, + { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "imageio" +version = "2.37.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/6f/606be632e37bf8d05b253e8626c2291d74c691ddc7bcdf7d6aaf33b32f6a/imageio-2.37.2.tar.gz", hash = "sha256:0212ef2727ac9caa5ca4b2c75ae89454312f440a756fcfc8ef1993e718f50f8a", size = 389600, upload-time = "2025-11-04T14:29:39.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/fe/301e0936b79bcab4cacc7548bf2853fc28dced0a578bab1f7ef53c9aa75b/imageio-2.37.2-py3-none-any.whl", hash = "sha256:ad9adfb20335d718c03de457358ed69f141021a333c40a53e57273d8a5bd0b9b", size = 317646, upload-time = "2025-11-04T14:29:37.948Z" }, +] + +[[package]] +name = "img2pdf" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pikepdf" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/97/ca44c467131b93fda82d2a2f21b738c8bcf63b5259e3b8250e928b8dd52a/img2pdf-0.6.3.tar.gz", hash = "sha256:219518020f5bd242bdc46493941ea3f756f664c2e86f2454721e74353f58cd95", size = 120350, upload-time = "2025-11-05T20:51:57.558Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/dc/91e3a4a11c25ae183bd5a71b84ecb298db76405ff70013f76b10877bdfe3/img2pdf-0.6.3-py3-none-any.whl", hash = "sha256:44d12d235752edd17c43c04ff39952cdc5dd4c6aba90569c4902bd445085266b", size = 49701, upload-time = "2025-11-05T20:51:55.469Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + +[[package]] +name = "jsonlines" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" }, +] + +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, +] + +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + +[[package]] +name = "latex2mathml" +version = "3.78.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/26/57b1034c08922d0aefea79430a5e0006ffaee4f0ec59d566613f667ab2f7/latex2mathml-3.78.1.tar.gz", hash = "sha256:f941db80bf41db33f31df87b304e8b588f8166b813b0257c11c98f7a9d0aac71", size = 74030, upload-time = "2025-08-29T23:34:23.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3e/76/d661ea2e529c3d464f9efd73f9ac31626b45279eb4306e684054ea20e3d4/latex2mathml-3.78.1-py3-none-any.whl", hash = "sha256:f089b6d75e85b937f99693c93e8c16c0804008672c3dd2a3d25affd36f238100", size = 73892, upload-time = "2025-08-29T23:34:21.98Z" }, +] + +[[package]] +name = "lazy-loader" +version = "0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" }, +] + +[[package]] +name = "locket" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/83/97b29fe05cb6ae28d2dbd30b81e2e402a3eed5f460c26e9eaa5895ceacf5/locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632", size = 4350, upload-time = "2022-04-20T22:04:44.312Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/bc/83e112abc66cd466c6b83f99118035867cecd41802f8d044638aa78a106e/locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3", size = 4398, upload-time = "2022-04-20T22:04:42.23Z" }, +] + +[[package]] +name = "lxml" +version = "5.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/2d/67693cc8a605a12e5975380d7ff83020dcc759351b5a066e1cced04f797b/lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9", size = 8083240, upload-time = "2025-04-23T01:45:18.566Z" }, + { url = "https://files.pythonhosted.org/packages/73/53/b5a05ab300a808b72e848efd152fe9c022c0181b0a70b8bca1199f1bed26/lxml-5.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7", size = 4387685, upload-time = "2025-04-23T01:45:21.387Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/1a3879c5f512bdcd32995c301886fe082b2edd83c87d41b6d42d89b4ea4d/lxml-5.4.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa", size = 4991164, upload-time = "2025-04-23T01:45:23.849Z" }, + { url = "https://files.pythonhosted.org/packages/f9/94/bbc66e42559f9d04857071e3b3d0c9abd88579367fd2588a4042f641f57e/lxml-5.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df", size = 4746206, upload-time = "2025-04-23T01:45:26.361Z" }, + { url = "https://files.pythonhosted.org/packages/66/95/34b0679bee435da2d7cae895731700e519a8dfcab499c21662ebe671603e/lxml-5.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e", size = 5342144, upload-time = "2025-04-23T01:45:28.939Z" }, + { url = "https://files.pythonhosted.org/packages/e0/5d/abfcc6ab2fa0be72b2ba938abdae1f7cad4c632f8d552683ea295d55adfb/lxml-5.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44", size = 4825124, upload-time = "2025-04-23T01:45:31.361Z" }, + { url = "https://files.pythonhosted.org/packages/5a/78/6bd33186c8863b36e084f294fc0a5e5eefe77af95f0663ef33809cc1c8aa/lxml-5.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba", size = 4876520, upload-time = "2025-04-23T01:45:34.191Z" }, + { url = "https://files.pythonhosted.org/packages/3b/74/4d7ad4839bd0fc64e3d12da74fc9a193febb0fae0ba6ebd5149d4c23176a/lxml-5.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba", size = 4765016, upload-time = "2025-04-23T01:45:36.7Z" }, + { url = "https://files.pythonhosted.org/packages/24/0d/0a98ed1f2471911dadfc541003ac6dd6879fc87b15e1143743ca20f3e973/lxml-5.4.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c", size = 5362884, upload-time = "2025-04-23T01:45:39.291Z" }, + { url = "https://files.pythonhosted.org/packages/48/de/d4f7e4c39740a6610f0f6959052b547478107967362e8424e1163ec37ae8/lxml-5.4.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8", size = 4902690, upload-time = "2025-04-23T01:45:42.386Z" }, + { url = "https://files.pythonhosted.org/packages/07/8c/61763abd242af84f355ca4ef1ee096d3c1b7514819564cce70fd18c22e9a/lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86", size = 4944418, upload-time = "2025-04-23T01:45:46.051Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/6d7e3b63e7e282619193961a570c0a4c8a57fe820f07ca3fe2f6bd86608a/lxml-5.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056", size = 4827092, upload-time = "2025-04-23T01:45:48.943Z" }, + { url = "https://files.pythonhosted.org/packages/71/4a/e60a306df54680b103348545706a98a7514a42c8b4fbfdcaa608567bb065/lxml-5.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7", size = 5418231, upload-time = "2025-04-23T01:45:51.481Z" }, + { url = "https://files.pythonhosted.org/packages/27/f2/9754aacd6016c930875854f08ac4b192a47fe19565f776a64004aa167521/lxml-5.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd", size = 5261798, upload-time = "2025-04-23T01:45:54.146Z" }, + { url = "https://files.pythonhosted.org/packages/38/a2/0c49ec6941428b1bd4f280650d7b11a0f91ace9db7de32eb7aa23bcb39ff/lxml-5.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751", size = 4988195, upload-time = "2025-04-23T01:45:56.685Z" }, + { url = "https://files.pythonhosted.org/packages/7a/75/87a3963a08eafc46a86c1131c6e28a4de103ba30b5ae903114177352a3d7/lxml-5.4.0-cp311-cp311-win32.whl", hash = "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4", size = 3474243, upload-time = "2025-04-23T01:45:58.863Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f9/1f0964c4f6c2be861c50db380c554fb8befbea98c6404744ce243a3c87ef/lxml-5.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539", size = 3815197, upload-time = "2025-04-23T01:46:01.096Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/d101ace719ca6a4ec043eb516fcfcb1b396a9fccc4fcd9ef593df34ba0d5/lxml-5.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4", size = 8127392, upload-time = "2025-04-23T01:46:04.09Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/beddae0cec4dd9ddf46abf156f0af451c13019a0fa25d7445b655ba5ccb7/lxml-5.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d", size = 4415103, upload-time = "2025-04-23T01:46:07.227Z" }, + { url = "https://files.pythonhosted.org/packages/d0/25/d0d93a4e763f0462cccd2b8a665bf1e4343dd788c76dcfefa289d46a38a9/lxml-5.4.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779", size = 5024224, upload-time = "2025-04-23T01:46:10.237Z" }, + { url = "https://files.pythonhosted.org/packages/31/ce/1df18fb8f7946e7f3388af378b1f34fcf253b94b9feedb2cec5969da8012/lxml-5.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e", size = 4769913, upload-time = "2025-04-23T01:46:12.757Z" }, + { url = "https://files.pythonhosted.org/packages/4e/62/f4a6c60ae7c40d43657f552f3045df05118636be1165b906d3423790447f/lxml-5.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9", size = 5290441, upload-time = "2025-04-23T01:46:16.037Z" }, + { url = "https://files.pythonhosted.org/packages/9e/aa/04f00009e1e3a77838c7fc948f161b5d2d5de1136b2b81c712a263829ea4/lxml-5.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5", size = 4820165, upload-time = "2025-04-23T01:46:19.137Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/e0b2f61fa2404bf0f1fdf1898377e5bd1b74cc9b2cf2c6ba8509b8f27990/lxml-5.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5", size = 4932580, upload-time = "2025-04-23T01:46:21.963Z" }, + { url = "https://files.pythonhosted.org/packages/24/a2/8263f351b4ffe0ed3e32ea7b7830f845c795349034f912f490180d88a877/lxml-5.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4", size = 4759493, upload-time = "2025-04-23T01:46:24.316Z" }, + { url = "https://files.pythonhosted.org/packages/05/00/41db052f279995c0e35c79d0f0fc9f8122d5b5e9630139c592a0b58c71b4/lxml-5.4.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e", size = 5324679, upload-time = "2025-04-23T01:46:27.097Z" }, + { url = "https://files.pythonhosted.org/packages/1d/be/ee99e6314cdef4587617d3b3b745f9356d9b7dd12a9663c5f3b5734b64ba/lxml-5.4.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7", size = 4890691, upload-time = "2025-04-23T01:46:30.009Z" }, + { url = "https://files.pythonhosted.org/packages/ad/36/239820114bf1d71f38f12208b9c58dec033cbcf80101cde006b9bde5cffd/lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079", size = 4955075, upload-time = "2025-04-23T01:46:32.33Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e1/1b795cc0b174efc9e13dbd078a9ff79a58728a033142bc6d70a1ee8fc34d/lxml-5.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20", size = 4838680, upload-time = "2025-04-23T01:46:34.852Z" }, + { url = "https://files.pythonhosted.org/packages/72/48/3c198455ca108cec5ae3662ae8acd7fd99476812fd712bb17f1b39a0b589/lxml-5.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8", size = 5391253, upload-time = "2025-04-23T01:46:37.608Z" }, + { url = "https://files.pythonhosted.org/packages/d6/10/5bf51858971c51ec96cfc13e800a9951f3fd501686f4c18d7d84fe2d6352/lxml-5.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f", size = 5261651, upload-time = "2025-04-23T01:46:40.183Z" }, + { url = "https://files.pythonhosted.org/packages/2b/11/06710dd809205377da380546f91d2ac94bad9ff735a72b64ec029f706c85/lxml-5.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc", size = 5024315, upload-time = "2025-04-23T01:46:43.333Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b0/15b6217834b5e3a59ebf7f53125e08e318030e8cc0d7310355e6edac98ef/lxml-5.4.0-cp312-cp312-win32.whl", hash = "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f", size = 3486149, upload-time = "2025-04-23T01:46:45.684Z" }, + { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + +[[package]] +name = "marko" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/2f/050b6d485f052ddf17d76a41f9334d6fb2a8a85df35347a12d97ed3bc5c1/marko-2.2.2.tar.gz", hash = "sha256:6940308e655f63733ca518c47a68ec9510279dbb916c83616e4c4b5829f052e8", size = 143641, upload-time = "2026-01-05T11:04:41.935Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/f8/36d79bac5701e6786f9880c61bbe57574760a13c1af84ab71e5ed21faecc/marko-2.2.2-py3-none-any.whl", hash = "sha256:f064ae8c10416285ad1d96048dc11e98ef04e662d3342ae416f662b70aa7959e", size = 42701, upload-time = "2026-01-05T11:04:40.75Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, + { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, + { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, + { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, + { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "mpire" +version = "2.10.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270, upload-time = "2024-05-07T14:00:31.815Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/14/1db1729ad6db4999c3a16c47937d601fcb909aaa4224f5eca5a2f145a605/mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb", size = 272756, upload-time = "2024-05-07T14:00:29.633Z" }, +] + +[package.optional-dependencies] +dill = [ + { name = "multiprocess" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, + { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, + { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, + { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, + { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, + { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, + { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, + { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, + { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "multiprocess" +version = "0.70.19" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" }, + { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" }, + { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" }, + { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" }, + { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" }, + { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "ninja" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" }, + { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" }, + { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" }, + { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" }, + { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" }, + { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" }, + { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" }, + { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" }, + { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" }, + { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" }, + { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" }, + { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" }, + { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" }, + { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" }, + { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" }, + { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" }, + { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, +] + +[[package]] +name = "nvidia-cublas-cu11" +version = "11.11.3.6" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/be/c222e33e60d28ecd496a46fc4d78ccae0ee28e1fd7dc705b6288b4cad27e/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl", hash = "sha256:39fb40e8f486dd8a2ddb8fdeefe1d5b28f5b99df01c87ab3676f057a74a5a6f3", size = 417870452, upload-time = "2022-10-18T21:17:48.638Z" }, + { url = "https://files.pythonhosted.org/packages/ea/2e/9d99c60771d275ecf6c914a612e9a577f740a615bc826bec132368e1d3ae/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:60252822adea5d0b10cd990a7dc7bedf7435f30ae40083c7a624a85a43225abc", size = 417870460, upload-time = "2024-08-17T00:00:26.889Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu11" +version = "11.8.87" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/c9/b4b15f709a694ea9f84871c6c4fbeeb54bab225962d852665a2c6f77f90d/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl", hash = "sha256:0e50c707df56c75a2c0703dc6b886f3c97a22f37d6f63839f75b7418ba672a8d", size = 13093657, upload-time = "2022-10-03T21:46:12.544Z" }, + { url = "https://files.pythonhosted.org/packages/74/42/9f5c5cc084ce6f3073048c4f6806f45ba4c8c73f227c9587215d9c372e05/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux2014_x86_64.whl", hash = "sha256:4191a17913a706b5098681280cd089cd7d8d3df209a6f5cb79384974a96d24f2", size = 13093662, upload-time = "2024-08-16T23:56:38.082Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu11" +version = "11.8.89" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/08/a9833e4e9f9165bedb7f36033b47aa399b053b9cb2eaf7b84d1e28705cf7/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:1f27d67b0f72902e9065ae568b4f6268dfe49ba3ed269c9a3da99bb86d1d2008", size = 23173264, upload-time = "2022-10-03T21:47:00.705Z" }, + { url = "https://files.pythonhosted.org/packages/60/44/202e027c224c26e15a53f01c5c7604c7f6b4fd368882d3164ea08fead207/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a8d02f3cba345be56b1ffc3e74d8f61f02bb758dd31b0f20e12277a5a244f756", size = 23173745, upload-time = "2024-08-16T23:58:16.539Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu11" +version = "11.8.89" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/3e/84db02be49fe6d6df6e42f69fd64501c22d0f9ada9c9877f885612085d20/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:f587bd726eb2f7612cf77ce38a2c1e65cf23251ff49437f6161ce0d647f64f7c", size = 875585, upload-time = "2022-10-03T21:46:03.05Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ec/a540f28b31de7bc1ed49eecc72035d4cb77db88ead1d42f7bfa5ae407ac6/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:92d04069a987e1fbc9213f8376d265df0f7bb42617d44f5eda1f496acea7f2d1", size = 875592, upload-time = "2024-08-16T23:56:18.774Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu11" +version = "9.1.0.70" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/3b/0b776f04e364cd99e4cf152c2a9eadb5934c67c9a91429da55169a9447fd/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e6135ac63fe9d5b0b89cfb35c3fc1c1349f2b995becadf2e9dc21bca89d9633d", size = 663919573, upload-time = "2024-04-22T15:20:24.839Z" }, +] + +[[package]] +name = "nvidia-cufft-cu11" +version = "10.9.0.58" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/79/b912a77e38e41f15a0581a59f5c3548d1ddfdda3225936fb67c342719e7a/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl", hash = "sha256:222f9da70c80384632fd6035e4c3f16762d64ea7a843829cb278f98b3cb7dd81", size = 168405414, upload-time = "2022-10-03T23:29:47.505Z" }, + { url = "https://files.pythonhosted.org/packages/64/c8/133717b43182ba063803e983e7680a94826a9f4ff5734af0ca315803f1b3/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e21037259995243cc370dd63c430d77ae9280bedb68d5b5a18226bfc92e5d748", size = 168405419, upload-time = "2024-08-17T00:02:03.562Z" }, +] + +[[package]] +name = "nvidia-curand-cu11" +version = "10.3.0.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/28/c47f8e2439ddbcbeae3cf74d43ed572b651d630ea72863d5357f3759eb66/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:ac439548c88580269a1eb6aeb602a5aed32f0dbb20809a31d9ed7d01d77f6bf5", size = 58124493, upload-time = "2022-10-03T23:30:05.413Z" }, + { url = "https://files.pythonhosted.org/packages/58/e5/ce5806afc48a6e4e0dddd25316ac60b6fa94fd1791bdbf4ca17bf52696ea/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:cd4cffbf78bb06580206b4814d5dc696d1161c902aae37b2bba00056832379e6", size = 58124497, upload-time = "2024-08-17T00:03:01.833Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu11" +version = "11.4.1.48" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/ee/939ff0104991dd7bdabb4c9767994c612ba0e1c9a55672a1ddd42f5e5b16/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl", hash = "sha256:ca538f545645b7e6629140786d3127fe067b3d5a085bd794cde5bfe877c8926f", size = 128240842, upload-time = "2022-10-03T23:30:24.348Z" }, + { url = "https://files.pythonhosted.org/packages/52/fe/866e87e6e6a1b0a5fcf8524a058042656702f2057e22bfdb8899a7c38e10/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea9fb1ad8c644ca9ed55af13cc39af3b7ba4c3eb5aef18471fe1fe77d94383cb", size = 128246438, upload-time = "2024-08-17T00:03:52.432Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu11" +version = "11.7.5.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/e0/21b829c535d569831835a4ca5d049a19ba00d3e91f3e12ab4ad27bd7385f/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:4ae709fe78d3f23f60acaba8c54b8ad556cf16ca486e0cc1aa92dca7555d2d2b", size = 204126221, upload-time = "2022-10-18T21:19:28.04Z" }, + { url = "https://files.pythonhosted.org/packages/ed/5c/b0333b07c51ced77397c2fb0d9826072cea0da9d421aa7e792aa0f8ecc72/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8d7cf1628fd8d462b5d2ba6678fae34733a48ecb80495b9c68672ec6a6dde5ef", size = 204126227, upload-time = "2024-08-17T00:05:20.798Z" }, +] + +[[package]] +name = "nvidia-nccl-cu11" +version = "2.21.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/9a/8b6a28b3b87d5fddab0e92cd835339eb8fbddaa71ae67518c8c1b3d05bae/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:49d8350629c7888701d1fd200934942671cb5c728f49acc5a0b3a768820bed29", size = 147811630, upload-time = "2024-04-03T15:33:12.879Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu11" +version = "11.8.86" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/a2/23214c23118784dc2189ac2d2e48190df3e4206e2f73eb17d47140797a2b/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:890656d8bd9b4e280231c832e1f0d03459200ba4824ddda3dcb59b1e1989b9f5", size = 99125, upload-time = "2022-10-03T21:47:19.565Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, +] + +[[package]] +name = "opencv-python-headless" +version = "4.11.0.86" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/2f/5b2b3ba52c864848885ba988f24b7f105052f68da9ab0e693cc7c25b0b30/opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798", size = 95177929, upload-time = "2025-01-16T13:53:40.22Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/53/2c50afa0b1e05ecdb4603818e85f7d174e683d874ef63a6abe3ac92220c8/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca", size = 37326460, upload-time = "2025-01-16T13:52:57.015Z" }, + { url = "https://files.pythonhosted.org/packages/3b/43/68555327df94bb9b59a1fd645f63fafb0762515344d2046698762fc19d58/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81", size = 56723330, upload-time = "2025-01-16T13:55:45.731Z" }, + { url = "https://files.pythonhosted.org/packages/45/be/1438ce43ebe65317344a87e4b150865c5585f4c0db880a34cdae5ac46881/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb", size = 29487060, upload-time = "2025-01-16T13:51:59.625Z" }, + { url = "https://files.pythonhosted.org/packages/dd/5c/c139a7876099916879609372bfa513b7f1257f7f1a908b0bdc1c2328241b/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b", size = 49969856, upload-time = "2025-01-16T13:53:29.654Z" }, + { url = "https://files.pythonhosted.org/packages/95/dd/ed1191c9dc91abcc9f752b499b7928aacabf10567bb2c2535944d848af18/opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b", size = 29324425, upload-time = "2025-01-16T13:52:49.048Z" }, + { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, +] + +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, +] + +[[package]] +name = "partd" +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "locket" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/3a/3f06f34820a31257ddcabdfafc2672c5816be79c7e353b02c1f318daa7d4/partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c", size = 21029, upload-time = "2024-05-06T19:51:41.945Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, +] + +[[package]] +name = "pikepdf" +version = "10.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "lxml" }, + { name = "packaging" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/ba/7635a5f4259a2a91ed4f094e358dec3068ecedc891d70b8e76a02904ca0c/pikepdf-10.3.0.tar.gz", hash = "sha256:e2a64a5f1ebf8c411193126b9eeff7faf5739a40bce7441e579531422469fbb1", size = 4575749, upload-time = "2026-01-30T07:33:53.317Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a9/0d2107a3c796ab2fa7d379ee801190c95c4132f0bb5cfc1fd8d2e3ac74af/pikepdf-10.3.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:99fb21d20dc02f9828d477d2c549ee3f6e191801f84a2a2505d21baacb731745", size = 4753016, upload-time = "2026-01-30T07:32:51.999Z" }, + { url = "https://files.pythonhosted.org/packages/a9/2b/f634a0956aa15074db6c62309ec3d08bd158ddbdea8bd2081cea8b6eb3ed/pikepdf-10.3.0-cp311-cp311-macosx_15_0_x86_64.whl", hash = "sha256:c8a4b6862d7e0e69dd3f57efd362826966d1f341e0d052f7f23f0fe3a2375a36", size = 5063869, upload-time = "2026-01-30T07:32:54.418Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/d5ba1febacde805e7ec75a3df0888e53212f8e5f82fa1fc09c0fa981c7f9/pikepdf-10.3.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b86d42e66004ffaf5284aae0d9814bb3d19f048a45943479db5ca3d02d46bfb", size = 2445530, upload-time = "2026-01-30T07:32:56.117Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ba/196351a049a7a9d255140a414f586779b3ad77f0d09091e639d9f85c4131/pikepdf-10.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7021b31eddd5aa611f6941a2c171b7ce321c7763263ff658368f5f40bda1d4", size = 2673622, upload-time = "2026-01-30T07:32:57.85Z" }, + { url = "https://files.pythonhosted.org/packages/7c/cf/1315759de9dc66f769f84067da2127046e46489100f6e2be614fcb6c8394/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b653b1d0c5f17efb080ef68b65d3fcc8909f22128b75e0479775a35cd8d9fe6e", size = 3644910, upload-time = "2026-01-30T07:33:00.182Z" }, + { url = "https://files.pythonhosted.org/packages/80/6f/578ee7b53d06267f6c489fb7734792f6fa670a3a7d0b55db20b084e0957d/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fa3e4b32a2c1d15bb57e91ee3896c19b3c8145d46c26fbac8747efe7cb5ce3bd", size = 3835871, upload-time = "2026-01-30T07:33:02.804Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0f/980dbfb5ab9231d30e44d9285e8a7509f0871fc6fe438559e1eed16e683d/pikepdf-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:3233da668d665d301a4a4fd1481867e688336fdb410e9bc9d4e5b0cd62e334eb", size = 3756976, upload-time = "2026-01-30T07:33:05.596Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/d6ca7f6066d7f3b61b56bffeca1069c0ded635ba316aa1df54fcc0e2104f/pikepdf-10.3.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d1a6646def3fc47f763eab0dcb11341a7205cef1b7dc5c62f1dee435a89472b9", size = 4762039, upload-time = "2026-01-30T07:33:08.626Z" }, + { url = "https://files.pythonhosted.org/packages/9c/dc/d0db713a34a493eedf4eded566668762aee5acfad958bdf374a450df931c/pikepdf-10.3.0-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:e968e4e81d6c05d8e4b24594b27a64cb9be3c7a4371bf0635f6b669559171e6b", size = 5078640, upload-time = "2026-01-30T07:33:10.478Z" }, + { url = "https://files.pythonhosted.org/packages/21/c0/e0a1f1afb99ecac5f7f21313b47c174178f85df0f1ec7080e0d431324099/pikepdf-10.3.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfad0e4e6bc268ca041d639b232d76c25c9ad7023b7189d14869ef4446cabda2", size = 2450284, upload-time = "2026-01-30T07:33:12.215Z" }, + { url = "https://files.pythonhosted.org/packages/db/3a/2f0e8bd70cf57896a85b1d7f7ca3ce79d91a17222e1b23b607860ea52a5d/pikepdf-10.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cf7ab25f1e9063de320d2edecb2cd2960329cc25bac645c7938390f6538d9bf", size = 2699411, upload-time = "2026-01-30T07:33:13.878Z" }, + { url = "https://files.pythonhosted.org/packages/fd/10/da5f244aa14b845cd835f34b6a7a217493952f2532d2e00957ed3bd79aea/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3904353137e5b0cb2a316d84057e1e5301a65e6b1810d4763348ae8919ba20f4", size = 3649524, upload-time = "2026-01-30T07:33:15.641Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ef/3efb78a16d9c702dfd64fdeaee6a1ac6af95c41d4ec60b784e9171f20753/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4335ec70a659b5be1dfc7094a67db7f9c017c9c1cf9049b56d0e35ad24a46ff0", size = 3861320, upload-time = "2026-01-30T07:33:17.466Z" }, + { url = "https://files.pythonhosted.org/packages/8d/63/b0243fe62cf5d4d9da49010a15e0177b9629b8183092b3bd804f59a1529a/pikepdf-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac5befc1e991e28b16be104c219bdb1f6cf62a8371f4019ce7bab64ec5ec5745", size = 3763570, upload-time = "2026-01-30T07:33:19.863Z" }, +] + +[[package]] +name = "pillow" +version = "10.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/62/c9449f9c3043c37f73e7487ec4ef0c03eb9c9afc91a92b977a67b3c0bbc5/pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c", size = 3509265, upload-time = "2024-07-01T09:45:49.812Z" }, + { url = "https://files.pythonhosted.org/packages/f4/5f/491dafc7bbf5a3cc1845dc0430872e8096eb9e2b6f8161509d124594ec2d/pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be", size = 3375655, upload-time = "2024-07-01T09:45:52.462Z" }, + { url = "https://files.pythonhosted.org/packages/73/d5/c4011a76f4207a3c151134cd22a1415741e42fa5ddecec7c0182887deb3d/pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3", size = 4340304, upload-time = "2024-07-01T09:45:55.006Z" }, + { url = "https://files.pythonhosted.org/packages/ac/10/c67e20445a707f7a610699bba4fe050583b688d8cd2d202572b257f46600/pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6", size = 4452804, upload-time = "2024-07-01T09:45:58.437Z" }, + { url = "https://files.pythonhosted.org/packages/a9/83/6523837906d1da2b269dee787e31df3b0acb12e3d08f024965a3e7f64665/pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe", size = 4365126, upload-time = "2024-07-01T09:46:00.713Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e5/8c68ff608a4203085158cff5cc2a3c534ec384536d9438c405ed6370d080/pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319", size = 4533541, upload-time = "2024-07-01T09:46:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7c/01b8dbdca5bc6785573f4cee96e2358b0918b7b2c7b60d8b6f3abf87a070/pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d", size = 4471616, upload-time = "2024-07-01T09:46:05.356Z" }, + { url = "https://files.pythonhosted.org/packages/c8/57/2899b82394a35a0fbfd352e290945440e3b3785655a03365c0ca8279f351/pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696", size = 4600802, upload-time = "2024-07-01T09:46:08.145Z" }, + { url = "https://files.pythonhosted.org/packages/4d/d7/a44f193d4c26e58ee5d2d9db3d4854b2cfb5b5e08d360a5e03fe987c0086/pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496", size = 2235213, upload-time = "2024-07-01T09:46:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/c1/d0/5866318eec2b801cdb8c82abf190c8343d8a1cd8bf5a0c17444a6f268291/pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91", size = 2554498, upload-time = "2024-07-01T09:46:12.685Z" }, + { url = "https://files.pythonhosted.org/packages/d4/c8/310ac16ac2b97e902d9eb438688de0d961660a87703ad1561fd3dfbd2aa0/pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22", size = 2243219, upload-time = "2024-07-01T09:46:14.83Z" }, + { url = "https://files.pythonhosted.org/packages/05/cb/0353013dc30c02a8be34eb91d25e4e4cf594b59e5a55ea1128fde1e5f8ea/pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94", size = 3509350, upload-time = "2024-07-01T09:46:17.177Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5c558a0f247e0bf9cec92bff9b46ae6474dd736f6d906315e60e4075f737/pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597", size = 3374980, upload-time = "2024-07-01T09:46:19.169Z" }, + { url = "https://files.pythonhosted.org/packages/84/48/6e394b86369a4eb68b8a1382c78dc092245af517385c086c5094e3b34428/pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80", size = 4343799, upload-time = "2024-07-01T09:46:21.883Z" }, + { url = "https://files.pythonhosted.org/packages/3b/f3/a8c6c11fa84b59b9df0cd5694492da8c039a24cd159f0f6918690105c3be/pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca", size = 4459973, upload-time = "2024-07-01T09:46:24.321Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1b/c14b4197b80150fb64453585247e6fb2e1d93761fa0fa9cf63b102fde822/pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef", size = 4370054, upload-time = "2024-07-01T09:46:26.825Z" }, + { url = "https://files.pythonhosted.org/packages/55/77/40daddf677897a923d5d33329acd52a2144d54a9644f2a5422c028c6bf2d/pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a", size = 4539484, upload-time = "2024-07-01T09:46:29.355Z" }, + { url = "https://files.pythonhosted.org/packages/40/54/90de3e4256b1207300fb2b1d7168dd912a2fb4b2401e439ba23c2b2cabde/pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b", size = 4477375, upload-time = "2024-07-01T09:46:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/13/24/1bfba52f44193860918ff7c93d03d95e3f8748ca1de3ceaf11157a14cf16/pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9", size = 4608773, upload-time = "2024-07-01T09:46:33.73Z" }, + { url = "https://files.pythonhosted.org/packages/55/04/5e6de6e6120451ec0c24516c41dbaf80cce1b6451f96561235ef2429da2e/pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42", size = 2235690, upload-time = "2024-07-01T09:46:36.587Z" }, + { url = "https://files.pythonhosted.org/packages/74/0a/d4ce3c44bca8635bd29a2eab5aa181b654a734a29b263ca8efe013beea98/pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a", size = 2554951, upload-time = "2024-07-01T09:46:38.777Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ca/184349ee40f2e92439be9b3502ae6cfc43ac4b50bc4fc6b3de7957563894/pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9", size = 2243427, upload-time = "2024-07-01T09:46:43.15Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" }, + { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" }, + { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" }, + { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" }, + { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" }, + { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" }, + { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" }, + { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" }, + { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, +] + +[[package]] +name = "pyclipper" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/21/3c06205bb407e1f79b73b7b4dfb3950bd9537c4f625a68ab5cc41177f5bc/pyclipper-1.4.0.tar.gz", hash = "sha256:9882bd889f27da78add4dd6f881d25697efc740bf840274e749988d25496c8e1", size = 54489, upload-time = "2025-12-01T13:15:35.015Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/e3/64cf7794319b088c288706087141e53ac259c7959728303276d18adc665d/pyclipper-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:adcb7ca33c5bdc33cd775e8b3eadad54873c802a6d909067a57348bcb96e7a2d", size = 264281, upload-time = "2025-12-01T13:14:55.47Z" }, + { url = "https://files.pythonhosted.org/packages/34/cd/44ec0da0306fa4231e76f1c2cb1fa394d7bde8db490a2b24d55b39865f69/pyclipper-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fd24849d2b94ec749ceac7c34c9f01010d23b6e9d9216cf2238b8481160e703d", size = 139426, upload-time = "2025-12-01T13:14:56.683Z" }, + { url = "https://files.pythonhosted.org/packages/ad/88/d8f6c6763ea622fe35e19c75d8b39ed6c55191ddc82d65e06bc46b26cb8e/pyclipper-1.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b6c8d75ba20c6433c9ea8f1a0feb7e4d3ac06a09ad1fd6d571afc1ddf89b869", size = 989649, upload-time = "2025-12-01T13:14:58.28Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e9/ea7d68c8c4af3842d6515bedcf06418610ad75f111e64c92c1d4785a1513/pyclipper-1.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e29d7443d7cc0e83ee9daf43927730386629786d00c63b04fe3b53ac01462c", size = 962842, upload-time = "2025-12-01T13:15:00.044Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b7/0b4a272d8726e51ab05e2b933d8cc47f29757fb8212e38b619e170e6015c/pyclipper-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a8d2b5fb75ebe57e21ce61e79a9131edec2622ff23cc665e4d1d1f201bc1a801", size = 95098, upload-time = "2025-12-01T13:15:01.359Z" }, + { url = "https://files.pythonhosted.org/packages/3a/76/4901de2919198bb2bd3d989f86d4a1dff363962425bb2d63e24e6c990042/pyclipper-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:e9b973467d9c5fa9bc30bb6ac95f9f4d7c3d9fc25f6cf2d1cc972088e5955c01", size = 104362, upload-time = "2025-12-01T13:15:02.439Z" }, + { url = "https://files.pythonhosted.org/packages/90/1b/7a07b68e0842324d46c03e512d8eefa9cb92ba2a792b3b4ebf939dafcac3/pyclipper-1.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:222ac96c8b8281b53d695b9c4fedc674f56d6d4320ad23f1bdbd168f4e316140", size = 265676, upload-time = "2025-12-01T13:15:04.15Z" }, + { url = "https://files.pythonhosted.org/packages/6b/dd/8bd622521c05d04963420ae6664093f154343ed044c53ea260a310c8bb4d/pyclipper-1.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f3672dbafbb458f1b96e1ee3e610d174acb5ace5bd2ed5d1252603bb797f2fc6", size = 140458, upload-time = "2025-12-01T13:15:05.76Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/6e3e241882bf7d6ab23d9c69ba4e85f1ec47397cbbeee948a16cf75e21ed/pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d1f807e2b4760a8e5c6d6b4e8c1d71ef52b7fe1946ff088f4fa41e16a881a5ca", size = 978235, upload-time = "2025-12-01T13:15:06.993Z" }, + { url = "https://files.pythonhosted.org/packages/cf/f4/3418c1cd5eea640a9fa2501d4bc0b3655fa8d40145d1a4f484b987990a75/pyclipper-1.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce1f83c9a4e10ea3de1959f0ae79e9a5bd41346dff648fee6228ba9eaf8b3872", size = 961388, upload-time = "2025-12-01T13:15:08.467Z" }, + { url = "https://files.pythonhosted.org/packages/ac/94/c85401d24be634af529c962dd5d781f3cb62a67cd769534df2cb3feee97a/pyclipper-1.4.0-cp312-cp312-win32.whl", hash = "sha256:3ef44b64666ebf1cb521a08a60c3e639d21b8c50bfbe846ba7c52a0415e936f4", size = 95169, upload-time = "2025-12-01T13:15:10.098Z" }, + { url = "https://files.pythonhosted.org/packages/97/77/dfea08e3b230b82ee22543c30c35d33d42f846a77f96caf7c504dd54fab1/pyclipper-1.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:d1e5498d883b706a4ce636247f0d830c6eb34a25b843a1b78e2c969754ca9037", size = 104619, upload-time = "2025-12-01T13:15:11.592Z" }, + { url = "https://files.pythonhosted.org/packages/18/59/81050abdc9e5b90ffc2c765738c5e40e9abd8e44864aaa737b600f16c562/pyclipper-1.4.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98b2a40f98e1fc1b29e8a6094072e7e0c7dfe901e573bf6cfc6eb7ce84a7ae87", size = 126495, upload-time = "2025-12-01T13:15:33.743Z" }, +] + +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, + { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, + { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, + { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, + { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, + { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, + { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, + { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, + { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, + { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, + { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, + { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, + { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, + { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, + { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, + { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, + { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, + { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, +] + +[[package]] +name = "pydantic-settings" +version = "2.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pylatexenc" +version = "2.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" } + +[[package]] +name = "pymupdf" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pymupdfb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/57/da06ca4886afc71a624e4b463d05f45c8a822596ede939957295e229eb4e/PyMuPDF-1.24.10.tar.gz", hash = "sha256:bd3ebd6d3fb8a845582098362f885bfb0a31ae4272587efc2c55c5e29fe7327a", size = 46988085, upload-time = "2024-09-02T16:28:45.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/35/6af0bb4bafe9d54893a04d9639f73b1b754efe0235997052d75fb6b7edc1/PyMuPDF-1.24.10-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:5fbd67cce759fc0126902137409cf9da6313b776c4d5ff0d5200f336350f86a3", size = 3194012, upload-time = "2024-09-02T16:27:14.019Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2b/c254cf49dfcf2469a674407a680f5b2b174b866e84d322f5767baf4d3ad3/PyMuPDF-1.24.10-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:2b14dbdf7c415bb0fa849527abbe7b4f1f55ae23b9355d132951f634438c59ac", size = 2974781, upload-time = "2024-09-02T16:27:17.213Z" }, + { url = "https://files.pythonhosted.org/packages/1c/77/78800d3a711f92060f8e338a5df9330ffb5950f4fb3beeba01e15c03c4c6/PyMuPDF-1.24.10-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:1a87440a6cbc0d5ad513425baa0f4747841898fca6e37350ca3e6b29e5f40c01", size = 3210393, upload-time = "2024-09-02T22:17:05.788Z" }, + { url = "https://files.pythonhosted.org/packages/c5/39/3aaa1e8822c55c71bb37911b5b1c3157ef38d731581224b29a682d80a17b/PyMuPDF-1.24.10-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:c0d1ccdc062ea9961063790831e838bc43fcf9a8436a8b9f55898addf97c0f86", size = 3482650, upload-time = "2024-09-02T16:27:21.101Z" }, + { url = "https://files.pythonhosted.org/packages/5b/73/6b5c2dc59539b79cb9430ff946d7dff308af146f7c8bc7b96c963e12970d/PyMuPDF-1.24.10-cp311-none-musllinux_1_2_x86_64.whl", hash = "sha256:f68671363be5a2ba104ab7d3bad821d2994cbe3f3408538bbc27d32e6dc9f923", size = 3600588, upload-time = "2024-09-02T16:27:25.022Z" }, + { url = "https://files.pythonhosted.org/packages/71/e9/d3bf062325b4821726a2f9ce9d75b63f594ae24bc38c31f55b4285f1f5e1/PyMuPDF-1.24.10-cp311-none-win32.whl", hash = "sha256:49f83556cd1a7d05b36a54ccc01fce324da8a4e6854e36cc5cd94d321e428565", size = 2694768, upload-time = "2024-09-02T16:27:33.318Z" }, + { url = "https://files.pythonhosted.org/packages/30/3f/356a70c105d4410c29529f1ca8c53b5d176b448a4409238b4dcd133507a4/PyMuPDF-1.24.10-cp311-none-win_amd64.whl", hash = "sha256:05b8d360766b87f4abd186eba16a56b92bae513b2361b13f633fe6256329292e", size = 3214889, upload-time = "2024-09-02T16:27:28.174Z" }, + { url = "https://files.pythonhosted.org/packages/75/84/7231344d98355a40fb57c4025391dfb4116e2c3e9d98d5cc83f80c5ea942/PyMuPDF-1.24.10-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f323aa7bb55e0214e632bfe24fa140bd5dcfeac2d3977bdce46e760385140513", size = 3230169, upload-time = "2024-09-02T16:27:37.842Z" }, + { url = "https://files.pythonhosted.org/packages/b2/bc/975b4fe4400b00c912dad1874c43d31486150e6f39d7dae758751c27e2dd/PyMuPDF-1.24.10-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:50d2972558d25ce46a8634b58787b28dbeff9b3fe4299530fc9c8c9921061e83", size = 2980118, upload-time = "2024-09-02T16:27:41.534Z" }, + { url = "https://files.pythonhosted.org/packages/5b/dc/0f22c77ac4f8e6b8316072519513d5f0111fffe96d357051db0ddf043032/PyMuPDF-1.24.10-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:0e3969c2fdff682b3b2c6a2b463adde068d6d8e20e2133ef6c8503469259646a", size = 3216830, upload-time = "2024-09-02T22:17:09.193Z" }, + { url = "https://files.pythonhosted.org/packages/a3/1b/1b41b27aab571b835f8d983492b80ed64548e3b5c4d169e23c639727d43b/PyMuPDF-1.24.10-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:cd78ee1ebefdfe72bc36fd4b731cc8c694eb8ef5337d8ea956b0e94cd88751fc", size = 3491118, upload-time = "2024-09-02T16:27:50.098Z" }, + { url = "https://files.pythonhosted.org/packages/2d/3c/f1ffbc6e13ab37900c2aa71e434bbba922770091242e2b059acdb14f779e/PyMuPDF-1.24.10-cp312-none-musllinux_1_2_x86_64.whl", hash = "sha256:696eed91d2ee44e76277dfeb6bd904c84ae005378588949df6ed9be9e03b9817", size = 3612589, upload-time = "2024-09-02T16:27:54.185Z" }, + { url = "https://files.pythonhosted.org/packages/53/fb/158909af75c84968ea7e6659a75fd67bd462103c599033b23ffd6bc173be/PyMuPDF-1.24.10-cp312-none-win32.whl", hash = "sha256:1e5413e1aeab2f18e1ca1b3ff17057a4a7c5cbf4ff14abc93203da88fc1a1dd8", size = 2701190, upload-time = "2024-09-02T16:27:57.74Z" }, + { url = "https://files.pythonhosted.org/packages/91/4a/4a54d3f6a779ac5eed92e82fe3c1bb426bc40f9ea57c8656839198944a82/PyMuPDF-1.24.10-cp312-none-win_amd64.whl", hash = "sha256:227a4473fce8fa32b9268da68781048795503b67dc045867fc201e1334204bf1", size = 3228084, upload-time = "2024-09-02T16:27:45.749Z" }, +] + +[[package]] +name = "pymupdfb" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/ff/ecfcb41414b51976974d74c8e35fef0a0e5b47c7046a11c860553f5dccf0/PyMuPDFb-1.24.10.tar.gz", hash = "sha256:007b91fa9b528c5c0eecea2e49c486ac02e878274f9e31522bdd948adc5f8327", size = 37502, upload-time = "2024-09-02T16:28:48.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/94/b217dc987b4ac0e3793984427112d6032563b741e27763f7761c2231d022/PyMuPDFb-1.24.10-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:cd6b24630d90dce9ab3e59d06c5e616686f8d7ec626be1311721fcb062aa0078", size = 15536229, upload-time = "2024-09-02T16:25:19.4Z" }, + { url = "https://files.pythonhosted.org/packages/16/7a/f634c76d8331cb8dedcfaced17424cc469ee20b7f53cf29c9ef17a01b461/PyMuPDFb-1.24.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fda2c34b206f724b1b5685b67188e2a57bcaa5c99bc40a0a5bc62057514c5cdf", size = 15149482, upload-time = "2024-09-02T16:25:34.352Z" }, + { url = "https://files.pythonhosted.org/packages/62/97/67b5da2edd034e66dadd0ec530e277afb14fe866a3b3b01d9fad154bc6f8/PyMuPDFb-1.24.10-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4f50a7472f9bb10cbc7a1cd589ee4626ca030b8a4a02749f9a29eb6f00c0e0db", size = 15711338, upload-time = "2024-09-02T22:17:01.592Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/ad3f076e86328880797fe7e98c43b2879df56cf6cb75ac3058da06d6e6cb/PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:409f1270ef2e70d845e80149ff3db9cfed578274042316cba55cc3e3882421ea", size = 15921939, upload-time = "2024-09-02T16:26:00.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/e7/02160ea905a7ba16d6e1ca51759ae1c1045785ebebae57ba30e82617f934/PyMuPDFb-1.24.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:aca96b6e9ee3096a26810592f4d899f4d3cf3cf0c902ae7e8cca09bce4d946c4", size = 17076991, upload-time = "2024-09-02T16:25:46.703Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c0/e1ed840440131f71b068cdb3b620a69ec27543b1012a6bd855d8d05f1629/PyMuPDFb-1.24.10-py3-none-win32.whl", hash = "sha256:2d231b42fe3bf79837df235e7fbdf7ff8b46bf4ca1346d0f0124fb1cdd343ce8", size = 11731706, upload-time = "2024-09-02T16:26:19.131Z" }, + { url = "https://files.pythonhosted.org/packages/70/cb/8459d6c179befd7c6eee555334f054e9a6dcdd9f8671891e1da19e0ce526/PyMuPDFb-1.24.10-py3-none-win_amd64.whl", hash = "sha256:27ea65c701608b6b7632703339ca33ea6d513843b26dbe9bdefb2f56f7b9b196", size = 13186168, upload-time = "2024-09-02T16:26:10.503Z" }, +] + +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "python-bidi" +version = "0.6.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ed/e3/c0c8bf6fca79ac946a28d57f116e3b9e5b10a4469b6f70bf73f3744c49bf/python_bidi-0.6.7.tar.gz", hash = "sha256:c10065081c0e137975de5d9ba2ff2306286dbf5e0c586d4d5aec87c856239b41", size = 45503, upload-time = "2025-10-22T09:52:49.624Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/de/c30a13ad95239507af472a5fc2cadd2e5e172055068f12ac39b37922c7f8/python_bidi-0.6.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a8892a7da0f617135fe9c92dc7070d13a0f96ab3081f9db7ff5b172a3905bd78", size = 274420, upload-time = "2025-10-22T09:51:58.262Z" }, + { url = "https://files.pythonhosted.org/packages/ad/9f/be5efef7eea5f1e2a6415c4052a988f594dcf5a11a15103f2718d324a35b/python_bidi-0.6.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:06650a164e63e94dc8a291cc9d415b4027cb1cce125bc9b02dac0f34d535ed47", size = 264586, upload-time = "2025-10-22T09:51:49.255Z" }, + { url = "https://files.pythonhosted.org/packages/87/ec/2c374b6de35870817ffb3512c0666ea8c3794ef923b5586c69451e0e5395/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6df7be07af867ec1d121c92ea827efad4d77b25457c06eeab477b601e82b2340", size = 293672, upload-time = "2025-10-22T09:50:58.504Z" }, + { url = "https://files.pythonhosted.org/packages/29/1a/722d7d7128bdc9a530351a0d2fdf2ff5f4af66a865a6bca925f99832e2cc/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:73a88dc333efc42281bd800d5182c8625c6e11d109fc183fe3d7a11d48ab1150", size = 302643, upload-time = "2025-10-22T09:51:06.419Z" }, + { url = "https://files.pythonhosted.org/packages/24/d7/5b9b593dd58fc745233d8476e9f4e0edd437547c78c58340619868470349/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f24189dc3aea3a0a94391a047076e1014306b39ba17d7a38ebab510553cd1a97", size = 441692, upload-time = "2025-10-22T09:51:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/08/b9/16e7a1db5f022da6654e89875d231ec2e044d42ef7b635feeff61cee564c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a507fe6928a27a308e04ebf2065719b7850d1bf9ff1924f4e601ef77758812bd", size = 326933, upload-time = "2025-10-22T09:51:23.631Z" }, + { url = "https://files.pythonhosted.org/packages/e0/a6/45aaec301292c6a07a9cc3168f5d1a92c8adc2ef36a3cd1f227b9caa980c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbbffb948a32f9783d1a28bc0c53616f0a76736ed1e7c1d62e3e99a8dfaab869", size = 302034, upload-time = "2025-10-22T09:51:41.347Z" }, + { url = "https://files.pythonhosted.org/packages/71/a3/7e42cce6e153c21b4e5cc96d429a5910909823f6fedd174b64ff67bc76a7/python_bidi-0.6.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7e507e1e798ebca77ddc9774fd405107833315ad802cfdaa1ab07b6d9154fc8", size = 315738, upload-time = "2025-10-22T09:51:33.409Z" }, + { url = "https://files.pythonhosted.org/packages/43/7c/a5e4c0acc8e6ca61953b4add0576f0483f63b809b5389154e5da13927b0b/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:849a57d39feaf897955d0b19bbf4796bea53d1bcdf83b82e0a7b059167eb2049", size = 473968, upload-time = "2025-10-22T09:52:07.624Z" }, + { url = "https://files.pythonhosted.org/packages/b1/aa/a18bc3cbab7a0e598cbe7b89f2c0913aedcc66dcafce9a4c357465c87859/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5ebc19f24e65a1f5c472e26d88e78b9d316e293bc6f205f32de4c4e99276336e", size = 567038, upload-time = "2025-10-22T09:52:18.594Z" }, + { url = "https://files.pythonhosted.org/packages/92/46/fc6c54a8b5bfbee50e650f885ddef4f8c4f92880467ea0bc2bf133747048/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:24388c77cb00b8aa0f9c84beb7e3e523a3dac4f786ece64a1d8175a07b24da72", size = 493970, upload-time = "2025-10-22T09:52:29.815Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f1/2c15f5b938b2e087e4e950cc14dcead5bedbaabfc6c576dac15739bc0c91/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:19737d217088ef27014f98eac1827c5913e6fb1dea96332ed84ede61791070d9", size = 465161, upload-time = "2025-10-22T09:52:40.517Z" }, + { url = "https://files.pythonhosted.org/packages/56/d7/73a70a1fb819152485521b8dfe627e14ba9d3d5a65213244ab099adf3600/python_bidi-0.6.7-cp311-cp311-win32.whl", hash = "sha256:95c9de7ebc55ffb777548f2ecaf4b96b0fa0c92f42bf4d897b9f4cd164ec7394", size = 157033, upload-time = "2025-10-22T09:52:59.228Z" }, + { url = "https://files.pythonhosted.org/packages/68/84/06999dc54ea047fe33209af7150df4202ab7ad52deeb66b2c2040ac07884/python_bidi-0.6.7-cp311-cp311-win_amd64.whl", hash = "sha256:898db0ea3e4aaa95b7fecba02a7560dfbf368f9d85053f2875f6d610c4d4ec2c", size = 161282, upload-time = "2025-10-22T09:52:51.467Z" }, + { url = "https://files.pythonhosted.org/packages/e5/03/5b2f3e73501d0f41ebc2b075b49473047c6cdfc3465cf890263fc69e3915/python_bidi-0.6.7-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:11c51579e01f768446a7e13a0059fea1530936a707abcbeaad9467a55cb16073", size = 272536, upload-time = "2025-10-22T09:51:59.721Z" }, + { url = "https://files.pythonhosted.org/packages/31/77/c6048e938a73e5a7c6fa3d5e3627a5961109daa728c2e7d050567cecdc26/python_bidi-0.6.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47deaada8949af3a790f2cd73b613f9bfa153b4c9450f91c44a60c3109a81f73", size = 263258, upload-time = "2025-10-22T09:51:50.328Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/ed4dc501cab7de70ce35cd435c86278e4eb1caf238c80bc72297767c9219/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b38ddfab41d10e780edb431edc30aec89bee4ce43d718e3896e99f33dae5c1d3", size = 292700, upload-time = "2025-10-22T09:50:59.628Z" }, + { url = "https://files.pythonhosted.org/packages/77/6a/1bf06d7544c940ffddd97cd0e02c55348a92163c5495fa18e34217dfbebe/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a93b0394cc684d64356b0475858c116f1e335ffbaba388db93bf47307deadfa", size = 300881, upload-time = "2025-10-22T09:51:07.507Z" }, + { url = "https://files.pythonhosted.org/packages/22/1d/ce7577a8f50291c06e94f651ac5de0d1678fc2642af26a5dad9901a0244f/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec1694134961b71ac05241ac989b49ccf08e232b5834d5fc46f8a7c3bb1c13a9", size = 439125, upload-time = "2025-10-22T09:51:16.559Z" }, + { url = "https://files.pythonhosted.org/packages/a3/87/4cf6dcd58e22f0fd904e7a161c6b73a5f9d17d4d49073fcb089ba62f1469/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8047c33b85f7790474a1f488bef95689f049976a4e1c6f213a8d075d180a93e4", size = 325816, upload-time = "2025-10-22T09:51:25.12Z" }, + { url = "https://files.pythonhosted.org/packages/2a/0a/4028a088e29ce8f1673e85ec9f64204fc368355c3207e6a71619c2b4579a/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d9de35eb5987da27dd81e371c52142dd8e924bd61c1006003071ea05a735587", size = 300550, upload-time = "2025-10-22T09:51:42.739Z" }, + { url = "https://files.pythonhosted.org/packages/1f/05/cac15eba462d5a2407ac4ef1c792c45a948652b00c6bd81eaab3834a62d2/python_bidi-0.6.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a99d898ad1a399d9c8cab5561b3667fd24f4385820ac90c3340aa637aa5adfc9", size = 313017, upload-time = "2025-10-22T09:51:34.905Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b1/3ba91b9ea60fa54a9aa730a5fe432bd73095d55be371244584fc6818eae1/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5debaab33562fdfc79ffdbd8d9c51cf07b8529de0e889d8cd145d78137aab21e", size = 472798, upload-time = "2025-10-22T09:52:09.079Z" }, + { url = "https://files.pythonhosted.org/packages/50/40/4bf5fb7255e35c218174f322a4d4c80b63b2604d73adc6e32f843e700824/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c11c62a3cdb9d1426b1536de9e3446cb09c7d025bd4df125275cae221f214899", size = 565234, upload-time = "2025-10-22T09:52:19.703Z" }, + { url = "https://files.pythonhosted.org/packages/bd/81/ad23fb85bff69d0a25729cd3834254b87c3c7caa93d657c8f8edcbed08f6/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6c051f2d28ca542092d01da8b5fe110fb6191ff58d298a54a93dc183bece63bf", size = 491844, upload-time = "2025-10-22T09:52:31.216Z" }, + { url = "https://files.pythonhosted.org/packages/65/85/103baaf142b2838f583b71904a2454fa31bd2a912ff505c25874f45d6c3e/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95867a07c5dee0ea2340fe1d0e4f6d9f5c5687d473193b6ee6f86fa44aac45d1", size = 463753, upload-time = "2025-10-22T09:52:41.943Z" }, + { url = "https://files.pythonhosted.org/packages/54/c3/6a5c3b9f42a6b188430c83a7e70a76bc7c0db3354302fce7c8ed94a0c062/python_bidi-0.6.7-cp312-cp312-win32.whl", hash = "sha256:4c73cd980d45bb967799c7f0fc98ea93ae3d65b21ef2ba6abef6a057720bf483", size = 155820, upload-time = "2025-10-22T09:53:00.254Z" }, + { url = "https://files.pythonhosted.org/packages/45/c4/683216398ee3abf6b9bb0f26ae15c696fabbe36468ba26d5271f0c11b343/python_bidi-0.6.7-cp312-cp312-win_amd64.whl", hash = "sha256:d524a4ba765bae9b950706472a77a887a525ed21144fe4b41f6190f6e57caa2c", size = 159966, upload-time = "2025-10-22T09:52:52.547Z" }, + { url = "https://files.pythonhosted.org/packages/b8/4e/6135798d84b62eea70c0f9435301c2a4ba854e87be93a3fcd1d935266d24/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c9a679b24f5c6f366a0dec75745e1abeae2f597f033d0d54c74cbe62e7e6ae28", size = 276275, upload-time = "2025-10-22T09:52:05.078Z" }, + { url = "https://files.pythonhosted.org/packages/74/83/2123596d43e552af9e2806e361646fa579f34a1d1e9e2c1707a0ab6a02dd/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fe5971110013610f0db40505d0b204edc756e92eafac1372a464f8b9162b11", size = 266951, upload-time = "2025-10-22T09:51:56.216Z" }, + { url = "https://files.pythonhosted.org/packages/5c/8c/8d1e1501717227a6d52fc7b9c47a3de61486b024fbdd4821bfad724c0699/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17572944e6d8fb616d111fc702c759da2bf7cedab85a3e4fa2af0c9eb95ed438", size = 295745, upload-time = "2025-10-22T09:51:04.438Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ff/ef04e7f9067c2c5d862b9f8d9a192486c500c8aa295f0fb756c25ab47fc8/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3b63d19f3f56ff7f99bce5ca9ef8c811dbf0f509d8e84c1bc06105ed26a49528", size = 304123, upload-time = "2025-10-22T09:51:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/be/72/b973895e257a7d4cc8365ab094612f6ee885df863a4964d8865b9f534b67/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1350033431d75be749273236dcfc808e54404cd6ece6204cdb1bc4ccc163455", size = 442484, upload-time = "2025-10-22T09:51:21.575Z" }, + { url = "https://files.pythonhosted.org/packages/c1/1a/68ca9d10bc309828e8cdb2d57a30dd7e5753ac8520c8d7a0322daeb9eef7/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c5fb99f774748de283fadf915106f130b74be1bade934b7f73a7a8488b95da1", size = 329149, upload-time = "2025-10-22T09:51:31.232Z" }, + { url = "https://files.pythonhosted.org/packages/03/40/ab450c06167a7de596d99b1ba5cee2c605b3ff184baccf08210ede706b1b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d28e2bdcadf5b6161bb4ee9313ce41eac746ba57e744168bf723a415a11af05", size = 303529, upload-time = "2025-10-22T09:51:46.997Z" }, + { url = "https://files.pythonhosted.org/packages/ec/c5/585b5c413e3b77a32500fb877ea30aa23c45a6064dbd7fe77d87b72cd90b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3777ae3e088e94df854fbcbd8d59f9239b74aac036cb6bbd19f8035c8e42478", size = 317753, upload-time = "2025-10-22T09:51:39.272Z" }, + { url = "https://files.pythonhosted.org/packages/f9/05/b7b4b447890d614ccb40633f4d65f334bcf9fe3ad13be33aaa54dcbc34f3/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:77bb4cbadf4121db395189065c58c9dd5d1950257cc1983004e6df4a3e2f97ad", size = 476054, upload-time = "2025-10-22T09:52:15.856Z" }, + { url = "https://files.pythonhosted.org/packages/ca/94/64f6d2c09c4426918345b54ca8902f94b663eadd744c9dd89070f546c9bc/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:f1fe71c203f66bc169a393964d5702f9251cfd4d70279cb6453fdd42bd2e675f", size = 568365, upload-time = "2025-10-22T09:52:27.556Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d2/c39a6b82aa0fcedac7cbe6078b78bb9089b43d903f8e00859e42b504bb8e/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:d87ed09e5c9b6d2648e8856a4e556147b9d3cd4d63905fa664dd6706bc414256", size = 495292, upload-time = "2025-10-22T09:52:38.306Z" }, + { url = "https://files.pythonhosted.org/packages/0a/8d/a80f37ab92118e305d7b574306553599f81534c50b4eb23ef34ebe09c09c/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:766d5f5a686eb99b53168a7bdfb338035931a609bdbbcb537cef9e050a86f359", size = 467159, upload-time = "2025-10-22T09:52:48.603Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-docx" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, +] + +[[package]] +name = "python-pptx" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "pillow" }, + { name = "typing-extensions" }, + { name = "xlsxwriter" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, +] + +[[package]] +name = "pytz" +version = "2026.1.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, +] + +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" }, + { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" }, + { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" }, + { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, + { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, +] + +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + +[[package]] +name = "regex" +version = "2026.2.28" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/db/8cbfd0ba3f302f2d09dd0019a9fcab74b63fee77a76c937d0e33161fb8c1/regex-2026.2.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e621fb7c8dc147419b28e1702f58a0177ff8308a76fa295c71f3e7827849f5d9", size = 488462, upload-time = "2026-02-28T02:16:22.616Z" }, + { url = "https://files.pythonhosted.org/packages/5d/10/ccc22c52802223f2368731964ddd117799e1390ffc39dbb31634a83022ee/regex-2026.2.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d5bef2031cbf38757a0b0bc4298bb4824b6332d28edc16b39247228fbdbad97", size = 290774, upload-time = "2026-02-28T02:16:23.993Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/6796b3bf3101e64117201aaa3a5a030ec677ecf34b3cd6141b5d5c6c67d5/regex-2026.2.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcb399ed84eabf4282587ba151f2732ad8168e66f1d3f85b1d038868fe547703", size = 288724, upload-time = "2026-02-28T02:16:25.403Z" }, + { url = "https://files.pythonhosted.org/packages/9c/02/291c0ae3f3a10cea941d0f5366da1843d8d1fa8a25b0671e20a0e454bb38/regex-2026.2.28-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1b34dfa72f826f535b20712afa9bb3ba580020e834f3c69866c5bddbf10098", size = 791924, upload-time = "2026-02-28T02:16:26.863Z" }, + { url = "https://files.pythonhosted.org/packages/0f/57/f0235cc520d9672742196c5c15098f8f703f2758d48d5a7465a56333e496/regex-2026.2.28-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:851fa70df44325e1e4cdb79c5e676e91a78147b1b543db2aec8734d2add30ec2", size = 860095, upload-time = "2026-02-28T02:16:28.772Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7c/393c94cbedda79a0f5f2435ebd01644aba0b338d327eb24b4aa5b8d6c07f/regex-2026.2.28-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:516604edd17b1c2c3e579cf4e9b25a53bf8fa6e7cedddf1127804d3e0140ca64", size = 906583, upload-time = "2026-02-28T02:16:30.977Z" }, + { url = "https://files.pythonhosted.org/packages/2c/73/a72820f47ca5abf2b5d911d0407ba5178fc52cf9780191ed3a54f5f419a2/regex-2026.2.28-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7ce83654d1ab701cb619285a18a8e5a889c1216d746ddc710c914ca5fd71022", size = 800234, upload-time = "2026-02-28T02:16:32.55Z" }, + { url = "https://files.pythonhosted.org/packages/34/b3/6e6a4b7b31fa998c4cf159a12cbeaf356386fbd1a8be743b1e80a3da51e4/regex-2026.2.28-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2791948f7c70bb9335a9102df45e93d428f4b8128020d85920223925d73b9e1", size = 772803, upload-time = "2026-02-28T02:16:34.029Z" }, + { url = "https://files.pythonhosted.org/packages/10/e7/5da0280c765d5a92af5e1cd324b3fe8464303189cbaa449de9a71910e273/regex-2026.2.28-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a83cc26aa2acda6b8b9dfe748cf9e84cbd390c424a1de34fdcef58961a297a", size = 781117, upload-time = "2026-02-28T02:16:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/76/39/0b8d7efb256ae34e1b8157acc1afd8758048a1cf0196e1aec2e71fd99f4b/regex-2026.2.28-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ec6f5674c5dc836994f50f1186dd1fafde4be0666aae201ae2fcc3d29d8adf27", size = 854224, upload-time = "2026-02-28T02:16:38.119Z" }, + { url = "https://files.pythonhosted.org/packages/21/ff/a96d483ebe8fe6d1c67907729202313895d8de8495569ec319c6f29d0438/regex-2026.2.28-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:50c2fc924749543e0eacc93ada6aeeb3ea5f6715825624baa0dccaec771668ae", size = 761898, upload-time = "2026-02-28T02:16:40.333Z" }, + { url = "https://files.pythonhosted.org/packages/89/bd/d4f2e75cb4a54b484e796017e37c0d09d8a0a837de43d17e238adf163f4e/regex-2026.2.28-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ba55c50f408fb5c346a3a02d2ce0ebc839784e24f7c9684fde328ff063c3cdea", size = 844832, upload-time = "2026-02-28T02:16:41.875Z" }, + { url = "https://files.pythonhosted.org/packages/8a/a7/428a135cf5e15e4e11d1e696eb2bf968362f8ea8a5f237122e96bc2ae950/regex-2026.2.28-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edb1b1b3a5576c56f08ac46f108c40333f222ebfd5cf63afdfa3aab0791ebe5b", size = 788347, upload-time = "2026-02-28T02:16:43.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/59/68691428851cf9c9c3707217ab1d9b47cfeec9d153a49919e6c368b9e926/regex-2026.2.28-cp311-cp311-win32.whl", hash = "sha256:948c12ef30ecedb128903c2c2678b339746eb7c689c5c21957c4a23950c96d15", size = 266033, upload-time = "2026-02-28T02:16:45.094Z" }, + { url = "https://files.pythonhosted.org/packages/42/8b/1483de1c57024e89296cbcceb9cccb3f625d416ddb46e570be185c9b05a9/regex-2026.2.28-cp311-cp311-win_amd64.whl", hash = "sha256:fd63453f10d29097cc3dc62d070746523973fb5aa1c66d25f8558bebd47fed61", size = 277978, upload-time = "2026-02-28T02:16:46.75Z" }, + { url = "https://files.pythonhosted.org/packages/a4/36/abec45dc6e7252e3dbc797120496e43bb5730a7abf0d9cb69340696a2f2d/regex-2026.2.28-cp311-cp311-win_arm64.whl", hash = "sha256:00f2b8d9615aa165fdff0a13f1a92049bfad555ee91e20d246a51aa0b556c60a", size = 270340, upload-time = "2026-02-28T02:16:48.626Z" }, + { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" }, + { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" }, + { url = "https://files.pythonhosted.org/packages/9e/06/3ef1ac6910dc3295ebd71b1f9bfa737e82cfead211a18b319d45f85ddd09/regex-2026.2.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b65d33a17101569f86d9c5966a8b1d7fbf8afdda5a8aa219301b0a80f58cf7d", size = 289200, upload-time = "2026-02-28T02:16:54.08Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c9/8cc8d850b35ab5650ff6756a1cb85286e2000b66c97520b29c1587455344/regex-2026.2.28-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e71dcecaa113eebcc96622c17692672c2d104b1d71ddf7adeda90da7ddeb26fc", size = 796765, upload-time = "2026-02-28T02:16:55.905Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5d/57702597627fc23278ebf36fbb497ac91c0ce7fec89ac6c81e420ca3e38c/regex-2026.2.28-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:481df4623fa4969c8b11f3433ed7d5e3dc9cec0f008356c3212b3933fb77e3d8", size = 863093, upload-time = "2026-02-28T02:16:58.094Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/f3ecad537ca2811b4d26b54ca848cf70e04fcfc138667c146a9f3157779c/regex-2026.2.28-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64e7c6ad614573e0640f271e811a408d79a9e1fe62a46adb602f598df42a818d", size = 909455, upload-time = "2026-02-28T02:17:00.918Z" }, + { url = "https://files.pythonhosted.org/packages/9e/40/bb226f203caa22c1043c1ca79b36340156eca0f6a6742b46c3bb222a3a57/regex-2026.2.28-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6b08a06976ff4fb0d83077022fde3eca06c55432bb997d8c0495b9a4e9872f4", size = 802037, upload-time = "2026-02-28T02:17:02.842Z" }, + { url = "https://files.pythonhosted.org/packages/44/7c/c6d91d8911ac6803b45ca968e8e500c46934e58c0903cbc6d760ee817a0a/regex-2026.2.28-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:864cdd1a2ef5716b0ab468af40139e62ede1b3a53386b375ec0786bb6783fc05", size = 775113, upload-time = "2026-02-28T02:17:04.506Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8d/4a9368d168d47abd4158580b8c848709667b1cd293ff0c0c277279543bd0/regex-2026.2.28-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:511f7419f7afab475fd4d639d4aedfc54205bcb0800066753ef68a59f0f330b5", size = 784194, upload-time = "2026-02-28T02:17:06.888Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bf/2c72ab5d8b7be462cb1651b5cc333da1d0068740342f350fcca3bca31947/regex-2026.2.28-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b42f7466e32bf15a961cf09f35fa6323cc72e64d3d2c990b10de1274a5da0a59", size = 856846, upload-time = "2026-02-28T02:17:09.11Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f4/6b65c979bb6d09f51bb2d2a7bc85de73c01ec73335d7ddd202dcb8cd1c8f/regex-2026.2.28-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8710d61737b0c0ce6836b1da7109f20d495e49b3809f30e27e9560be67a257bf", size = 763516, upload-time = "2026-02-28T02:17:11.004Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/29ea5e27400ee86d2cc2b4e80aa059df04eaf78b4f0c18576ae077aeff68/regex-2026.2.28-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4390c365fd2d45278f45afd4673cb90f7285f5701607e3ad4274df08e36140ae", size = 849278, upload-time = "2026-02-28T02:17:12.693Z" }, + { url = "https://files.pythonhosted.org/packages/1d/91/3233d03b5f865111cd517e1c95ee8b43e8b428d61fa73764a80c9bb6f537/regex-2026.2.28-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb3b1db8ff6c7b8bf838ab05583ea15230cb2f678e569ab0e3a24d1e8320940b", size = 790068, upload-time = "2026-02-28T02:17:14.9Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/abc706c1fb03b4580a09645b206a3fc032f5a9f457bc1a8038ac555658ab/regex-2026.2.28-cp312-cp312-win32.whl", hash = "sha256:f8ed9a5d4612df9d4de15878f0bc6aa7a268afbe5af21a3fdd97fa19516e978c", size = 266416, upload-time = "2026-02-28T02:17:17.15Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/2a6f7dff190e5fa9df9fb4acf2fdf17a1aa0f7f54596cba8de608db56b3a/regex-2026.2.28-cp312-cp312-win_amd64.whl", hash = "sha256:01d65fd24206c8e1e97e2e31b286c59009636c022eb5d003f52760b0f42155d4", size = 277297, upload-time = "2026-02-28T02:17:18.723Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f0/58a2484851fadf284458fdbd728f580d55c1abac059ae9f048c63b92f427/regex-2026.2.28-cp312-cp312-win_arm64.whl", hash = "sha256:c0b5ccbb8ffb433939d248707d4a8b31993cb76ab1a0187ca886bf50e96df952", size = 270408, upload-time = "2026-02-28T02:17:20.328Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, +] + +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" }, + { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" }, + { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" }, + { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" }, + { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" }, + { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" }, + { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" }, + { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" }, + { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" }, + { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" }, + { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" }, + { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" }, + { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" }, + { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" }, + { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, + { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, + { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, + { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, + { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, + { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, + { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" }, + { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" }, + { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" }, + { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" }, + { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" }, + { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" }, + { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" }, + { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" }, + { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" }, + { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" }, + { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, +] + +[[package]] +name = "rtree" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/09/7302695875a019514de9a5dd17b8320e7a19d6e7bc8f85dcfb79a4ce2da3/rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46", size = 52425, upload-time = "2025-08-13T19:32:01.413Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/d9/108cd989a4c0954e60b3cdc86fd2826407702b5375f6dfdab2802e5fed98/rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4", size = 468484, upload-time = "2025-08-13T19:31:50.593Z" }, + { url = "https://files.pythonhosted.org/packages/f3/cf/2710b6fd6b07ea0aef317b29f335790ba6adf06a28ac236078ed9bd8a91d/rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d", size = 436325, upload-time = "2025-08-13T19:31:52.367Z" }, + { url = "https://files.pythonhosted.org/packages/55/e1/4d075268a46e68db3cac51846eb6a3ab96ed481c585c5a1ad411b3c23aad/rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65", size = 459789, upload-time = "2025-08-13T19:31:53.926Z" }, + { url = "https://files.pythonhosted.org/packages/d1/75/e5d44be90525cd28503e7f836d077ae6663ec0687a13ba7810b4114b3668/rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c", size = 507644, upload-time = "2025-08-13T19:31:55.164Z" }, + { url = "https://files.pythonhosted.org/packages/fd/85/b8684f769a142163b52859a38a486493b05bafb4f2fb71d4f945de28ebf9/rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967", size = 1454478, upload-time = "2025-08-13T19:31:56.808Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a4/c2292b95246b9165cc43a0c3757e80995d58bc9b43da5cb47ad6e3535213/rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc", size = 1555140, upload-time = "2025-08-13T19:31:58.031Z" }, + { url = "https://files.pythonhosted.org/packages/74/25/5282c8270bfcd620d3e73beb35b40ac4ab00f0a898d98ebeb41ef0989ec8/rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489", size = 389358, upload-time = "2025-08-13T19:31:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" }, +] + +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + +[package.optional-dependencies] +torch = [ + { name = "numpy" }, + { name = "packaging" }, + { name = "torch" }, +] + +[[package]] +name = "scikit-image" +version = "0.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "imageio" }, + { name = "lazy-loader" }, + { name = "networkx" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "scipy" }, + { name = "tifffile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/b4/2528bb43c67d48053a7a649a9666432dc307d66ba02e3a6d5c40f46655df/scikit_image-0.26.0.tar.gz", hash = "sha256:f5f970ab04efad85c24714321fcc91613fcb64ef2a892a13167df2f3e59199fa", size = 22729739, upload-time = "2025-12-20T17:12:21.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/16/8a407688b607f86f81f8c649bf0d68a2a6d67375f18c2d660aba20f5b648/scikit_image-0.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b1ede33a0fb3731457eaf53af6361e73dd510f449dac437ab54573b26788baf0", size = 12355510, upload-time = "2025-12-20T17:10:31.628Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f9/7efc088ececb6f6868fd4475e16cfafc11f242ce9ab5fc3557d78b5da0d4/scikit_image-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7af7aa331c6846bd03fa28b164c18d0c3fd419dbb888fb05e958ac4257a78fdd", size = 12056334, upload-time = "2025-12-20T17:10:34.559Z" }, + { url = "https://files.pythonhosted.org/packages/9f/1e/bc7fb91fb5ff65ef42346c8b7ee8b09b04eabf89235ab7dbfdfd96cbd1ea/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ea6207d9e9d21c3f464efe733121c0504e494dbdc7728649ff3e23c3c5a4953", size = 13297768, upload-time = "2025-12-20T17:10:37.733Z" }, + { url = "https://files.pythonhosted.org/packages/a5/2a/e71c1a7d90e70da67b88ccc609bd6ae54798d5847369b15d3a8052232f9d/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74aa5518ccea28121f57a95374581d3b979839adc25bb03f289b1bc9b99c58af", size = 13711217, upload-time = "2025-12-20T17:10:40.935Z" }, + { url = "https://files.pythonhosted.org/packages/d4/59/9637ee12c23726266b91296791465218973ce1ad3e4c56fc81e4d8e7d6e1/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d5c244656de905e195a904e36dbc18585e06ecf67d90f0482cbde63d7f9ad59d", size = 14337782, upload-time = "2025-12-20T17:10:43.452Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5c/a3e1e0860f9294663f540c117e4bf83d55e5b47c281d475cc06227e88411/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21a818ee6ca2f2131b9e04d8eb7637b5c18773ebe7b399ad23dcc5afaa226d2d", size = 14805997, upload-time = "2025-12-20T17:10:45.93Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c6/2eeacf173da041a9e388975f54e5c49df750757fcfc3ee293cdbbae1ea0a/scikit_image-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:9490360c8d3f9a7e85c8de87daf7c0c66507960cf4947bb9610d1751928721c7", size = 11878486, upload-time = "2025-12-20T17:10:48.246Z" }, + { url = "https://files.pythonhosted.org/packages/c3/a4/a852c4949b9058d585e762a66bf7e9a2cd3be4795cd940413dfbfbb0ce79/scikit_image-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:0baa0108d2d027f34d748e84e592b78acc23e965a5de0e4bb03cf371de5c0581", size = 11346518, upload-time = "2025-12-20T17:10:50.575Z" }, + { url = "https://files.pythonhosted.org/packages/99/e8/e13757982264b33a1621628f86b587e9a73a13f5256dad49b19ba7dc9083/scikit_image-0.26.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d454b93a6fa770ac5ae2d33570f8e7a321bb80d29511ce4b6b78058ebe176e8c", size = 12376452, upload-time = "2025-12-20T17:10:52.796Z" }, + { url = "https://files.pythonhosted.org/packages/e3/be/f8dd17d0510f9911f9f17ba301f7455328bf13dae416560126d428de9568/scikit_image-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3409e89d66eff5734cd2b672d1c48d2759360057e714e1d92a11df82c87cba37", size = 12061567, upload-time = "2025-12-20T17:10:55.207Z" }, + { url = "https://files.pythonhosted.org/packages/b3/2b/c70120a6880579fb42b91567ad79feb4772f7be72e8d52fec403a3dde0c6/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c717490cec9e276afb0438dd165b7c3072d6c416709cc0f9f5a4c1070d23a44", size = 13084214, upload-time = "2025-12-20T17:10:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/f4/a2/70401a107d6d7466d64b466927e6b96fcefa99d57494b972608e2f8be50f/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df650e79031634ac90b11e64a9eedaf5a5e06fcd09bcd03a34be01745744466", size = 13561683, upload-time = "2025-12-20T17:10:59.49Z" }, + { url = "https://files.pythonhosted.org/packages/13/a5/48bdfd92794c5002d664e0910a349d0a1504671ef5ad358150f21643c79a/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cefd85033e66d4ea35b525bb0937d7f42d4cdcfed2d1888e1570d5ce450d3932", size = 14112147, upload-time = "2025-12-20T17:11:02.083Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b5/ac71694da92f5def5953ca99f18a10fe98eac2dd0a34079389b70b4d0394/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3f5bf622d7c0435884e1e141ebbe4b2804e16b2dd23ae4c6183e2ea99233be70", size = 14661625, upload-time = "2025-12-20T17:11:04.528Z" }, + { url = "https://files.pythonhosted.org/packages/23/4d/a3cc1e96f080e253dad2251bfae7587cf2b7912bcd76fd43fd366ff35a87/scikit_image-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:abed017474593cd3056ae0fe948d07d0747b27a085e92df5474f4955dd65aec0", size = 11911059, upload-time = "2025-12-20T17:11:06.61Z" }, + { url = "https://files.pythonhosted.org/packages/35/8a/d1b8055f584acc937478abf4550d122936f420352422a1a625eef2c605d8/scikit_image-0.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:4d57e39ef67a95d26860c8caf9b14b8fb130f83b34c6656a77f191fa6d1d04d8", size = 11348740, upload-time = "2025-12-20T17:11:09.118Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312, upload-time = "2025-01-10T08:07:55.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/2a/e291c29670795406a824567d1dfc91db7b699799a002fdaa452bceea8f6e/scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33", size = 12102620, upload-time = "2025-01-10T08:06:16.675Z" }, + { url = "https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d", size = 11116234, upload-time = "2025-01-10T08:06:21.83Z" }, + { url = "https://files.pythonhosted.org/packages/30/cd/ed4399485ef364bb25f388ab438e3724e60dc218c547a407b6e90ccccaef/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2", size = 12592155, upload-time = "2025-01-10T08:06:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/62fc9a5a659bb58a03cdd7e258956a5824bdc9b4bb3c5d932f55880be569/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8", size = 13497069, upload-time = "2025-01-10T08:06:32.515Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415", size = 11139809, upload-time = "2025-01-10T08:06:35.514Z" }, + { url = "https://files.pythonhosted.org/packages/0a/18/c797c9b8c10380d05616db3bfb48e2a3358c767affd0857d56c2eb501caa/scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b", size = 12104516, upload-time = "2025-01-10T08:06:40.009Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2", size = 11167837, upload-time = "2025-01-10T08:06:43.305Z" }, + { url = "https://files.pythonhosted.org/packages/a4/f6/ff7beaeb644bcad72bcfd5a03ff36d32ee4e53a8b29a639f11bcb65d06cd/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f", size = 12253728, upload-time = "2025-01-10T08:06:47.618Z" }, + { url = "https://files.pythonhosted.org/packages/29/7a/8bce8968883e9465de20be15542f4c7e221952441727c4dad24d534c6d99/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86", size = 13147700, upload-time = "2025-01-10T08:06:50.888Z" }, + { url = "https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52", size = 11110613, upload-time = "2025-01-10T08:06:54.115Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" }, + { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" }, + { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" }, + { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" }, + { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" }, + { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" }, + { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" }, + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, +] + +[[package]] +name = "semchunk" +version = "2.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpire", extra = ["dill"] }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/62/96/c418c322730b385e81d4ab462e68dd48bb2dbda4d8efa17cad2ca468d9ac/semchunk-2.2.2.tar.gz", hash = "sha256:940e89896e64eeb01de97ba60f51c8c7b96c6a3951dfcf574f25ce2146752f52", size = 12271, upload-time = "2024-12-17T22:54:30.332Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/84/94ca7896c7df20032bcb09973e9a4d14c222507c0aadf22e89fa76bb0a04/semchunk-2.2.2-py3-none-any.whl", hash = "sha256:94ca19020c013c073abdfd06d79a7c13637b91738335f3b8cdb5655ee7cc94d2", size = 10271, upload-time = "2024-12-17T22:54:27.689Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, +] + +[[package]] +name = "shapely" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" }, + { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" }, + { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" }, + { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" }, + { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" }, + { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" }, + { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" }, + { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" }, + { url = "https://files.pythonhosted.org/packages/24/c0/f3b6453cf2dfa99adc0ba6675f9aaff9e526d2224cbd7ff9c1a879238693/shapely-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fe2533caae6a91a543dec62e8360fe86ffcdc42a7c55f9dfd0128a977a896b94", size = 1833550, upload-time = "2025-09-24T13:50:30.019Z" }, + { url = "https://files.pythonhosted.org/packages/86/07/59dee0bc4b913b7ab59ab1086225baca5b8f19865e6101db9ebb7243e132/shapely-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba4d1333cc0bc94381d6d4308d2e4e008e0bd128bdcff5573199742ee3634359", size = 1643556, upload-time = "2025-09-24T13:50:32.291Z" }, + { url = "https://files.pythonhosted.org/packages/26/29/a5397e75b435b9895cd53e165083faed5d12fd9626eadec15a83a2411f0f/shapely-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bd308103340030feef6c111d3eb98d50dc13feea33affc8a6f9fa549e9458a3", size = 2988308, upload-time = "2025-09-24T13:50:33.862Z" }, + { url = "https://files.pythonhosted.org/packages/b9/37/e781683abac55dde9771e086b790e554811a71ed0b2b8a1e789b7430dd44/shapely-2.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1e7d4d7ad262a48bb44277ca12c7c78cb1b0f56b32c10734ec9a1d30c0b0c54b", size = 3099844, upload-time = "2025-09-24T13:50:35.459Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f3/9876b64d4a5a321b9dc482c92bb6f061f2fa42131cba643c699f39317cb9/shapely-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e9eddfe513096a71896441a7c37db72da0687b34752c4e193577a145c71736fc", size = 3988842, upload-time = "2025-09-24T13:50:37.478Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a0/704c7292f7014c7e74ec84eddb7b109e1fbae74a16deae9c1504b1d15565/shapely-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:980c777c612514c0cf99bc8a9de6d286f5e186dcaf9091252fcd444e5638193d", size = 4152714, upload-time = "2025-09-24T13:50:39.9Z" }, + { url = "https://files.pythonhosted.org/packages/53/46/319c9dc788884ad0785242543cdffac0e6530e4d0deb6c4862bc4143dcf3/shapely-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9111274b88e4d7b54a95218e243282709b330ef52b7b86bc6aaf4f805306f454", size = 1542745, upload-time = "2025-09-24T13:50:41.414Z" }, + { url = "https://files.pythonhosted.org/packages/ec/bf/cb6c1c505cb31e818e900b9312d514f381fbfa5c4363edfce0fcc4f8c1a4/shapely-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:743044b4cfb34f9a67205cee9279feaf60ba7d02e69febc2afc609047cb49179", size = 1722861, upload-time = "2025-09-24T13:50:43.35Z" }, +] + +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, +] + +[[package]] +name = "sympy" +version = "1.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, +] + +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tifffile" +version = "2026.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c5/cb/2f6d79c7576e22c116352a801f4c3c8ace5957e9aced862012430b62e14f/tifffile-2026.3.3.tar.gz", hash = "sha256:d9a1266bed6f2ee1dd0abde2018a38b4f8b2935cb843df381d70ac4eac5458b7", size = 388745, upload-time = "2026-03-03T19:14:38.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.20.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/25/b1681c1c30ea3ea6e584ae3fffd552430b12faa599b558c4c4783f56d7ff/tokenizers-0.20.3.tar.gz", hash = "sha256:2278b34c5d0dd78e087e1ca7f9b1dcbf129d80211afa645f214bd6e051037539", size = 340513, upload-time = "2024-11-05T17:34:10.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/93/6742ef9206409d5ce1fdf44d5ca1687cdc3847ba0485424e2c731e6bcf67/tokenizers-0.20.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:585b51e06ca1f4839ce7759941e66766d7b060dccfdc57c4ca1e5b9a33013a90", size = 2674224, upload-time = "2024-11-05T17:30:49.972Z" }, + { url = "https://files.pythonhosted.org/packages/aa/14/e75ece72e99f6ef9ae07777ca9fdd78608f69466a5cecf636e9bd2f25d5c/tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61cbf11954f3b481d08723ebd048ba4b11e582986f9be74d2c3bdd9293a4538d", size = 2558991, upload-time = "2024-11-05T17:30:51.666Z" }, + { url = "https://files.pythonhosted.org/packages/46/54/033b5b2ba0c3ae01e026c6f7ced147d41a2fa1c573d00a66cb97f6d7f9b3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef820880d5e4e8484e2fa54ff8d297bb32519eaa7815694dc835ace9130a3eea", size = 2892476, upload-time = "2024-11-05T17:30:53.505Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/cc369fb3297d61f3311cab523d16d48c869dc2f0ba32985dbf03ff811041/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:67ef4dcb8841a4988cd00dd288fb95dfc8e22ed021f01f37348fd51c2b055ba9", size = 2802775, upload-time = "2024-11-05T17:30:55.229Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/62ad983e8ea6a63e04ed9c5be0b605056bf8aac2f0125f9b5e0b3e2b89fa/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb", size = 3086138, upload-time = "2024-11-05T17:30:57.332Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ac/4637ba619db25094998523f9e6f5b456e1db1f8faa770a3d925d436db0c3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:444d188186eab3148baf0615b522461b41b1f0cd58cd57b862ec94b6ac9780f1", size = 3098076, upload-time = "2024-11-05T17:30:59.455Z" }, + { url = "https://files.pythonhosted.org/packages/58/ce/9793f2dc2ce529369807c9c74e42722b05034af411d60f5730b720388c7d/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37c04c032c1442740b2c2d925f1857885c07619224a533123ac7ea71ca5713da", size = 3379650, upload-time = "2024-11-05T17:31:01.264Z" }, + { url = "https://files.pythonhosted.org/packages/50/f6/2841de926bc4118af996eaf0bdf0ea5b012245044766ffc0347e6c968e63/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453c7769d22231960ee0e883d1005c93c68015025a5e4ae56275406d94a3c907", size = 2994005, upload-time = "2024-11-05T17:31:02.985Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b2/00915c4fed08e9505d37cf6eaab45b12b4bff8f6719d459abcb9ead86a4b/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4bb31f7b2847e439766aaa9cc7bccf7ac7088052deccdb2275c952d96f691c6a", size = 8977488, upload-time = "2024-11-05T17:31:04.424Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ac/1c069e7808181ff57bcf2d39e9b6fbee9133a55410e6ebdaa89f67c32e83/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:843729bf0f991b29655a069a2ff58a4c24375a553c70955e15e37a90dd4e045c", size = 9294935, upload-time = "2024-11-05T17:31:06.882Z" }, + { url = "https://files.pythonhosted.org/packages/50/47/722feb70ee68d1c4412b12d0ea4acc2713179fd63f054913990f9e259492/tokenizers-0.20.3-cp311-none-win32.whl", hash = "sha256:efcce3a927b1e20ca694ba13f7a68c59b0bd859ef71e441db68ee42cf20c2442", size = 2197175, upload-time = "2024-11-05T17:31:09.385Z" }, + { url = "https://files.pythonhosted.org/packages/75/68/1b4f928b15a36ed278332ac75d66d7eb65d865bf344d049c452c18447bf9/tokenizers-0.20.3-cp311-none-win_amd64.whl", hash = "sha256:88301aa0801f225725b6df5dea3d77c80365ff2362ca7e252583f2b4809c4cc0", size = 2381616, upload-time = "2024-11-05T17:31:10.685Z" }, + { url = "https://files.pythonhosted.org/packages/07/00/92a08af2a6b0c88c50f1ab47d7189e695722ad9714b0ee78ea5e1e2e1def/tokenizers-0.20.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:49d12a32e190fad0e79e5bdb788d05da2f20d8e006b13a70859ac47fecf6ab2f", size = 2667951, upload-time = "2024-11-05T17:31:12.356Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9a/e17a352f0bffbf415cf7d73756f5c73a3219225fc5957bc2f39d52c61684/tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:282848cacfb9c06d5e51489f38ec5aa0b3cd1e247a023061945f71f41d949d73", size = 2555167, upload-time = "2024-11-05T17:31:13.839Z" }, + { url = "https://files.pythonhosted.org/packages/27/37/d108df55daf4f0fcf1f58554692ff71687c273d870a34693066f0847be96/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe4e08c7d0cd6154c795deb5bf81d2122f36daf075e0c12a8b050d824ef0a64", size = 2898389, upload-time = "2024-11-05T17:31:15.12Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/32f29da16d28f59472fa7fb38e7782069748c7e9ab9854522db20341624c/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca94fc1b73b3883c98f0c88c77700b13d55b49f1071dfd57df2b06f3ff7afd64", size = 2795866, upload-time = "2024-11-05T17:31:16.857Z" }, + { url = "https://files.pythonhosted.org/packages/29/4e/8a9a3c89e128c4a40f247b501c10279d2d7ade685953407c4d94c8c0f7a7/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef279c7e239f95c8bdd6ff319d9870f30f0d24915b04895f55b1adcf96d6c60d", size = 3085446, upload-time = "2024-11-05T17:31:18.392Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3b/a2a7962c496ebcd95860ca99e423254f760f382cd4bd376f8895783afaf5/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16384073973f6ccbde9852157a4fdfe632bb65208139c9d0c0bd0176a71fd67f", size = 3094378, upload-time = "2024-11-05T17:31:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f4/a8a33f0192a1629a3bd0afcad17d4d221bbf9276da4b95d226364208d5eb/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:312d522caeb8a1a42ebdec87118d99b22667782b67898a76c963c058a7e41d4f", size = 3385755, upload-time = "2024-11-05T17:31:21.778Z" }, + { url = "https://files.pythonhosted.org/packages/9e/65/c83cb3545a65a9eaa2e13b22c93d5e00bd7624b354a44adbdc93d5d9bd91/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b7cb962564785a83dafbba0144ecb7f579f1d57d8c406cdaa7f32fe32f18ad", size = 2997679, upload-time = "2024-11-05T17:31:23.134Z" }, + { url = "https://files.pythonhosted.org/packages/55/e9/a80d4e592307688a67c7c59ab77e03687b6a8bd92eb5db763a2c80f93f57/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:124c5882ebb88dadae1fc788a582299fcd3a8bd84fc3e260b9918cf28b8751f5", size = 8989296, upload-time = "2024-11-05T17:31:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/60c957af8d2244321124e893828f1a4817cde1a2d08d09d423b73f19bd2f/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2b6e54e71f84c4202111a489879005cb14b92616a87417f6c102c833af961ea2", size = 9303621, upload-time = "2024-11-05T17:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/be/a9/96172310ee141009646d63a1ca267c099c462d747fe5ef7e33f74e27a683/tokenizers-0.20.3-cp312-none-win32.whl", hash = "sha256:83d9bfbe9af86f2d9df4833c22e94d94750f1d0cd9bfb22a7bb90a86f61cdb1c", size = 2188979, upload-time = "2024-11-05T17:31:29.483Z" }, + { url = "https://files.pythonhosted.org/packages/bd/68/61d85ae7ae96dde7d0974ff3538db75d5cdc29be2e4329cd7fc51a283e22/tokenizers-0.20.3-cp312-none-win_amd64.whl", hash = "sha256:44def74cee574d609a36e17c8914311d1b5dbcfe37c55fd29369d42591b91cf2", size = 2380725, upload-time = "2024-11-05T17:31:31.315Z" }, +] + +[[package]] +name = "toolz" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/d6/114b492226588d6ff54579d95847662fc69196bdeec318eb45393b24c192/toolz-1.1.0.tar.gz", hash = "sha256:27a5c770d068c110d9ed9323f24f1543e83b2f300a687b7891c1a6d56b697b5b", size = 52613, upload-time = "2025-10-17T04:03:21.661Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/12/5911ae3eeec47800503a238d971e51722ccea5feb8569b735184d5fcdbc0/toolz-1.1.0-py3-none-any.whl", hash = "sha256:15ccc861ac51c53696de0a5d6d4607f99c210739caf987b5d2054f3efed429d8", size = 58093, upload-time = "2025-10-17T04:03:20.435Z" }, +] + +[[package]] +name = "torch" +version = "2.6.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, + { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:3e73419aab6dbcd888a3cc6a00d1f52f5950d918d7289ea6aeae751346613edc" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:6ab0417ce9b78ab0a34721a99734b5fd4cc3d7b62ff1c068a7d636fd829772db" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:9f7d170d6c78726945d95fcc3a3d7601f36aed0e6e0dc9ca377a64d6a8fd7b3a" }, + { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:6c040e4181c5dae73b965b61394ec431c93b2018165e2be8f15fc68d44444cb3" }, +] + +[[package]] +name = "torchaudio" +version = "2.6.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "torch" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, + { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, +] + +[[package]] +name = "torchvision" +version = "0.21.0+cu118" +source = { registry = "https://download.pytorch.org/whl/cu118" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, + { name = "torch" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, + { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "transformers" +version = "4.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944, upload-time = "2024-11-18T22:13:01.012Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, +] + +[[package]] +name = "tree-sitter" +version = "0.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, + { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, + { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, + { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, + { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, + { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, + { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, + { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, + { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, + { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, + { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, + { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, + { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, + { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, + { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, + { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, + { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, + { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, + { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, +] + +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, + { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, + { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, + { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, + { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, + { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/fc/bb52958f7e399250aee093751e9373a6311cadbe76b6e0d109b853757f35/tree_sitter_typescript-0.23.2.tar.gz", hash = "sha256:7b167b5827c882261cb7a50dfa0fb567975f9b315e87ed87ad0a0a3aedb3834d", size = 773053, upload-time = "2024-11-11T02:36:11.396Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/95/4c00680866280e008e81dd621fd4d3f54aa3dad1b76b857a19da1b2cc426/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3cd752d70d8e5371fdac6a9a4df9d8924b63b6998d268586f7d374c9fba2a478", size = 286677, upload-time = "2024-11-11T02:35:58.839Z" }, + { url = "https://files.pythonhosted.org/packages/8f/2f/1f36fda564518d84593f2740d5905ac127d590baf5c5753cef2a88a89c15/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c7cc1b0ff5d91bac863b0e38b1578d5505e718156c9db577c8baea2557f66de8", size = 302008, upload-time = "2024-11-11T02:36:00.733Z" }, + { url = "https://files.pythonhosted.org/packages/96/2d/975c2dad292aa9994f982eb0b69cc6fda0223e4b6c4ea714550477d8ec3a/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1eed5b0b3a8134e86126b00b743d667ec27c63fc9de1b7bb23168803879e31", size = 351987, upload-time = "2024-11-11T02:36:02.669Z" }, + { url = "https://files.pythonhosted.org/packages/49/d1/a71c36da6e2b8a4ed5e2970819b86ef13ba77ac40d9e333cb17df6a2c5db/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e96d36b85bcacdeb8ff5c2618d75593ef12ebaf1b4eace3477e2bdb2abb1752c", size = 344960, upload-time = "2024-11-11T02:36:04.443Z" }, + { url = "https://files.pythonhosted.org/packages/7f/cb/f57b149d7beed1a85b8266d0c60ebe4c46e79c9ba56bc17b898e17daf88e/tree_sitter_typescript-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8d4f0f9bcb61ad7b7509d49a1565ff2cc363863644a234e1e0fe10960e55aea0", size = 340245, upload-time = "2024-11-11T02:36:06.473Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ab/dd84f0e2337296a5f09749f7b5483215d75c8fa9e33738522e5ed81f7254/tree_sitter_typescript-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:3f730b66396bc3e11811e4465c41ee45d9e9edd6de355a58bbbc49fa770da8f9", size = 278015, upload-time = "2024-11-11T02:36:07.631Z" }, + { url = "https://files.pythonhosted.org/packages/9f/e4/81f9a935789233cf412a0ed5fe04c883841d2c8fb0b7e075958a35c65032/tree_sitter_typescript-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:05db58f70b95ef0ea126db5560f3775692f609589ed6f8dd0af84b7f19f1cbb7", size = 274052, upload-time = "2024-11-11T02:36:09.514Z" }, +] + +[[package]] +name = "triton" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636, upload-time = "2025-01-22T19:12:51.322Z" }, + { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, +] + +[[package]] +name = "typer" +version = "0.16.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] + +[[package]] +name = "wrapt" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/81/60c4471fce95afa5922ca09b88a25f03c93343f759aae0f31fb4412a85c7/wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb", size = 60666, upload-time = "2026-03-06T02:52:58.934Z" }, + { url = "https://files.pythonhosted.org/packages/6b/be/80e80e39e7cb90b006a0eaf11c73ac3a62bbfb3068469aec15cc0bc795de/wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d", size = 61601, upload-time = "2026-03-06T02:53:00.487Z" }, + { url = "https://files.pythonhosted.org/packages/b0/be/d7c88cd9293c859fc74b232abdc65a229bb953997995d6912fc85af18323/wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894", size = 114057, upload-time = "2026-03-06T02:52:44.08Z" }, + { url = "https://files.pythonhosted.org/packages/ea/25/36c04602831a4d685d45a93b3abea61eca7fe35dab6c842d6f5d570ef94a/wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842", size = 116099, upload-time = "2026-03-06T02:54:56.74Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4e/98a6eb417ef551dc277bec1253d5246b25003cf36fdf3913b65cb7657a56/wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8", size = 112457, upload-time = "2026-03-06T02:53:52.842Z" }, + { url = "https://files.pythonhosted.org/packages/cb/a6/a6f7186a5297cad8ec53fd7578533b28f795fdf5372368c74bd7e6e9841c/wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6", size = 115351, upload-time = "2026-03-06T02:53:32.684Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/06e66189e721dbebd5cf20e138acc4d1150288ce118462f2fcbff92d38db/wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9", size = 111748, upload-time = "2026-03-06T02:53:08.455Z" }, + { url = "https://files.pythonhosted.org/packages/ef/43/4808b86f499a51370fbdbdfa6cb91e9b9169e762716456471b619fca7a70/wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15", size = 113783, upload-time = "2026-03-06T02:53:02.02Z" }, + { url = "https://files.pythonhosted.org/packages/91/2c/a3f28b8fa7ac2cefa01cfcaca3471f9b0460608d012b693998cd61ef43df/wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b", size = 57977, upload-time = "2026-03-06T02:53:27.844Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c3/2b1c7bd07a27b1db885a2fab469b707bdd35bddf30a113b4917a7e2139d2/wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1", size = 60336, upload-time = "2026-03-06T02:54:28.104Z" }, + { url = "https://files.pythonhosted.org/packages/ec/5c/76ece7b401b088daa6503d6264dd80f9a727df3e6042802de9a223084ea2/wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a", size = 58756, upload-time = "2026-03-06T02:53:16.319Z" }, + { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255, upload-time = "2026-03-06T02:52:45.663Z" }, + { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848, upload-time = "2026-03-06T02:53:48.728Z" }, + { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433, upload-time = "2026-03-06T02:54:40.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013, upload-time = "2026-03-06T02:53:26.58Z" }, + { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326, upload-time = "2026-03-06T02:53:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444, upload-time = "2026-03-06T02:54:09.5Z" }, + { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237, upload-time = "2026-03-06T02:54:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563, upload-time = "2026-03-06T02:53:20.412Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198, upload-time = "2026-03-06T02:53:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441, upload-time = "2026-03-06T02:52:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836, upload-time = "2026-03-06T02:53:22.053Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, +] + +[[package]] +name = "xlsxwriter" +version = "3.2.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, +] + +[[package]] +name = "yarl" +version = "1.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" }, + { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" }, + { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" }, + { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" }, + { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" }, + { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" }, + { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" }, + { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" }, + { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" }, + { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" }, + { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" }, + { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, + { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, + { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, + { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, + { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, + { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, + { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, + { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, + { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" }, + { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" }, + { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" }, + { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" }, + { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" }, + { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" }, + { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" }, + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, +] diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index b85460e..c7a5b58 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -1,67 +1,57 @@ # GlossAPI Dependency Profiles & Test Notes ## Environment Profiles -- **Vanilla** – core GlossAPI pipeline without GPU OCR add-ons. Uses `requirements-glossapi-vanilla.txt`. -- **RapidOCR** – Docling + RapidOCR GPU stack. Builds on vanilla requirements and adds ONNX runtime (`requirements-glossapi-rapidocr.txt`). -- **DeepSeek** – GPU OCR via DeepSeek/vLLM. Extends vanilla requirements with torch/cu128, nightly vLLM and supporting CUDA libs (`requirements-glossapi-deepseek.txt`). `xformers` was dropped because the published wheels still pin Torch 2.8; the rest of the stack now installs cleanly on Torch 2.9. +- **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml`. -Each profile is installed through `dependency_setup/setup_glossapi.sh`: +Recommended installation commands: ```bash -# Examples (venv path optional) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests -./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests ``` Key flags: -- `--download-deepseek` optionally fetches DeepSeek weights (skipped by default; set `--weights-dir` if they live elsewhere). +- `--download-model` optionally fetches DeepSeek weights (set `--model-root` if they live elsewhere). - `--smoke-test` (DeepSeek only) runs `dependency_setup/deepseek_gpu_smoke.py`. ## Test Segmentation Pytest markers were added so suites can be run per profile: -- `rapidocr` – GPU Docling/RapidOCR integration tests. - `deepseek` – DeepSeek execution paths. -- Unmarked tests cover the vanilla footprint. +- Unmarked tests cover the Docling/core footprint. -`setup_glossapi.sh` now chooses marker expressions automatically: +Suggested commands: -| Mode | Command run by script | -|-----------|---------------------------------------------------------| -| vanilla | `pytest -q -m "not rapidocr and not deepseek" tests` | -| rapidocr | `pytest -q -m "not deepseek" tests` | -| deepseek | `pytest -q -m "not rapidocr" tests` | +| Profile | Command | +|-----------|---------| +| Docling | `pytest -q -m "not deepseek" tests` | +| DeepSeek | `pytest -q -m "deepseek" tests` | -Heavy GPU tests in `tests/test_pipeline_smoke.py` were guarded with `pytest.importorskip("onnxruntime")` so vanilla installs skip them cleanly. Helper PDFs now embed DejaVuSans with Unicode support and insert spacing to keep OCR-friendly glyphs. +## Validation Runs (2026-03-08) +- `./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --smoke-test` -## Validation Runs (2025-10-30) -- `./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests` - -All three completed successfully after the following adjustments: -1. **Rust extensions** – switched to `pip install -e rust/glossapi_rs_{cleaner,noise}` because `maturin develop` left the wheel unregistered. -2. **Parquet locking** – `_parquet_lock` now creates parent directories before attempting the file lock (fixes `FileNotFoundError` in concurrent metadata tests). -3. **RapidOCR pipeline** – fixed `GlossExtract.create_extractor()` to build the Docling converter regardless of import path and added UTF-8 PDF generation improvements; smoke tests now pass on CUDA. -4. **DeepSeek stack** – updated nightly vLLM pin (`0.11.1rc5.dev58+g60f76baa6.cu129`) and removed `xformers` to resolve Torch 2.9 dependency conflicts. +These completed successfully after the following adjustments: +1. **Rust extensions** – use editable installs for `rust/glossapi_rs_{cleaner,noise}` so local changes are picked up immediately. +2. **DeepSeek stack** – moved to a uv-managed runtime pinned to the `transformers`-based OCR-2 path. +3. **Attention fallback** – the DeepSeek runner falls back to `eager` attention if `flash-attn` is unavailable. ## Known Follow-ups -- **DeepSeek weights** – installer warns if weights are absent. Set `--download-deepseek` or populate `${DEEPSEEK_ROOT}/DeepSeek-OCR` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). -- **xformers kernels** – removed pending compatible Torch 2.9 wheels. Reintroduce once upstream publishes matching builds. +- **DeepSeek weights** – installer warns if weights are absent. Set `--download-model` or populate `${MODEL_ROOT}/DeepSeek-OCR-2` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). +- **flash-attn** – optional. Reintroduce into the pinned flow once wheel availability is stable across target hosts. - **Patchelf warnings** – maturin emits rpath hints if `patchelf` is missing; they are benign but install `patchelf` if cleaner logs are desired. -- **Deprecation noise** – Docling emits future warnings (Pydantic) and RapidOCR font deprecation notices; currently harmless but worth tracking for future upgrades. +- **Deprecation noise** – Docling and Transformers emit some warnings on current pins; currently harmless but worth tracking for future upgrades. ## Quick Reference -- Activate an environment: `source dependency_setup/.venvs//bin/activate` +- Activate an environment: `source dependency_setup/.venvs//bin/activate` - Re-run tests manually: - - Vanilla: `pytest -m "not rapidocr and not deepseek" tests` - - RapidOCR: `pytest -m "not deepseek" tests` - - DeepSeek: `pytest -m "not rapidocr" tests` + - Docling: `pytest -m "not deepseek" tests` + - DeepSeek: `pytest -m "deepseek" tests` - DeepSeek runtime exports: ```bash export GLOSSAPI_DEEPSEEK_PYTHON="dependency_setup/.venvs/deepseek/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="/mnt/data/glossAPI/deepseek-ocr/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="/mnt/data/glossAPI/deepseek-ocr/libjpeg-turbo/lib" - export LD_LIBRARY_PATH="$GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH:${LD_LIBRARY_PATH:-}" + export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT="/mnt/data/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py" + export GLOSSAPI_DEEPSEEK_MODEL_DIR="/mnt/data/glossAPI/deepseek-ocr-2-model/DeepSeek-OCR-2" ``` These notes capture the current dependency state, the rationale behind constraint changes, and the validation steps used to exercise each profile. diff --git a/dependency_setup/requirements-glossapi-deepseek.txt b/dependency_setup/requirements-glossapi-deepseek.txt index 5cc685a..8185d9c 100644 --- a/dependency_setup/requirements-glossapi-deepseek.txt +++ b/dependency_setup/requirements-glossapi-deepseek.txt @@ -1,16 +1,13 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 ---extra-index-url https://wheels.vllm.ai/nightly --r requirements-glossapi-vanilla.txt -# CUDA Torch stack aligned with NVIDIA L4 (CUDA 12.8 wheels) -torch==2.9.0+cu128 -torchvision==0.24.0+cu128 -torchaudio==2.9.0+cu128 -# DeepSeek via nightly vLLM -vllm==0.11.1rc5.dev58+g60f76baa6.cu129 -flashinfer-python==0.4.1 -compressed-tensors==0.12.2 -depyf==0.20.0 -# Auxiliary CUDA libs -nvidia-nvshmem-cu12==3.3.20 -nvidia-nccl-cu12==2.27.5 -triton==3.5.0 +--extra-index-url https://download.pytorch.org/whl/cu118 +-r requirements-glossapi-docling.txt +torch==2.6.0 +torchvision==0.21.0 +torchaudio==2.6.0 +transformers==4.46.3 +tokenizers==0.20.3 +accelerate>=1.2.1,<2 +pymupdf==1.24.10 +Pillow==10.4.0 +img2pdf>=0.5.1 +easydict +addict diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt new file mode 100644 index 0000000..402261a --- /dev/null +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -0,0 +1,38 @@ +# Core GlossAPI runtime (Docling extraction/layout) +maturin>=1.5,<2.0 +numpy<2 +pandas>=1.3.0 +python-dateutil>=2.8.2 +pytz>=2021.1 +scikit-learn==1.6.1 +joblib>=1.0.0 +dask>=2022.1.0 +pyarrow>=7.0.0 +aiohttp>=3.8.0 +aiofiles>=23.0.0 +ftfy>=6.0.0 +tenacity>=8.0.0 +tqdm>=4.67.0 +pyyaml>=6.0 +pypdfium2>=4.0.0 +zstandard>=0.22.0 +docling==2.48.0 +docling-core==2.47.0 +docling-parse==4.4.0 +docling-ibm-models==3.9.1 +msgspec>=0.18.6 +fpdf2>=2.7.0 +cachetools +cbor2 +einops +tiktoken +diskcache==5.6.3 +lark==1.2.2 +numba==0.61.2 +# Tooling / tests +pytest>=8.0 +pytest-mock>=3.14 +psutil>=5.9 +rich>=14.0 +safetensors>=0.4 +huggingface-hub>=0.22 diff --git a/dependency_setup/requirements-glossapi-rapidocr.txt b/dependency_setup/requirements-glossapi-rapidocr.txt deleted file mode 100644 index f5c5839..0000000 --- a/dependency_setup/requirements-glossapi-rapidocr.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements-glossapi-vanilla.txt -rapidocr>=3.3.0 -opencv-python-headless>=4.8.0 -onnxruntime-gpu==1.18.1 diff --git a/dependency_setup/setup_deepseek_uv.sh b/dependency_setup/setup_deepseek_uv.sh new file mode 100755 index 0000000..04a21ba --- /dev/null +++ b/dependency_setup/setup_deepseek_uv.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +PROJECT_DIR="${SCRIPT_DIR}/deepseek_uv" + +PYTHON_BIN="${PYTHON:-python3}" +VENV_PATH="${GLOSSAPI_DEEPSEEK_VENV:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" +MODEL_ROOT="${DEEPSEEK_ROOT:-${REPO_ROOT}/deepseek-ocr-2-model}" +DOWNLOAD_MODEL=0 +RUN_SMOKE=0 +RUN_TESTS=0 + +info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + +SYNC_ARGS=(--no-dev) + +usage() { + cat <<'EOF' +Usage: setup_deepseek_uv.sh [options] + +Options: + --venv PATH Target virtual environment path + --python PATH Python executable to use for uv venv + --model-root PATH Destination root for the DeepSeek-OCR-2 model + --download-model Download DeepSeek-OCR-2 via huggingface_hub + --run-tests Run the DeepSeek pytest subset after installation + --smoke-test Run dependency_setup/deepseek_gpu_smoke.py + --help Show this help message +EOF +} + +while (( "$#" )); do + case "$1" in + --venv) + shift || { echo "--venv requires a path" >&2; exit 1; } + VENV_PATH="${1:-}" + ;; + --python) + shift || { echo "--python requires a path" >&2; exit 1; } + PYTHON_BIN="${1:-}" + ;; + --model-root|--weights-dir) + shift || { echo "--model-root requires a path" >&2; exit 1; } + MODEL_ROOT="${1:-}" + ;; + --download-model|--download-deepseek) + DOWNLOAD_MODEL=1 + ;; + --run-tests) + RUN_TESTS=1 + ;; + --smoke-test) + RUN_SMOKE=1 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift || true +done + +command -v uv >/dev/null 2>&1 || error "uv is required. Install it first, e.g. 'python3 -m pip install --user uv'." + +MODEL_DIR="${MODEL_ROOT}/DeepSeek-OCR-2" + +if [[ -x "${VENV_PATH}/bin/python" ]]; then + info "Reusing uv environment at ${VENV_PATH}" +else + info "Creating uv environment at ${VENV_PATH}" + uv venv --python "${PYTHON_BIN}" "${VENV_PATH}" +fi + +if [[ "${RUN_TESTS}" -eq 1 ]]; then + SYNC_ARGS+=(--group test) +fi + +info "Syncing DeepSeek runtime from ${PROJECT_DIR}" +UV_PROJECT_ENVIRONMENT="${VENV_PATH}" uv sync --project "${PROJECT_DIR}" --python "${VENV_PATH}/bin/python" "${SYNC_ARGS[@]}" + +info "Installing Rust extensions in editable mode" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_noise" + +if [[ "${DOWNLOAD_MODEL}" -eq 1 ]]; then + info "Downloading DeepSeek-OCR-2 model to ${MODEL_DIR}" + HUGGINGFACE_HUB_TOKEN="${HUGGINGFACE_HUB_TOKEN:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}}" \ + "${VENV_PATH}/bin/python" - <\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + usage() { cat <<'EOF' Usage: setup_glossapi.sh [options] Options: - --mode MODE Environment profile: vanilla, rapidocr, deepseek (default: vanilla) + --mode MODE Environment profile: docling or deepseek (default: docling) --venv PATH Target virtual environment path --python PATH Python executable to use when creating the venv - --download-deepseek Fetch DeepSeek-OCR weights (only meaningful for --mode deepseek) - --weights-dir PATH Destination directory for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr) + --download-deepseek Fetch DeepSeek-OCR-2 weights (DeepSeek mode only) + --weights-dir PATH Destination directory root for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr-2-model) --run-tests Run pytest -q after installation --smoke-test Run dependency_setup/deepseek_gpu_smoke.py (deepseek mode only) --help Show this help message @@ -69,13 +73,30 @@ while (( "$#" )); do done case "${MODE}" in - vanilla|rapidocr|deepseek) ;; + vanilla) + warn "Mode 'vanilla' is deprecated; using 'docling' instead." + MODE="docling" + ;; + rapidocr) + error "RapidOCR setup has been removed. Use --mode docling or --mode deepseek." + ;; + docling|deepseek) ;; *) - echo "Invalid mode '${MODE}'. Expected vanilla, rapidocr, or deepseek." >&2 + echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 exit 1 ;; esac +if [[ "${MODE}" == "deepseek" ]]; then + exec "${SCRIPT_DIR}/setup_deepseek_uv.sh" \ + --python "${PYTHON_BIN}" \ + --venv "${VENV_PATH:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" \ + --model-root "${DEEPSEEK_ROOT}" \ + $([[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]] && printf '%s' "--download-model") \ + $([[ "${RUN_TESTS}" -eq 1 ]] && printf '%s' "--run-tests") \ + $([[ "${RUN_SMOKE}" -eq 1 ]] && printf '%s' "--smoke-test") +fi + if [[ -z "${VENV_PATH}" ]]; then VENV_PATH="${REPO_ROOT}/.venv_glossapi_${MODE}" fi @@ -86,10 +107,6 @@ if [[ ! -f "${REQUIREMENTS_FILE}" ]]; then exit 1 fi -info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } -warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } -error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } - ensure_venv() { if [[ ! -d "${VENV_PATH}" ]]; then info "Creating virtual environment at ${VENV_PATH}" @@ -107,44 +124,6 @@ python_run() { "${VENV_PATH}/bin/python" "$@" } -download_deepseek_weights() { - local root="$1" - local target="${root}/DeepSeek-OCR" - - if [[ -d "${target}" ]]; then - info "DeepSeek-OCR weights already present at ${target}" - return 0 - fi - - mkdir -p "${root}" - if command -v huggingface-cli >/dev/null 2>&1; then - info "Downloading DeepSeek weights with huggingface-cli (this may take a while)" - huggingface-cli download deepseek-ai/DeepSeek-OCR \ - --repo-type model \ - --include "DeepSeek-OCR/*" \ - --local-dir "${target}" \ - --local-dir-use-symlinks False || warn "huggingface-cli download failed; falling back to git-lfs" - fi - - if [[ ! -d "${target}" ]]; then - if command -v git >/dev/null 2>&1; then - if ! command -v git-lfs >/dev/null 2>&1; then - warn "git-lfs not available; install git-lfs to clone DeepSeek weights via git." - else - info "Cloning DeepSeek weights via git-lfs" - git lfs install --skip-repo >/dev/null 2>&1 || true - git clone https://huggingface.co/deepseek-ai/DeepSeek-OCR "${target}" - fi - else - warn "Neither huggingface-cli nor git found; skipping DeepSeek weight download." - fi - fi - - if [[ ! -d "${target}" ]]; then - warn "DeepSeek weights were not downloaded. Set DEEPSEEK_ROOT manually once acquired." - fi -} - ensure_venv info "Upgrading pip tooling" pip_run install --upgrade pip wheel setuptools @@ -159,43 +138,18 @@ info "Building Rust extensions via editable installs" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_noise" -if [[ "${MODE}" == "deepseek" ]]; then - export GLOSSAPI_DEEPSEEK_PYTHON="${VENV_PATH}/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="${DEEPSEEK_ROOT}/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="${DEEPSEEK_ROOT}/libjpeg-turbo/lib" - export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 - export LD_LIBRARY_PATH="${GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH:-}" - - if [[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]]; then - download_deepseek_weights "${DEEPSEEK_ROOT}" - else - warn "DeepSeek weights not downloaded (use --download-deepseek to fetch automatically)." - fi -fi - if [[ "${RUN_TESTS}" -eq 1 ]]; then pytest_args=("-q") case "${MODE}" in - vanilla) - pytest_args+=("-m" "not rapidocr and not deepseek") - ;; - rapidocr) + docling) pytest_args+=("-m" "not deepseek") ;; - deepseek) - pytest_args+=("-m" "not rapidocr") - ;; esac info "Running pytest ${pytest_args[*]} tests" python_run -m pytest "${pytest_args[@]}" tests fi -if [[ "${MODE}" == "deepseek" && "${RUN_SMOKE}" -eq 1 ]]; then - info "Running DeepSeek smoke test" - python_run "${SCRIPT_DIR}/deepseek_gpu_smoke.py" -fi - cat < clean/evaluate -> OCR -> section validation has been run on capped Pergamos samples +- OCR progress artifacts were moved out of the canonical `markdown/` tree so downstream stages no longer treat them as real documents + +The following work is intentionally not part of the completed set yet: + +- Docling dependency upgrades +- page-level OCR reevaluation experiments +- broader corpus-level comparative benchmarking beyond the capped validation runs + +## Remaining TODO to wrap up the implemented changes + +These are the remaining tasks for closing out the already-implemented migration work: + +1. review and curate the final commit contents +2. keep only source, docs, and test changes that belong in the `development` branch +3. exclude local artifacts, downloaded models, disposable environments, and ad hoc validation output from the commit +4. optionally run one more small real-PDF compatibility slice if an extra release-confidence check is desired +5. create or switch to the `development` branch and push the finalized change set there + +This means the migration implementation itself is effectively done; what remains is mainly release hygiene and branch preparation. + +## Target architecture + +The target shape is: + +1. `download()` +2. `extract()` via safe backend or Docling +3. `clean()` and compute Greek-quality routing +4. `ocr()` via DeepSeek only for documents that need remediation +5. `section()` +6. `annotate()` +7. `export()` + +Important boundary: + +- keep `Docling` for extraction, layout, Markdown, JSON artifacts, and optional formula/code enrichment +- remove `RapidOCR` from the OCR path and installation surface +- enforce `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` for production and release validation + +This is a simplification, not a redesign of the entire pipeline contract. + +## Why this direction + +The current mixed OCR surface adds complexity in three places: + +- dependency installation and CUDA compatibility +- runtime branching and operational support burden +- validation burden when one OCR path succeeds and another fails differently + +The simplified design still preserves the important current properties: + +- selective OCR after Greek-quality validation +- Docling-generated layout and JSON artifacts for downstream stages +- explicit operational metadata and rerun semantics + +## Stage 1: DeepSeek-only OCR + +Goal: + +- make DeepSeek the only OCR remediation backend +- remove silent stub fallback from production paths + +Changes: + +- remove `rapidocr` as a supported OCR backend +- route `Corpus.ocr()` to DeepSeek only +- fail hard when DeepSeek runtime, weights, or CLI are unavailable +- keep the current document-level `needs_ocr` selection model + +Do not change in this stage: + +- Docling extraction contract +- sectioning and annotation behavior +- page-level routing policy +- formula/code enrichment policy + +Why this stage exists: + +- it gives the desired simplification without changing the rest of the pipeline contract at the same time +- it isolates OCR-engine risk from Docling-upgrade risk + +Success criteria: + +- no remaining production path imports or dispatches RapidOCR +- no final validation run succeeds via stub output +- documents flagged `needs_ocr=True` can still be remediated through DeepSeek + +Status: + +- completed + +## Stage 2: Installation simplification + +Goal: + +- reduce the environment surface to what the simplified pipeline actually needs + +Changes: + +- remove the `rapidocr` install profile and `onnxruntime-gpu` +- simplify setup profiles around: + - Docling extraction/runtime + - DeepSeek OCR runtime +- remove unused requirement baggage where it is not imported by GlossAPI itself +- make Python version constraints match current upstream reality + +Current constraint to fix: + +- GlossAPI currently declares `requires-python = ">=3.8"` while current Docling requires Python `>=3.10` + +Do not change in this stage: + +- pipeline behavior +- artifact layout +- OCR routing logic + +Why this stage exists: + +- environment simplification should follow architectural simplification +- it is easier to reason about required packages once RapidOCR is gone + +Success criteria: + +- setup documentation exposes only the supported environments +- install instructions no longer mention removed OCR components +- Python floor and dependency pins are internally consistent + +Status: + +- completed for the currently supported DeepSeek-only flow +- final branch hygiene and commit curation still remain + +## Stage 3: Docling upgrade + +Goal: + +- upgrade Docling after the OCR surface has already been simplified + +Changes: + +- update `docling` +- update `docling-core` +- update `docling-parse` +- update `docling-ibm-models` +- adapt any compatibility shims required by changed public APIs + +Do not change in this stage: + +- DeepSeek-only OCR decision +- page-level experiment +- formula/code enrichment policy unless explicitly validated + +Why this stage exists: + +- upgrading Docling before removing RapidOCR combines two unrelated breakage sources +- after Stage 1 the Docling integration surface is smaller and easier to validate + +Success criteria: + +- Phase-1 extraction still produces the documented canonical artifacts +- downstream sectioning, annotation, and export still consume the outputs +- metadata and resumability behavior do not regress + +Status: + +- deferred + +## Stage 4: Re-evaluate retained Docling capabilities + +Goal: + +- decide which Docling-powered features remain justified after the simplification + +Features to evaluate: + +- formula enrichment +- code enrichment +- table structure extraction +- any extra model/artifact prefetch currently required for non-default functionality + +Why this stage exists: + +- some capabilities may still be valuable for technical corpora +- some may only be increasing runtime and failure surface + +Rule: + +- do not remove formula/code enrichment just because it simplifies the stack +- remove it only if real-corpus evaluation shows little or no value + +Success criteria: + +- every retained capability has a measurable purpose +- every removed capability has an explicit evaluation-based justification + +Status: + +- pending + +## Stage 5: Page-level reevaluation experiment + +Goal: + +- test whether whole-document OCR reruns should be replaced or complemented by page-level escalation + +Experiment shape: + +- baseline branch: current document-level `needs_ocr` routing +- experiment branch: page-level or ROI-level routing + +What stays fixed: + +- DeepSeek remains the only OCR backend +- Docling remains the structured extraction/layout path + +Why this is separate: + +- it is an architectural experiment, not a prerequisite for the OCR simplification +- it should be compared against the stabilized DeepSeek-only baseline + +Primary evaluation questions: + +- does page-level escalation improve quality on long PDFs +- does it reduce OCR runtime and GPU cost +- does it preserve downstream sectioning and annotation quality + +Status: + +- pending + +## Non-goals for the first pass + +These are intentionally out of scope for the initial migration: + +- replacing Docling JSON/layout artifacts with DeepSeek-native structured artifacts +- merging all runtime concerns into one universal environment regardless of ecosystem constraints +- changing artifact layout at the same time as OCR simplification +- treating synthetic, mocked, or stubbed tests as sufficient release validation + +## Release sequence + +The intended order is: + +1. DeepSeek-only OCR and no-stub enforcement +2. installation simplification +3. Docling upgrade +4. retained-capability review +5. page-level experiment + +This order keeps one major architectural assumption changing at a time. diff --git a/docs/architecture/index.md b/docs/architecture/index.md index a8d8621..f6e1c85 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -103,7 +103,7 @@ Purpose: Important characteristics: -- can use RapidOCR via Docling or DeepSeek OCR +- uses DeepSeek OCR for remediation while keeping Docling in the surrounding extraction/layout flow - reads metadata to find OCR candidates - skiplist-aware - designed as a corrective stage, not the default for every document diff --git a/docs/configuration.md b/docs/configuration.md index 659d65c..0810530 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -20,28 +20,22 @@ Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread ### DeepSeek optional dependencies -Install DeepSeek backend extras to enable the DeepSeek OCR path (imports remain lazy, so the package is optional). Use the CUDA 12.1 wheels for both vLLM and Torch: +Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: ```bash -pip install '.[deepseek]' - -# Install Torch CUDA 12.1 wheels (required by the DeepSeek script) -pip install --extra-index-url https://download.pytorch.org/whl/cu121 \ - 'torch==2.5.1+cu121' 'torchvision==0.20.1+cu121' - -# Alternatively, use the requirements file (edit to uncomment torch lines): -pip install -r deepseek-ocr/requirements-deepseek.txt +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. ### DeepSeek runtime controls -- `GLOSSAPI_DEEPSEEK_ALLOW_STUB` (`1` by default): allow the builtin stub runner for tests and lightweight environments. -- `GLOSSAPI_DEEPSEEK_ALLOW_CLI` (`0` by default): flip to `1` to force the real vLLM CLI even when the stub is allowed. -- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs `run_pdf_ocr_vllm.py` (defaults to the current interpreter). -- `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT`: override path to the DeepSeek CLI script (defaults to `deepseek-ocr/run_pdf_ocr_vllm.py` under the repo). -- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths (e.g., for `libjpeg-turbo`) when launching the CLI. +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB`: must remain `0`; stub execution is rejected. +- `GLOSSAPI_DEEPSEEK_ALLOW_CLI`: keep at `1` to require the real runtime. +- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. +- `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`: override path to the OCR runner script (defaults to `src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`). +- `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. +- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. ## Math Enrichment (Phase‑2) @@ -71,10 +65,6 @@ All LaTeX policy knobs are loaded via `glossapi.text_sanitize.load_latex_policy( - `GLOSSAPI_WORKER_LOG_DIR`: override the directory used for per-worker logs and `gpu.current` markers (defaults to `logs/ocr_workers/` or `logs/math_workers/` under the output directory). - `GLOSSAPI_WORKER_LOG_VERBOSE` = `1|0` (default `1`): emit (or suppress) the GPU binding banner each worker prints on startup. -## RapidOCR Model Paths - -- `GLOSSAPI_RAPIDOCR_ONNX_DIR`: directory containing `det/rec/cls` ONNX models and keys. - ## Triage & Parquet - Triage always writes both: diff --git a/docs/getting_started.md b/docs/getting_started.md index f6bf4ce..94a2325 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -4,46 +4,39 @@ This guide gets a new GlossAPI contributor from clone → first extraction with ## Checklist -- Python 3.8+ (3.10 recommended) +- Python 3.10+ (`3.12` recommended for the DeepSeek runtime) - Recent `pip` (or `uv`) and a C/C++ toolchain for Rust wheels -- Optional: NVIDIA GPU with CUDA 12.x drivers for Docling/RapidOCR acceleration +- Optional: NVIDIA GPU with CUDA drivers for Docling/DeepSeek acceleration ## Install GlossAPI -### Recommended — mode-aware setup script +### Recommended setup -Use `dependency_setup/setup_glossapi.sh` to build an isolated virtualenv with the correct dependency set for vanilla, RapidOCR, or DeepSeek runs. Examples: +Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `dependency_setup/setup_deepseek_uv.sh` for the OCR runtime. Examples: ```bash -# Vanilla pipeline (CPU-only OCR) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# RapidOCR GPU stack -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR on GPU (expects weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR on GPU (uv-managed, downloads DeepSeek-OCR-2 if requested) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Add `--download-deepseek` if you need the script to fetch weights via Hugging Face; otherwise it searches `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Inspect `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation runs. The script installs GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the CLI can run (env vars, model dir, flashinfer, cc1plus, libjpeg). -- Force the real CLI and avoid stub fallback by setting: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. +- Force the real runtime and avoid stub fallback by setting: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- Install a CUDA toolkit with `nvcc` and set `CUDA_HOME` / prepend `$CUDA_HOME/bin` to `PATH` (FlashInfer/vLLM JIT expects it). -- If FlashInfer is unstable on your stack, disable it with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- Avoid FP8 KV cache issues by exporting `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1`; tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. -- Keep `LD_LIBRARY_PATH` pointing at the toolkit lib64 (e.g. `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`). + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- `flash-attn` is optional. The runner uses it when available and otherwise falls back to the Transformers `eager` attention implementation. ### Option 1 — pip (evaluate quickly) @@ -74,30 +67,19 @@ chmod +x scripts/setup_conda.sh conda activate glossapi ``` -The helper script provisions Python 3.10, installs Rust + `maturin`, performs an editable install, and applies the Docling RapidOCR patch automatically. +The helper script provisions Python 3.10, installs Rust + `maturin`, and performs an editable install. ## GPU prerequisites (optional but recommended) -`setup_glossapi.sh` pulls the right CUDA/Torch/ONNX wheels for the RapidOCR and DeepSeek profiles. If you are curating dependencies manually, make sure you: +`setup_glossapi.sh` and `setup_deepseek_uv.sh` pull the required Torch wheels for the supported Docling and DeepSeek flows. If you are curating dependencies manually, make sure you: -- Install the GPU build of ONNX Runtime (`onnxruntime-gpu`) and uninstall the CPU wheel. -- Select the PyTorch build that matches your driver/toolkit (the repository currently targets CUDA 12.8 for DeepSeek). +- Select the PyTorch build that matches your driver/toolkit. - Verify the providers with: ```bash - python -c "import onnxruntime as ort; print(ort.get_available_providers())" python -c "import torch; print(torch.cuda.is_available())" ``` -## RapidOCR models & keys - -GlossAPI ships the required ONNX models and Greek keys under `glossapi/models/rapidocr/{onnx,keys}`. To override them, set `GLOSSAPI_RAPIDOCR_ONNX_DIR` to a directory containing: - -- `det/inference.onnx` -- `rec/inference.onnx` -- `cls/ch_ppocr_mobile_v2.0_cls_infer.onnx` -- `greek_ppocrv5_keys.txt` - ## First run (lightweight corpus) ```bash diff --git a/docs/index.md b/docs/index.md index d696c8d..d8ec279 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,10 +16,11 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. - [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. - [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. +- [DeepSeek-Only Upgrade Roadmap](architecture/deepseek_only_upgrade_roadmap.md) — the staged simplification plan for OCR and dependency upgrades. ## Learn the pipeline - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. -- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers Docling + RapidOCR usage. +- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. - [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. @@ -27,6 +28,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Configuration](configuration.md) lists all environment knobs. - [Troubleshooting](troubleshooting.md) captures the most common pitfalls. - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. +- [Compatibility And Regression Matrix](testing/compatibility_matrix.md) defines the release-validation gates for the migration and upgrades. ## Reference - [Corpus API](api/corpus.md) details public methods and parameters. diff --git a/docs/math_enrichment_runtime.md b/docs/math_enrichment_runtime.md index 21d8617..096209c 100644 --- a/docs/math_enrichment_runtime.md +++ b/docs/math_enrichment_runtime.md @@ -68,9 +68,8 @@ c.ocr(math_targets=targets, math_batch_size=4) ## OCR/Model Constraints (recap) -- ORT GPU only: uninstall `onnxruntime` CPU; use `onnxruntime-gpu`. -- RapidOCR keys: Docling 2.48.0 needs `Rec.rec_keys_path` patch (see README). -- Model discovery: set `GLOSSAPI_RAPIDOCR_ONNX_DIR` or package models under `glossapi/models/rapidocr/`. +- DeepSeek OCR runs in its own pinned runtime; set `GLOSSAPI_DEEPSEEK_PYTHON`, `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`, and `GLOSSAPI_DEEPSEEK_MODEL_DIR`. +- Keep `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` and `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`. - Optional Torch CUDA: needed for GPU layout/enrichment; see README for the CUDA wheels. ## Multi‑GPU diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 197bb0a..f401829 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -1,15 +1,14 @@ # GPU OCR and Math Enrichment -This document summarizes how GlossAPI uses the GPU for OCR and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. +This document summarizes how GlossAPI uses the GPU for OCR remediation and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. ## Overview -- Phase‑1 (Extract): PDF → Markdown via Docling; optional GPU OCR via RapidOCR (ONNXRuntime). Optionally emit JSON + formula index for Phase‑2. +- Phase‑1 (Extract): PDF → Markdown via Docling or the safe backend. Optionally emit JSON + formula index for Phase‑2. - Phase‑2 (Enrich): From Docling JSON, decode math/code on the GPU (CodeFormula) and re‑emit enriched Markdown. Backends -- `backend='rapidocr'` (default): Docling + RapidOCR; Phase‑2 math runs from Docling JSON. -- `backend='deepseek'`: DeepSeek‑OCR; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. +- `backend='deepseek'`: DeepSeek-OCR-2; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. Policy: never OCR and math on the same file - If a file needs OCR, GlossAPI runs OCR only (no Phase‑2 on that file in the same pass). @@ -18,24 +17,20 @@ Policy: never OCR and math on the same file ### Python API layout - DeepSeek entry point: `glossapi.ocr.deepseek.runner.run_for_files(...)` -- RapidOCR dispatcher: `glossapi.ocr.rapidocr.dispatch.run_via_extract(...)` - Math enrichment: `glossapi.ocr.math.enrich.enrich_from_docling_json(...)` - Utility helpers (Docling JSON / cleaning): `glossapi.ocr.utils.*` ## Prerequisites -- RapidOCR/Docling stack: `pip install '.[rapidocr]'` -- DeepSeek CLI stack (in a dedicated venv recommended): `pip install '.[deepseek]'` -- ONNXRuntime GPU installed (no CPU ORT): `onnxruntime-gpu==1.18.1` -- Torch CUDA installed: e.g., `torch==2.5.1+cu121` -- Packaged RapidOCR models/keys found under `glossapi/models/rapidocr/{onnx,keys}` or via `GLOSSAPI_RAPIDOCR_ONNX_DIR`. +- Main GlossAPI stack: `./dependency_setup/setup_glossapi.sh --mode docling` +- DeepSeek runtime: `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek` +- Torch CUDA installed in the DeepSeek env (the uv setup pins the tested stack). - Optional helpers for Phase‑2 JSON: `pypdfium2`, `zstandard`. Verify GPU readiness before forcing OCR or math: ```bash python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())" # expects True, >=1 -python -c "import onnxruntime as ort; print(ort.get_available_providers())" # must include CUDAExecutionProvider ``` ## Running Phase‑1 (Extract) @@ -44,17 +39,14 @@ python -c "import onnxruntime as ort; print(ort.get_available_providers())" from glossapi import Corpus c = Corpus('IN','OUT') -# GPU OCR on PDFs; emit JSON + formula index for Phase‑2 +# Emit JSON + formula index for Phase‑2 c.extract( input_format='pdf', - accel_type='CUDA', # or use_gpus='multi' for multi‑GPU - force_ocr=True, # OCR always on for PDFs + accel_type='CUDA', emit_formula_index=True, # request json/.formula_index.jsonl alongside the default JSON ) ``` -When `force_ocr=True` (or when math/code enrichment is enabled), GlossAPI automatically switches to the Docling backend and aborts if CUDA‑enabled torch/ONNXRuntime providers are not available. - Outputs: - `markdown/.md` - `json/.docling.json(.zst)` and `json/.formula_index.jsonl` @@ -88,12 +80,7 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → runs OCR only for bad files; equations are included inline; Phase‑2 is skipped ``` -If you need Phase‑2 math on files that do not require OCR, use RapidOCR/Docling and math‑only (expects Docling JSON from Phase‑1): - -```python -c.ocr(backend='rapidocr', fix_bad=False, math_enhance=True, mode='math_only') -# → runs Phase‑2 on non‑OCR files only (requires Docling JSON) -``` +If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. ## Multi‑GPU @@ -101,7 +88,7 @@ Phase‑1 (extract): ```python c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) ``` -Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. OCR uses ORT GPU under the same process. +Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. Phase‑2 (enrich): ```python @@ -119,7 +106,7 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning - Batch sizes - - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula docling side throughput. + - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. @@ -159,11 +146,7 @@ OUT/ ## Troubleshooting -- Missing CUDAExecutionProvider - - Ensure `onnxruntime-gpu` is installed and `onnxruntime` CPU is uninstalled. - Torch reports no CUDA - Check `nvidia-smi` and match Torch CUDA build to your driver. -- OCR is slow or falls back to CPU - - Confirm ORT providers include CUDAExecutionProvider and that `accel_type='CUDA'` is used. - Out of memory - Lower `batch_size` for Phase‑2, reduce `GLOSSAPI_IMAGES_SCALE`, or split inputs. diff --git a/docs/quickstart.md b/docs/quickstart.md index 4b10685..a498725 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -38,14 +38,13 @@ Workers report per-batch summaries and extraction progress is persisted into `download_results/download_results.parquet`, so you can restart multi-GPU runs without losing progress (no extra checkpoint files required). -## GPU OCR (opt-in) +## OCR remediation (opt-in) ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', accel_type='CUDA', force_ocr=True) -# or reuse multi-GPU batching -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.clean() +c.ocr(backend='deepseek', fix_bad=True, math_enhance=False) ``` ## Phase‑2 Math Enrichment (from JSON) @@ -76,7 +75,7 @@ c.section() # to parquet c.annotate() # classify/annotate sections ``` -See ocr_and_math_enhancement.md for GPU details, batch sizes, and artifact locations. +See `ocr_and_math_enhancement.md` for OCR runtime details, batch sizes, and artifact locations. ### DeepSeek OCR @@ -89,12 +88,11 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → OCR only for bad files; math is included inline in the Markdown ``` -To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the CLI bits are reachable: +To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the runtime is reachable: ```bash -export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py -export GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek-venv/bin/python -export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR -export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib +export GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek-venv/bin/python +export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2 python -m glossapi.ocr.deepseek.preflight # optional: validates env without running OCR ``` diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 3bf8815..3a7e57c 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -25,12 +25,9 @@ The OCR stage repairs documents whose extracted text is considered unreliable, a ## Backend choices -The pipeline supports at least two OCR-oriented modes: - -- RapidOCR through the Docling path -- DeepSeek OCR for environments configured for that backend - -These are operationally different and should not be treated as interchangeable implementation details. +The supported OCR remediation backend is DeepSeek OCR. Docling remains part of +the surrounding extraction and layout flow, but OCR reruns themselves are now +expected to use the DeepSeek runtime. ## Selection model diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md new file mode 100644 index 0000000..0c00d59 --- /dev/null +++ b/docs/testing/compatibility_matrix.md @@ -0,0 +1,276 @@ +# Compatibility And Regression Matrix + +This document defines the release-validation matrix for the DeepSeek-only migration and subsequent Docling upgrades. + +It is not a generic unit-test list. It is a contract-based validation plan tied to the documented pipeline behavior. + +## Scope + +This matrix applies to changes in: + +- DeepSeek-only OCR migration +- no-stub enforcement +- installation simplification +- Docling dependency upgrades +- page-level reevaluation experiments + +## Validation policy + +Release validation for this migration must use: + +- real PDFs +- real Docling +- real DeepSeek +- real GPUs where the code path requires them +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` + +Developer-only tests may still use mocks or lightweight stubs for fast iteration, but those do not satisfy release gates for this migration. + +## Test levels + +### L0: Install and import sanity + +Purpose: + +- prove the supported environments install cleanly and that removed components are truly gone + +Typical inputs: + +- fresh venv +- supported Python version + +### L1: Lightweight smoke corpus + +Purpose: + +- prove the baseline end-to-end flow still works on the small repo corpus + +Typical inputs: + +- `samples/lightweight_pdf_corpus/` + +### L2: Real-PDF contract validation + +Purpose: + +- prove the documented artifacts and metadata contracts still hold on real documents + +Typical inputs: + +- real PDFs from a representative sample + +### L3: Multi-GPU and operational recovery + +Purpose: + +- prove the runtime behavior remains correct under parallel execution and rerun conditions + +Typical inputs: + +- multiple real PDFs +- at least two visible GPUs + +### L4: Comparative corpus evaluation + +Purpose: + +- compare baseline and changed behavior on a real evaluation slice + +Typical inputs: + +- real corpus slice such as the Pergamos sample + +## Mandatory invariants + +The following must remain true unless a change explicitly revises the contract and updates the docs: + +- canonical Markdown is written to `markdown/.md` +- Docling JSON artifacts are emitted when requested +- cleaner output still drives `needs_ocr` +- OCR remains selective rather than defaulting to all documents +- metadata parquet remains the durable operational record +- reruns skip completed work unless forced +- skiplist semantics remain explicit and stable +- no production path silently falls back to stub OCR + +## Release-gate matrix + +| ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | +| --- | --- | --- | --- | --- | --- | --- | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed RapidOCR profile | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no runtime import of removed RapidOCR modules | +| `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | +| `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | +| `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | +| `OCR-001` | L2 | DeepSeek-only remediation | docs with `needs_ocr=True` | `ocr(backend="deepseek", fix_bad=True)` | recovered docs updated, metadata marks `ocr_success=True` | no stub output, no silent success | +| `OCR-002` | L2 | No-stub enforcement | broken/missing DeepSeek runtime | run OCR with `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` | run fails explicitly | failure must not produce placeholder success artifacts | +| `MTH-001` | L2 | Formula/code enrichment compatibility | math-heavy real PDF | Docling extract plus Phase-2 enrichment | enriched outputs and metadata remain coherent | no schema drift breaking enrichment | +| `SEC-001` | L2 | Sectioning contract | usable real docs | `section()` | `sections/sections_for_annotation.parquet` produced | no empty-output regression caused by upstream changes | +| `ANN-001` | L2 | Annotation contract | section parquet | `annotate()` | classified outputs produced | model integration must not break on changed upstream text/layout | +| `EXP-001` | L2 | Export contract | processed docs | `jsonl()` / `jsonl_sharded()` | JSONL and metadata outputs match documented layout | no dropped metadata fields without explicit design change | +| `RES-001` | L3 | Resumability | interrupted or partial run | rerun with defaults | completed items skipped correctly | no duplicate reprocessing by default | +| `RES-002` | L3 | Force/reprocess semantics | prior successful run | rerun with force/reprocess flag | selected items are reprocessed | no stale completion flags blocking intended rerun | +| `SKP-001` | L3 | Skiplist semantics | run with known problematic items | extract/OCR rerun | skiplist excludes intended stems only | no hidden filtering of healthy items | +| `GPU-001` | L3 | Multi-GPU OCR | real PDF slice on 2 GPUs | DeepSeek OCR in parallel | work is distributed and completes per GPU | no worker success masking failures | +| `CMP-001` | L4 | Baseline quality comparison | Pergamos sample slice | compare pre/post change outputs | no material regression in artifact completeness and downstream usability | runtime improvement alone does not justify quality loss | +| `CMP-002` | L4 | Whole-text vs page-level experiment | long PDFs | compare baseline branch vs page-level branch | quality/runtime tradeoff explicitly measured | experimental branch does not replace baseline without evidence | + +## Detailed test groups + +### Install and runtime compatibility + +What to prove: + +- supported environment installs cleanly +- unsupported/removed OCR components are not required +- Python floor matches actual upstream dependencies + +Critical checks: + +- packaging metadata uses a supported Python minimum +- setup docs expose only supported install paths +- removal of RapidOCR does not leave dead imports or entrypoints + +## Extraction contract + +What to prove: + +- Phase-1 still produces canonical Markdown +- Docling extraction still produces JSON artifacts when requested +- metrics continue to be written where downstream stages expect them + +Artifacts to check: + +- `markdown/.md` +- `json/.docling.json(.zst)` +- `json/.formula_index.jsonl` when requested +- `json/metrics/.metrics.json` +- `json/metrics/.per_page.metrics.json` + +## Cleaning and Greek-quality routing + +What to prove: + +- cleaner still computes routing decisions required for selective OCR +- Greek-text validation remains first-class rather than incidental cleanup + +Fields to check in metadata parquet: + +- `needs_ocr` +- `filter` +- Greek-quality and badness-related fields currently emitted by the cleaner + +## DeepSeek OCR contract + +What to prove: + +- DeepSeek is the only OCR remediation backend +- no-stub enforcement is real +- recovered documents update metadata correctly + +Required environment behavior: + +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` +- real model weights present +- real CLI/runtime path present + +Negative checks: + +- no markdown contains placeholder stub markers +- no OCR pass succeeds after a DeepSeek CLI failure unless real output exists +- no removed OCR backend is referenced during final validation + +## Formula and code enrichment + +What to prove: + +- if retained, enrichment still works with the upgraded Docling stack +- if later removed, the removal is justified by evaluation rather than convenience + +Checks: + +- enriched Markdown is generated where expected +- `json/.latex_map.jsonl` remains coherent when enrichment is enabled +- metadata updates for math enrichment still work + +## Section, annotate, and export contracts + +What to prove: + +- downstream stages still consume the extraction outputs +- output layout and metadata structure remain compatible with the documented pipeline + +Artifacts to check: + +- `sections/sections_for_annotation.parquet` +- `classified_sections.parquet` +- `fully_annotated_sections.parquet` +- exported JSONL shards and related metadata + +## Resumability and operational recovery + +What to prove: + +- reruns still honor completion state +- skiplist semantics remain intact +- multi-worker failures remain visible and recoverable + +Checks: + +- default rerun skips completed items +- explicit force/reprocess reruns the intended items +- problematic stems are persisted and not silently lost + +## Comparative evaluation set + +Suggested real-world slice: + +- lightweight corpus for smoke validation +- representative real PDFs spanning: + - short documents + - medium documents + - long documents + - structure-rich documents + - math-heavy documents where applicable + +For current local evaluation work, a Pergamos sample manifest has been prepared outside the repo and can be used as the L3/L4 real-PDF slice. + +## Suggested release sequence + +For the planned migration, run gates in this order: + +1. `ENV-*` +2. `EXT-*` +3. `CLN-*` +4. `OCR-*` +5. `MTH-*` +6. `SEC-*`, `ANN-*`, `EXP-*` +7. `RES-*`, `SKP-*`, `GPU-*` +8. `CMP-*` + +This keeps low-level compatibility failures from being confused with downstream quality regressions. + +## Exit criteria per stage + +### Stage 1 exit criteria + +- DeepSeek-only OCR path works on real PDFs +- no-stub enforcement verified +- no remaining release dependency on RapidOCR + +### Stage 2 exit criteria + +- install paths reduced to supported environments +- packaging/docs no longer reference removed OCR components + +### Stage 3 exit criteria + +- upgraded Docling passes `EXT-*`, `MTH-*`, `SEC-*`, `ANN-*`, and `EXP-*` + +### Stage 4 exit criteria + +- retained or removed Docling capabilities are justified by evaluation evidence + +### Stage 5 exit criteria + +- page-level branch is compared against the stabilized baseline before any adoption decision diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6691407..24cc470 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -2,19 +2,15 @@ ## OCR runs on CPU -- Verify ONNXRuntime GPU: `python -c "import onnxruntime as ort; print(ort.get_available_providers())"` — must include `CUDAExecutionProvider`. -- Ensure CPU ORT wheel is not installed: `pip uninstall -y onnxruntime`. -- Make sure you pass `accel_type='CUDA'` (or `use_gpus='multi'`). +- Verify Torch CUDA: `python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())"`. +- Make sure the DeepSeek runtime is the one configured in `GLOSSAPI_DEEPSEEK_PYTHON`. +- Run `python -m glossapi.ocr.deepseek.preflight` in the DeepSeek env before large OCR jobs. ## Torch doesn’t see the GPU - Check `nvidia-smi` and driver installation. - Match Torch CUDA build to your driver; see getting_started.md for the recommended wheel. -## RapidOCR font download failure - -- The first OCR call might download a visualization font. Ensure egress is allowed; the file is cached afterwards. - ## Out of memory - Lower Phase‑2 `batch_size` (e.g., 8) and reduce inline `GLOSSAPI_FORMULA_BATCH`. diff --git a/mkdocs.yml b/mkdocs.yml index ba13512..1776dd5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: GlossAPI -site_description: Academic document processing pipeline (Docling + RapidOCR + Rust) +site_description: Academic document processing pipeline (Docling + DeepSeek + Rust) repo_url: https://github.com/eellak/glossAPI theme: name: material @@ -22,6 +22,7 @@ nav: - Metadata, Artifacts, and Run Diagnostics: architecture/metadata_artifacts_and_run_diagnostics.md - Artifact Layout and Stage Handoffs: architecture/artifact_layout_and_stage_handoffs.md - Resumability, Recovery, and Retention: architecture/resumability_recovery_and_retention.md + - DeepSeek-Only Upgrade Roadmap: architecture/deepseek_only_upgrade_roadmap.md - Pipeline: - Pipeline Overview: pipeline.md - OCR & Math Enrichment: ocr_and_math_enhancement.md @@ -39,6 +40,7 @@ nav: - Configuration: configuration.md - AWS Job Distribution: aws_job_distribution.md - Troubleshooting: troubleshooting.md + - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - Corpus API: api/corpus.md - Math Enrichment Runtime: math_enrichment_runtime.md diff --git a/pyproject.toml b/pyproject.toml index 3d0d5fa..60b23f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,11 +10,11 @@ authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"} ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", # ORT+RapidOCR best compatibility + "numpy<2", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -37,28 +37,26 @@ classifiers = [ ] [project.optional-dependencies] -# Docling + RapidOCR ONNX stack (kept optional to preserve import-light installs) -rapidocr = [ +# Docling extraction/layout stack +docling = [ "docling==2.48.0", - # Use RapidOCR core package; avoid rapidocr_onnxruntime to prevent pip - # from auto-installing the CPU-only 'onnxruntime' wheel. - "rapidocr>=3.3.0", - "onnxruntime-gpu==1.18.1", ] # Optional CUDA layout acceleration (Docling) cuda = [ "torch==2.5.1", "torchvision==0.20.1", ] -# DeepSeek OCR backend extras (CUDA 12.1 build of vLLM). Torch is not pinned here -# because users should install the CUDA wheel from the PyTorch index -# (see docs: installing torch==2.5.1+cu121 via extra index URL). +# DeepSeek OCR backend extras (Torch should be installed from the PyTorch index). deepseek = [ - "vllm>=0.11.0", - "transformers>=4.45,<5", + "transformers==4.46.3", + "tokenizers==0.20.3", "accelerate>=1.2.1,<2", "pymupdf==1.24.10", "Pillow==10.4.0", + "img2pdf>=0.5.1", + "einops", + "easydict", + "addict", ] docs = [ "mkdocs>=1.5", @@ -78,6 +76,5 @@ glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ - "rapidocr: requires the RapidOCR/Docling execution stack", "deepseek: exercises the DeepSeek OCR pipeline", ] diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index 4539ead..c92d336 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -1,54 +1,7 @@ -""" -GlossAPI Library - -A library for processing academic texts in Greek and other languages: -- Extracting content from PDFs and other formats with Docling -- Robust batch processing with error isolation and automatic resumption -- Clustering documents based on extraction quality -- Extracting and cleaning academic sections -- Classifying sections using machine learning - -This is an open source project that provides tools for linguistic annotations -and text processing, with a special focus on the Greek language. -""" +"""GlossAPI library.""" from __future__ import annotations -import os - -# Keep Docling/RapidOCR bootstrap optional and import‑light by default. -# If the environment requests skipping (common in tests or minimal envs), -# or if Docling is not installed, we avoid importing heavy dependencies here. -_SKIP_DOCLING_BOOT = os.environ.get("GLOSSAPI_SKIP_DOCLING_BOOT") == "1" - -def _attempt_patch_docling() -> bool: - if _SKIP_DOCLING_BOOT: - return False - try: - # Import inside the function to avoid pulling Docling when unused or missing. - from .ocr.rapidocr.safe import patch_docling_rapidocr # type: ignore - - try: - return bool(patch_docling_rapidocr()) - except Exception: - # Swallow any runtime error to keep top‑level import light/safe. - return False - except Exception: - # Docling (or its transitive deps) not available – keep going. - return False - - -def patch_docling_rapidocr() -> bool: - """Best‑effort registration of the SafeRapidOcrModel. - - Returns True when the patch was applied; False when unavailable or skipped. - Safe to call multiple times. - """ - return _attempt_patch_docling() - -# Attempt the patch once at import time, but never fail import if it does not apply. -_ = _attempt_patch_docling() - __all__ = [ 'GlossSection', 'GlossSectionClassifier', @@ -56,7 +9,6 @@ def patch_docling_rapidocr() -> bool: 'Sampler', 'Section', 'GlossDownloader', - 'patch_docling_rapidocr', ] def __getattr__(name: str): @@ -81,7 +33,6 @@ def __getattr__(name: str): return GlossDownloader raise AttributeError(name) -# Derive version dynamically from installed package metadata if possible try: from importlib.metadata import version as _pkg_version __version__: str = _pkg_version(__name__) diff --git a/src/glossapi/_pipeline.py b/src/glossapi/_pipeline.py index 73e5ecc..1909b60 100644 --- a/src/glossapi/_pipeline.py +++ b/src/glossapi/_pipeline.py @@ -1,7 +1,7 @@ """Backward-compatible adapter. -Docling pipeline builders moved to `glossapi.ocr.rapidocr.pipeline`. +Docling pipeline builders moved to `glossapi.ocr.docling.pipeline`. This module re-exports the public API to preserve legacy imports. """ -from .ocr.rapidocr.pipeline import * # noqa: F401,F403 +from .ocr.docling.pipeline import * # noqa: F401,F403 diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index abdaa5e..e5a4329 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -346,6 +346,8 @@ def finalize(self) -> None: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + encoding="utf-8", + errors="replace", bufsize=1, ) try: diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a584eaf..a748dcc 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -96,6 +96,12 @@ def prime_extractor( except Exception: images_scale_env = "1.25" + if force_ocr: + self.logger.warning( + "Phase-1 Docling OCR is deprecated and no longer executes OCR. " + "Use Corpus.ocr(backend='deepseek') for OCR remediation." + ) + # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, @@ -154,12 +160,12 @@ def _gpu_preflight( require_math: bool, require_backend_gpu: bool = False, ) -> None: - """Abort early when GPU OCR/math is requested but CUDA is unavailable.""" + """Abort early when GPU-backed Docling work is requested but CUDA is unavailable.""" if not (require_ocr or require_math or require_backend_gpu): return instructions = ( - "GPU OCR and math enrichment require CUDA-enabled torch and onnxruntime-gpu. " + "GPU-backed Docling extraction and math enrichment require CUDA-enabled torch. " "Install the CUDA wheels and ensure NVIDIA drivers expose the desired devices." ) @@ -167,30 +173,15 @@ def _gpu_preflight( accel_lower = str(accel_type or "").strip().lower() if accel_lower.startswith("cpu"): raise RuntimeError( - "GPU OCR was requested (force_ocr/math) but accel_type='CPU'. " + "GPU-backed Docling extraction was requested but accel_type='CPU'. " f"{instructions}" ) - try: - import onnxruntime as _ort # type: ignore - providers = _ort.get_available_providers() - except Exception as exc: - raise RuntimeError( - "onnxruntime not available while attempting GPU OCR. " - "Install onnxruntime-gpu and rerun." - ) from exc - - if "CUDAExecutionProvider" not in providers: - raise RuntimeError( - "CUDAExecutionProvider missing from onnxruntime providers. " - f"Detected providers={providers}. {instructions}" - ) - torch_mod = _maybe_import_torch(force=True) if torch_mod is None or not getattr(torch_mod, "cuda", None) or not torch_mod.cuda.is_available(): raise RuntimeError( - "Torch CUDA is not available but GPU OCR/math was requested. " - "Install the CUDA wheel (e.g. torch==2.5.1+cu121) and ensure CUDA drivers/devices are visible." + "Torch CUDA is not available but GPU-backed Docling extraction/math was requested. " + "Install the CUDA wheel and ensure CUDA drivers/devices are visible." ) device_count = torch_mod.cuda.device_count() @@ -208,13 +199,12 @@ def _gpu_preflight( if not self._gpu_banner_logged: self.logger.info( - "GPU preflight: using torch + onnxruntime GPU backends; ensure CUDA drivers are available." + "GPU preflight: using torch-backed Docling extraction; ensure CUDA drivers are available." ) self._gpu_banner_logged = True self.logger.info( - "GPU preflight OK: providers=%s torch_devices=%s", - ",".join(providers), + "GPU preflight OK: torch_devices=%s", ", ".join(device_names) or "", ) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 4dec423..80afc7f 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -33,7 +33,7 @@ def ocr( *, fix_bad: bool = True, mode: Optional[str] = None, - backend: str = "rapidocr", + backend: str = "deepseek", device: Optional[str] = None, model_dir: Optional[Union[str, Path]] = None, max_pages: Optional[int] = None, @@ -70,8 +70,8 @@ def ocr( fix_bad only -> 'ocr_bad'; math_enhance only -> 'math_only'; neither -> no‑op. - - backend: 'rapidocr' (default) uses the Docling + RapidOCR path via Phase‑1 extract(). - 'deepseek' uses the DeepSeek‑OCR path (no Docling JSON, math unsupported). + - backend: 'deepseek' (default) uses the DeepSeek OCR remediation path. + Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. @@ -82,9 +82,9 @@ def ocr( ``reprocess_completed=False``). Prefer the explicit ``reprocess_completed`` toggle. """ # Normalize backend - backend_norm = str(backend or "rapidocr").strip().lower() - if backend_norm not in {"rapidocr", "deepseek"}: - raise ValueError("backend must be 'rapidocr' or 'deepseek'") + backend_norm = str(backend or "deepseek").strip().lower() + if backend_norm != "deepseek": + raise ValueError("backend must be 'deepseek'") # CONTENT_DEBUG override (preferred uppercase alias) # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags @@ -147,13 +147,21 @@ def ocr( reprocess_completed = reprocess_flag # DeepSeek semantics note - if backend_norm == "deepseek": + if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: try: self.logger.info( "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." ) except Exception: pass + if mode_norm == "ocr_bad_then_math": + try: + self.logger.info( + "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." + ) + except Exception: + pass + mode_norm = "ocr_bad" # Identify bad documents from parquet (Rust cleaner output) bad_files: List[str] = [] skipped_completed = 0 @@ -578,24 +586,6 @@ def _run_math(stems: List[str]) -> None: except Exception as _e: self.logger.error("DeepSeek OCR runner failed: %s", _e) raise - else: - # RapidOCR/Docling path via Phase-1 extract - self.extract( - input_format="pdf", - num_threads=os.cpu_count() or 4, - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=bad_files, - skip_existing=False, - use_gpus=use_gpus, - devices=devices, - # Do not generate Docling JSON for OCR targets; math will skip them - export_doc_json=False, - emit_formula_index=False, - phase1_backend="docling", - ) reran_ocr = True # Update metadata to reflect successful OCR reruns try: diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..3788d54 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -10,7 +10,6 @@ AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, - RapidOcrOptions, LayoutOptions, TableStructureOptions, TableFormerMode, @@ -106,11 +105,8 @@ def _ensure_docling_pipeline_loaded() -> None: from docling.pipeline.simple_pipeline import SimplePipeline -# Ensure RapidOCR plugin is registered for factory-based OCR construction -import docling.models.rapid_ocr_model # noqa: F401 -from .ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from .ocr.rapidocr.pool import GLOBAL_RAPID_OCR_POOL import inspect +from .ocr.docling_pipeline import build_layout_pipeline import ftfy import logging @@ -328,7 +324,7 @@ def _apply_thread_caps(self) -> None: self._thread_caps_applied = True def release_resources(self) -> None: - """Release Docling converters, pooled RapidOCR engines, and GPU caches.""" + """Release Docling converters and GPU caches.""" try: self.converter = None except Exception: @@ -343,10 +339,6 @@ def release_resources(self) -> None: setattr(self, attr, None) except Exception: pass - try: - GLOBAL_RAPID_OCR_POOL.clear() - except Exception: - pass torch_mod = _maybe_import_torch() if torch_mod is not None and getattr(torch_mod, "cuda", None): try: @@ -553,12 +545,7 @@ def create_extractor( ocr_langs: list[str] | None = None, profile_timings: bool = True, ): - """Create a document converter with configured options using the canonical builder. - - Delegates PDF pipeline construction to `glossapi.ocr.rapidocr.pipeline.build_rapidocr_pipeline` - to avoid duplicated provider checks and option wiring. Falls back to the legacy - inline path if the canonical builder is unavailable. - """ + """Create a Docling document converter for Phase-1 extraction.""" _ensure_docling_converter_loaded() _ensure_docling_pipeline_loaded() # Enable/disable Docling pipeline timings collection (for benchmarks) @@ -574,171 +561,83 @@ def create_extractor( # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: - if formula_enrichment: + if formula_enrichment or code_enrichment: torch_mod = _maybe_import_torch(force=True) if torch_mod is None: - raise RuntimeError("Torch not available but formula enrichment requested.") + raise RuntimeError("Torch not available but Docling GPU enrichment was requested.") if hasattr(torch_mod, "cuda") and isinstance(getattr(self, "pipeline_options", None), PdfPipelineOptions): dev = getattr(self.pipeline_options, "accelerator_options", None) dv = getattr(dev, "device", None) if (isinstance(dv, str) and dv.lower().startswith("cuda")) and not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") + raise RuntimeError("Torch CUDA not available but Docling GPU enrichment was requested.") except Exception as e: raise RuntimeError(f"Torch CUDA preflight failed: {e}") - # Build PDF pipeline via the canonical builder (preferred) - opts = None - active_backend = DoclingParseV2DocumentBackend - try: - from .ocr.rapidocr.pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - except Exception: # pragma: no cover - adapter fallback - from ._pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - - device_str = self._current_device_str() or "cuda:0" - builder = build_rapidocr_pipeline if enable_ocr else build_layout_pipeline - - try: - _, opts = builder( - device=device_str, - images_scale=float(images_scale), - formula_enrichment=bool(formula_enrichment), - code_enrichment=bool(code_enrichment), - **({"text_score": float(text_score)} if enable_ocr else {}), - ) - - if enable_ocr and hasattr(opts, "ocr_options") and getattr(opts, "ocr_options", None) is not None: - if use_cls is not None: - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - if ocr_langs: - setattr(opts.ocr_options, "lang", list(ocr_langs)) # type: ignore[attr-defined] - if force_full_page_ocr is not None: - setattr(opts.ocr_options, "force_full_page_ocr", bool(force_full_page_ocr)) # type: ignore[attr-defined] - + if enable_ocr: try: - setattr(opts, "images_scale", float(images_scale)) + self._log.warning( + "Docling Phase-1 OCR is no longer supported. " + "Ignoring enable_ocr/force_full_page_ocr; use Corpus.ocr(backend='deepseek') instead." + ) except Exception: pass - self._active_pdf_options = opts - self._current_ocr_enabled = bool(enable_ocr) - - # Create a multi-format DocumentConverter using the built PDF options - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - if opts is None: - opts = self.pipeline_options - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=opts, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), - InputFormat.XML_JATS: XMLJatsFormatOption(), - InputFormat.HTML: HTMLFormatOption(), - InputFormat.PPTX: PowerpointFormatOption(), - InputFormat.CSV: CsvFormatOption(), - InputFormat.MD: MarkdownFormatOption(), - }, - ) - self._active_pdf_backend = active_backend + active_backend = DoclingParseV2DocumentBackend + device_str = self._current_device_str() or "cuda:0" + _, opts = build_layout_pipeline( + device=device_str, + images_scale=float(images_scale), + formula_enrichment=bool(formula_enrichment), + code_enrichment=bool(code_enrichment), + ) + try: + opts.do_ocr = False + setattr(opts, "images_scale", float(images_scale)) except Exception: - # Fallback to legacy inline configuration path - if enable_ocr: - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError( - "RapidOCR ONNX models/keys not found. Ensure models exist under glossapi.models/rapidocr or set GLOSSAPI_RAPIDOCR_ONNX_DIR." - ) - langs = ocr_langs or ["el", "en"] - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=langs, - force_full_page_ocr=bool(force_full_page_ocr), - use_det=True, - use_cls=bool(use_cls), - use_rec=True, - text_score=float(text_score), - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - self.pipeline_options.ocr_options = ocr_opts - # Attach core toggles to existing pipeline_options - try: - self.pipeline_options.do_ocr = bool(enable_ocr) - self.pipeline_options.do_formula_enrichment = bool(formula_enrichment) - self.pipeline_options.do_code_enrichment = bool(code_enrichment) - try: - setattr(self.pipeline_options, "images_scale", float(images_scale)) - except Exception: - pass - except Exception: - pass - if not enable_ocr: - try: - setattr(self.pipeline_options, "ocr_options", None) - except Exception: - pass + pass - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=self.pipeline_options, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - }, - ) + self._active_pdf_options = opts + self._current_ocr_enabled = False - self._active_pdf_options = self.pipeline_options - self._current_ocr_enabled = bool(enable_ocr) - self._active_pdf_backend = active_backend + pdf_backend = DoclingParseV2DocumentBackend + try: + if getattr(self, "use_pypdfium_backend", False): + pdf_backend = PyPdfiumDocumentBackend + self.pdf_backend_name = "pypdfium" + except Exception: + pdf_backend = DoclingParseV2DocumentBackend + active_backend = pdf_backend + + self.converter = DocumentConverter( + allowed_formats=[ + InputFormat.PDF, + InputFormat.DOCX, + InputFormat.XML_JATS, + InputFormat.HTML, + InputFormat.PPTX, + InputFormat.CSV, + InputFormat.MD, + ], + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=opts, + pipeline_cls=StandardPdfPipeline, + backend=active_backend, + ), + InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), + InputFormat.XML_JATS: XMLJatsFormatOption(), + InputFormat.HTML: HTMLFormatOption(), + InputFormat.PPTX: PowerpointFormatOption(), + InputFormat.CSV: CsvFormatOption(), + InputFormat.MD: MarkdownFormatOption(), + }, + ) + self._active_pdf_backend = active_backend # Record last configuration for reuse try: self._last_extractor_cfg = self._cfg_signature( - enable_ocr=enable_ocr, + enable_ocr=False, force_full_page_ocr=force_full_page_ocr, text_score=text_score, images_scale=images_scale, diff --git a/src/glossapi/ocr/__init__.py b/src/glossapi/ocr/__init__.py index bb167c4..df79456 100644 --- a/src/glossapi/ocr/__init__.py +++ b/src/glossapi/ocr/__init__.py @@ -1,7 +1,7 @@ """Lightweight OCR backend package. Exports minimal, import-safe helpers for OCR backends. Heavy -dependencies (vLLM, transformers, PyMuPDF) are imported lazily +dependencies (transformers, PyMuPDF) are imported lazily inside the specific backend functions so importing this package does not require GPU stacks or model weights. """ @@ -12,17 +12,14 @@ __all__ = [ "deepseek", - "rapidocr", "math", "utils", "deepseek_runner", - "rapidocr_dispatch", ] -_SUBPACKAGES = {"deepseek", "rapidocr", "math", "utils"} +_SUBPACKAGES = {"deepseek", "math", "utils"} _ALIASES = { "deepseek_runner": "glossapi.ocr.deepseek.runner", - "rapidocr_dispatch": "glossapi.ocr.rapidocr.dispatch", } diff --git a/src/glossapi/ocr/deepseek/__init__.py b/src/glossapi/ocr/deepseek/__init__.py index 5326c42..a5fb1ca 100644 --- a/src/glossapi/ocr/deepseek/__init__.py +++ b/src/glossapi/ocr/deepseek/__init__.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR backend with a lightweight stub fallback.""" +"""DeepSeek OCR backend.""" from .runner import run_for_files from . import preflight diff --git a/src/glossapi/ocr/deepseek/preflight.py b/src/glossapi/ocr/deepseek/preflight.py index 76810e6..6669707 100644 --- a/src/glossapi/ocr/deepseek/preflight.py +++ b/src/glossapi/ocr/deepseek/preflight.py @@ -1,17 +1,16 @@ -"""Preflight checks for the DeepSeek OCR CLI environment.""" +"""Preflight checks for the DeepSeek OCR environment.""" from __future__ import annotations import dataclasses import os -import shutil import sys from pathlib import Path from typing import Dict, Iterable, List, Optional -DEFAULT_SCRIPT = Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" -DEFAULT_MODEL_DIR = Path.cwd() / "deepseek-ocr" / "DeepSeek-OCR" -DEFAULT_LIB_DIR = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_MODEL_DIR = REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2" @dataclasses.dataclass(frozen=True) @@ -46,9 +45,6 @@ def summarize(self) -> str: def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[Path]: - if not path: - errors.append(CheckResult(label, False, "Not provided")) - return None if not path.exists(): errors.append(CheckResult(label, False, f"Missing at {path}")) return None @@ -58,38 +54,45 @@ def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[ def check_deepseek_env( env: Optional[Dict[str, str]] = None, *, - check_flashinfer: bool = True, + check_torch: bool = True, ) -> PreflightReport: - """Validate DeepSeek CLI prerequisites without running the model.""" + """Validate DeepSeek OCR prerequisites without running the model.""" env = dict(env or os.environ) errors: List[CheckResult] = [] warnings: List[CheckResult] = [] infos: List[CheckResult] = [] - allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" + allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") == "1" + allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1" if not allow_cli: - warnings.append( + errors.append( CheckResult( "allow_cli", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 to force the real CLI.", + "DeepSeek OCR requires the real CLI/runtime. Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1.", ) ) if allow_stub: - warnings.append( + errors.append( CheckResult( "allow_stub", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 to fail instead of falling back to stub output.", + "Stub execution is no longer supported. Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0.", ) ) - script = Path(env.get("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT") or DEFAULT_SCRIPT) - _ensure_path(script, "vllm_script", errors) + script = Path( + env.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT") + or DEFAULT_SCRIPT + ) + _ensure_path(script, "runner_script", errors) - python_bin = Path(env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") or sys.executable) + python_bin = Path( + env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") + or env.get("GLOSSAPI_DEEPSEEK_PYTHON") + or sys.executable + ) _ensure_path(python_bin, "deepseek_python", errors) model_dir = Path( @@ -99,7 +102,7 @@ def check_deepseek_env( ) model_dir = _ensure_path(model_dir, "model_dir", errors) if model_dir: - has_weights = any(model_dir.glob("*.safetensors")) or (model_dir / "model-00001-of-000001.safetensors").exists() + has_weights = any(model_dir.glob("*.safetensors")) has_config = (model_dir / "config.json").exists() if not has_weights or not has_config: errors.append( @@ -110,34 +113,21 @@ def check_deepseek_env( ) ) - ld_path_env = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - lib_dir = Path(ld_path_env) if ld_path_env else DEFAULT_LIB_DIR - _ensure_path(lib_dir, "ld_library_path", errors) - - cc1plus_path = shutil.which("cc1plus", path=env.get("PATH", "")) - if not cc1plus_path: - errors.append( - CheckResult( - "cc1plus", - False, - "C++ toolchain missing (cc1plus not on PATH); install g++ and ensure PATH includes gcc's cc1plus.", - ) - ) - else: - infos.append(CheckResult("cc1plus", True, f"Found at {cc1plus_path}")) - - if check_flashinfer: + if check_torch: try: - import flashinfer # type: ignore + import torch # type: ignore - infos.append(CheckResult("flashinfer", True, f"flashinfer {flashinfer.__version__} import ok")) + infos.append(CheckResult("torch", True, f"torch {torch.__version__} import ok")) + if not torch.cuda.is_available(): + warnings.append(CheckResult("cuda", False, "Torch CUDA is not available.")) except Exception as exc: # pragma: no cover - depends on env - errors.append(CheckResult("flashinfer", False, f"flashinfer import failed: {exc}")) + errors.append(CheckResult("torch", False, f"torch import failed: {exc}")) return PreflightReport(errors=errors, warnings=warnings, infos=infos) def main(argv: Optional[Iterable[str]] = None) -> int: + del argv report = check_deepseek_env() summary = report.summarize() if summary: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py new file mode 100644 index 0000000..0e0e868 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -0,0 +1,188 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files.""" + +from __future__ import annotations + +import argparse +import json +import re +import tempfile +from pathlib import Path +from typing import Iterable, List + +import fitz +import torch +from PIL import Image +from transformers import AutoModel, AutoTokenizer + +PROMPT = "\n<|grounding|>Convert the document to markdown. " +PAGE_SPLIT = "\n<--- Page Split --->\n" + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: + if files: + return [(input_dir / name).resolve() for name in files] + return sorted(input_dir.glob("*.pdf")) + + +def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: + images: List[Image.Image] = [] + doc = fitz.open(pdf_path) + try: + page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) + zoom = 144 / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for idx in range(page_count): + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + images.append(img) + finally: + doc.close() + return images + + +def _clean_markdown(text: str) -> str: + text = (text or "").replace("<|end▁of▁sentence|>", "").strip() + pattern = re.compile(r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL) + matches = pattern.findall(text) + for full_match, label, _coords in matches: + if label == "image": + text = text.replace(full_match, "") + else: + text = text.replace(full_match, "") + return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() + + +def _load_model(model_dir: Path, device: str): + attn_impl = "flash_attention_2" + try: + import flash_attn # noqa: F401 + except Exception: + attn_impl = "eager" + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + if device.startswith("cuda"): + model = model.eval().to(device).to(torch.bfloat16) + else: + model = model.eval().to(device) + return tokenizer, model + + +def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: + result = model.infer( + tokenizer, + prompt=PROMPT, + image_file=str(image_path), + output_path=str(output_dir), + base_size=1024, + image_size=768, + crop_mode=True, + save_results=False, + eval_mode=True, + ) + return _clean_markdown(str(result)) + + +def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + (md_dir / f"{stem}.md").write_text(markdown.strip() + "\n", encoding="utf-8") + metrics = { + "page_count": page_count, + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") + partial_path = progress_dir / f"{stem}.partial.md" + if partial_path.exists(): + partial_path.unlink() + + +def _write_progress( + output_dir: Path, + stem: str, + page_outputs: List[str], + total_pages: int, + completed_pages: int, +) -> None: + """Emit lightweight progress artifacts during long OCR runs.""" + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + partial_markdown = PAGE_SPLIT.join(page_outputs).strip() + if partial_markdown: + (progress_dir / f"{stem}.partial.md").write_text(partial_markdown + "\n", encoding="utf-8") + progress = { + "completed_pages": completed_pages, + "total_pages": total_pages, + "status": "running" if completed_pages < total_pages else "complete", + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.progress.json").write_text( + json.dumps(progress, indent=2), + encoding="utf-8", + ) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + pdfs = _iter_pdfs(input_dir, args.files) + if not pdfs: + return 0 + + tokenizer, model = _load_model(model_dir, args.device) + + for pdf_path in pdfs: + images = _render_pages(pdf_path, args.max_pages) + page_outputs: List[str] = [] + total_pages = len(images) + _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) + with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + for idx, image in enumerate(images): + page_png = tmp_dir / f"page_{idx + 1:04d}.png" + image.save(page_png, format="PNG") + page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + if args.content_debug: + page_text = f"\n{page_text}".strip() + page_outputs.append(page_text) + _write_progress( + output_dir, + pdf_path.stem, + page_outputs, + total_pages, + idx + 1, + ) + markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" + _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index d68f05c..2568665 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR runner with stub and optional CLI dispatch.""" +"""DeepSeek OCR runner.""" from __future__ import annotations @@ -17,6 +17,8 @@ _pypdfium2 = None LOGGER = logging.getLogger(__name__) +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" def _page_count(pdf_path: Path) -> int: @@ -32,12 +34,13 @@ def _run_cli( input_dir: Path, output_dir: Path, *, + files: List[str], + model_dir: Path, python_bin: Optional[Path], script: Path, max_pages: Optional[int], content_debug: bool, - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, + device: Optional[str], ) -> None: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -47,78 +50,62 @@ def _run_cli( str(input_dir), "--output-dir", str(output_dir), + "--model-dir", + str(model_dir), ] + if files: + cmd += ["--files", *files] if max_pages is not None: cmd += ["--max-pages", str(max_pages)] if content_debug: cmd.append("--content-debug") - if gpu_memory_utilization is not None: - cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)] - if disable_fp8_kv: - cmd.append("--no-fp8-kv") + if device: + cmd += ["--device", str(device)] env = os.environ.copy() if shutil.which("cc1plus", path=env.get("PATH", "")) is None: - # FlashInfer JIT (via vLLM) needs a C++ toolchain; add a known cc1plus location if missing. for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): - env["PATH"] = f"{candidate.parent}:{env.get('PATH','')}" + env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" break ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: - env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH','')}" + env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" - LOGGER.info("Running DeepSeek CLI: %s", " ".join(cmd)) + LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments -def _run_one_pdf(pdf_path: Path, md_out: Path, metrics_out: Path, cfg: Dict[str, Any]) -> Dict[str, Any]: - """Stub processor for a single PDF.""" - page_count = _page_count(pdf_path) - max_pages = cfg.get("max_pages") - if max_pages is not None and page_count: - page_count = min(page_count, max_pages) - - md_lines = [ - f"# DeepSeek OCR (stub) — {pdf_path.name}", - "", - f"Pages: {page_count if page_count else 'unknown'}", - ] - if cfg.get("content_debug"): - md_lines.append("") - md_lines.append("") - md_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("\n".join(md_lines) + "\n", encoding="utf-8") - - metrics = {"page_count": page_count} - metrics_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - return metrics - - def run_for_files( self_ref: Any, files: Iterable[str], *, - model_dir: Optional[Path] = None, # kept for API compatibility + model_dir: Optional[Path] = None, output_dir: Optional[Path] = None, - log_dir: Optional[Path] = None, # unused placeholder to mirror rapidocr + log_dir: Optional[Path] = None, # kept for API compatibility max_pages: Optional[int] = None, - allow_stub: bool = True, - allow_cli: bool = False, + allow_stub: bool = False, # ignored after stub removal; kept for compatibility + allow_cli: bool = True, # ignored after stub removal; kept for compatibility python_bin: Optional[Path] = None, vllm_script: Optional[Path] = None, content_debug: bool = False, persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved - device: Optional[str] = None, # reserved - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, + device: Optional[str] = None, + gpu_memory_utilization: Optional[float] = None, # reserved + disable_fp8_kv: bool = False, # reserved **_: Any, ) -> Dict[str, Any]: - """Run DeepSeek OCR for the provided files. + """Run DeepSeek OCR for the provided files.""" + + requested_stub = bool(allow_stub) + del log_dir, allow_stub, allow_cli, persist_engine, precision + del gpu_memory_utilization, disable_fp8_kv - Returns a mapping of stem -> minimal metadata (page_count). - """ + if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": + raise RuntimeError( + "DeepSeek stub execution has been removed. " + "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." + ) file_list = [str(f) for f in files or []] if not file_list: @@ -131,67 +118,63 @@ def run_for_files( md_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True) - env_allow_stub = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" - env_allow_cli = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - - use_cli = allow_cli or env_allow_cli - use_stub = allow_stub and env_allow_stub - - script_path = Path(vllm_script) if vllm_script else Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" - # Optional GPU memory utilization override (env wins over kwarg) - env_gpu_mem = os.environ.get("GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION") - gpu_mem_fraction = gpu_memory_utilization - if env_gpu_mem: - try: - gpu_mem_fraction = float(env_gpu_mem) - except Exception: - gpu_mem_fraction = gpu_memory_utilization - disable_fp8_kv = disable_fp8_kv or os.environ.get("GLOSSAPI_DEEPSEEK_NO_FP8_KV") == "1" - - if use_cli and script_path.exists(): - try: - _run_cli( - input_root, - out_root, - python_bin=python_bin, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, - gpu_memory_utilization=gpu_mem_fraction, - disable_fp8_kv=disable_fp8_kv, - ) - results: Dict[str, Any] = {} - for name in file_list: - pdf_path = (input_root / name).resolve() - stem = Path(name).stem - md_path = md_dir / f"{stem}.md" - metrics_path = metrics_dir / f"{stem}.metrics.json" - if not md_path.exists() or not md_path.read_text(encoding="utf-8").strip(): - placeholder = [ - f"# DeepSeek OCR — {pdf_path.name}", - "", - "[[Blank page]]", - ] - md_path.parent.mkdir(parents=True, exist_ok=True) - md_path.write_text("\n".join(placeholder) + "\n", encoding="utf-8") - page_count = _page_count(pdf_path) - if not metrics_path.exists(): - metrics_path.parent.mkdir(parents=True, exist_ok=True) - metrics_path.write_text(json.dumps({"page_count": page_count}, indent=2), encoding="utf-8") - results[stem] = {"page_count": page_count} - return results - except Exception as exc: - if not use_stub: - raise - LOGGER.warning("DeepSeek CLI failed (%s); falling back to stub output", exc) - - cfg = {"max_pages": max_pages, "content_debug": content_debug} + model_root = Path( + model_dir + or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR", "") + or (REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2") + ) + if not model_root.exists(): + raise FileNotFoundError( + "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." + ) + + script_path = Path( + vllm_script + or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") + or DEFAULT_SCRIPT + ) + if not script_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") + + python_exe = Path( + python_bin + or os.environ.get("GLOSSAPI_DEEPSEEK_PYTHON", "") + or os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", "") + or sys.executable + ) + if not python_exe.exists(): + raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") + + _run_cli( + input_dir=input_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) + results: Dict[str, Any] = {} for name in file_list: pdf_path = (input_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" - results[stem] = _run_one_pdf(pdf_path, md_path, metrics_path, cfg) + if not md_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR did not produce markdown for {name}: {md_path}") + if not md_path.read_text(encoding="utf-8").strip(): + raise RuntimeError(f"DeepSeek OCR produced empty markdown for {name}: {md_path}") + page_count = _page_count(pdf_path) + if metrics_path.exists(): + try: + results[stem] = json.loads(metrics_path.read_text(encoding="utf-8")) + continue + except Exception: + pass + results[stem] = {"page_count": page_count} + metrics_path.write_text(json.dumps(results[stem], indent=2), encoding="utf-8") return results diff --git a/src/glossapi/ocr/docling/__init__.py b/src/glossapi/ocr/docling/__init__.py new file mode 100644 index 0000000..28d4b0a --- /dev/null +++ b/src/glossapi/ocr/docling/__init__.py @@ -0,0 +1,5 @@ +"""Docling PDF pipeline helpers used by GlossAPI.""" + +from .pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py new file mode 100644 index 0000000..aea64fd --- /dev/null +++ b/src/glossapi/ocr/docling/pipeline.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + LayoutOptions, + PdfPipelineOptions, + PictureDescriptionApiOptions, + TableFormerMode, + TableStructureOptions, +) + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + acc = AcceleratorOptions(device=dev) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def _apply_common_pdf_options( + *, + acc: AcceleratorOptions, + images_scale: float, + formula_enrichment: bool, + code_enrichment: bool, +) -> PdfPipelineOptions: + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + opts = PdfPipelineOptions( + accelerator_options=acc, + layout_options=LayoutOptions(), + do_ocr=False, + do_table_structure=True, + do_formula_enrichment=bool(formula_enrichment), + do_code_enrichment=bool(code_enrichment), + force_backend_text=False, + generate_parsed_pages=False, + table_structure_options=table_opts, + allow_external_plugins=True, + ) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", images_scale) + except Exception: + pass + return opts + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Create a Docling layout-only PDF pipeline.""" + + acc, _ = _resolve_accelerator(device) + opts = _apply_common_pdf_options( + acc=acc, + images_scale=float(images_scale), + formula_enrichment=formula_enrichment, + code_enrichment=code_enrichment, + ) + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py new file mode 100644 index 0000000..ef85950 --- /dev/null +++ b/src/glossapi/ocr/docling_pipeline.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + LayoutOptions, + PdfPipelineOptions, + PictureDescriptionApiOptions, + TableFormerMode, + TableStructureOptions, +) + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + acc = AcceleratorOptions(device=dev) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Build the Docling PDF pipeline used for Phase-1 extraction.""" + + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + acc, _ = _resolve_accelerator(device) + opts = PdfPipelineOptions( + accelerator_options=acc, + layout_options=LayoutOptions(), + do_ocr=False, + do_table_structure=True, + do_formula_enrichment=bool(formula_enrichment), + do_code_enrichment=bool(code_enrichment), + force_backend_text=False, + generate_parsed_pages=False, + table_structure_options=table_opts, + allow_external_plugins=True, + ) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", float(images_scale)) + except Exception: + pass + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts + + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/__init__.py b/src/glossapi/ocr/rapidocr/__init__.py deleted file mode 100644 index c0d1232..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""RapidOCR subpackage with lazy re-exports.""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - -__all__ = [ - "dispatch", - "docling_pipeline", - "pool", - "safe", - "onnx", - "_paths", - "pipeline", -] - - -def __getattr__(name: str) -> Any: - if name in __all__: - return import_module(f"glossapi.ocr.rapidocr.{name}") - raise AttributeError(name) - - -def __dir__() -> list[str]: - return sorted(set(globals().keys()) | set(__all__)) diff --git a/src/glossapi/ocr/rapidocr/__init__.py.backup b/src/glossapi/ocr/rapidocr/__init__.py.backup deleted file mode 100644 index 865f119..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py.backup +++ /dev/null @@ -1,6 +0,0 @@ -"""RapidOCR subpackage (shim).""" - -from __future__ import annotations - -__all__ = ["dispatch"] - diff --git a/src/glossapi/ocr/rapidocr/_paths.py b/src/glossapi/ocr/rapidocr/_paths.py deleted file mode 100644 index 4c1cc2a..0000000 --- a/src/glossapi/ocr/rapidocr/_paths.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Tuple -import importlib -import os - - -@dataclass -class ResolvedOnnx: - det: Optional[str] - rec: Optional[str] - cls: Optional[str] - keys: Optional[str] - - -def _find_first(base: Path, patterns: list[str]) -> Optional[str]: - for pat in patterns: - for p in base.rglob(pat): - if p.is_file(): - return str(p) - return None - - -def _resolve_packaged_cls_fallback() -> Optional[str]: - try: - rapidocr = importlib.import_module("rapidocr") - base = Path(rapidocr.__file__).resolve().parent / "models" - pref = base / "ch_ppocr_mobile_v2.0_cls_infer.onnx" - if pref.exists(): - return str(pref) - return _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - except Exception: - return None - - -def resolve_packaged_onnx_and_keys() -> ResolvedOnnx: - """Locate ONNX det/rec/cls and Greek keys packaged with the glossapi package. - - Search order: - 1) GLOSSAPI_RAPIDOCR_ONNX_DIR (env var) with heuristic file names - 2) Under the installed glossapi package folder `models/` and common subfolders - 3) CLS only: fallback to RapidOCR’s bundled cls model if missing - """ - # 1) Explicit override directory - override = os.getenv("GLOSSAPI_RAPIDOCR_ONNX_DIR") - if override: - base = Path(override) - det = _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - if det or rec or cls or keys: - return ResolvedOnnx(det, rec, cls, keys) - - # 2) Search inside installed glossapi package - try: - glossapi = importlib.import_module("glossapi") - pkg_root = Path(glossapi.__file__).resolve().parent - # Candidate asset directories inside the package - candidates = [ - pkg_root / "models", - pkg_root / "models" / "rapidocr", - pkg_root / "models" / "rapidocr" / "onnx", - pkg_root / "models" / "rapidocr" / "keys", - pkg_root / "resources", - pkg_root / "assets", - pkg_root / "data", - ] - det = rec = cls = keys = None - for base in candidates: - if not base.exists(): - continue - det = det or _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = rec or _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = cls or _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = keys or _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - - if cls is None: - cls = _resolve_packaged_cls_fallback() - return ResolvedOnnx(det, rec, cls, keys) - except Exception: - return ResolvedOnnx(None, None, _resolve_packaged_cls_fallback(), None) - - -def summarize_resolution() -> Tuple[bool, str]: - r = resolve_packaged_onnx_and_keys() - ok = bool(r.det and r.rec and r.cls and r.keys) - msg = f"det={bool(r.det)} rec={bool(r.rec)} cls={bool(r.cls)} keys={bool(r.keys)}" - return ok, msg - diff --git a/src/glossapi/ocr/rapidocr/dispatch.py b/src/glossapi/ocr/rapidocr/dispatch.py deleted file mode 100644 index 7deeba2..0000000 --- a/src/glossapi/ocr/rapidocr/dispatch.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -from typing import Iterable, Optional - - -def run_via_extract( - corpus, - files: Iterable[str], - *, - export_doc_json: bool = False, - internal_debug: bool = False, - content_debug: Optional[bool] = None, -) -> None: - """Thin adapter that forwards to Corpus.extract for RapidOCR/Docling. - - This exists for symmetry with deepseek_runner and to keep the OCR package - as the single entry point for OCR backends. - """ - # Note: internal_debug/content_debug are no-ops for the Docling/RapidOCR path. - # Docling's output already produces a single concatenated Markdown document. - corpus.extract( - input_format="pdf", - num_threads=1, # let extract decide; override in tests if needed - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=list(files), - skip_existing=False, - export_doc_json=bool(export_doc_json), - emit_formula_index=bool(export_doc_json), - phase1_backend="docling", - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py b/src/glossapi/ocr/rapidocr/docling_pipeline.py deleted file mode 100644 index bb8988f..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi.ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup b/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup deleted file mode 100644 index f80344d..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi._rapidocr_paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi._pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/onnx.py b/src/glossapi/ocr/rapidocr/onnx.py deleted file mode 100644 index 57430d1..0000000 --- a/src/glossapi/ocr/rapidocr/onnx.py +++ /dev/null @@ -1,105 +0,0 @@ -"""OCR helpers for GlossAPI using Docling + RapidOCR (ONNXRuntime). - -GPU-first OCR that auto-discovers packaged ONNX models and Greek keys within -the installed `glossapi` package. Designed as a drop-in for Corpus.ocr(). -""" -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Dict, Any, Tuple - -_PIPELINE_CACHE: dict[str, Tuple[object, object]] = {} - - -def _build_pipeline( - device: Optional[str] = None, - *, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, -): - # Delegate to canonical builder to avoid duplication - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline - - engine, opts = build_rapidocr_pipeline( - device=(device or "cuda:0"), - text_score=(0.45 if text_score is None else float(text_score)), - images_scale=(1.25 if images_scale is None else float(images_scale)), - formula_enrichment=False, - code_enrichment=False, - ) - # Apply use_cls override if requested - try: - if use_cls is not None and hasattr(opts, "ocr_options"): - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - except Exception: - pass - return engine, opts - - -def run_rapidocr_onnx( - pdf_path: Path | str, - *, - device: Optional[str] = None, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, - max_pages: Optional[int] = None, -) -> Dict[str, Any]: - """Run Docling + RapidOCR (ONNX) OCR on a PDF and return markdown text. - - Returns - ------- - dict with keys: - - markdown_text: str - - duration_s: float - - pages: int - - models: dict with file names of det/rec/cls/keys - """ - from time import perf_counter - pdf_p = Path(pdf_path) - if not pdf_p.exists(): - raise FileNotFoundError(pdf_p) - - key = str(device or "cuda:0").lower() - cached = _PIPELINE_CACHE.get(key) - if cached is None: - pipe, r = _build_pipeline(device=device, use_cls=use_cls, text_score=text_score, images_scale=images_scale) - _PIPELINE_CACHE[key] = (pipe, r) - else: - pipe, r = cached # type: ignore[misc] - - t0 = perf_counter() - conv = pipe.convert(source=str(pdf_p)) # type: ignore[attr-defined] - doc = conv.document - md_text = doc.export_to_markdown() - duration = perf_counter() - t0 - - # Attempt to get page count from conv/document - pages = 0 - try: - if hasattr(doc, "pages"): - pages = len(doc.pages) # type: ignore[attr-defined] - except Exception: - pages = 0 - - # Return model identifiers as file names only (no full paths) - import os as _os - models = { - "det": _os.path.basename(r.det) if r.det else None, - "rec": _os.path.basename(r.rec) if r.rec else None, - "cls": _os.path.basename(r.cls) if r.cls else None, - "keys": _os.path.basename(r.keys) if r.keys else None, - } - - return { - "markdown_text": md_text or "", - "duration_s": duration, - "pages": int(pages), - "models": models, - } - - -__all__ = [ - "run_rapidocr_onnx", -] diff --git a/src/glossapi/ocr/rapidocr/pipeline.py b/src/glossapi/ocr/rapidocr/pipeline.py deleted file mode 100644 index a623c3d..0000000 --- a/src/glossapi/ocr/rapidocr/pipeline.py +++ /dev/null @@ -1,229 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PictureDescriptionApiOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import DocumentConverter, PdfFormatOption - -from ._paths import resolve_packaged_onnx_and_keys -from .pool import GLOBAL_RAPID_OCR_POOL -from .safe import SafeRapidOcrModel, patch_docling_rapidocr - -_logger = logging.getLogger(__name__) - -patch_docling_rapidocr() - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def _apply_common_pdf_options( - *, - acc: AcceleratorOptions, - images_scale: float, - formula_enrichment: bool, - code_enrichment: bool, -) -> PdfPipelineOptions: - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - # Prefer lightweight placeholder picture descriptions to avoid heavy VLM backends. - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - return opts - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Builder for a Docling PDF pipeline without RapidOCR. - - Returns ``(converter, PdfPipelineOptions)`` where ``converter`` is a - ``StandardPdfPipeline`` configured for layout extraction only. - """ - - acc, _ = _resolve_accelerator(device) - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts - - -def build_rapidocr_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Canonical builder for Docling + RapidOCR pipeline. - - Returns a tuple (engine, PdfPipelineOptions). Prefers explicit RapidOCR injection - when supported; otherwise returns a DocumentConverter using the factory path. - """ - - def _fallback_layout(reason: str) -> Tuple[object, PdfPipelineOptions]: - _logger.warning( - "RapidOCR pipeline fallback: %s. Using Docling layout-only configuration.", - reason, - ) - pipeline, opts = build_layout_pipeline( - device=device, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - return pipeline, opts - - acc, want_cuda = _resolve_accelerator(device) - - # Optional provider preflight only when CUDA requested - if want_cuda: - try: - import onnxruntime as ort # type: ignore - - prov = ort.get_available_providers() - if "CUDAExecutionProvider" not in prov: - raise RuntimeError(f"CUDAExecutionProvider not available: {prov}") - except Exception as e: # pragma: no cover - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - return _fallback_layout("packaged RapidOCR ONNX assets missing") - - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=["el", "en"], - force_full_page_ocr=False, - use_det=True, - use_cls=False, - use_rec=True, - text_score=text_score, - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - opts.do_ocr = True - opts.ocr_options = ocr_opts - - # Prefer explicit injection of RapidOCR model when available - try: - from docling.models.rapid_ocr_model import RapidOcrModel # type: ignore - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - import inspect - - sig = inspect.signature(StandardPdfPipeline.__init__) - if "ocr_model" not in sig.parameters: - raise RuntimeError("Docling build does not support RapidOCR injection") - - def _factory(): - try: - return SafeRapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - except Exception: # pragma: no cover - # Fall back to the stock implementation if our wrapper misbehaves. - return RapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - - pooled_model = GLOBAL_RAPID_OCR_POOL.get( - str(acc.device), - ocr_opts, - _factory, - expected_type=SafeRapidOcrModel, - ) - pipeline = StandardPdfPipeline(opts, ocr_model=pooled_model) # type: ignore - return pipeline, opts - except Exception as exc: - _logger.warning( - "RapidOCR injection unavailable (%s); using DocumentConverter factory path.", - exc, - ) - - # Fallback: use DocumentConverter factory - try: - converter = DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} - ) - return converter, opts - except Exception as exc: - return _fallback_layout(f"DocumentConverter failed: {exc}") - - -__all__ = ["build_layout_pipeline", "build_rapidocr_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/pool.py b/src/glossapi/ocr/rapidocr/pool.py deleted file mode 100644 index db1e8f2..0000000 --- a/src/glossapi/ocr/rapidocr/pool.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Shared RapidOCR engine pooling utilities.""" -from __future__ import annotations - -from dataclasses import dataclass -from threading import Lock -from typing import Callable, Dict, Optional, Union, Type - -from docling.datamodel.pipeline_options import RapidOcrOptions - - -@dataclass(frozen=True) -class _PoolKey: - device: str - det_model_path: str - rec_model_path: str - cls_model_path: str - lang: Tuple[str, ...] - text_score: float - use_det: bool - use_cls: bool - use_rec: bool - - -class RapidOcrEnginePool: - """Process-local cache of RapidOCR models keyed by configuration.""" - - def __init__(self) -> None: - self._lock = Lock() - self._cache: Dict[_PoolKey, object] = {} - - def _make_key(self, device: str, opts: RapidOcrOptions) -> _PoolKey: - lang = tuple(opts.lang or []) - return _PoolKey( - device=str(device), - det_model_path=str(getattr(opts, "det_model_path", "")), - rec_model_path=str(getattr(opts, "rec_model_path", "")), - cls_model_path=str(getattr(opts, "cls_model_path", "")), - lang=lang, - text_score=float(getattr(opts, "text_score", 0.0)), - use_det=bool(getattr(opts, "use_det", True)), - use_cls=bool(getattr(opts, "use_cls", False)), - use_rec=bool(getattr(opts, "use_rec", True)), - ) - - def get( - self, - device: str, - opts: RapidOcrOptions, - factory: Callable[[], object], - *, - expected_type: Optional[Union[Type[object], tuple[Type[object], ...]]] = None, - ) -> object: - key = self._make_key(device, opts) - with self._lock: - model = self._cache.get(key) - if expected_type is not None and model is not None and not isinstance(model, expected_type): - self._cache.pop(key, None) - model = None - if model is None: - model = factory() - if expected_type is None or isinstance(model, expected_type): - self._cache[key] = model - return model - - def clear(self) -> None: - with self._lock: - self._cache.clear() - - -GLOBAL_RAPID_OCR_POOL = RapidOcrEnginePool() - -__all__ = ["RapidOcrEnginePool", "GLOBAL_RAPID_OCR_POOL"] diff --git a/src/glossapi/ocr/rapidocr/safe.py b/src/glossapi/ocr/rapidocr/safe.py deleted file mode 100644 index 5534563..0000000 --- a/src/glossapi/ocr/rapidocr/safe.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Temporary wrappers around Docling's RapidOCR integration. - -The upstream Docling release (2.48.x) does not tolerate RapidOCR returning -``None`` for a given crop. That bubbles up as an AttributeError inside the -conversion loop and the entire document fails. Until Docling includes a fix, we -wrap the loader so that ``None`` simply means "no detections" and processing -continues. Once Docling ships a release with the guard we can drop this shim and -revert to the vanilla ``RapidOcrModel``. -""" - -from __future__ import annotations - -import importlib.util -import sys -from collections.abc import Iterable -from pathlib import Path -from typing import Optional, Type - -import numpy - -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrOptions, RapidOcrOptions -from docling.models.rapid_ocr_model import RapidOcrModel as _RapidOcrModel -from docling.models.rapid_ocr_model import TextCell, _log -from docling.utils.profiling import TimeRecorder -from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle - -from ._paths import resolve_packaged_onnx_and_keys - - -class SafeRapidOcrModel(_RapidOcrModel): - """Drop-in RapidOCR wrapper that copes with ``None`` OCR results. - - Docling 2.48.0 assumes ``self.reader`` always returns an object with - ``boxes/txts/scores``. RapidOCR occasionally yields ``None`` for problematic - crops, which crashes the extractor. We normalise the return value before the - original list(zip(...)) call and treat anything unexpected as "no boxes". - Remove this once Docling hardens the upstream implementation. - """ - - # NOTE: keep signature identical so StandardPdfPipeline can instantiate it. - _rapidocr_available: Optional[bool] = None - - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - options: RapidOcrOptions, - accelerator_options, - ): - rapidocr_available = self._rapidocr_available - if rapidocr_available is None: - rapidocr_available = bool( - importlib.util.find_spec("rapidocr") is not None or "rapidocr" in sys.modules - ) - SafeRapidOcrModel._rapidocr_available = rapidocr_available - - effective_enabled = bool(enabled and rapidocr_available) - if enabled and not rapidocr_available: - _log.warning( - "RapidOCR python package not found; continuing with Docling pipeline OCR disabled." - ) - - if effective_enabled: - try: - resolved = resolve_packaged_onnx_and_keys() - - _log.warning( - 'SafeRapidOcrModel initial options: det=%s rec=%s cls=%s keys=%s', - getattr(options, 'det_model_path', None), - getattr(options, 'rec_model_path', None), - getattr(options, 'cls_model_path', None), - getattr(options, 'rec_keys_path', None), - ) - - if resolved.det: - options.det_model_path = resolved.det - if resolved.rec: - options.rec_model_path = resolved.rec - if resolved.cls: - options.cls_model_path = resolved.cls - if resolved.keys: - options.rec_keys_path = resolved.keys - - try: - from rapidocr.ch_ppocr_rec import main as _rapidocr_rec_main - - if not getattr(_rapidocr_rec_main.TextRecognizer, '_glossapi_patch', False): - original_get_character_dict = _rapidocr_rec_main.TextRecognizer.get_character_dict - - def _patched_get_character_dict(self, cfg): - try: - current_keys = cfg.get('keys_path', None) - current_rec_keys = cfg.get('rec_keys_path', None) - if current_rec_keys is None and current_keys is not None: - cfg['rec_keys_path'] = current_keys - _log.warning('Patched RapidOCR cfg: set rec_keys_path from keys_path=%s', current_keys) - else: - _log.warning('Patched RapidOCR cfg: existing rec_keys_path=%s keys_path=%s', current_rec_keys, current_keys) - except Exception: - _log.warning('RapidOCR cfg inspection failed', exc_info=True) - return original_get_character_dict(self, cfg) - - _rapidocr_rec_main.TextRecognizer.get_character_dict = _patched_get_character_dict - _rapidocr_rec_main.TextRecognizer._glossapi_patch = True - except Exception: - _log.warning('Failed to patch RapidOCR TextRecognizer for keys fallback', exc_info=True) - - _log.warning( - 'SafeRapidOcrModel using packaged assets: det=%s rec=%s cls=%s keys=%s', - options.det_model_path, - options.rec_model_path, - options.cls_model_path, - options.rec_keys_path, - ) - except Exception: - _log.warning( - 'SafeRapidOcrModel bootstrap failed to resolve packaged assets', - exc_info=True, - ) - - super().__init__( - enabled=effective_enabled, - artifacts_path=artifacts_path, - options=options, - accelerator_options=accelerator_options, - ) - - @classmethod - def get_options_type(cls) -> Type[OcrOptions]: - return RapidOcrOptions - - def _normalise_result(self, result): - """Return an iterable of (bbox, text, score) triples. - - RapidOCR returns ``None`` or semi-populated structures in some corner - cases. We swallow those and log a one-line warning so the page still - progresses through the pipeline. - """ - - if result is None: - _log.warning("RapidOCR returned None; skipping crop") - return [] - boxes = getattr(result, "boxes", None) - txts = getattr(result, "txts", None) - scores = getattr(result, "scores", None) - if boxes is None or txts is None or scores is None: - _log.warning("RapidOCR returned incomplete data; treating crop as empty") - return [] - try: - return list(zip(boxes.tolist(), txts, scores)) - except Exception as exc: # pragma: no cover - defensive only - _log.warning("RapidOCR result normalisation failed: %s", exc) - return [] - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - if not self.enabled: - yield from page_batch - return - - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - continue - - with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - raw_result = self.reader( - im, - use_det=self.options.use_det, - use_cls=self.options.use_cls, - use_rec=self.options.use_rec, - ) - result = self._normalise_result(raw_result) - del high_res_image - del im - - if not result: - continue - - cells = [ - TextCell( - index=ix, - text=line[1], - orig=line[1], - confidence=line[2], - from_ocr=True, - rect=BoundingRectangle.from_bounding_box( - BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ) - ), - ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) - - self.post_process_cells(all_ocr_cells, page) - - from docling.datamodel.settings import settings - - if settings.debug.visualize_ocr: - self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) - - yield page - - -def patch_docling_rapidocr() -> bool: - """Replace Docling's RapidOcrModel with the safe shim if available.""" - - try: - import docling.models.rapid_ocr_model as rapid_module - except Exception: # pragma: no cover - Docling missing - return False - - current = getattr(rapid_module, "RapidOcrModel", None) - if current is SafeRapidOcrModel: - return False - - rapid_module.RapidOcrModel = SafeRapidOcrModel - try: - from docling.models.factories import get_ocr_factory # type: ignore - import logging - except Exception: - return True - - try: - factory = get_ocr_factory() - options_type = SafeRapidOcrModel.get_options_type() - - if hasattr(factory, "classes"): - factory.classes[options_type] = SafeRapidOcrModel - elif hasattr(factory, "_classes"): - factory._classes[options_type] = SafeRapidOcrModel - logging.getLogger(__name__).info( - "Registered SafeRapidOcrModel for %s", options_type - ) - try: - from docling.pipeline import standard_pdf_pipeline as _std_pdf # type: ignore - from docling.datamodel.pipeline_options import RapidOcrOptions # type: ignore - from functools import lru_cache - except Exception as _exc: # pragma: no cover - best effort - logging.getLogger(__name__).warning( - "Docling factory patch limited to local mutation: %s", _exc - ) - else: - original_get_factory = getattr( - _std_pdf.get_ocr_factory, "__wrapped__", _std_pdf.get_ocr_factory - ) - - def _ensure_safe(factory_obj): - try: - current = factory_obj.classes.get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - factory_obj.classes[RapidOcrOptions] = SafeRapidOcrModel - except AttributeError: - current = getattr(factory_obj, "_classes", {}).get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - getattr(factory_obj, "_classes", {})[RapidOcrOptions] = SafeRapidOcrModel - return factory_obj - - @lru_cache(maxsize=None) - def _patched_get_ocr_factory(allow_external_plugins: bool = False): - return _ensure_safe(original_get_factory(allow_external_plugins)) - - _patched_get_ocr_factory.__wrapped__ = original_get_factory # type: ignore[attr-defined] - _std_pdf.get_ocr_factory = _patched_get_ocr_factory # type: ignore[attr-defined] - try: - _ensure_safe(_std_pdf.get_ocr_factory(False)) - except Exception: - pass - except Exception as exc: # pragma: no cover - best effort - import logging - - logging.getLogger(__name__).warning( - "Failed to re-register SafeRapidOcrModel: %s", exc - ) - return True - - -__all__ = ["SafeRapidOcrModel", "patch_docling_rapidocr"] diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 29db5be..424d359 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -50,12 +50,6 @@ def make_corpus(tmp_path): return Corpus(input_dir=input_dir, output_dir=output_dir) -def set_onnx_providers(monkeypatch, providers): - stub = SimpleNamespace(get_available_providers=lambda: providers) - monkeypatch.setitem(sys.modules, "onnxruntime", stub) - return stub - - def set_torch_stub(monkeypatch, *, available: bool, device_count: int): cuda_ns = SimpleNamespace( is_available=lambda: available, @@ -70,8 +64,7 @@ def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() - set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CPUExecutionProvider"]) + set_torch_stub(monkeypatch, available=False, device_count=0) with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( @@ -81,7 +74,7 @@ def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): phase1_backend="docling", ) - assert "CUDAExecutionProvider" in str(exc.value) + assert "Torch CUDA is not available" in str(exc.value) def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -89,8 +82,6 @@ def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -106,8 +97,6 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CPU", @@ -125,8 +114,6 @@ def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatc corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=2) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CUDA", @@ -147,8 +134,6 @@ def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypa corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", diff --git a/tests/test_deepseek_preflight.py b/tests/test_deepseek_preflight.py index 1900a2b..73e761d 100644 --- a/tests/test_deepseek_preflight.py +++ b/tests/test_deepseek_preflight.py @@ -1,5 +1,4 @@ import sys -from pathlib import Path from glossapi.ocr.deepseek.preflight import check_deepseek_env @@ -9,45 +8,34 @@ def test_preflight_reports_missing_components(tmp_path): "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "0", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "1", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": str(tmp_path / "missing_python"), - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(tmp_path / "missing_script.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(tmp_path / "missing_script.py"), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(tmp_path / "missing_model"), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(tmp_path / "missing_lib"), - "PATH": str(tmp_path), # no cc1plus here } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) names = {c.name for c in report.errors} + assert "allow_cli" in names + assert "allow_stub" in names assert "deepseek_python" in names - assert "vllm_script" in names + assert "runner_script" in names assert "model_dir" in names - assert "ld_library_path" in names - assert "cc1plus" in names assert not report.ok def test_preflight_passes_with_complete_env(tmp_path): - script = tmp_path / "run_pdf_ocr_vllm.py" + script = tmp_path / "run_pdf_ocr_transformers.py" script.write_text("#!/usr/bin/env python3\n", encoding="utf-8") - model_dir = tmp_path / "DeepSeek-OCR" + model_dir = tmp_path / "DeepSeek-OCR-2" model_dir.mkdir() (model_dir / "config.json").write_text("{}", encoding="utf-8") (model_dir / "model-00001-of-000001.safetensors").write_bytes(b"stub") - lib_dir = tmp_path / "libjpeg" - lib_dir.mkdir() - fake_bin = tmp_path / "bin" - fake_bin.mkdir() - cc1plus = fake_bin / "cc1plus" - cc1plus.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") - cc1plus.chmod(0o755) env = { "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "1", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "0", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": sys.executable, - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(script), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(script), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(model_dir), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(lib_dir), - "PATH": str(fake_bin), } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) assert report.ok assert not report.errors diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py new file mode 100644 index 0000000..a5a93e4 --- /dev/null +++ b/tests/test_deepseek_runner_contract.py @@ -0,0 +1,62 @@ +from pathlib import Path + +import pandas as pd +import pytest + + +def _mk_corpus(tmp_path: Path): + from glossapi import Corpus + + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_deepseek_backend_rejects_stub_mode(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + df = pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ) + parquet_path = dl_dir / "download_results.parquet" + df.to_parquet(parquet_path, index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%real\n") + + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") + + with pytest.raises(RuntimeError, match="stub execution has been removed"): + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc[fname, "ocr_success"]) is False + assert bool(updated.loc[fname, "needs_ocr"]) is True + + +def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _write_outputs, _write_progress + + output_dir = tmp_path / "output" + _write_progress( + output_dir=output_dir, + stem="doc", + page_outputs=["page one"], + total_pages=5, + completed_pages=1, + ) + + canonical_markdown = output_dir / "markdown" / "doc.md" + progress_markdown = output_dir / "sidecars" / "ocr_progress" / "doc.partial.md" + progress_json = output_dir / "json" / "metrics" / "doc.progress.json" + + assert not canonical_markdown.exists() + assert progress_markdown.exists() + assert progress_json.exists() + + _write_outputs(output_dir=output_dir, stem="doc", markdown="final", page_count=5) + + assert canonical_markdown.exists() + assert canonical_markdown.read_text(encoding="utf-8") == "final\n" + assert not progress_markdown.exists() diff --git a/tests/test_deepseek_runner_stub.py b/tests/test_deepseek_runner_stub.py deleted file mode 100644 index aee5177..0000000 --- a/tests/test_deepseek_runner_stub.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -import pandas as pd - - -def _mk_corpus(tmp_path: Path): - from glossapi import Corpus - - root = tmp_path / "corpus" - root.mkdir() - return Corpus(input_dir=root, output_dir=root) - - -def test_deepseek_backend_stub_runs_and_updates_parquet(tmp_path, monkeypatch): - corpus = _mk_corpus(tmp_path) - - # Seed a minimal metadata parquet with one bad file - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - fname = "doc.pdf" - df = pd.DataFrame( - [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] - ) - parquet_path = dl_dir / "download_results.parquet" - df.to_parquet(parquet_path, index=False) - - # Create an empty placeholder file for the PDF - (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") - - # Monkeypatch the runner internal to avoid heavy imports - from glossapi.ocr.deepseek import runner - - def fake_run_one(pdf_path, md_out, metrics_out, cfg): - md_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("deepseek stub output\n", encoding="utf-8") - metrics_out.write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") - return {"page_count": 1} - - monkeypatch.setattr(runner, "_run_one_pdf", fake_run_one) - - # Run OCR via dispatcher - corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) - - # Artifacts exist - stem = "doc" - md = corpus.output_dir / "markdown" / f"{stem}.md" - metrics = corpus.output_dir / "json" / "metrics" / f"{stem}.metrics.json" - assert md.exists(), "Markdown output should be created by deepseek stub" - assert metrics.exists(), "Metrics JSON should be created by deepseek stub" - - # Parquet updated - updated = pd.read_parquet(parquet_path).set_index("filename") - row = updated.loc[fname] - assert bool(row["ocr_success"]) is True - assert bool(row["needs_ocr"]) is False - # extraction_mode is optional; if present assert value - if "extraction_mode" in updated.columns: - assert updated.loc[fname, "extraction_mode"] == "deepseek" diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index 0419ba4..6c410c5 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -11,7 +11,7 @@ def _mk_corpus(tmp_path: Path): return Corpus(input_dir=root, output_dir=root) -def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): +def test_deepseek_ocr_then_math_only_smoke(tmp_path, monkeypatch): corpus = _mk_corpus(tmp_path) # Two PDFs: one needs OCR, one does not (for math-only later) @@ -28,7 +28,7 @@ def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) - # DeepSeek stub for OCR + # DeepSeek runner is stubbed here only to avoid the heavy model during unit tests. from glossapi.ocr.deepseek import runner def fake_run_for_files(self_ref, files, **kwargs): @@ -45,7 +45,7 @@ def fake_run_for_files(self_ref, files, **kwargs): # Run DeepSeek OCR for bad files corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=True, mode="ocr_bad_then_math") - # RapidOCR math-only pass: ensure JSON for clean.pdf and run math + # Math-only pass: ensure JSON for clean.pdf and run math json_dir = corpus.output_dir / "json" json_dir.mkdir(parents=True, exist_ok=True) (json_dir / "clean.docling.json").write_text("{}", encoding="utf-8") @@ -58,7 +58,7 @@ def fake_enrich(files=None, **kwargs): monkeypatch.setattr(corpus, "formula_enrich_from_json", fake_enrich) - corpus.ocr(backend="rapidocr", fix_bad=False, math_enhance=True, mode="math_only") + corpus.ocr(backend="deepseek", fix_bad=False, math_enhance=True, mode="math_only") # Verify updated = pd.read_parquet(parquet_path).set_index("filename") diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 965692b..3779d07 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -51,29 +51,7 @@ def fail_math(*args, **kwargs): assert calls.get("files") == [fname] -def test_rapidocr_backend_routes_to_extract_with_docling(tmp_path, monkeypatch): +def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) - - # Seed minimal metadata parquet that flags a single file for OCR - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - df = pd.DataFrame([ - {"filename": "doc.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False} - ]) - df.to_parquet(dl_dir / "download_results.parquet", index=False) - - captured = {} - - def fake_extract(**kwargs): - captured.update(kwargs) - return None - - monkeypatch.setattr(corpus, "extract", fake_extract) - - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False, use_gpus="single", devices=[0]) - - assert captured, "Expected extract() to be called for rapidocr backend" - assert captured.get("force_ocr") is True - assert captured.get("phase1_backend") == "docling" - files = captured.get("filenames") or [] - assert files and files[0] == "doc.pdf" + with pytest.raises(ValueError, match="backend must be 'deepseek'"): + corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) diff --git a/tests/test_ocr_imports.py b/tests/test_ocr_imports.py index 3487619..094e72b 100644 --- a/tests/test_ocr_imports.py +++ b/tests/test_ocr_imports.py @@ -8,32 +8,19 @@ def test_import_ocr_package_is_lightweight(): import glossapi.ocr as ocr assert hasattr(ocr, "deepseek") - assert hasattr(ocr, "rapidocr") # New subpackages remain importable lazily import glossapi.ocr.deepseek.runner as deepseek_runner - import glossapi.ocr.rapidocr.dispatch as rapid_dispatch assert ocr.deepseek.runner is deepseek_runner - assert ocr.rapidocr.dispatch is rapid_dispatch assert ocr.deepseek_runner is deepseek_runner - assert ocr.rapidocr_dispatch is rapid_dispatch assert hasattr(deepseek_runner, "run_for_files") - assert hasattr(rapid_dispatch, "run_via_extract") # Utilities module always available (pure Python) from glossapi.ocr.utils import json_io as utils_json assert hasattr(utils_json, "export_docling_json") - if importlib.util.find_spec("docling") is not None: - try: - from glossapi.ocr.rapidocr import pool as rapid_pool - except ModuleNotFoundError: - pytest.skip("Docling optional dependencies not available") - else: - assert hasattr(rapid_pool, "GLOBAL_RAPID_OCR_POOL") - if importlib.util.find_spec("docling_core") is not None: try: from glossapi.ocr.math import enrich_from_docling_json, RoiEntry diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 4fe7464..7dae1b7 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -1,4 +1,5 @@ import os +import sys from pathlib import Path import pandas as pd @@ -7,10 +8,6 @@ pytest.importorskip("docling") pytest.importorskip("glossapi_rs_cleaner") -pytest.importorskip( - "onnxruntime", reason="RapidOCR/DeepSeek end-to-end tests require onnxruntime" -) -import onnxruntime as ort # noqa: E402 from glossapi import Corpus from glossapi.corpus import _resolve_skiplist_path @@ -106,11 +103,8 @@ def _assert_dir_contents( pytest.fail(f"Unexpected file {entry} in {root}") -@pytest.mark.rapidocr -def test_pipeline_smoke_and_artifacts(tmp_path): +def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for pipeline smoke test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -146,6 +140,21 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert bool(needs.get("blank.pdf")), "Blank PDF should be flagged for OCR" assert not bool(needs.get("text.pdf")) + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + (markdown_dir / f"{stem}.md").write_text("[[Blank page]]\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( mode="ocr_bad", use_gpus="single", @@ -193,15 +202,8 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert sections_file.exists() -@pytest.mark.rapidocr def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - - assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -256,6 +258,25 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert bool(greek_row["needs_ocr"]), "Greek consonant doc should require OCR rerun" assert "non_greek_text" in str(greek_row.get("filter", "")), "Filter should record non-Greek text" + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + if stem == "greek_consonants": + text = documents["greek_consonants"] + else: + text = documents.get(stem) or "[[Blank page]]" + (markdown_dir / f"{stem}.md").write_text(f"{text}\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( fix_bad=True, math_enhance=True, @@ -268,6 +289,15 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not bool(greek_after["needs_ocr"]), "Greek consonant doc should be resolved after OCR rerun" assert bool(greek_after.get("ocr_success", False)), "OCR rerun should mark greek consonant doc as success" + corpus.ocr( + backend="deepseek", + fix_bad=False, + math_enhance=True, + mode="math_only", + use_gpus="single", + devices=[device_idx], + ) + json_dir = corpus_dir / "json" assert json_dir.exists(), "Docling JSON directory should exist after extraction" for stem in documents: @@ -304,11 +334,8 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not skiplist_path.read_text(encoding="utf-8").strip(), "Fatal skip-list should remain empty" -@pytest.mark.rapidocr def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for OCR recovery test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -384,8 +411,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): script = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py", + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + Path.cwd() / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py", ) ) if not script.exists(): @@ -393,8 +420,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): python_bin = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_TEST_PYTHON", - Path("/mnt/data/glossAPI/deepseek_venv/bin/python"), + "GLOSSAPI_DEEPSEEK_PYTHON", + os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", sys.executable), ) ) if not python_bin.exists(): @@ -409,29 +436,17 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): if not model_dir.exists(): pytest.skip(f"DeepSeek model directory missing: {model_dir}") - lib_path = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - if not lib_path: - candidate = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" - if candidate.exists(): - lib_path = str(candidate) - if not lib_path or not Path(lib_path).exists(): - pytest.skip("Set GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH to the libjpeg-turbo library directory") - - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - device_idx = 0 if torch.cuda.device_count() > 1: device_idx = torch.cuda.current_device() - # Force the CLI path (no stub fallback) and point to the desired interpreter/script. + # Force the real runner path and point to the desired interpreter/script. monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", str(python_bin)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", str(script)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH", lib_path) - monkeypatch.setenv("VLLM_ALLOW_REMOTE_CODE", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", str(script)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) existing_py_path = os.environ.get("PYTHONPATH", "") src_path = str(Path.cwd() / "src") if existing_py_path: @@ -439,13 +454,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): else: monkeypatch.setenv("PYTHONPATH", src_path) - import glossapi.ocr.deepseek.runner as deepseek_runner - - def _raise_if_stub(*_args, **_kwargs): - raise AssertionError("DeepSeek fallback stub should not run in CLI smoke test") - - monkeypatch.setattr(deepseek_runner, "_run_one_pdf", _raise_if_stub) - corpus_dir = tmp_path / "corpus" corpus_dir.mkdir() diff --git a/tests/test_rapidocr_patch.py b/tests/test_rapidocr_patch.py deleted file mode 100644 index 93a8ca5..0000000 --- a/tests/test_rapidocr_patch.py +++ /dev/null @@ -1,368 +0,0 @@ -import importlib -import sys -import types -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pytest - - -def _clear_modules(prefix: str) -> None: - for name in list(sys.modules): - if name == prefix or name.startswith(f"{prefix}."): - sys.modules.pop(name, None) - - -def _install_docling_stub(*, supports_injection: bool) -> None: - _clear_modules("docling") - _clear_modules("docling_core") - _clear_modules("glossapi") - - def register(name: str) -> types.ModuleType: - module = types.ModuleType(name) - sys.modules[name] = module - return module - - docling = register("docling") - register("docling.backend") - register("docling.backend.docling_parse_backend").DoclingParseDocumentBackend = object - register("docling.backend.docling_parse_v2_backend").DoclingParseV2DocumentBackend = object - register("docling.backend.pypdfium2_backend").PyPdfiumDocumentBackend = object - - base_models = register("docling.datamodel.base_models") - - class InputFormat: - PDF = "pdf" - DOCX = "docx" - XML_JATS = "xml" - HTML = "html" - PPTX = "pptx" - CSV = "csv" - MD = "md" - - class ConversionStatus: - SUCCESS = "success" - PARTIAL_SUCCESS = "partial" - - class Page: - def __init__(self): - self._backend = types.SimpleNamespace( - is_valid=lambda: True, - get_page_image=lambda *args, **kwargs: types.SimpleNamespace() - ) - - base_models.InputFormat = InputFormat - base_models.ConversionStatus = ConversionStatus - base_models.Page = Page - - pipeline_opts = register("docling.datamodel.pipeline_options") - - class AcceleratorDevice: - AUTO = "auto" - CUDA = "cuda" - MPS = "mps" - CPU = "cpu" - - class AcceleratorOptions: - def __init__(self, num_threads=None, device=None): - self.num_threads = num_threads - self.device = device - - class PdfPipelineOptions: - def __init__(self, **_kwargs): - self.ocr_options = None - self.do_ocr = False - - class RapidOcrOptions: - def __init__(self, **kwargs): - for key, value in kwargs.items(): - setattr(self, key, value) - self.rec_keys_path = None - - class OcrOptions: - pass - - class LayoutOptions: - pass - - class TableStructureOptions: - def __init__(self, mode=None): - self.mode = mode - self.do_cell_matching = False - - class TableFormerMode: - ACCURATE = "accurate" - - class PictureDescriptionApiOptions: - pass - - pipeline_opts.AcceleratorDevice = AcceleratorDevice - pipeline_opts.AcceleratorOptions = AcceleratorOptions - pipeline_opts.PdfPipelineOptions = PdfPipelineOptions - pipeline_opts.RapidOcrOptions = RapidOcrOptions - pipeline_opts.OcrOptions = OcrOptions - pipeline_opts.LayoutOptions = LayoutOptions - pipeline_opts.TableStructureOptions = TableStructureOptions - pipeline_opts.TableFormerMode = TableFormerMode - pipeline_opts.PictureDescriptionApiOptions = PictureDescriptionApiOptions - - register("docling.datamodel.document").ConversionResult = object - - settings_mod = register("docling.datamodel.settings") - - class _Debug: - def __init__(self): - self.profile_pipeline_timings = False - self.visualize_ocr = False - - class _Settings: - def __init__(self): - self.debug = _Debug() - - settings_mod.settings = _Settings() - - converter_mod = register("docling.document_converter") - - class DocumentConverter: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - class PdfFormatOption: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - converter_mod.DocumentConverter = DocumentConverter - converter_mod.PdfFormatOption = PdfFormatOption - converter_mod.WordFormatOption = object - converter_mod.HTMLFormatOption = object - converter_mod.XMLJatsFormatOption = object - converter_mod.PowerpointFormatOption = object - converter_mod.MarkdownFormatOption = object - converter_mod.CsvFormatOption = object - - register("docling.pipeline.simple_pipeline").SimplePipeline = object - - pipelines_mod = register("docling.pipelines.standard_pdf_pipeline") - pipeline_mod = register("docling.pipeline.standard_pdf_pipeline") - - if supports_injection: - class StandardPdfPipeline: - def __init__(self, opts, ocr_model=None, **_): - self.opts = opts - self.ocr_model = ocr_model - else: - class StandardPdfPipeline: - def __init__(self, opts, **_): - self.opts = opts - - pipelines_mod.StandardPdfPipeline = StandardPdfPipeline - pipeline_mod.StandardPdfPipeline = StandardPdfPipeline - - rapid_module = register("docling.models.rapid_ocr_model") - - class DummyReader: - def __call__(self, *_args, **_kwargs): - return [] - - class RapidOcrModel: - def __init__(self, enabled, artifacts_path, options, accelerator_options): - self.enabled = enabled - self.reader = DummyReader() - self.options = options - - def get_ocr_rects(self, _page): - return [] - - def post_process_cells(self, _cells, _page): - pass - - class TextCell: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - class _Log: - @staticmethod - def warning(_msg, *args, **kwargs): - return None - - rapid_module.RapidOcrModel = RapidOcrModel - rapid_module.TextCell = TextCell - rapid_module._log = _Log() - - utils_mod = register("docling.utils") - profiling_mod = register("docling.utils.profiling") - - class TimeRecorder: - def __init__(self, *_args, **_kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, *exc): - return False - - profiling_mod.TimeRecorder = TimeRecorder - utils_mod.profiling = profiling_mod - - register("docling.models") - - core_doc = register("docling_core.types.doc") - - class BoundingBox: - @staticmethod - def from_tuple(coord, origin=None): - return SimpleNamespace(coord=coord, origin=origin) - - class CoordOrigin: - TOPLEFT = "topleft" - - core_doc.BoundingBox = BoundingBox - core_doc.CoordOrigin = CoordOrigin - - core_page = register("docling_core.types.doc.page") - - class BoundingRectangle: - @staticmethod - def from_bounding_box(box): - return box - - core_page.BoundingRectangle = BoundingRectangle - - -def _install_onnxruntime_stub(): - sys.modules['onnxruntime'] = types.SimpleNamespace( - get_available_providers=lambda: ['CUDAExecutionProvider'] - ) - - -def _make_safe_ocr() -> SimpleNamespace: - """Return an instantiated SafeRapidOcrModel with stubbed dependencies.""" - rapid_opts = sys.modules['docling.datamodel.pipeline_options'].RapidOcrOptions() - accel_opts = sys.modules['docling.datamodel.pipeline_options'].AcceleratorOptions(device='cuda:0') - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel - - return SafeRapidOcrModel(enabled=True, artifacts_path=None, options=rapid_opts, accelerator_options=accel_opts) - - -@pytest.fixture(autouse=True) -def _cleanup_modules(): - yield - for name in [n for n in list(sys.modules) if n.startswith('glossapi') and '_rapidocr_paths' not in n]: - if name.startswith('glossapi_rs_'): - continue - sys.modules.pop(name, None) - _clear_modules('docling') - _clear_modules('docling_core') - sys.modules.pop('onnxruntime', None) - - -def test_patch_runs_on_import(): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - rapid_module = sys.modules['docling.models.rapid_ocr_model'] - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel, patch_docling_rapidocr - - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - patch_docling_rapidocr() - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - -def test_build_rapidocr_pipeline_injects_when_supported(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - glossapi_mod = importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - captured = {} - - def fake_pool_get(device, opts, factory, expected_type): - model = factory() - assert isinstance(model, pipeline.SafeRapidOcrModel) - assert expected_type is pipeline.SafeRapidOcrModel - captured['device'] = device - captured['opts'] = opts - return SimpleNamespace() - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fake_pool_get)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - assert hasattr(engine, 'ocr_model') - assert captured['device'] == 'cuda:0' - assert opts.do_ocr is True - - -def test_build_rapidocr_pipeline_falls_back_without_injection(monkeypatch): - _install_docling_stub(supports_injection=False) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - def fail_pool(*_args, **_kwargs): - raise AssertionError('Pool should not be used when injection unsupported') - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fail_pool)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - converter_mod = importlib.import_module('docling.document_converter') - assert isinstance(engine, converter_mod.DocumentConverter) - assert opts.do_ocr is True - - -def test_safe_rapidocr_normalises_none(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - assert model._normalise_result(None) == [] - - -def test_safe_rapidocr_normalises_incomplete_and_valid_data(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - class IncompleteResult: - boxes = None - txts = ['foo'] - scores = [0.9] - - assert model._normalise_result(IncompleteResult()) == [] - - box = np.array([ - [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0]], - ]) - - class FullResult: - boxes = box - txts = ['foo'] - scores = [0.9] - - output = model._normalise_result(FullResult()) - assert output == [ - (box[0].tolist(), 'foo', 0.9) - ] From 83f7bf276078059e6665dec3ed6d548881cda8df Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 23:45:29 +0200 Subject: [PATCH 06/93] Add GitHub Pages docs workflow --- .github/workflows/docs.yml | 40 ++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 8 +------- 2 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/docs.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..6c7fcbd --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Build and Deploy Docs + +on: + push: + branches: + - development + - main + - master + workflow_dispatch: + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install MkDocs + run: | + python -m pip install --upgrade pip + pip install mkdocs mkdocs-material + + - name: Build site + run: mkdocs build --strict + + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site + publish_branch: gh-pages + force_orphan: true diff --git a/mkdocs.yml b/mkdocs.yml index 1776dd5..ba0a1e4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,14 +42,8 @@ nav: - Troubleshooting: troubleshooting.md - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - - Corpus API: api/corpus.md + - Corpus API: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md - - Divio Skeleton: - - Overview: divio/overview.md - - Tutorials: divio/tutorials.md - - How-to Guides: divio/how_to_guides.md - - Reference: divio/reference.md - - Explanation: divio/explanation.md docs_dir: docs markdown_extensions: - admonition From 1bf4261d3a2a0597c37bce0b68c8d0faa773b5e7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 9 Mar 2026 23:54:20 +0200 Subject: [PATCH 07/93] Fix docs links for Pages build --- .github/workflows/docs.yml | 2 +- docs/index.md | 16 +--------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6c7fcbd..4719481 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -26,7 +26,7 @@ jobs: - name: Install MkDocs run: | python -m pip install --upgrade pip - pip install mkdocs mkdocs-material + pip install 'mkdocs<2' 'mkdocs-material<10' - name: Build site run: mkdocs build --strict diff --git a/docs/index.md b/docs/index.md index d8ec279..01ad63c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,29 +7,15 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Quickstart Recipes](quickstart.md) — common extraction/OCR flows in copy-paste form. - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. -## Understand the architecture -- [Architecture Overview](architecture/index.md) — the end-to-end staged model and why it exists. -- [Core Design Principles](architecture/core_design_principles.md) — the design constraints that shape the pipeline. -- [Docling Throughput and Batching](architecture/docling_throughput_and_batching.md) — how throughput and stability trade off. -- [Failure Recovery and Skiplist](architecture/docling_failure_recovery_and_skiplist.md) — how the pipeline survives problematic PDFs. -- [Greek Text Validation](architecture/greek_text_validation.md) — why extraction success is not enough for Greek corpora. -- [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. -- [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. -- [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. -- [DeepSeek-Only Upgrade Roadmap](architecture/deepseek_only_upgrade_roadmap.md) — the staged simplification plan for OCR and dependency upgrades. - ## Learn the pipeline - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. -- [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. ## Configure and debug - [Configuration](configuration.md) lists all environment knobs. - [Troubleshooting](troubleshooting.md) captures the most common pitfalls. - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. -- [Compatibility And Regression Matrix](testing/compatibility_matrix.md) defines the release-validation gates for the migration and upgrades. ## Reference -- [Corpus API](api/corpus.md) details public methods and parameters. -- `docs/divio/` contains placeholder pages for the upcoming Divio restructuring—feel free to open PRs fleshing them out. +- [Corpus API](api_corpus_tmp.md) details public methods and parameters. From 79cc99c237ba9ea685e0e94bc349907c26502bf7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 10 Mar 2026 01:31:58 +0200 Subject: [PATCH 08/93] docs: map pipeline concepts to implementation --- README.md | 17 ++++ docs/api/corpus.md | 216 +++++++++++++++++++++++++++++++++++++++++++++ docs/code_map.md | 61 +++++++++++++ docs/index.md | 4 +- docs/pipeline.md | 105 +++++++++++++++++++--- mkdocs.yml | 4 +- 6 files changed, 395 insertions(+), 12 deletions(-) create mode 100644 docs/api/corpus.md create mode 100644 docs/code_map.md diff --git a/README.md b/README.md index e581361..5ad3b1f 100644 --- a/README.md +++ b/README.md @@ -80,11 +80,28 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips. ## Repo Landmarks +- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files. +- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs. - `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown. - `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic. - `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus). - `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides. +## Pipeline map + +Use this as the shortest path from a documentation concept to the public call that implements it. + +| Stage | Main call | Important parameters | Writes | +| --- | --- | --- | --- | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | +| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | +| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns | +| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL | + ## Contributing - Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check. - Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together. diff --git a/docs/api/corpus.md b/docs/api/corpus.md new file mode 100644 index 0000000..40f8c47 --- /dev/null +++ b/docs/api/corpus.md @@ -0,0 +1,216 @@ +# API Reference — `glossapi.Corpus` + +The `Corpus` class is the high‑level entrypoint for the pipeline. Below are the most commonly used methods. + +Use this page as a compact contract reference. For the stage-by-stage artifact view, see `../pipeline.md`. For the source-level ownership map, see `../code_map.md`. + +## Constructor + +```python +glossapi.Corpus( + input_dir: str | Path, + output_dir: str | Path, + section_classifier_model_path: str | Path | None = None, + extraction_model_path: str | Path | None = None, + metadata_path: str | Path | None = None, + annotation_mapping: dict[str, str] | None = None, + downloader_config: dict[str, Any] | None = None, + log_level: int = logging.INFO, + verbose: bool = False, +) +``` + +- `input_dir`: source files (PDF/DOCX/HTML/…) +- `output_dir`: pipeline outputs (markdown, json, sections, …) +- `downloader_config`: defaults for `download()` (e.g., concurrency, cookies) +- Main side effects: creates the standard output folders and lazy-initializes the extractor, sectioner, and classifier. + +## extract() + +```python +extract( + input_format: str = 'all', + num_threads: int | None = None, + accel_type: str = 'CUDA', # 'CPU'|'CUDA'|'MPS'|'Auto' + *, + force_ocr: bool = False, + formula_enrichment: bool = False, + code_enrichment: bool = False, + filenames: list[str] | None = None, + skip_existing: bool = True, + use_gpus: str = 'single', # 'single'|'multi' + devices: list[int] | None = None, + use_cls: bool = False, + benchmark_mode: bool = False, + export_doc_json: bool = True, + emit_formula_index: bool = False, +) -> None +``` + +- Purpose: Phase‑1 extraction from source files into markdown plus optional JSON intermediates. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout/OCR + - `force_ocr=True`: turn on OCR during extraction + - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `export_doc_json=True`: write `json/.docling.json(.zst)` + - `emit_formula_index=True`: also write `json/.formula_index.jsonl` +- Main outputs: + - `markdown/.md` + - `json/.docling.json(.zst)` when enabled + - `json/metrics/.metrics.json` + - `json/metrics/.per_page.metrics.json` + +## clean() + +```python +clean( + input_dir: str | Path | None = None, + threshold: float = 0.10, + num_threads: int | None = None, + drop_bad: bool = True, +) -> None +``` + +- Purpose: run the Rust cleaner/noise pipeline and decide which documents are safe for downstream processing. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if present +- Important parameters: + - `threshold`: badness threshold + - `drop_bad`: whether to remove bad files from downstream selection + - `empty_char_threshold`, `empty_min_pages`: heuristics for OCR rerun recommendation +- Main outputs: + - `clean_markdown/.md` + - cleaner report parquet + - updated parquet columns such as `filter`, `needs_ocr`, and metrics fields +- Operational note: this stage is the quality gate that drives `section()` and `ocr()`. + +## ocr() + +```python +ocr( + *, + fix_bad: bool = True, + mode: str | None = None, + device: str | None = None, + model_dir: str | Path | None = None, + max_pages: int | None = None, + persist_engine: bool = True, + limit: int | None = None, + dpi: int | None = None, + precision: str | None = None, + math_enhance: bool = True, + math_targets: dict[str, list[tuple[int,int]]] | None = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = 'single', + devices: list[int] | None = None, + force: bool | None = None, +) -> None +``` + +- Purpose: selective OCR retry and optional Phase‑2 math/code enrichment. +- Mode selection: + - `ocr_bad`: rerun OCR only for cleaner-flagged docs + - `math_only`: run enrichment from existing Docling JSON + - `ocr_bad_then_math`: OCR flagged docs, then enrich them +- Important parameters: + - `mode`, `fix_bad`, `math_enhance` + - `use_gpus`, `devices` + - `math_targets` to restrict enrichment to specific items +- Main outputs: + - refreshed `markdown/.md` + - refreshed cleaner/parquet metadata after OCR reruns + - `json/.latex_map.jsonl` when enrichment runs + +## formula_enrich_from_json() + +```python +formula_enrich_from_json( + files: list[str] | None = None, + *, + device: str = 'cuda', + batch_size: int = 8, + dpi_base: int = 220, + targets_by_stem: dict[str, list[tuple[int,int]]] | None = None, +) -> None +``` + +- Purpose: Phase‑2 GPU enrichment from previously exported Docling JSON. +- Typical inputs: + - `json/.docling.json(.zst)` + - optional formula/code index data +- Important parameters: + - `files`: restrict to specific stems + - `device`, `batch_size`, `dpi_base` + - `targets_by_stem`: target specific `(page_no, item_index)` tuples +- Main outputs: + - enriched markdown back into `markdown/.md` + - `json/.latex_map.jsonl` + +## section(), annotate() + +```python +section() -> None +annotate(annotation_type: str = 'text', fully_annotate: bool = True) -> None +``` + +- `section()`: + - purpose: convert markdown into one row per section with structural flags + - inputs: markdown selected by cleaner/parquet metadata + - outputs: `sections/sections_for_annotation.parquet` +- `annotate()`: + - purpose: classify sections and optionally expand them into full document structure + - important parameters: `annotation_type='text'|'chapter'|'auto'`, `fully_annotate` + - outputs: `classified_sections.parquet` and `fully_annotated_sections.parquet` + +## download() + +```python +download( + input_parquet: str | Path, + *, + links_column: str | None = None, + parallelize_by: str | None = None, + verbose: bool | None = None, + **kwargs, +) -> pd.DataFrame +``` + +- Purpose: fetch source files described in a parquet dataset. +- Typical inputs: + - an explicit `input_parquet` + - or the first parquet file found in `input_dir` +- Important parameters: + - `links_column`: override URL column name + - `parallelize_by`: choose grouping for the scheduler + - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` + - returned `pd.DataFrame` with download status and metadata + +## triage_math() + +- Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. +- Inputs: `json/metrics/.per_page.metrics.json` +- Outputs: updated `download_results` parquet with routing fields such as formula totals and phase recommendation + +## Suggested Reading Order + +1. `download()` if you start from URLs. +2. `extract()` for Phase‑1 layout/markdown. +3. `clean()` to decide what needs OCR. +4. `ocr()` if you need OCR retry or Phase‑2 enrichment. +5. `section()` and `annotate()` for structured downstream outputs. + +--- + +See also: +- Code map: ../code_map.md +- Pipeline overview and artifacts: ../pipeline.md +- Configuration and environment variables: ../configuration.md +- OCR and math enrichment details: ../ocr_and_math_enhancement.md diff --git a/docs/code_map.md b/docs/code_map.md new file mode 100644 index 0000000..97f12d5 --- /dev/null +++ b/docs/code_map.md @@ -0,0 +1,61 @@ +# Code Map + +This page maps the main documentation ideas to the code that implements them. It is +meant to help you move from "what does GlossAPI do?" to "where do I change it?" +without reading the entire repo. + +## Top-Level Entry Points + +| Area | Main code | Responsibility | +| --- | --- | --- | +| Public package entry | `src/glossapi/__init__.py` | Applies the RapidOCR patch on import and exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes. | +| High-level orchestration | `src/glossapi/corpus.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | + +## Pipeline Stages + +| Stage | Main methods/classes | Notes | +| --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | Supports URL expansion, deduplication, checkpoints, per-domain scheduling, and resume. | +| Extract | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` | Handles backend choice, GPU preflight, and single- vs multi-GPU dispatch. | +| Clean / quality gate | `Corpus.clean()` | Runs the Rust cleaner and merges quality metrics back into parquet metadata. | +| OCR retry / math follow-up | `Corpus.ocr()`, `Corpus.formula_enrich_from_json()` | Re-runs OCR only for flagged documents and optionally performs Phase-2 math/code enrichment from JSON. | +| Sectioning | `Corpus.section()`, `GlossSection.to_parquet()` | Converts markdown documents into section rows for later classification. | +| Classification / annotation | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | Runs the SVM classifier and post-processes section labels into final document structure. | +| Export / triage | `Corpus.jsonl()`, `Corpus.triage_math()` | Produces training/export JSONL and computes routing hints for math-dense documents. | + +## Backend and Runtime Helpers + +| File | Responsibility | +| --- | --- | +| `src/glossapi/_pipeline.py` | Canonical builders for layout-only and RapidOCR-backed Docling pipelines. | +| `src/glossapi/rapidocr_safe.py` | Monkey-patch/shim for Docling 2.48.x so problematic OCR crops do not crash whole documents. | +| `src/glossapi/_rapidocr_paths.py` | Resolves packaged RapidOCR ONNX models and Greek keys, with env-var override support. | +| `src/glossapi/ocr_pool.py` | Reuses RapidOCR model instances where possible. | +| `src/glossapi/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/triage.py` | Summarizes per-page formula density and updates parquet routing metadata. | +| `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | + +## Rust Extensions + +| Crate | Path | Purpose | +| --- | --- | --- | +| Cleaner | `rust/glossapi_rs_cleaner` | Markdown cleaning, script/noise filtering, and report generation used by `Corpus.clean()`. | +| Noise metrics | `rust/glossapi_rs_noise` | Fast quality metrics used by the broader pipeline and package build configuration. | + +## Tests To Read First + +| Test | Why it matters | +| --- | --- | +| `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | +| `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | +| `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | +| `tests/test_rapidocr_patch.py` | Covers the Docling/RapidOCR compatibility patch and fallback paths. | + +## If You Need To Change... + +- Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. +- Phase-1 parsing, OCR selection, or artifact generation: start in `src/glossapi/corpus.py` and `src/glossapi/gloss_extract.py`. +- Docling/RapidOCR wiring or provider issues: start in `src/glossapi/_pipeline.py`, `src/glossapi/rapidocr_safe.py`, and `src/glossapi/_rapidocr_paths.py`. +- Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus.py`. diff --git a/docs/index.md b/docs/index.md index 01ad63c..997d2d8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,6 +8,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. ## Learn the pipeline +- [Code Map](code_map.md) links the main documentation ideas to the classes and files that implement them. - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. @@ -18,4 +19,5 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. ## Reference -- [Corpus API](api_corpus_tmp.md) details public methods and parameters. +- [Corpus API](api/corpus.md) gives the compact contract view of the main public methods. +- [Legacy Corpus API Notes](api_corpus_tmp.md) remains available while the docs are being consolidated. diff --git a/docs/pipeline.md b/docs/pipeline.md index cb11662..cacc8c4 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -6,16 +6,88 @@ GlossAPI is a staged pipeline. You can enter at any stage and use the same folde The `Corpus` class is the stable surface of the project. New functionality should plug into the existing phase mixins so callers can stick to the small set of entrypoints (`download()`, `extract()`, `clean()`, `ocr()`, `section()`, `annotate()`, `export/jsonl*()`). The expected usage pattern is a short script that chains these calls; avoid ad-hoc monkeypatches or bypassing the orchestrator when adding features so downstream users retain resumability and consistent artifacts. -## Stages - -- Download (optional): fetch source files from URLs → `downloads/` -- Extract (Phase‑1): parse PDFs to Markdown; optional GPU OCR → `markdown/.md` -- Clean: compute quality metrics and filter low‑quality items; decide which to OCR -- OCR (compat shim): re‑run extract on filtered items with `force_ocr=True` -- JSON + index (optional): emit `json/.docling.json(.zst)` and `json/.formula_index.jsonl` for Phase‑2 -- Enrich (Phase‑2): decode FORMULA/CODE from JSON on GPU → overwrite `markdown/.md`, write `json/.latex_map.jsonl` -- Section: produce `sections/sections_for_annotation.parquet` -- Annotate: classify sections; produce `classified_sections.parquet` and `fully_annotated_sections.parquet` +## Stage Map + +| Stage | Main code | Typical inputs | Important parameters | Main outputs | +| --- | --- | --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `devices`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | +| OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | +| Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | +| Section | `Corpus.section()`, `GlossSection.to_parquet()` | markdown selected by cleaner/parquet | no major public knobs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | section parquet and classifier model | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage / export | `Corpus.triage_math()`, `Corpus.jsonl()` | metrics, parquet metadata, cleaned markdown | output path for JSONL | parquet routing hints, JSONL export | + +## Stage Contracts + +### 1. Download + +- Main code: `Corpus.download()` -> `GlossDownloader.download_files()` +- Purpose: read a metadata parquet, expand list/JSON URL cells, deduplicate URLs, download supported file types, and checkpoint progress. +- Typical inputs: + - a parquet file in `input_dir` or an explicit `input_parquet` + - a URL column such as `url` or `links_column` +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` +- Read this next if you want the scheduler details: `gloss_downloader.py` + +### 2. Extract (Phase‑1) + +- Main code: `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` +- Purpose: convert source files to markdown and optional intermediate JSON artifacts. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'` + - `force_ocr=True` to turn on OCR during extraction + - `use_gpus='single'|'multi'` + - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Main outputs: + - canonical markdown in `markdown/.md` + - optional Docling JSON and index artifacts in `json/` + - per-document and per-page metrics in `json/metrics/` + +### 3. Clean + +- Main code: `Corpus.clean()` +- Purpose: run the Rust cleaner, compute quality/noise signals, and decide what should continue downstream. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if one exists +- Important parameters: + - `threshold` and `drop_bad` + - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions +- Main outputs: + - cleaned markdown in `clean_markdown/` + - merged parquet metadata including OCR-related flags + +### 4. OCR Retry and Phase‑2 Enrichment + +- Main code: `Corpus.ocr()` and `Corpus.formula_enrich_from_json()` +- Purpose: + - rerun OCR only for documents marked bad by the cleaner + - optionally decode formula/code regions from Docling JSON into markdown +- Modes: + - `ocr_bad` + - `math_only` + - `ocr_bad_then_math` +- Main outputs: + - refreshed `markdown/.md` + - `json/.latex_map.jsonl` when math/code enrichment runs + +### 5. Section and Annotate + +- Main code: `Corpus.section()`, `GlossSection.to_parquet()`, `Corpus.annotate()`, `GlossSectionClassifier.*` +- Purpose: + - split markdown into sections suitable for classification + - classify sections and optionally expand coarse labels into full document structure +- Main outputs: + - `sections/sections_for_annotation.parquet` + - `classified_sections.parquet` + - `fully_annotated_sections.parquet` ## Artifact Layout @@ -44,6 +116,19 @@ Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). - Metrics lived under `markdown/` in earlier versions; they now live under `json/metrics/`. - When math enrichment cannot recover after the configured number of respawns, the corresponding PDFs and Docling artifacts are copied into the `problematic_math/` folders above and the stems are added to the fatal skip-list for later review. +- The same folder can act as both `input_dir` and `output_dir`; the pipeline creates its own subdirectories under that root. + +## Readability Shortcut + +If you only need the shortest path through the system: + +1. `Corpus.download()` if you start from URLs. +2. `Corpus.extract()` for Phase‑1 markdown. +3. `Corpus.clean()` to decide what needs OCR. +4. `Corpus.ocr()` for selective OCR and optional math/code enrichment. +5. `Corpus.section()` and `Corpus.annotate()` for structured outputs. + +If you need to jump from these ideas to the source files, see `code_map.md`. ## Exporting corpora diff --git a/mkdocs.yml b/mkdocs.yml index ba0a1e4..43b70fa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,7 +42,9 @@ nav: - Troubleshooting: troubleshooting.md - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: - - Corpus API: api_corpus_tmp.md + - Code Map: code_map.md + - Corpus API: api/corpus.md + - Legacy Corpus API Notes: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md docs_dir: docs markdown_extensions: From 379b8f0ff65817ee481153a14fe35b96039bd22a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 17 Mar 2026 22:55:01 +0200 Subject: [PATCH 09/93] Handle HTML download interstitials --- src/glossapi/gloss_downloader.py | 85 +++++++++++++++++++++ tests/test_gloss_downloader_dynamic_html.py | 53 +++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 tests/test_gloss_downloader_dynamic_html.py diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index f9a7bf2..5afba9c 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -765,6 +765,77 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes # 5) Fall back to URL ext if any, otherwise 'bin' return url_ext if url_ext else 'bin' + + def _url_looks_like_file_endpoint(self, url: str) -> bool: + """Return True when the URL shape suggests a direct file download endpoint.""" + try: + lowered = str(url or "").lower() + except Exception: + return False + hints = ( + ".pdf", + ".docx", + ".pptx", + ".xml", + ".csv", + "/pdf", + "format=pdf", + "type=pdf", + "download", + "attachment", + "/file", + "getfile.php", + ) + return any(token in lowered for token in hints) + + def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: bytes) -> Optional[str]: + """ + Detect HTML challenge/viewer pages that should not count as successful downloads. + + We still allow regular HTML documents, but fail fast on common interstitials + such as WAF challenge pages and JavaScript-only document viewers. + """ + try: + lower_headers = {str(k).lower(): str(v).lower() for k, v in (headers or {}).items()} + lower_body = (content or b"")[: 1 << 17].decode("utf-8", errors="ignore").lower() + except Exception: + lower_headers = {} + lower_body = "" + + if not lower_body: + return None + + if ( + "x-amzn-waf-action" in lower_headers + or "awswafintegration" in lower_body + or "challenge.js" in lower_body + or "verify that you're not a robot" in lower_body + ): + return ( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ) + + viewer_markers = ( + "fliphtml5_pages", + "monitor:player:html5", + "javascript/loadingjs.js", + "javascript/main.js", + "bookconfig.totalpagecount", + "getfile.php?lib=", + ) + viewer_hits = sum(1 for marker in viewer_markers if marker in lower_body) + if viewer_hits >= 2: + return ( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ) + + content_type = lower_headers.get("content-type", "") + if self._url_looks_like_file_endpoint(url) and "text/html" in content_type: + return "Expected a file-like response but received HTML instead" + + return None async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], rate_limiter: RateLimiter, retry_count: int = 0, @@ -916,6 +987,15 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn await f.write(chunk) # Infer extension using URL, headers and first bytes file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, bytes(head)) + if html_issue: + try: + os.remove(tmp_path) + except Exception: + pass + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count if not self.is_supported_format(file_ext): # Clean up temp and report try: @@ -946,6 +1026,11 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn session, requester, url, headers, timeout ) file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count if not self.is_supported_format(file_ext): self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count diff --git a/tests/test_gloss_downloader_dynamic_html.py b/tests/test_gloss_downloader_dynamic_html.py new file mode 100644 index 0000000..a1bd678 --- /dev/null +++ b/tests/test_gloss_downloader_dynamic_html.py @@ -0,0 +1,53 @@ +from glossapi.gloss_downloader import GlossDownloader + + +def test_detects_waf_challenge_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + headers = { + "Content-Type": "text/html; charset=UTF-8", + "x-amzn-waf-action": "challenge", + } + body = b""" + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "challenge page" in error.lower() + + +def test_detects_js_document_viewer_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b""" + + + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "document viewer" in error.lower() + + +def test_regular_html_document_is_still_allowed(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://example.org/article" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b"""Article +

Normal HTML document

Body text.

""" + + assert downloader.infer_file_extension(url, headers, body) == "html" + assert downloader._detect_html_interstitial(url, headers, body) is None From aca4dbb695dfc83d70e03887491df82e0efc4fdd Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 18 Mar 2026 01:26:18 +0200 Subject: [PATCH 10/93] Add browser-gated download mode --- README.md | 61 ++- docs/api/corpus.md | 23 + docs/stages/download.md | 25 ++ install_glossapi.py | 23 + pyproject.toml | 4 + src/glossapi/__init__.py | 4 + src/glossapi/corpus/phase_download.py | 35 +- src/glossapi/download_policy.py | 125 ++++++ src/glossapi/gloss_browser_downloader.py | 415 +++++++++++++++++ src/glossapi/gloss_downloader.py | 545 ++++++++++++++--------- src/glossapi/scripts/install_glossapi.py | 230 ++++++++++ tests/test_browser_gloss_downloader.py | 297 ++++++++++++ tests/test_install_glossapi.py | 51 +++ 13 files changed, 1618 insertions(+), 220 deletions(-) create mode 100644 install_glossapi.py create mode 100644 src/glossapi/download_policy.py create mode 100644 src/glossapi/gloss_browser_downloader.py create mode 100644 src/glossapi/scripts/install_glossapi.py create mode 100644 tests/test_browser_gloss_downloader.py create mode 100644 tests/test_install_glossapi.py diff --git a/README.md b/README.md index 5ad3b1f..953c03b 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,65 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend `setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +If you want a guided install that asks which phases you plan to use, run: + +```bash +python install_glossapi.py +``` + +That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them. + +## Browser-Gated Download Mode + +`Corpus.download(...)` now supports three high-level routes for file acquisition: + +- `download_mode="standard"`: direct HTTP downloader only +- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial +- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints + +Use `browser_mode=True` as a legacy alias for `download_mode="browser"`. + +### Policy-driven routing + +If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL: + +```yaml +default: + downloader: standard + +rules: + - match: + domains: [eur-lex.europa.eu] + downloader: browser + + - match: + url_regex: "https://example.org/protected/.*" + downloader: auto +``` + +```python +from glossapi import Corpus + +corpus = Corpus(input_dir="out", output_dir="out") +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +### Operational notes + +- Browser mode is for browser-gated file endpoints, not viewer-only sources. +- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files. +- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory. +- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files. + +### Regression strategy + +The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs. + +For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite. + **DeepSeek runtime checklist** - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. - Export these to force the real runtime and avoid silent stub output: @@ -93,7 +152,7 @@ Use this as the shortest path from a documentation concept to the public call th | Stage | Main call | Important parameters | Writes | | --- | --- | --- | --- | -| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | | Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | | OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 40f8c47..2fb796c 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -187,12 +187,35 @@ download( - Important parameters: - `links_column`: override URL column name - `parallelize_by`: choose grouping for the scheduler + - `download_mode`: one of `standard`, `auto`, or `browser` + - `browser_mode=True`: alias for `download_mode="browser"` + - `download_policy_file`: route specific domains/URL patterns to `standard`, `auto`, or `browser` - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. - Main outputs: - downloaded files in `downloads/` - partial/final results in `download_results/` - returned `pd.DataFrame` with download status and metadata +Browser-capable download mode is intended for browser-gated file endpoints where a real file still exists behind session/bootstrap checks. It is not a general viewer extractor. Viewer-only sources should still fail cleanly with a recorded error and no local file artifact. + +Example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_mode="browser", +) +``` + +Policy-routed example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + ## triage_math() - Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. diff --git a/docs/stages/download.md b/docs/stages/download.md index 99bc4f8..c70c551 100644 --- a/docs/stages/download.md +++ b/docs/stages/download.md @@ -8,6 +8,7 @@ The download stage acquires source documents from parquet-based URL metadata and - read URL-bearing parquet input - download files concurrently +- route known browser-gated sources through browser-assisted acquisition when configured - retain source metadata context - avoid refetching previously successful downloads - assign stable-enough local filenames for downstream processing @@ -42,10 +43,34 @@ Typical issues include: - transient network failures - rate limiting +- browser-gated file endpoints that return HTML challenge/interstitial pages +- viewer-only sources that should fail cleanly instead of being recorded as successful downloads - duplicate URLs - filename collisions - partially completed corpus fetches +## Browser-gated sources + +The downloader now distinguishes between: + +- direct file endpoints +- browser-gated file endpoints +- viewer-only/document-reader sources + +For browser-gated file endpoints: + +- `download_mode="auto"` probes with direct HTTP and escalates to a browser session when it detects a recoverable interstitial +- `download_mode="browser"` goes directly to the browser-assisted path +- `download_policy_file=...` can route known domains or URL patterns to the correct path without probing every file + +Browser-assisted mode is designed for retrievable file endpoints, not for sources that only expose page images, tiles, HTML/SVG re-rendering, or DRM-wrapped readers. + +## Session reuse + +Browser-assisted mode reuses cached browser session state per domain so multiple files from the same protected source do not need a fresh browser bootstrap every time. + +This keeps the browser as a session-bootstrap resource rather than the main downloader. + ## Contributor note Any change to filename assignment or result parquet structure can have downstream impact on: diff --git a/install_glossapi.py b/install_glossapi.py new file mode 100644 index 0000000..ef7a7c9 --- /dev/null +++ b/install_glossapi.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +def _bootstrap_repo_src() -> None: + repo_root = Path(__file__).resolve().parent + src_dir = repo_root / "src" + src_str = str(src_dir) + if src_str not in sys.path: + sys.path.insert(0, src_str) + + +def main() -> int: + _bootstrap_repo_src() + from glossapi.scripts.install_glossapi import main as _main + + return int(_main()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index 60b23f8..3c045db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,10 @@ classifiers = [ ] [project.optional-dependencies] +# Browser automation fallback for browser-gated file endpoints +browser = [ + "playwright>=1.52,<2", +] # Docling extraction/layout stack docling = [ "docling==2.48.0", diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index c92d336..14f0c31 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -9,6 +9,7 @@ 'Sampler', 'Section', 'GlossDownloader', + 'BrowserGlossDownloader', ] def __getattr__(name: str): @@ -31,6 +32,9 @@ def __getattr__(name: str): if name == 'GlossDownloader': from .gloss_downloader import GlossDownloader # type: ignore return GlossDownloader + if name == 'BrowserGlossDownloader': + from .gloss_browser_downloader import BrowserGlossDownloader # type: ignore + return BrowserGlossDownloader raise AttributeError(name) try: diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py index 38179fd..c543076 100644 --- a/src/glossapi/corpus/phase_download.py +++ b/src/glossapi/corpus/phase_download.py @@ -19,6 +19,7 @@ import pandas as pd from .._naming import canonical_stem +from ..gloss_browser_downloader import BrowserGlossDownloader from ..gloss_downloader import GlossDownloader # Avoid importing section/classifier here; download phase does not use them. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path @@ -212,6 +213,22 @@ def _looks_like_list(s: str) -> bool: # Initialize downloader configuration (kwargs take precedence) dl_cfg = dict(self.downloader_config) dl_cfg.update(kwargs) + browser_mode = dl_cfg.pop('browser_mode', None) + if browser_mode is not None and 'download_mode' not in dl_cfg: + dl_cfg['download_mode'] = 'browser' if browser_mode else 'standard' + download_mode = str(dl_cfg.pop('download_mode', 'standard')).strip().lower() + policy_requested = bool(dl_cfg.get('download_policy_file') or dl_cfg.get('download_policy')) + if download_mode in {'standard', 'default', 'http'} and not policy_requested: + downloader_cls = GlossDownloader + default_download_route = 'standard' + elif download_mode in {'browser', 'browser_protected'} or policy_requested: + downloader_cls = BrowserGlossDownloader + default_download_route = 'browser' if download_mode in {'browser', 'browser_protected'} else 'standard' + elif download_mode in {'auto', 'browser_fallback'}: + downloader_cls = BrowserGlossDownloader + default_download_route = 'auto' + else: + raise ValueError(f"Unsupported download_mode: {download_mode}") # Allow caller to override which column holds links if links_column: url_column = links_column @@ -232,14 +249,18 @@ def _looks_like_list(s: str) -> bool: except Exception: pass - downloader = GlossDownloader( - url_column=url_column, - output_dir=str(self.output_dir), - log_level=self.logger.level, - verbose=verbose if verbose is not None else self.verbose, + downloader_kwargs = { + "url_column": url_column, + "output_dir": str(self.output_dir), + "log_level": self.logger.level, + "verbose": verbose if verbose is not None else self.verbose, **{k: v for k, v in dl_cfg.items() if k not in {'input_parquet'}}, - _used_filename_bases=used_bases - ) + "_used_filename_bases": used_bases, + } + if downloader_cls is BrowserGlossDownloader: + downloader_kwargs["default_download_route"] = default_download_route + + downloader = downloader_cls(**downloader_kwargs) # Download files self.logger.info(f"Downloading files from URLs in {input_parquet}...") diff --git a/src/glossapi/download_policy.py b/src/glossapi/download_policy.py new file mode 100644 index 0000000..f42e043 --- /dev/null +++ b/src/glossapi/download_policy.py @@ -0,0 +1,125 @@ +"""Policy routing for downloader selection.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Optional +from urllib.parse import urlparse + +import yaml + +VALID_DOWNLOADERS = {"standard", "browser", "auto"} +ROUTE_OPTION_KEYS = { + "browser_timeout_ms", + "browser_post_load_wait_ms", + "browser_engine", + "browser_headless", + "browser_session_ttl_seconds", +} + + +def _normalize_downloader(value: Any, default: str = "standard") -> str: + normalized = str(value or default).strip().lower() + if normalized in {"default", "http"}: + normalized = "standard" + if normalized in {"browser_fallback"}: + normalized = "auto" + if normalized in {"browser_protected"}: + normalized = "browser" + if normalized not in VALID_DOWNLOADERS: + raise ValueError(f"Unsupported downloader route: {value}") + return normalized + + +@dataclass(frozen=True) +class DownloadPolicyMatch: + domains: tuple[str, ...] = () + url_regex: Optional[re.Pattern[str]] = None + + def matches(self, url: str) -> bool: + parsed = urlparse(url) + hostname = (parsed.hostname or "").lower() + if self.domains: + matched_domain = any( + hostname == domain or hostname.endswith(f".{domain}") + for domain in self.domains + ) + if not matched_domain: + return False + if self.url_regex and not self.url_regex.search(url): + return False + return True + + +@dataclass(frozen=True) +class DownloadPolicyRule: + matcher: DownloadPolicyMatch + downloader: str + options: Dict[str, Any] + + def matches(self, url: str) -> bool: + return self.matcher.matches(url) + + +@dataclass(frozen=True) +class DownloadPolicy: + default_downloader: str = "standard" + default_options: Dict[str, Any] | None = None + rules: tuple[DownloadPolicyRule, ...] = () + + def resolve(self, url: str) -> tuple[str, Dict[str, Any]]: + for rule in self.rules: + if rule.matches(url): + return rule.downloader, dict(rule.options) + return self.default_downloader, dict(self.default_options or {}) + + +def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS} + + +def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch: + domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip()) + url_regex = raw.get("url_regex") + compiled = re.compile(str(url_regex)) if url_regex else None + return DownloadPolicyMatch(domains=domains, url_regex=compiled) + + +def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy: + default_block = dict(data.get("default") or {}) + default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard") + default_options = _extract_route_options(default_block) + + rules = [] + for raw_rule in data.get("rules") or []: + raw_rule = dict(raw_rule or {}) + matcher = _build_matcher(dict(raw_rule.get("match") or {})) + downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader) + options = _extract_route_options(raw_rule) + rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options)) + + return DownloadPolicy( + default_downloader=default_downloader, + default_options=default_options, + rules=tuple(rules), + ) + + +def load_download_policy(path: str | Path) -> DownloadPolicy: + policy_path = Path(path).expanduser().resolve() + payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {} + if not isinstance(payload, dict): + raise ValueError("Download policy file must define a mapping at the top level") + return build_download_policy(payload) + + +__all__ = [ + "DownloadPolicy", + "DownloadPolicyMatch", + "DownloadPolicyRule", + "VALID_DOWNLOADERS", + "build_download_policy", + "load_download_policy", +] diff --git a/src/glossapi/gloss_browser_downloader.py b/src/glossapi/gloss_browser_downloader.py new file mode 100644 index 0000000..1fc41fa --- /dev/null +++ b/src/glossapi/gloss_browser_downloader.py @@ -0,0 +1,415 @@ +"""Browser-capable downloader mode for browser-gated file endpoints.""" + +from __future__ import annotations + +import asyncio +import os +import time +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Any, Dict, Optional, Tuple + +import aiofiles +import aiohttp + +from .download_policy import DownloadPolicy, load_download_policy +from .gloss_downloader import GlossDownloader + + +@dataclass +class BrowserSessionState: + user_agent: str + cookie_header: str + cached_at: float + + +class BrowserGlossDownloader(GlossDownloader): + """ + Downloader variant that retries browser-gated file endpoints via Playwright. + + This mode only targets file endpoints that are protected by browser/session + checks. It intentionally does not attempt viewer-style extraction. + """ + + def __init__( + self, + *args, + browser_timeout_ms: int = 60000, + browser_post_load_wait_ms: int = 3000, + browser_engine: str = "chromium", + browser_headless: bool = True, + browser_session_ttl_seconds: int = 900, + browser_max_parallel_bootstraps: int = 2, + default_download_route: str = "auto", + **kwargs, + ): + super().__init__(*args, **kwargs) + self.browser_timeout_ms = int(browser_timeout_ms) + self.browser_post_load_wait_ms = int(browser_post_load_wait_ms) + self.browser_engine = str(browser_engine or "chromium") + self.browser_headless = bool(browser_headless) + self.browser_session_ttl_seconds = int(browser_session_ttl_seconds) + self.browser_max_parallel_bootstraps = max(1, int(browser_max_parallel_bootstraps)) + self.browser_bootstrap_semaphore = asyncio.Semaphore(self.browser_max_parallel_bootstraps) + self._browser_session_cache: Dict[str, BrowserSessionState] = {} + self._browser_session_locks: Dict[str, asyncio.Lock] = {} + self.default_download_route = str(default_download_route or "auto").strip().lower() + self.policy = self._load_policy() + + def _load_policy(self) -> Optional[DownloadPolicy]: + if self.download_policy is not None: + return self.download_policy + if self.download_policy_file: + return load_download_policy(self.download_policy_file) + return None + + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + if self.policy is not None: + return self.policy.resolve(url) + return self.default_download_route, {} + + def _route_setting(self, route_options: Dict[str, Any], name: str, fallback: Any) -> Any: + return route_options.get(name, fallback) + + def _domain_key(self, url: str) -> str: + return self._extract_base_domain(url) or (urlparse(url).hostname or "").lower() + + def _choose_browser_bootstrap_url(self, url: str) -> str: + if self._url_looks_like_file_endpoint(url): + return self.get_base_url(url) + return url + + def _should_ignore_navigation_exception(self, url: str, exc: Exception) -> bool: + message = str(exc) + if self._url_looks_like_file_endpoint(url) and "net::ERR_ABORTED" in message: + return True + return False + + def _session_lock_for_domain(self, domain_key: str) -> asyncio.Lock: + lock = self._browser_session_locks.get(domain_key) + if lock is None: + lock = asyncio.Lock() + self._browser_session_locks[domain_key] = lock + return lock + + def _is_browser_session_fresh(self, state: BrowserSessionState, route_options: Dict[str, Any]) -> bool: + ttl = int(self._route_setting(route_options, "browser_session_ttl_seconds", self.browser_session_ttl_seconds)) + if ttl <= 0: + return False + return (time.time() - state.cached_at) < ttl + + def _should_attempt_browser_recovery(self, url: str, html_issue: str) -> bool: + issue = str(html_issue or "").lower() + if "document viewer returned" in issue: + return False + if "challenge page returned" in issue: + return True + if "cookie bootstrap is required" in issue: + return True + if "expected a file-like response but received html instead" in issue: + return self._url_looks_like_file_endpoint(url) + return False + + def _build_ssl_connector(self) -> Optional[aiohttp.TCPConnector]: + connector = None + if not self.ssl_verify: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + def _domain_cookies_for_url(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} + for domain_pattern, domain_cookies in self.domain_cookies.items(): + if domain_pattern in url: + cookies.update(domain_cookies) + return cookies + + async def _write_recovered_file(self, row_index: int, filename: str, body: bytes) -> None: + tmp_path = self.downloads_dir / f".part_browser_{row_index}" + async with aiofiles.open(tmp_path, "wb") as handle: + await handle.write(body) + final_path = self.downloads_dir / filename + os.replace(tmp_path, final_path) + + async def _fetch_with_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + state: BrowserSessionState, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + request_headers = { + "User-Agent": state.user_agent, + "Accept": "application/pdf,application/octet-stream,*/*;q=0.8", + } + if state.cookie_header: + request_headers["Cookie"] = state.cookie_header + if referer: + request_headers["Referer"] = referer + + connector = self._build_ssl_connector() + timeout = aiohttp.ClientTimeout(total=min(max(self.request_timeout, 30), 180)) + async with aiohttp.ClientSession(connector=connector) as session: + async with session.get(url, headers=request_headers, timeout=timeout) as response: + response.raise_for_status() + body = await response.read() + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + return body, response_headers, {"candidate_url": url, "session_reused": True} + + async def _bootstrap_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + route_options: Dict[str, Any], + ) -> tuple[BrowserSessionState, list[tuple[str, Dict[str, str], str]]]: + timeout_ms = int(self._route_setting(route_options, "browser_timeout_ms", self.browser_timeout_ms)) + post_load_wait_ms = int( + self._route_setting(route_options, "browser_post_load_wait_ms", self.browser_post_load_wait_ms) + ) + browser_engine = str(self._route_setting(route_options, "browser_engine", self.browser_engine)) + browser_headless = bool(self._route_setting(route_options, "browser_headless", self.browser_headless)) + + try: + from playwright.async_api import async_playwright + except ImportError as exc: # pragma: no cover - exercised via monkeypatch + raise RuntimeError( + "Browser download mode requires the optional 'browser' dependencies " + "(install Playwright and browser binaries)" + ) from exc + + accepted_responses: list[tuple[str, Dict[str, str], str]] = [] + bootstrap_url = self._choose_browser_bootstrap_url(url) + + async with self.browser_bootstrap_semaphore: + async with async_playwright() as playwright: + browser_type = getattr(playwright, browser_engine, None) + if browser_type is None: + raise RuntimeError(f"Unsupported browser engine: {browser_engine}") + + browser = await browser_type.launch(headless=browser_headless) + context = await browser.new_context(ignore_https_errors=not self.ssl_verify) + parsed = urlparse(url) + browser_cookies = [ + { + "name": key, + "value": str(value), + "domain": parsed.hostname or "", + "path": "/", + } + for key, value in self._domain_cookies_for_url(url).items() + ] + if browser_cookies: + await context.add_cookies(browser_cookies) + page = await context.new_page() + if referer: + await page.set_extra_http_headers({"Referer": referer}) + + async def _route_filter(route: Any) -> None: + req = route.request + if req.resource_type in {"image", "media", "font"}: + await route.abort() + return + req_url = str(req.url or "") + if "googletagmanager" in req_url or "google-analytics.com" in req_url: + await route.abort() + return + await route.continue_() + + await page.route("**/*", _route_filter) + + def _record_response(response: Any) -> None: + try: + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + file_ext = self.infer_file_extension(response.url, response_headers, b"") + if file_ext and file_ext != "html" and self.is_supported_format(file_ext): + accepted_responses.append((response.url, response_headers, file_ext)) + except Exception: + return + + page.on("response", _record_response) + + try: + main_response = None + try: + main_response = await page.goto(bootstrap_url, wait_until="networkidle", timeout=timeout_ms) + except Exception as exc: + if not self._should_ignore_navigation_exception(bootstrap_url, exc): + raise + if main_response is not None: + main_headers = {str(k): str(v) for k, v in (main_response.headers or {}).items()} + main_ext = self.infer_file_extension(main_response.url, main_headers, b"") + if main_ext and main_ext != "html" and self.is_supported_format(main_ext): + accepted_responses.insert(0, (main_response.url, main_headers, main_ext)) + if not accepted_responses and post_load_wait_ms > 0: + await page.wait_for_timeout(post_load_wait_ms) + + browser_user_agent = await page.evaluate("() => navigator.userAgent") + browser_cookies = await context.cookies() + finally: + await browser.close() + + cookie_header = "; ".join( + f"{cookie['name']}={cookie['value']}" for cookie in browser_cookies if cookie.get("name") + ) + return BrowserSessionState( + user_agent=browser_user_agent, + cookie_header=cookie_header, + cached_at=time.time(), + ), accepted_responses + + async def _download_via_browser_session( + self, + *, + url: str, + referer: Optional[str], + route_options: Optional[Dict[str, Any]] = None, + force_refresh: bool = False, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + options = dict(route_options or {}) + domain_key = self._domain_key(url) + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + lock = self._session_lock_for_domain(domain_key) + async with lock: + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + state, accepted_responses = await self._bootstrap_browser_session_state( + url=url, + referer=referer, + route_options=options, + ) + self._browser_session_cache[domain_key] = state + candidate_url = accepted_responses[0][0] if accepted_responses else url + body, response_headers, meta = await self._fetch_with_browser_session_state( + url=candidate_url, + referer=referer, + state=state, + ) + meta.update({ + "candidate_url": candidate_url, + "session_reused": False, + "domain_key": domain_key, + }) + return body, response_headers, meta + + async def _download_browser_route( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + route_options: Dict[str, Any], + ) -> Tuple[bool, str, str, str, int]: + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + error_msg = f"Browser-routed download failed: {exc}" + self.logger.warning(error_msg) + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + return await self._finalize_download_result( + row_index=row_index, + url=meta.get("candidate_url") or url, + resp_headers=response_headers, + content=body, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route != "browser": + return None + return await self._download_browser_route( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + route_options=route_options, + ) + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route == "standard": + return None + if route == "auto" and not self._should_attempt_browser_recovery(url, html_issue): + return None + + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + message = f"{html_issue}; browser recovery failed: {exc}" + self.logger.warning(message) + return False, "", "html", message, retry_count + 1 + + file_ext = self.infer_file_extension(meta["candidate_url"], response_headers, body) + if file_ext == "html": + message = ( + f"{html_issue}; browser recovery still returned HTML from {meta['candidate_url']}" + ) + self.logger.warning(message) + return False, "", file_ext, message, retry_count + 1 + if not self.is_supported_format(file_ext): + message = ( + f"{html_issue}; browser recovery returned unsupported format: {file_ext}" + ) + self.logger.warning(message) + return False, "", file_ext or "", message, retry_count + 1 + + if filename_base and str(filename_base).strip(): + filename = f"{filename_base}.{file_ext}" + else: + filename = self.generate_filename(row_index, file_ext) + + await self._write_recovered_file(row_index, filename, body) + self.logger.info( + "Recovered browser-gated download via browser mode: %s -> %s", + url, + filename, + ) + return True, filename, file_ext, "", retry_count diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index 5afba9c..ffce858 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -141,6 +141,8 @@ def __init__( error_burst_window: int = 20, error_burst_threshold: float = 0.5, park_403_seconds: float = 600.0, + download_policy_file: Optional[Union[str, Path]] = None, + download_policy: Optional[Any] = None, _used_filename_bases: Optional[Set[str]] = None, ): """ @@ -241,6 +243,8 @@ def verbose_log(self, message, level=logging.DEBUG): self.checkpoint_seconds = float(checkpoint_seconds) if checkpoint_seconds else None # Warnings JSON path self.domain_warnings_path = self.output_dir / 'domain_scheduler_warnings.json' + self.download_policy_file = Path(download_policy_file).expanduser().resolve() if download_policy_file else None + self.download_policy = download_policy # Progress logger (separate file; default to output logs dir) self.progress_logger = self.logger @@ -836,41 +840,44 @@ def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: return "Expected a file-like response but received HTML instead" return None - - async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], - rate_limiter: RateLimiter, retry_count: int = 0, - filename_base: Optional[str] = None, - referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: - """ - Download a file from a URL - - Args: - row_index: Index in the dataframe - url: URL to download - semaphore: Semaphore for concurrency control - rate_limiter: Rate limiter for API limits - retry_count: Current retry count - Returns: - Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) - """ - if not url or pd.isna(url): - return False, "", "", "Empty URL", retry_count - - # Get a new user-agent for each request - user_agent = next(self.user_agents) - domain = urlparse(url).netloc - - # Ensure URL has scheme + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to recover from HTML interstitials via alternate fetch modes.""" + return None + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to short-circuit the direct HTTP path for known routes.""" + return None + + def _normalize_request_url(self, url: str) -> str: if not url.startswith(("http://", "https://")): - url = f"https://{url}" - - # Get base URL for referer header + return f"https://{url}" + return url + + def _build_request_headers(self, url: str, user_agent: str, referer: Optional[str]) -> Dict[str, str]: + domain = urlparse(url).netloc base_url = self.get_base_url(url) - - # Enhanced headers with common browser-like attributes to bypass 403 errors - # Prefer caller-provided referer (e.g., the external_link page) - _referer = (referer or '').strip() - headers = { + referer_value = (referer or '').strip() + return { 'User-Agent': user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', @@ -884,74 +891,296 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'TE': 'trailers', - 'Referer': _referer if _referer else f"https://www.google.com/search?q={domain}", + 'Referer': referer_value if referer_value else f"https://www.google.com/search?q={domain}", 'Origin': base_url, 'DNT': '1' } - - # Check for domain-specific cookies - cookies = {} + + def _resolve_request_cookies(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} for domain_pattern, domain_cookies in self.domain_cookies.items(): if domain_pattern in url: cookies.update(domain_cookies) # If the domain needs dynamic values like random IDs - for key, value in cookies.items(): + for key, value in list(cookies.items()): if 'random.randint' in str(value): # Replace with an actual random value (only supporting this pattern for now) - if 'session-id' in value: + if 'session-id' in str(value): cookies[key] = f"session-id-{random.randint(100000000, 999999999)}" + return cookies + + def _build_request_timeout(self, retry_count: int) -> aiohttp.ClientTimeout: + return aiohttp.ClientTimeout( + total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes + connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute + sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute + sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + ) + + def _build_session_connector(self, url: str) -> Optional[aiohttp.TCPConnector]: + connector = None + url_base = self._extract_base_domain(url) + force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) + if (not self.ssl_verify) or force_insecure: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + async def _bootstrap_download_session( + self, + session: aiohttp.ClientSession, + url: str, + headers: Dict[str, str], + ) -> Dict[str, str]: + headers = await self.setup_session(session, url, headers) + + # Set a shorter timeout for the initial connection attempt + base_timeout = aiohttp.ClientTimeout(total=10) + try: + # Visit the base domain to establish cookies if needed + base_domain = urlparse(url).netloc + if any(domain in base_domain for domain in self.domain_cookies.keys()): + base_url = f"https://{base_domain}" + async with session.get(base_url, headers=headers, timeout=base_timeout): + pass + except Exception as e: + # Non-fatal error, just log and continue + self.logger.debug(f"Initial base URL visit failed: {str(e)}") + return headers + + def _best_effort_url_extension(self, url: str) -> str: + try: + return self.get_file_extension_from_url(url) + except Exception: + return "" + + def _build_output_filename(self, row_index: int, file_ext: str, filename_base: Optional[str]) -> str: + if filename_base and str(filename_base).strip(): + return f"{filename_base}.{file_ext}" + return self.generate_filename(row_index, file_ext) + + def _cleanup_temp_file(self, tmp_path: Optional[Path]) -> None: + if not tmp_path: + return + try: + os.remove(tmp_path) + except Exception: + pass + + def _move_temp_file_to_final(self, tmp_path: Path, filename: str) -> None: + final_path = Path(self.downloads_dir) / filename + try: + os.replace(tmp_path, final_path) + except Exception: + try: + os.rename(tmp_path, final_path) + except Exception: + pass + + async def _finalize_download_result( + self, + *, + row_index: int, + url: str, + resp_headers: Dict[str, str], + content: bytes, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + tmp_path: Optional[Path] = None, + ) -> Tuple[bool, str, str, str, int]: + file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self._cleanup_temp_file(tmp_path) + recovered = await self._recover_html_interstitial( + row_index=row_index, + url=url, + headers=resp_headers, + content=content, + html_issue=html_issue, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + if recovered is not None: + return recovered + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count + if not self.is_supported_format(file_ext): + self._cleanup_temp_file(tmp_path) + self.logger.warning( + f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}" + ) + return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count + + filename = self._build_output_filename(row_index, file_ext, filename_base) + if tmp_path is not None: + self._move_temp_file_to_final(tmp_path, filename) + else: + await self.write_file(filename, content, self.downloads_dir) + self.logger.info(f"Successfully downloaded {filename} from {url}") + return True, filename, file_ext, "", retry_count + + async def _download_via_streaming_get( + self, + *, + session: aiohttp.ClientSession, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + from tenacity import AsyncRetrying + + head = bytearray() + async for attempt in AsyncRetrying( + stop=stop_after_attempt(max(1, int(self.max_retries))), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=(retry_if_exception_type(aiohttp.ClientError) | + retry_if_exception_type(asyncio.TimeoutError)), + before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), + reraise=True, + ): + with attempt: + async with session.get(url, headers=headers, timeout=timeout) as response: + response.raise_for_status() + resp_headers = dict(response.headers or {}) + tmp_path = Path(self.downloads_dir) / f".part_{row_index}" + async with aiofiles.open(tmp_path, 'wb') as f: + async for chunk in response.content.iter_chunked(1 << 16): + if chunk: + if len(head) < (1 << 16): + need = (1 << 16) - len(head) + head.extend(chunk[:need]) + await f.write(chunk) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=bytes(head), + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + tmp_path=tmp_path, + ) + return False, "", "", "Retry exhaustion", retry_count + 1 + + async def _download_via_buffered_request( + self, + *, + session: aiohttp.ClientSession, + requester: str, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + content, status, resp_headers = await self.make_request( + session, requester, url, headers, timeout + ) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=content, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + def _build_http_error_result( + self, + url: str, + error: aiohttp.ClientResponseError, + retry_count: int, + ) -> Tuple[bool, str, str, str, int]: + status = error.status + self.logger.warning(f"Received {status} for {url}") + + if self.verbose: + self.logger.debug(f"HTTP Error Details - Status: {error.status}, Message: {error.message}") + self.logger.debug(f"Headers: {error.headers if hasattr(error, 'headers') else 'No headers available'}") + self.logger.debug(f"Request info: {error.request_info if hasattr(error, 'request_info') else 'No request info available'}") + + retry_after = None + try: + hdrs = dict(getattr(error, 'headers', {}) or {}) + for k, v in hdrs.items(): + if k.lower() == 'retry-after': + val = str(v).strip() + if val.isdigit(): + retry_after = int(val) + else: + try: + dt = parsedate_to_datetime(val) + retry_after = max(0, int((dt.timestamp() - time.time()))) + except Exception: + retry_after = None + break + except Exception: + retry_after = None + error_msg = f"HTTP {status}: {str(error)}" + if status in (429, 503) and retry_after is not None: + error_msg += f" retry_after={retry_after}" + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + + async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], + rate_limiter: RateLimiter, retry_count: int = 0, + filename_base: Optional[str] = None, + referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + """ + Download a file from a URL + + Args: + row_index: Index in the dataframe + url: URL to download + semaphore: Semaphore for concurrency control + rate_limiter: Rate limiter for API limits + retry_count: Current retry count + Returns: + Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + """ + if not url or pd.isna(url): + return False, "", "", "Empty URL", retry_count + + url = self._normalize_request_url(url) + user_agent = next(self.user_agents) + headers = self._build_request_headers(url, user_agent, referer) + cookies = self._resolve_request_cookies(url) if semaphore: await semaphore.acquire() try: - # Apply rate limiting await rate_limiter.acquire() - - # Implement exponential backoff sleep_time = self.sleep * (2 ** retry_count) await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) - - # Set up timeout with exponential backoff - timeout = aiohttp.ClientTimeout( - total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes - connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute - sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute - sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + preflight = await self._preflight_download( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - + if preflight is not None: + return preflight + timeout = self._build_request_timeout(retry_count) + try: - # Prepare optional SSL connector - connector = None - # Domain-specific insecure override (discovered via ping) - url_base = self._extract_base_domain(url) - _force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) - if (not self.ssl_verify) or _force_insecure: - connector = aiohttp.TCPConnector(ssl=False) - elif self.ssl_cafile: - import ssl as _ssl - ctx = _ssl.create_default_context(cafile=self.ssl_cafile) - connector = aiohttp.TCPConnector(ssl=ctx) - # Create a new session for each download to avoid cookie contamination + connector = self._build_session_connector(url) async with aiohttp.ClientSession(cookies=cookies, connector=connector) as session: try: - # Try to access the base domain first to establish cookies - headers = await self.setup_session(session, url, headers) - - # Set a shorter timeout for the initial connection attempt - base_timeout = aiohttp.ClientTimeout(total=10) - try: - # Visit the base domain to establish cookies if needed - base_domain = urlparse(url).netloc - if any(domain in base_domain for domain in self.domain_cookies.keys()): - base_url = f"https://{base_domain}" - async with session.get(base_url, headers=headers, timeout=base_timeout): - pass - except Exception as e: - # Non-fatal error, just log and continue - self.logger.debug(f"Initial base URL visit failed: {str(e)}") - pass - - # Choose request method and perform streaming for GET + headers = await self._bootstrap_download_session(session, url, headers) requester = self.request_method.lower() try: @@ -960,126 +1189,30 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn self.verbose_log(f"Headers: {headers}") if requester == 'get': - # Streaming GET with retries - from tenacity import AsyncRetrying - head = bytearray() - resp_headers = {} - async for attempt in AsyncRetrying( - stop=stop_after_attempt(max(1, int(self.max_retries))), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=(retry_if_exception_type(aiohttp.ClientError) | - retry_if_exception_type(asyncio.TimeoutError)), - before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), - reraise=True, - ): - with attempt: - async with session.get(url, headers=headers, timeout=timeout) as response: - response.raise_for_status() - resp_headers = dict(response.headers or {}) - # Write to a temp file first - tmp_path = Path(self.downloads_dir) / f".part_{row_index}" - async with aiofiles.open(tmp_path, 'wb') as f: - async for chunk in response.content.iter_chunked(1 << 16): - if chunk: - if len(head) < (1 << 16): - need = (1 << 16) - len(head) - head.extend(chunk[:need]) - await f.write(chunk) - # Infer extension using URL, headers and first bytes - file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) - if file_ext == 'html': - html_issue = self._detect_html_interstitial(url, resp_headers, bytes(head)) - if html_issue: - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") - return False, "", file_ext, html_issue, retry_count - if not self.is_supported_format(file_ext): - # Clean up temp and report - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - # Decide final filename - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - final_path = Path(self.downloads_dir) / filename - try: - os.replace(tmp_path, final_path) - except Exception: - # Fallback to copy/rename - try: - os.rename(tmp_path, final_path) - except Exception: - pass - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count - else: - # Fallback to non-streaming POST - content, status, resp_headers = await self.make_request( - session, requester, url, headers, timeout + return await self._download_via_streaming_get( + session=session, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - file_ext = self.infer_file_extension(url, resp_headers, content) - if file_ext == 'html': - html_issue = self._detect_html_interstitial(url, resp_headers, content) - if html_issue: - self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") - return False, "", file_ext, html_issue, retry_count - if not self.is_supported_format(file_ext): - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - await self.write_file(filename, content, self.downloads_dir) - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count + return await self._download_via_buffered_request( + session=session, + requester=requester, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) except aiohttp.ClientResponseError as e: - # Handle HTTP errors - status = e.status - self.logger.warning(f"Received {status} for {url}") - - # Detailed verbose logging for HTTP errors - if self.verbose: - self.logger.debug(f"HTTP Error Details - Status: {e.status}, Message: {e.message}") - self.logger.debug(f"Headers: {e.headers if hasattr(e, 'headers') else 'No headers available'}") - self.logger.debug(f"Request info: {e.request_info if hasattr(e, 'request_info') else 'No request info available'}") - - # Build error with optional Retry-After info - retry_after = None - try: - hdrs = dict(getattr(e, 'headers', {}) or {}) - for k, v in hdrs.items(): - if k.lower() == 'retry-after': - val = str(v).strip() - if val.isdigit(): - retry_after = int(val) - else: - try: - dt = parsedate_to_datetime(val) - retry_after = max(0, int((dt.timestamp() - time.time()))) - except Exception: - retry_after = None - break - except Exception: - retry_after = None - error_msg = f"HTTP {status}: {str(e)}" - if status in (429, 503) and retry_after is not None: - error_msg += f" retry_after={retry_after}" - # Best-effort ext from URL if possible - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return self._build_http_error_result(url, e, retry_count) except Exception as e: error_msg = str(e) @@ -1092,11 +1225,7 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn import traceback self.logger.debug(f"Traceback: {traceback.format_exc()}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Overall timeout exceeded for {url}") @@ -1108,22 +1237,14 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn except aiohttp.ClientError as e: error_msg = str(e) self.logger.error(f"ClientError while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Timeout while downloading {url}") return False, "", "", "Timeout", retry_count + 1 except Exception as e: error_msg = str(e) self.logger.error(f"Error while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 finally: if semaphore: try: diff --git a/src/glossapi/scripts/install_glossapi.py b/src/glossapi/scripts/install_glossapi.py new file mode 100644 index 0000000..195d662 --- /dev/null +++ b/src/glossapi/scripts/install_glossapi.py @@ -0,0 +1,230 @@ +"""Guided installer for GlossAPI extras.""" + +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Set + + +PHASE_TO_EXTRAS: Dict[str, Set[str]] = { + "download": set(), + "browser_download": {"browser"}, + "extract": {"docling"}, + "ocr": set(), + "docs": {"docs"}, +} + + +@dataclass(frozen=True) +class InstallPlan: + phases: tuple[str, ...] + extras: tuple[str, ...] + editable: bool + include_cuda: bool + needs_deepseek_runtime: bool + + +def _supports_color() -> bool: + return sys.stdout.isatty() and os.environ.get("TERM") not in {"", "dumb", None} + + +def _style(text: str, code: str) -> str: + if not _supports_color(): + return text + return f"\033[{code}m{text}\033[0m" + + +def _prompt_yes_no(question: str, default: bool = False) -> bool: + suffix = "[Y/n]" if default else "[y/N]" + while True: + raw = input(f"{question} {suffix} ").strip().lower() + if not raw: + return default + if raw in {"y", "yes"}: + return True + if raw in {"n", "no"}: + return False + print("Please answer 'y' or 'n'.") + + +def _resolve_phase_selection(tokens: Iterable[str]) -> List[str]: + resolved: List[str] = [] + seen: Set[str] = set() + for token in tokens: + phase = str(token).strip().lower() + if not phase: + continue + if phase not in PHASE_TO_EXTRAS: + raise ValueError(f"Unsupported phase '{token}'. Valid phases: {', '.join(sorted(PHASE_TO_EXTRAS))}") + if phase not in seen: + seen.add(phase) + resolved.append(phase) + return resolved + + +def build_install_plan( + *, + phases: Sequence[str], + editable: bool, + include_cuda: bool, +) -> InstallPlan: + selected = _resolve_phase_selection(phases) + extras: Set[str] = set() + for phase in selected: + extras.update(PHASE_TO_EXTRAS[phase]) + if include_cuda: + extras.add("cuda") + return InstallPlan( + phases=tuple(selected), + extras=tuple(sorted(extras)), + editable=bool(editable), + include_cuda=bool(include_cuda), + needs_deepseek_runtime=("ocr" in selected), + ) + + +def build_pip_command(plan: InstallPlan, repo_root: Path) -> List[str]: + target = "." + if plan.extras: + target = f".[{','.join(plan.extras)}]" + cmd = [sys.executable, "-m", "pip", "install"] + if plan.editable: + cmd.append("-e") + cmd.append(target) + return cmd + + +def build_deepseek_command(repo_root: Path) -> Optional[List[str]]: + script = repo_root / "dependency_setup" / "setup_deepseek_uv.sh" + if not script.exists(): + return None + shell = shutil.which("bash") or shutil.which("sh") + if not shell: + return None + return [shell, str(script)] + + +def _interactive_plan(default_editable: bool) -> InstallPlan: + print(_style("GlossAPI Installer", "1;36")) + print("Select only the phases you plan to use so optional dependencies stay minimal.\n") + + selected: List[str] = ["download"] + print(_style("Core", "1;37")) + print(" download: base downloader/data pipeline dependencies") + if _prompt_yes_no("Add browser-gated download support?", default=False): + selected.append("browser_download") + if _prompt_yes_no("Add extraction support (Docling)?", default=False): + selected.append("extract") + if _prompt_yes_no("Add OCR support (DeepSeek backend)?", default=False): + selected.append("ocr") + if _prompt_yes_no("Add docs tooling?", default=False): + selected.append("docs") + include_cuda = _prompt_yes_no("Include CUDA extras where relevant?", default=False) + editable = _prompt_yes_no("Install in editable mode?", default=default_editable) + return build_install_plan(phases=selected, editable=editable, include_cuda=include_cuda) + + +def _plan_summary(plan: InstallPlan, command: Sequence[str]) -> str: + extras = ", ".join(plan.extras) if plan.extras else "(none)" + phases = ", ".join(plan.phases) if plan.phases else "(none)" + return "\n".join( + [ + _style("Install plan", "1;32"), + f" phases: {phases}", + f" extras: {extras}", + f" editable: {'yes' if plan.editable else 'no'}", + f" command: {shlex.join(command)}", + f" deepseek runtime: {'separate setup required' if plan.needs_deepseek_runtime else 'not requested'}", + ] + ) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python install_glossapi.py", + description="Guided installer for GlossAPI optional dependency groups.", + ) + parser.add_argument( + "--phases", + default="", + help=( + "Comma-separated phases to install. Valid values: " + + ", ".join(sorted(PHASE_TO_EXTRAS)) + + ". If omitted, an interactive wizard is shown." + ), + ) + parser.add_argument( + "--cuda", + action="store_true", + help="Include the CUDA extra.", + ) + parser.add_argument( + "--editable", + dest="editable", + action="store_true", + help="Install in editable mode.", + ) + parser.add_argument( + "--no-editable", + dest="editable", + action="store_false", + help="Install as a regular package.", + ) + parser.set_defaults(editable=True) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the computed pip command without running it.", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Skip confirmation prompts in non-interactive mode.", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + repo_root = Path(__file__).resolve().parents[3] + + if args.phases.strip(): + plan = build_install_plan( + phases=[token for token in args.phases.split(",") if token.strip()], + editable=args.editable, + include_cuda=bool(args.cuda), + ) + else: + plan = _interactive_plan(default_editable=bool(args.editable)) + + command = build_pip_command(plan, repo_root) + print(_plan_summary(plan, command)) + deepseek_command = build_deepseek_command(repo_root) if plan.needs_deepseek_runtime else None + if deepseek_command: + print(f" deepseek command: {shlex.join(deepseek_command)}") + + if args.dry_run: + return 0 + if not args.yes and not args.phases.strip(): + if not _prompt_yes_no("Run this install command now?", default=True): + print("Aborted.") + return 1 + + completed = subprocess.run(command, cwd=repo_root) + if completed.returncode != 0: + return int(completed.returncode) + if plan.needs_deepseek_runtime and deepseek_command: + print(_style("Provisioning dedicated DeepSeek runtime…", "1;33")) + completed = subprocess.run(deepseek_command, cwd=repo_root) + return int(completed.returncode) + + +if __name__ == "__main__": # pragma: no cover - CLI entrypoint + raise SystemExit(main()) diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py new file mode 100644 index 0000000..9412d23 --- /dev/null +++ b/tests/test_browser_gloss_downloader.py @@ -0,0 +1,297 @@ +import asyncio + +import pandas as pd + +from glossapi import Corpus +from glossapi.download_policy import build_download_policy +from glossapi.gloss_browser_downloader import BrowserGlossDownloader, BrowserSessionState +import glossapi.corpus.phase_download as phase_download_mod + + +def test_browser_downloader_skips_viewer_interstitial(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + called = False + + async def _fake_browser_download(**kwargs): + nonlocal called + called = True + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue=( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ), + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result is None + assert called is False + + +def test_browser_downloader_recovers_challenge_page(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + async def _fake_browser_download(**kwargs): + return ( + b"%PDF-1.7\n%dummy\n", + {"Content-Type": "application/pdf"}, + {"candidate_url": "https://example.org/file.pdf"}, + ) + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://example.org/file.pdf", + headers={"Content-Type": "text/html"}, + content=b"challenge", + html_issue=( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ), + retry_count=1, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 1) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.7") + assert not (tmp_path / "downloads" / ".part_browser_0").exists() + + +def test_browser_downloader_domain_cookie_lookup(tmp_path): + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + domain_cookies={"eur-lex.europa.eu": {"token": "abc123"}}, + ) + + cookies = downloader._domain_cookies_for_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) + + assert cookies == {"token": "abc123"} + + +def test_browser_downloader_bootstrap_url_uses_base_for_file_endpoints(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._choose_browser_bootstrap_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) == "https://eur-lex.europa.eu" + + +def test_browser_downloader_ignores_err_aborted_for_file_navigation(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._should_ignore_navigation_exception( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + assert not downloader._should_ignore_navigation_exception( + "https://example.org/article", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + + +def test_browser_downloader_uses_default_browser_route_for_preflight(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="browser") + + async def _fake_download_browser_route(**kwargs): + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://example.org/file.pdf", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + + +def test_browser_downloader_reuses_cached_domain_session(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="auto") + bootstraps = 0 + fetches = 0 + + async def _fake_fetch_with_browser_session_state(**kwargs): + nonlocal fetches + fetches += 1 + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + async def _bootstrap(**kwargs): + nonlocal bootstraps + bootstraps += 1 + return BrowserSessionState(user_agent="UA", cookie_header="a=b", cached_at=10_000.0), [] + + monkeypatch.setattr(downloader, "_bootstrap_browser_session_state", _bootstrap) + monkeypatch.setattr(downloader, "_fetch_with_browser_session_state", _fake_fetch_with_browser_session_state) + monkeypatch.setattr("glossapi.gloss_browser_downloader.time.time", lambda: 10_100.0) + + first = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file.pdf", referer=None) + ) + second = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file2.pdf", referer=None) + ) + + assert first[0].startswith(b"%PDF") + assert second[0].startswith(b"%PDF") + assert bootstraps == 1 + assert fetches == 2 + + +def test_browser_downloader_policy_routes_domain_to_browser(tmp_path, monkeypatch): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["eur-lex.europa.eu"]}, + "downloader": "browser", + "browser_timeout_ms": 1234, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + observed = {} + + async def _fake_download_browser_route(**kwargs): + observed.update(kwargs) + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert observed["route_options"]["browser_timeout_ms"] == 1234 + + +def test_corpus_download_mode_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + result = corpus.download(input_parquet=input_parquet, download_mode="browser") + + assert observed["cls"] == "browser" + assert observed["kwargs"]["default_download_route"] == "browser" + assert bool(result["download_success"].iloc[0]) is True + assert (tmp_path / "download_results" / f"download_results_{input_parquet.name}").exists() + + +def test_corpus_browser_mode_alias_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, browser_mode=True) + + assert observed["cls"] == "browser" + + +def test_corpus_policy_file_selects_browser_router(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://eur-lex.europa.eu/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + policy_path = tmp_path / "download_policy.yml" + policy_path.write_text( + "default:\n downloader: standard\nrules:\n - match:\n domains: [eur-lex.europa.eu]\n downloader: browser\n", + encoding="utf-8", + ) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://eur-lex.europa.eu/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, download_policy_file=policy_path) + + assert observed["kwargs"]["download_policy_file"] == policy_path.resolve() + assert observed["kwargs"]["default_download_route"] == "standard" diff --git a/tests/test_install_glossapi.py b/tests/test_install_glossapi.py new file mode 100644 index 0000000..5226429 --- /dev/null +++ b/tests/test_install_glossapi.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from glossapi.scripts.install_glossapi import ( + build_deepseek_command, + build_install_plan, + build_pip_command, +) + + +def test_build_install_plan_collects_phase_extras(): + plan = build_install_plan( + phases=["download", "browser_download", "extract", "ocr"], + editable=True, + include_cuda=False, + ) + + assert plan.phases == ("download", "browser_download", "extract", "ocr") + assert set(plan.extras) == {"browser", "docling"} + assert plan.editable is True + assert plan.needs_deepseek_runtime is True + + +def test_build_install_plan_adds_cuda_extra(): + plan = build_install_plan( + phases=["download"], + editable=False, + include_cuda=True, + ) + + assert set(plan.extras) == {"cuda"} + assert plan.editable is False + assert plan.needs_deepseek_runtime is False + + +def test_build_pip_command_uses_editable_install(): + plan = build_install_plan( + phases=["download", "browser_download"], + editable=True, + include_cuda=False, + ) + command = build_pip_command(plan, Path("/tmp/repo")) + + assert command[:4] == [command[0], "-m", "pip", "install"] + assert "-e" in command + assert command[-1] == ".[browser]" + + +def test_build_deepseek_command_points_to_setup_script(): + command = build_deepseek_command(Path("/tmp/repo")) + + assert command is None or command[0] From 96241f97cdf7db51c1e4cdc56cdaff0af71fbbad Mon Sep 17 00:00:00 2001 From: adidev001 Date: Sat, 21 Mar 2026 02:07:27 +0530 Subject: [PATCH 11/93] docs: document pipeline artifact contract and runtime outputs --- docs/pipeline.md | 51 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/docs/pipeline.md b/docs/pipeline.md index cacc8c4..2f4b9dd 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -53,16 +53,21 @@ The `Corpus` class is the stable surface of the project. New functionality shoul ### 3. Clean - Main code: `Corpus.clean()` -- Purpose: run the Rust cleaner, compute quality/noise signals, and decide what should continue downstream. +- Purpose: run the Rust cleaner, remove low-quality or noisy markdown, + and mark documents that may need OCR retry before moving on. - Typical inputs: - `markdown/*.md` - - metadata parquet if one exists + - metadata parquet, if available - Important parameters: - `threshold` and `drop_bad` - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions - Main outputs: - cleaned markdown in `clean_markdown/` - - merged parquet metadata including OCR-related flags + - updated parquet metadata with quality and OCR-related flags +- Runtime/debug artifacts: + - `.processing_state.pkl` keeps track of progress so interrupted runs can resume + - `problematic_files/` keeps files that could not be cleaned successfully + - `timeout_files/` keeps files that exceeded the cleaning time limit ### 4. OCR Retry and Phase‑2 Enrichment @@ -91,26 +96,40 @@ The `Corpus` class is the stable surface of the project. New functionality shoul ## Artifact Layout -``` +The tree below shows the main folders and files GlossAPI can create under +the output directory. + +To make the layout easier to follow, artifacts are grouped by the role they +play in the pipeline: + +- canonical — the main outputs a stage is expected to produce, and the + files later stages usually depend on +- runtime — state files used to resume work safely if a run is interrupted +- debug — extra files kept around when something fails or needs a closer look + OUT/ -├── downloads/ -│ └── problematic_math/ -├── download_results/ -├── markdown/ +├── downloads/ (canonical) +│ └── problematic_math/ (debug) +├── download_results/ (canonical) +├── markdown/ (canonical) +│ └── .md +├── clean_markdown/ (canonical) │ └── .md -├── json/ +├── json/ (canonical) │ ├── .docling.json(.zst) │ ├── .formula_index.jsonl │ ├── .latex_map.jsonl │ ├── metrics/ -│ ├── .metrics.json -│ └── .per_page.metrics.json -│ └── problematic_math/ -├── sections/ +│ │ ├── .metrics.json +│ │ └── .per_page.metrics.json +│ └── problematic_math/ (debug) +├── sections/ (canonical) │ └── sections_for_annotation.parquet -├── classified_sections.parquet -└── fully_annotated_sections.parquet -``` +├── classified_sections.parquet (canonical) +├── fully_annotated_sections.parquet (canonical) +├── .processing_state.pkl (runtime) +├── problematic_files/ (debug) +└── timeout_files/ (debug) Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). From 00aed533af81d314e7bdb7f905a0e46f565bbf3e Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 24 Mar 2026 15:33:50 +0200 Subject: [PATCH 12/93] Upgrade Docling and simplify OCR runtime --- .gitignore | 1 - README.md | 3 +- dependency_setup/deepseek_uv/pyproject.toml | 2 +- dependency_setup/deepseek_uv/uv.lock | 964 +----------------- dependency_setup/dependency_notes.md | 2 +- .../requirements-glossapi-docling.txt | 10 +- .../requirements-glossapi-vanilla.txt | 10 +- dependency_setup/setup_glossapi.sh | 3 - docs/api/corpus.md | 6 +- docs/api_corpus_tmp.md | 4 +- .../deepseek_only_upgrade_roadmap.md | 262 ----- docs/code_map.md | 23 +- docs/configuration.md | 12 + docs/getting_started.md | 1 + docs/multi_gpu.md | 3 +- docs/ocr_and_math_enhancement.md | 2 +- docs/pipeline.md | 6 +- docs/testing/compatibility_matrix.md | 8 +- pyproject.toml | 4 +- requirements.txt | 15 +- src/glossapi/corpus/corpus_orchestrator.py | 56 +- src/glossapi/corpus/phase_extract.py | 105 +- src/glossapi/gloss_extract.py | 34 +- src/glossapi/ocr/deepseek/runner.py | 5 +- src/glossapi/ocr/docling/pipeline.py | 47 + src/glossapi/ocr/docling_pipeline.py | 81 +- src/glossapi/scripts/ocr_gpu_batch.py | 14 +- tests/test_corpus_guards.py | 31 +- tests/test_ocr_dispatch_backends.py | 2 +- tests/test_pipeline_smoke.py | 4 - 30 files changed, 274 insertions(+), 1446 deletions(-) delete mode 100644 docs/architecture/deepseek_only_upgrade_roadmap.md diff --git a/.gitignore b/.gitignore index 929a8c5..74f3edc 100644 --- a/.gitignore +++ b/.gitignore @@ -82,4 +82,3 @@ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ deepseek-ocr-2/ -repro_rapidocr_onnx/ diff --git a/README.md b/README.md index 953c03b..04be81a 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `depend ``` `setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +The uv-managed DeepSeek runtime is OCR-only on purpose: it installs `glossapi[deepseek]` and does not carry the Docling layout stack. If you want a guided install that asks which phases you plan to use, run: @@ -153,7 +154,7 @@ Use this as the shortest path from a documentation concept to the public call th | Stage | Main call | Important parameters | Writes | | --- | --- | --- | --- | | Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | -| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `use_gpus`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | | OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | | Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml index 809b499..a1caa65 100644 --- a/dependency_setup/deepseek_uv/pyproject.toml +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" requires-python = ">=3.11,<3.13" dependencies = [ - "glossapi[docling,deepseek]", + "glossapi[deepseek]", "torch==2.6.0", "torchvision==0.21.0", "torchaudio==2.6.0", diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock index f5eefaa..4f99980 100644 --- a/dependency_setup/deepseek_uv/uv.lock +++ b/dependency_setup/deepseek_uv/uv.lock @@ -119,15 +119,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - [[package]] name = "attrs" version = "25.4.0" @@ -137,19 +128,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] -[[package]] -name = "beautifulsoup4" -version = "4.14.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, -] - [[package]] name = "certifi" version = "2026.2.25" @@ -270,132 +248,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, ] -[[package]] -name = "dill" -version = "0.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, -] - -[[package]] -name = "docling" -version = "2.48.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accelerate" }, - { name = "beautifulsoup4" }, - { name = "certifi" }, - { name = "docling-core", extra = ["chunking"] }, - { name = "docling-ibm-models" }, - { name = "docling-parse" }, - { name = "easyocr" }, - { name = "filetype" }, - { name = "huggingface-hub" }, - { name = "lxml" }, - { name = "marko" }, - { name = "openpyxl" }, - { name = "pandas" }, - { name = "pillow" }, - { name = "pluggy" }, - { name = "pydantic" }, - { name = "pydantic-settings" }, - { name = "pylatexenc" }, - { name = "pypdfium2" }, - { name = "python-docx" }, - { name = "python-pptx" }, - { name = "requests" }, - { name = "rtree" }, - { name = "scipy" }, - { name = "tqdm" }, - { name = "typer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/be/32/e117cb0dcc76c93828d2cd9b45c3f8ccf6c86314a60e9c65f16067d3df26/docling-2.48.0.tar.gz", hash = "sha256:e94a5f75c544ec1bbb9169d2f4da72e1f497fd2fcda57cfacc454c93b1c92a8e", size = 189422, upload-time = "2025-08-26T05:31:02.666Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/32/a9c6677c66178a397b89b5b6fe1e7b3d3de98ddc2b331fbcd7440419b9f0/docling-2.48.0-py3-none-any.whl", hash = "sha256:8a1c1dfd5ed84cadb0f81fcb1464e5d501c4bfaa121e15306e09e3c0c983cc3e", size = 212266, upload-time = "2025-08-26T05:31:00.779Z" }, -] - -[[package]] -name = "docling-core" -version = "2.68.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "defusedxml" }, - { name = "jsonref" }, - { name = "jsonschema" }, - { name = "latex2mathml" }, - { name = "pandas" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "pyyaml" }, - { name = "tabulate" }, - { name = "typer" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5e/b7/95e329d143528decd8f6af5d4db6c2d6bc3dc40f9d53ee5b7d5b901dfe11/docling_core-2.68.0.tar.gz", hash = "sha256:261ecb6281d45fcf0559640297eda728f8f7dd4fe8c8bf7ced42dbf9b4e46223", size = 267551, upload-time = "2026-03-07T12:20:24.523Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/66/d8bbe25dec2bb91d9090b939349b1c9b94c307edceada46c5bc6f213a569/docling_core-2.68.0-py3-none-any.whl", hash = "sha256:175145398c005399819a7cfe7b634257caaaecfbb4451840b8ddb31fc2f5ac12", size = 247092, upload-time = "2026-03-07T12:20:23.172Z" }, -] - -[package.optional-dependencies] -chunking = [ - { name = "semchunk" }, - { name = "transformers" }, - { name = "tree-sitter" }, - { name = "tree-sitter-c" }, - { name = "tree-sitter-javascript" }, - { name = "tree-sitter-python" }, - { name = "tree-sitter-typescript" }, -] - -[[package]] -name = "docling-ibm-models" -version = "3.11.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accelerate" }, - { name = "docling-core" }, - { name = "huggingface-hub" }, - { name = "jsonlines" }, - { name = "numpy" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "rtree" }, - { name = "safetensors", extra = ["torch"] }, - { name = "torch" }, - { name = "torchvision" }, - { name = "tqdm" }, - { name = "transformers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b6/91/f883e0a2b3466e1126dfd4463f386c70f5b90d271c27b6f5a97d2f8312e6/docling_ibm_models-3.11.0.tar.gz", hash = "sha256:454401563a8e79cb33b718bc559d9bacca8a0183583e48f8e616c9184c1f5eb1", size = 87721, upload-time = "2026-01-23T12:29:35.384Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/5d/97e9c2e10fbd3ee1723ac82c335f8211a9633c0397cc11ed057c3ba4006e/docling_ibm_models-3.11.0-py3-none-any.whl", hash = "sha256:68f7961069d643bfdab21b1c9ef24a979db293496f4c2283d95b1025a9ac5347", size = 87352, upload-time = "2026-01-23T12:29:34.045Z" }, -] - -[[package]] -name = "docling-parse" -version = "4.7.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "docling-core" }, - { name = "pillow" }, - { name = "pydantic" }, - { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "tabulate" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bb/7a/653c3b11920113217724fab9b4740f9f8964864f92a2a27590accecec5ac/docling_parse-4.7.3.tar.gz", hash = "sha256:5936e6bcb7969c2a13f38ecc75cada3b0919422dc845e96da4b0b7b3bbc394ce", size = 67646746, upload-time = "2026-01-14T14:18:19.376Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6c/81/dd317e0bce475153dc08a60a9a8615b1a04d4d3c9803175e6cb7b7e9b49b/docling_parse-4.7.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66896bbe925073e4d48f18ec29dcd611a390d6b2378fae72125e77b020cd5664", size = 14615974, upload-time = "2026-01-14T14:17:30.246Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b5/088590e0b32fd0a393ca419c644d1435a1c99fa6b2a87888eef4d0fdea33/docling_parse-4.7.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:281347b3e937c1a5ffa6f8774ee603b64a0899fe8a6885573dec7eb48a3421d8", size = 14981051, upload-time = "2026-01-14T14:17:32.426Z" }, - { url = "https://files.pythonhosted.org/packages/b7/63/2b6c9127924487573d5419d58ec77955f0b7c0a923c8232ad461d71039aa/docling_parse-4.7.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3d86c51f9ce35a1b40b2f410f7271d9bd5fc58e7240f4cae7fdd2cef757e671", size = 15092586, upload-time = "2026-01-14T14:17:34.634Z" }, - { url = "https://files.pythonhosted.org/packages/af/89/ed27a83eb113bdf0b0f82f3c30a0db3c005df58b236f6487b232dacdb57a/docling_parse-4.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:3b04459cc97a8a4929622e341b9981e23987a63af07db599afc5e1c4d389060b", size = 16144866, upload-time = "2026-01-14T14:17:36.742Z" }, - { url = "https://files.pythonhosted.org/packages/d6/26/9d86ae12699a25b7233f76ce062253e9c14e57781e00166b792b3a9d56db/docling_parse-4.7.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d89231aa4fba3e38b80c11beb8edc07569e934c1f3935b51f57904fefe958ba5", size = 14616739, upload-time = "2026-01-14T14:17:38.567Z" }, - { url = "https://files.pythonhosted.org/packages/f2/fd/1aebb8a7f15d658f3be858ddbbc4ef7206089d540a7df0dcd4b846b99901/docling_parse-4.7.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dffd19ed373b0da5cea124606b183489a8686c3d18643e94485be1bdda5713ea", size = 14980782, upload-time = "2026-01-14T14:17:40.659Z" }, - { url = "https://files.pythonhosted.org/packages/3e/47/a722527c9f89c65f69f8a463be4f12ad73bae18132f29d8de8b2d9f6f082/docling_parse-4.7.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc32b6f25a673e41b9a8112b6b841284f60dbac9427b7848a03b435460f74aee", size = 15092450, upload-time = "2026-01-14T14:17:42.838Z" }, - { url = "https://files.pythonhosted.org/packages/91/c7/316373a92ba42c2aeaee128fc77a34333449fe3e820b9d524e0ee396ea35/docling_parse-4.7.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef691045623863624f2cb7347572d0262a53cb84940ef7dd851d9f13a2eb8833", size = 16147359, upload-time = "2026-01-14T14:17:44.906Z" }, -] - [[package]] name = "easydict" version = "1.13" @@ -405,28 +257,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, ] -[[package]] -name = "easyocr" -version = "1.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ninja" }, - { name = "numpy" }, - { name = "opencv-python-headless" }, - { name = "pillow" }, - { name = "pyclipper" }, - { name = "python-bidi" }, - { name = "pyyaml" }, - { name = "scikit-image" }, - { name = "scipy" }, - { name = "shapely" }, - { name = "torch" }, - { name = "torchvision" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/84/4a2cab0e6adde6a85e7ba543862e5fc0250c51f3ac721a078a55cdcff250/easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c", size = 2870178, upload-time = "2024-09-24T11:34:43.554Z" }, -] - [[package]] name = "einops" version = "0.8.2" @@ -436,15 +266,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, ] -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, -] - [[package]] name = "filelock" version = "3.25.0" @@ -454,15 +275,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, ] -[[package]] -name = "filetype" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, -] - [[package]] name = "fonttools" version = "4.61.1" @@ -597,9 +409,6 @@ deepseek = [ { name = "tokenizers" }, { name = "transformers" }, ] -docling = [ - { name = "docling" }, -] [package.metadata] requires-dist = [ @@ -608,7 +417,7 @@ requires-dist = [ { name = "aiofiles", specifier = ">=23.0.0" }, { name = "aiohttp", specifier = ">=3.8.0" }, { name = "dask", specifier = ">=2022.1.0" }, - { name = "docling", marker = "extra == 'docling'", specifier = "==2.48.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.81.0" }, { name = "easydict", marker = "extra == 'deepseek'" }, { name = "einops", marker = "extra == 'deepseek'" }, { name = "ftfy", specifier = ">=6.0.0" }, @@ -616,9 +425,10 @@ requires-dist = [ { name = "joblib", specifier = ">=1.0.0" }, { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, - { name = "numpy", specifier = "<2" }, + { name = "numpy", specifier = ">=1.26,<3" }, { name = "pandas", specifier = ">=1.3.0" }, { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "playwright", marker = "extra == 'browser'", specifier = ">=1.52,<2" }, { name = "pyarrow", specifier = ">=7.0.0" }, { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, { name = "pypdfium2", specifier = ">=4.0.0" }, @@ -632,14 +442,14 @@ requires-dist = [ { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, { name = "zstandard", specifier = ">=0.22.0" }, ] -provides-extras = ["docling", "cuda", "deepseek", "docs"] +provides-extras = ["browser", "docling", "cuda", "deepseek", "docs"] [[package]] name = "glossapi-deepseek-runtime" version = "0.1.0" source = { virtual = "." } dependencies = [ - { name = "glossapi", extra = ["deepseek", "docling"] }, + { name = "glossapi", extra = ["deepseek"] }, { name = "torch" }, { name = "torchaudio" }, { name = "torchvision" }, @@ -653,7 +463,7 @@ test = [ [package.metadata] requires-dist = [ - { name = "glossapi", extras = ["docling", "deepseek"], editable = "../../" }, + { name = "glossapi", extras = ["deepseek"], editable = "../../" }, { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, @@ -709,19 +519,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] -[[package]] -name = "imageio" -version = "2.37.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "pillow" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/6f/606be632e37bf8d05b253e8626c2291d74c691ddc7bcdf7d6aaf33b32f6a/imageio-2.37.2.tar.gz", hash = "sha256:0212ef2727ac9caa5ca4b2c75ae89454312f440a756fcfc8ef1993e718f50f8a", size = 389600, upload-time = "2025-11-04T14:29:39.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/fe/301e0936b79bcab4cacc7548bf2853fc28dced0a578bab1f7ef53c9aa75b/imageio-2.37.2-py3-none-any.whl", hash = "sha256:ad9adfb20335d718c03de457358ed69f141021a333c40a53e57273d8a5bd0b9b", size = 317646, upload-time = "2025-11-04T14:29:37.948Z" }, -] - [[package]] name = "img2pdf" version = "0.6.3" @@ -777,75 +574,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] -[[package]] -name = "jsonlines" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/35/87/bcda8e46c88d0e34cad2f09ee2d0c7f5957bccdb9791b0b934ec84d84be4/jsonlines-4.0.0.tar.gz", hash = "sha256:0c6d2c09117550c089995247f605ae4cf77dd1533041d366351f6f298822ea74", size = 11359, upload-time = "2023-09-01T12:34:44.187Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/62/d9ba6323b9202dd2fe166beab8a86d29465c41a0288cbe229fac60c1ab8d/jsonlines-4.0.0-py3-none-any.whl", hash = "sha256:185b334ff2ca5a91362993f42e83588a360cf95ce4b71a73548502bda52a7c55", size = 8701, upload-time = "2023-09-01T12:34:42.563Z" }, -] - -[[package]] -name = "jsonref" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814, upload-time = "2023-01-16T16:10:04.455Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425, upload-time = "2023-01-16T16:10:02.255Z" }, -] - -[[package]] -name = "jsonschema" -version = "4.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "jsonschema-specifications" }, - { name = "referencing" }, - { name = "rpds-py" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, -] - -[[package]] -name = "jsonschema-specifications" -version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "referencing" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, -] - -[[package]] -name = "latex2mathml" -version = "3.78.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1a/26/57b1034c08922d0aefea79430a5e0006ffaee4f0ec59d566613f667ab2f7/latex2mathml-3.78.1.tar.gz", hash = "sha256:f941db80bf41db33f31df87b304e8b588f8166b813b0257c11c98f7a9d0aac71", size = 74030, upload-time = "2025-08-29T23:34:23.178Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/76/d661ea2e529c3d464f9efd73f9ac31626b45279eb4306e684054ea20e3d4/latex2mathml-3.78.1-py3-none-any.whl", hash = "sha256:f089b6d75e85b937f99693c93e8c16c0804008672c3dd2a3d25affd36f238100", size = 73892, upload-time = "2025-08-29T23:34:21.98Z" }, -] - -[[package]] -name = "lazy-loader" -version = "0.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "packaging" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/49/ac/21a1f8aa3777f5658576777ea76bfb124b702c520bbe90edf4ae9915eafa/lazy_loader-0.5.tar.gz", hash = "sha256:717f9179a0dbed357012ddad50a5ad3d5e4d9a0b8712680d4e687f5e6e6ed9b3", size = 15294, upload-time = "2026-03-06T15:45:09.054Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" }, -] - [[package]] name = "locket" version = "1.0.0" @@ -897,27 +625,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, ] -[[package]] -name = "markdown-it-py" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, -] - -[[package]] -name = "marko" -version = "2.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/2f/050b6d485f052ddf17d76a41f9334d6fb2a8a85df35347a12d97ed3bc5c1/marko-2.2.2.tar.gz", hash = "sha256:6940308e655f63733ca518c47a68ec9510279dbb916c83616e4c4b5829f052e8", size = 143641, upload-time = "2026-01-05T11:04:41.935Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/83/f8/36d79bac5701e6786f9880c61bbe57574760a13c1af84ab71e5ed21faecc/marko-2.2.2-py3-none-any.whl", hash = "sha256:f064ae8c10416285ad1d96048dc11e98ef04e662d3342ae416f662b70aa7959e", size = 42701, upload-time = "2026-01-05T11:04:40.75Z" }, -] - [[package]] name = "markupsafe" version = "3.0.3" @@ -948,34 +655,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, ] -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - -[[package]] -name = "mpire" -version = "2.10.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pygments" }, - { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3a/93/80ac75c20ce54c785648b4ed363c88f148bf22637e10c9863db4fbe73e74/mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97", size = 271270, upload-time = "2024-05-07T14:00:31.815Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/14/1db1729ad6db4999c3a16c47937d601fcb909aaa4224f5eca5a2f145a605/mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb", size = 272756, upload-time = "2024-05-07T14:00:29.633Z" }, -] - -[package.optional-dependencies] -dill = [ - { name = "multiprocess" }, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -1030,24 +709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, ] -[[package]] -name = "multiprocess" -version = "0.70.19" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "dill" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" }, - { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" }, - { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" }, - { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" }, - { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" }, - { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" }, - { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477, upload-time = "2026-01-19T06:47:38.619Z" }, -] - [[package]] name = "networkx" version = "3.6.1" @@ -1057,32 +718,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, ] -[[package]] -name = "ninja" -version = "1.13.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" }, - { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" }, - { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" }, - { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" }, - { url = "https://files.pythonhosted.org/packages/79/22/d1de07632b78ac8e6b785f41fa9aad7a978ec8c0a1bf15772def36d77aac/ninja-1.13.0-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:1c97223cdda0417f414bf864cfb73b72d8777e57ebb279c5f6de368de0062988", size = 179034, upload-time = "2025-08-11T15:09:57.394Z" }, - { url = "https://files.pythonhosted.org/packages/ed/de/0e6edf44d6a04dabd0318a519125ed0415ce437ad5a1ec9b9be03d9048cf/ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fb46acf6b93b8dd0322adc3a4945452a4e774b75b91293bafcc7b7f8e6517dfa", size = 180716, upload-time = "2025-08-11T15:09:58.696Z" }, - { url = "https://files.pythonhosted.org/packages/54/28/938b562f9057aaa4d6bfbeaa05e81899a47aebb3ba6751e36c027a7f5ff7/ninja-1.13.0-py3-none-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4be9c1b082d244b1ad7ef41eb8ab088aae8c109a9f3f0b3e56a252d3e00f42c1", size = 146843, upload-time = "2025-08-11T15:10:00.046Z" }, - { url = "https://files.pythonhosted.org/packages/2a/fb/d06a3838de4f8ab866e44ee52a797b5491df823901c54943b2adb0389fbb/ninja-1.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:6739d3352073341ad284246f81339a384eec091d9851a886dfa5b00a6d48b3e2", size = 154402, upload-time = "2025-08-11T15:10:01.657Z" }, - { url = "https://files.pythonhosted.org/packages/31/bf/0d7808af695ceddc763cf251b84a9892cd7f51622dc8b4c89d5012779f06/ninja-1.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11be2d22027bde06f14c343f01d31446747dbb51e72d00decca2eb99be911e2f", size = 552388, upload-time = "2025-08-11T15:10:03.349Z" }, - { url = "https://files.pythonhosted.org/packages/9d/70/c99d0c2c809f992752453cce312848abb3b1607e56d4cd1b6cded317351a/ninja-1.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:aa45b4037b313c2f698bc13306239b8b93b4680eb47e287773156ac9e9304714", size = 472501, upload-time = "2025-08-11T15:10:04.735Z" }, - { url = "https://files.pythonhosted.org/packages/9f/43/c217b1153f0e499652f5e0766da8523ce3480f0a951039c7af115e224d55/ninja-1.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f8e1e8a1a30835eeb51db05cf5a67151ad37542f5a4af2a438e9490915e5b72", size = 638280, upload-time = "2025-08-11T15:10:06.512Z" }, - { url = "https://files.pythonhosted.org/packages/8c/45/9151bba2c8d0ae2b6260f71696330590de5850e5574b7b5694dce6023e20/ninja-1.13.0-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:3d7d7779d12cb20c6d054c61b702139fd23a7a964ec8f2c823f1ab1b084150db", size = 642420, upload-time = "2025-08-11T15:10:08.35Z" }, - { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" }, - { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" }, - { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" }, - { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" }, - { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" }, - { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" }, -] - [[package]] name = "numpy" version = "1.26.4" @@ -1210,35 +845,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, ] -[[package]] -name = "opencv-python-headless" -version = "4.11.0.86" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/36/2f/5b2b3ba52c864848885ba988f24b7f105052f68da9ab0e693cc7c25b0b30/opencv-python-headless-4.11.0.86.tar.gz", hash = "sha256:996eb282ca4b43ec6a3972414de0e2331f5d9cda2b41091a49739c19fb843798", size = 95177929, upload-time = "2025-01-16T13:53:40.22Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/53/2c50afa0b1e05ecdb4603818e85f7d174e683d874ef63a6abe3ac92220c8/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:48128188ade4a7e517237c8e1e11a9cdf5c282761473383e77beb875bb1e61ca", size = 37326460, upload-time = "2025-01-16T13:52:57.015Z" }, - { url = "https://files.pythonhosted.org/packages/3b/43/68555327df94bb9b59a1fd645f63fafb0762515344d2046698762fc19d58/opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:a66c1b286a9de872c343ee7c3553b084244299714ebb50fbdcd76f07ebbe6c81", size = 56723330, upload-time = "2025-01-16T13:55:45.731Z" }, - { url = "https://files.pythonhosted.org/packages/45/be/1438ce43ebe65317344a87e4b150865c5585f4c0db880a34cdae5ac46881/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6efabcaa9df731f29e5ea9051776715b1bdd1845d7c9530065c7951d2a2899eb", size = 29487060, upload-time = "2025-01-16T13:51:59.625Z" }, - { url = "https://files.pythonhosted.org/packages/dd/5c/c139a7876099916879609372bfa513b7f1257f7f1a908b0bdc1c2328241b/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e0a27c19dd1f40ddff94976cfe43066fbbe9dfbb2ec1907d66c19caef42a57b", size = 49969856, upload-time = "2025-01-16T13:53:29.654Z" }, - { url = "https://files.pythonhosted.org/packages/95/dd/ed1191c9dc91abcc9f752b499b7928aacabf10567bb2c2535944d848af18/opencv_python_headless-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:f447d8acbb0b6f2808da71fddd29c1cdd448d2bc98f72d9bb78a7a898fc9621b", size = 29324425, upload-time = "2025-01-16T13:52:49.048Z" }, - { url = "https://files.pythonhosted.org/packages/86/8a/69176a64335aed183529207ba8bc3d329c2999d852b4f3818027203f50e6/opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:6c304df9caa7a6a5710b91709dd4786bf20a74d57672b3c31f7033cc638174ca", size = 39402386, upload-time = "2025-01-16T13:52:56.418Z" }, -] - -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, -] - [[package]] name = "packaging" version = "26.0" @@ -1433,111 +1039,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, ] -[[package]] -name = "pyclipper" -version = "1.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/21/3c06205bb407e1f79b73b7b4dfb3950bd9537c4f625a68ab5cc41177f5bc/pyclipper-1.4.0.tar.gz", hash = "sha256:9882bd889f27da78add4dd6f881d25697efc740bf840274e749988d25496c8e1", size = 54489, upload-time = "2025-12-01T13:15:35.015Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/de/e3/64cf7794319b088c288706087141e53ac259c7959728303276d18adc665d/pyclipper-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:adcb7ca33c5bdc33cd775e8b3eadad54873c802a6d909067a57348bcb96e7a2d", size = 264281, upload-time = "2025-12-01T13:14:55.47Z" }, - { url = "https://files.pythonhosted.org/packages/34/cd/44ec0da0306fa4231e76f1c2cb1fa394d7bde8db490a2b24d55b39865f69/pyclipper-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fd24849d2b94ec749ceac7c34c9f01010d23b6e9d9216cf2238b8481160e703d", size = 139426, upload-time = "2025-12-01T13:14:56.683Z" }, - { url = "https://files.pythonhosted.org/packages/ad/88/d8f6c6763ea622fe35e19c75d8b39ed6c55191ddc82d65e06bc46b26cb8e/pyclipper-1.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1b6c8d75ba20c6433c9ea8f1a0feb7e4d3ac06a09ad1fd6d571afc1ddf89b869", size = 989649, upload-time = "2025-12-01T13:14:58.28Z" }, - { url = "https://files.pythonhosted.org/packages/ff/e9/ea7d68c8c4af3842d6515bedcf06418610ad75f111e64c92c1d4785a1513/pyclipper-1.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e29d7443d7cc0e83ee9daf43927730386629786d00c63b04fe3b53ac01462c", size = 962842, upload-time = "2025-12-01T13:15:00.044Z" }, - { url = "https://files.pythonhosted.org/packages/4e/b7/0b4a272d8726e51ab05e2b933d8cc47f29757fb8212e38b619e170e6015c/pyclipper-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a8d2b5fb75ebe57e21ce61e79a9131edec2622ff23cc665e4d1d1f201bc1a801", size = 95098, upload-time = "2025-12-01T13:15:01.359Z" }, - { url = "https://files.pythonhosted.org/packages/3a/76/4901de2919198bb2bd3d989f86d4a1dff363962425bb2d63e24e6c990042/pyclipper-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:e9b973467d9c5fa9bc30bb6ac95f9f4d7c3d9fc25f6cf2d1cc972088e5955c01", size = 104362, upload-time = "2025-12-01T13:15:02.439Z" }, - { url = "https://files.pythonhosted.org/packages/90/1b/7a07b68e0842324d46c03e512d8eefa9cb92ba2a792b3b4ebf939dafcac3/pyclipper-1.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:222ac96c8b8281b53d695b9c4fedc674f56d6d4320ad23f1bdbd168f4e316140", size = 265676, upload-time = "2025-12-01T13:15:04.15Z" }, - { url = "https://files.pythonhosted.org/packages/6b/dd/8bd622521c05d04963420ae6664093f154343ed044c53ea260a310c8bb4d/pyclipper-1.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f3672dbafbb458f1b96e1ee3e610d174acb5ace5bd2ed5d1252603bb797f2fc6", size = 140458, upload-time = "2025-12-01T13:15:05.76Z" }, - { url = "https://files.pythonhosted.org/packages/7a/06/6e3e241882bf7d6ab23d9c69ba4e85f1ec47397cbbeee948a16cf75e21ed/pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d1f807e2b4760a8e5c6d6b4e8c1d71ef52b7fe1946ff088f4fa41e16a881a5ca", size = 978235, upload-time = "2025-12-01T13:15:06.993Z" }, - { url = "https://files.pythonhosted.org/packages/cf/f4/3418c1cd5eea640a9fa2501d4bc0b3655fa8d40145d1a4f484b987990a75/pyclipper-1.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce1f83c9a4e10ea3de1959f0ae79e9a5bd41346dff648fee6228ba9eaf8b3872", size = 961388, upload-time = "2025-12-01T13:15:08.467Z" }, - { url = "https://files.pythonhosted.org/packages/ac/94/c85401d24be634af529c962dd5d781f3cb62a67cd769534df2cb3feee97a/pyclipper-1.4.0-cp312-cp312-win32.whl", hash = "sha256:3ef44b64666ebf1cb521a08a60c3e639d21b8c50bfbe846ba7c52a0415e936f4", size = 95169, upload-time = "2025-12-01T13:15:10.098Z" }, - { url = "https://files.pythonhosted.org/packages/97/77/dfea08e3b230b82ee22543c30c35d33d42f846a77f96caf7c504dd54fab1/pyclipper-1.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:d1e5498d883b706a4ce636247f0d830c6eb34a25b843a1b78e2c969754ca9037", size = 104619, upload-time = "2025-12-01T13:15:11.592Z" }, - { url = "https://files.pythonhosted.org/packages/18/59/81050abdc9e5b90ffc2c765738c5e40e9abd8e44864aaa737b600f16c562/pyclipper-1.4.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98b2a40f98e1fc1b29e8a6094072e7e0c7dfe901e573bf6cfc6eb7ce84a7ae87", size = 126495, upload-time = "2025-12-01T13:15:33.743Z" }, -] - -[[package]] -name = "pydantic" -version = "2.12.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.41.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, - { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, - { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, - { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, - { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, - { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, - { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, - { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, - { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, - { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, - { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, - { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, - { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, - { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, - { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, - { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, - { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, - { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, - { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, - { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, - { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, - { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, - { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, - { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, - { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, - { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, - { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, - { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, - { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, - { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, - { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, - { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, - { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, - { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, - { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, - { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, -] - -[[package]] -name = "pydantic-settings" -version = "2.13.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, -] - [[package]] name = "pygments" version = "2.19.2" @@ -1547,12 +1048,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] -[[package]] -name = "pylatexenc" -version = "2.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" } - [[package]] name = "pymupdf" version = "1.24.10" @@ -1629,54 +1124,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] -[[package]] -name = "python-bidi" -version = "0.6.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ed/e3/c0c8bf6fca79ac946a28d57f116e3b9e5b10a4469b6f70bf73f3744c49bf/python_bidi-0.6.7.tar.gz", hash = "sha256:c10065081c0e137975de5d9ba2ff2306286dbf5e0c586d4d5aec87c856239b41", size = 45503, upload-time = "2025-10-22T09:52:49.624Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/de/c30a13ad95239507af472a5fc2cadd2e5e172055068f12ac39b37922c7f8/python_bidi-0.6.7-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a8892a7da0f617135fe9c92dc7070d13a0f96ab3081f9db7ff5b172a3905bd78", size = 274420, upload-time = "2025-10-22T09:51:58.262Z" }, - { url = "https://files.pythonhosted.org/packages/ad/9f/be5efef7eea5f1e2a6415c4052a988f594dcf5a11a15103f2718d324a35b/python_bidi-0.6.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:06650a164e63e94dc8a291cc9d415b4027cb1cce125bc9b02dac0f34d535ed47", size = 264586, upload-time = "2025-10-22T09:51:49.255Z" }, - { url = "https://files.pythonhosted.org/packages/87/ec/2c374b6de35870817ffb3512c0666ea8c3794ef923b5586c69451e0e5395/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6df7be07af867ec1d121c92ea827efad4d77b25457c06eeab477b601e82b2340", size = 293672, upload-time = "2025-10-22T09:50:58.504Z" }, - { url = "https://files.pythonhosted.org/packages/29/1a/722d7d7128bdc9a530351a0d2fdf2ff5f4af66a865a6bca925f99832e2cc/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:73a88dc333efc42281bd800d5182c8625c6e11d109fc183fe3d7a11d48ab1150", size = 302643, upload-time = "2025-10-22T09:51:06.419Z" }, - { url = "https://files.pythonhosted.org/packages/24/d7/5b9b593dd58fc745233d8476e9f4e0edd437547c78c58340619868470349/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f24189dc3aea3a0a94391a047076e1014306b39ba17d7a38ebab510553cd1a97", size = 441692, upload-time = "2025-10-22T09:51:15.39Z" }, - { url = "https://files.pythonhosted.org/packages/08/b9/16e7a1db5f022da6654e89875d231ec2e044d42ef7b635feeff61cee564c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a507fe6928a27a308e04ebf2065719b7850d1bf9ff1924f4e601ef77758812bd", size = 326933, upload-time = "2025-10-22T09:51:23.631Z" }, - { url = "https://files.pythonhosted.org/packages/e0/a6/45aaec301292c6a07a9cc3168f5d1a92c8adc2ef36a3cd1f227b9caa980c/python_bidi-0.6.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbbffb948a32f9783d1a28bc0c53616f0a76736ed1e7c1d62e3e99a8dfaab869", size = 302034, upload-time = "2025-10-22T09:51:41.347Z" }, - { url = "https://files.pythonhosted.org/packages/71/a3/7e42cce6e153c21b4e5cc96d429a5910909823f6fedd174b64ff67bc76a7/python_bidi-0.6.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7e507e1e798ebca77ddc9774fd405107833315ad802cfdaa1ab07b6d9154fc8", size = 315738, upload-time = "2025-10-22T09:51:33.409Z" }, - { url = "https://files.pythonhosted.org/packages/43/7c/a5e4c0acc8e6ca61953b4add0576f0483f63b809b5389154e5da13927b0b/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:849a57d39feaf897955d0b19bbf4796bea53d1bcdf83b82e0a7b059167eb2049", size = 473968, upload-time = "2025-10-22T09:52:07.624Z" }, - { url = "https://files.pythonhosted.org/packages/b1/aa/a18bc3cbab7a0e598cbe7b89f2c0913aedcc66dcafce9a4c357465c87859/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5ebc19f24e65a1f5c472e26d88e78b9d316e293bc6f205f32de4c4e99276336e", size = 567038, upload-time = "2025-10-22T09:52:18.594Z" }, - { url = "https://files.pythonhosted.org/packages/92/46/fc6c54a8b5bfbee50e650f885ddef4f8c4f92880467ea0bc2bf133747048/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:24388c77cb00b8aa0f9c84beb7e3e523a3dac4f786ece64a1d8175a07b24da72", size = 493970, upload-time = "2025-10-22T09:52:29.815Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f1/2c15f5b938b2e087e4e950cc14dcead5bedbaabfc6c576dac15739bc0c91/python_bidi-0.6.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:19737d217088ef27014f98eac1827c5913e6fb1dea96332ed84ede61791070d9", size = 465161, upload-time = "2025-10-22T09:52:40.517Z" }, - { url = "https://files.pythonhosted.org/packages/56/d7/73a70a1fb819152485521b8dfe627e14ba9d3d5a65213244ab099adf3600/python_bidi-0.6.7-cp311-cp311-win32.whl", hash = "sha256:95c9de7ebc55ffb777548f2ecaf4b96b0fa0c92f42bf4d897b9f4cd164ec7394", size = 157033, upload-time = "2025-10-22T09:52:59.228Z" }, - { url = "https://files.pythonhosted.org/packages/68/84/06999dc54ea047fe33209af7150df4202ab7ad52deeb66b2c2040ac07884/python_bidi-0.6.7-cp311-cp311-win_amd64.whl", hash = "sha256:898db0ea3e4aaa95b7fecba02a7560dfbf368f9d85053f2875f6d610c4d4ec2c", size = 161282, upload-time = "2025-10-22T09:52:51.467Z" }, - { url = "https://files.pythonhosted.org/packages/e5/03/5b2f3e73501d0f41ebc2b075b49473047c6cdfc3465cf890263fc69e3915/python_bidi-0.6.7-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:11c51579e01f768446a7e13a0059fea1530936a707abcbeaad9467a55cb16073", size = 272536, upload-time = "2025-10-22T09:51:59.721Z" }, - { url = "https://files.pythonhosted.org/packages/31/77/c6048e938a73e5a7c6fa3d5e3627a5961109daa728c2e7d050567cecdc26/python_bidi-0.6.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47deaada8949af3a790f2cd73b613f9bfa153b4c9450f91c44a60c3109a81f73", size = 263258, upload-time = "2025-10-22T09:51:50.328Z" }, - { url = "https://files.pythonhosted.org/packages/57/56/ed4dc501cab7de70ce35cd435c86278e4eb1caf238c80bc72297767c9219/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b38ddfab41d10e780edb431edc30aec89bee4ce43d718e3896e99f33dae5c1d3", size = 292700, upload-time = "2025-10-22T09:50:59.628Z" }, - { url = "https://files.pythonhosted.org/packages/77/6a/1bf06d7544c940ffddd97cd0e02c55348a92163c5495fa18e34217dfbebe/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a93b0394cc684d64356b0475858c116f1e335ffbaba388db93bf47307deadfa", size = 300881, upload-time = "2025-10-22T09:51:07.507Z" }, - { url = "https://files.pythonhosted.org/packages/22/1d/ce7577a8f50291c06e94f651ac5de0d1678fc2642af26a5dad9901a0244f/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec1694134961b71ac05241ac989b49ccf08e232b5834d5fc46f8a7c3bb1c13a9", size = 439125, upload-time = "2025-10-22T09:51:16.559Z" }, - { url = "https://files.pythonhosted.org/packages/a3/87/4cf6dcd58e22f0fd904e7a161c6b73a5f9d17d4d49073fcb089ba62f1469/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8047c33b85f7790474a1f488bef95689f049976a4e1c6f213a8d075d180a93e4", size = 325816, upload-time = "2025-10-22T09:51:25.12Z" }, - { url = "https://files.pythonhosted.org/packages/2a/0a/4028a088e29ce8f1673e85ec9f64204fc368355c3207e6a71619c2b4579a/python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d9de35eb5987da27dd81e371c52142dd8e924bd61c1006003071ea05a735587", size = 300550, upload-time = "2025-10-22T09:51:42.739Z" }, - { url = "https://files.pythonhosted.org/packages/1f/05/cac15eba462d5a2407ac4ef1c792c45a948652b00c6bd81eaab3834a62d2/python_bidi-0.6.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a99d898ad1a399d9c8cab5561b3667fd24f4385820ac90c3340aa637aa5adfc9", size = 313017, upload-time = "2025-10-22T09:51:34.905Z" }, - { url = "https://files.pythonhosted.org/packages/4b/b1/3ba91b9ea60fa54a9aa730a5fe432bd73095d55be371244584fc6818eae1/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5debaab33562fdfc79ffdbd8d9c51cf07b8529de0e889d8cd145d78137aab21e", size = 472798, upload-time = "2025-10-22T09:52:09.079Z" }, - { url = "https://files.pythonhosted.org/packages/50/40/4bf5fb7255e35c218174f322a4d4c80b63b2604d73adc6e32f843e700824/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c11c62a3cdb9d1426b1536de9e3446cb09c7d025bd4df125275cae221f214899", size = 565234, upload-time = "2025-10-22T09:52:19.703Z" }, - { url = "https://files.pythonhosted.org/packages/bd/81/ad23fb85bff69d0a25729cd3834254b87c3c7caa93d657c8f8edcbed08f6/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6c051f2d28ca542092d01da8b5fe110fb6191ff58d298a54a93dc183bece63bf", size = 491844, upload-time = "2025-10-22T09:52:31.216Z" }, - { url = "https://files.pythonhosted.org/packages/65/85/103baaf142b2838f583b71904a2454fa31bd2a912ff505c25874f45d6c3e/python_bidi-0.6.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:95867a07c5dee0ea2340fe1d0e4f6d9f5c5687d473193b6ee6f86fa44aac45d1", size = 463753, upload-time = "2025-10-22T09:52:41.943Z" }, - { url = "https://files.pythonhosted.org/packages/54/c3/6a5c3b9f42a6b188430c83a7e70a76bc7c0db3354302fce7c8ed94a0c062/python_bidi-0.6.7-cp312-cp312-win32.whl", hash = "sha256:4c73cd980d45bb967799c7f0fc98ea93ae3d65b21ef2ba6abef6a057720bf483", size = 155820, upload-time = "2025-10-22T09:53:00.254Z" }, - { url = "https://files.pythonhosted.org/packages/45/c4/683216398ee3abf6b9bb0f26ae15c696fabbe36468ba26d5271f0c11b343/python_bidi-0.6.7-cp312-cp312-win_amd64.whl", hash = "sha256:d524a4ba765bae9b950706472a77a887a525ed21144fe4b41f6190f6e57caa2c", size = 159966, upload-time = "2025-10-22T09:52:52.547Z" }, - { url = "https://files.pythonhosted.org/packages/b8/4e/6135798d84b62eea70c0f9435301c2a4ba854e87be93a3fcd1d935266d24/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c9a679b24f5c6f366a0dec75745e1abeae2f597f033d0d54c74cbe62e7e6ae28", size = 276275, upload-time = "2025-10-22T09:52:05.078Z" }, - { url = "https://files.pythonhosted.org/packages/74/83/2123596d43e552af9e2806e361646fa579f34a1d1e9e2c1707a0ab6a02dd/python_bidi-0.6.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:05fe5971110013610f0db40505d0b204edc756e92eafac1372a464f8b9162b11", size = 266951, upload-time = "2025-10-22T09:51:56.216Z" }, - { url = "https://files.pythonhosted.org/packages/5c/8c/8d1e1501717227a6d52fc7b9c47a3de61486b024fbdd4821bfad724c0699/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17572944e6d8fb616d111fc702c759da2bf7cedab85a3e4fa2af0c9eb95ed438", size = 295745, upload-time = "2025-10-22T09:51:04.438Z" }, - { url = "https://files.pythonhosted.org/packages/fd/ff/ef04e7f9067c2c5d862b9f8d9a192486c500c8aa295f0fb756c25ab47fc8/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3b63d19f3f56ff7f99bce5ca9ef8c811dbf0f509d8e84c1bc06105ed26a49528", size = 304123, upload-time = "2025-10-22T09:51:12.559Z" }, - { url = "https://files.pythonhosted.org/packages/be/72/b973895e257a7d4cc8365ab094612f6ee885df863a4964d8865b9f534b67/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1350033431d75be749273236dcfc808e54404cd6ece6204cdb1bc4ccc163455", size = 442484, upload-time = "2025-10-22T09:51:21.575Z" }, - { url = "https://files.pythonhosted.org/packages/c1/1a/68ca9d10bc309828e8cdb2d57a30dd7e5753ac8520c8d7a0322daeb9eef7/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c5fb99f774748de283fadf915106f130b74be1bade934b7f73a7a8488b95da1", size = 329149, upload-time = "2025-10-22T09:51:31.232Z" }, - { url = "https://files.pythonhosted.org/packages/03/40/ab450c06167a7de596d99b1ba5cee2c605b3ff184baccf08210ede706b1b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d28e2bdcadf5b6161bb4ee9313ce41eac746ba57e744168bf723a415a11af05", size = 303529, upload-time = "2025-10-22T09:51:46.997Z" }, - { url = "https://files.pythonhosted.org/packages/ec/c5/585b5c413e3b77a32500fb877ea30aa23c45a6064dbd7fe77d87b72cd90b/python_bidi-0.6.7-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3777ae3e088e94df854fbcbd8d59f9239b74aac036cb6bbd19f8035c8e42478", size = 317753, upload-time = "2025-10-22T09:51:39.272Z" }, - { url = "https://files.pythonhosted.org/packages/f9/05/b7b4b447890d614ccb40633f4d65f334bcf9fe3ad13be33aaa54dcbc34f3/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:77bb4cbadf4121db395189065c58c9dd5d1950257cc1983004e6df4a3e2f97ad", size = 476054, upload-time = "2025-10-22T09:52:15.856Z" }, - { url = "https://files.pythonhosted.org/packages/ca/94/64f6d2c09c4426918345b54ca8902f94b663eadd744c9dd89070f546c9bc/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:f1fe71c203f66bc169a393964d5702f9251cfd4d70279cb6453fdd42bd2e675f", size = 568365, upload-time = "2025-10-22T09:52:27.556Z" }, - { url = "https://files.pythonhosted.org/packages/fc/d2/c39a6b82aa0fcedac7cbe6078b78bb9089b43d903f8e00859e42b504bb8e/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:d87ed09e5c9b6d2648e8856a4e556147b9d3cd4d63905fa664dd6706bc414256", size = 495292, upload-time = "2025-10-22T09:52:38.306Z" }, - { url = "https://files.pythonhosted.org/packages/0a/8d/a80f37ab92118e305d7b574306553599f81534c50b4eb23ef34ebe09c09c/python_bidi-0.6.7-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:766d5f5a686eb99b53168a7bdfb338035931a609bdbbcb537cef9e050a86f359", size = 467159, upload-time = "2025-10-22T09:52:48.603Z" }, -] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1689,43 +1136,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] -[[package]] -name = "python-docx" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, -] - -[[package]] -name = "python-dotenv" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135, upload-time = "2026-03-01T16:00:26.196Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, -] - -[[package]] -name = "python-pptx" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "lxml" }, - { name = "pillow" }, - { name = "typing-extensions" }, - { name = "xlsxwriter" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, -] - [[package]] name = "pytz" version = "2026.1.post1" @@ -1735,19 +1145,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, ] -[[package]] -name = "pywin32" -version = "311" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" }, - { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" }, - { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" }, - { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" }, - { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" }, -] - [[package]] name = "pyyaml" version = "6.0.3" @@ -1775,20 +1172,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, ] -[[package]] -name = "referencing" -version = "0.37.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "rpds-py" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, -] - [[package]] name = "regex" version = "2026.2.28" @@ -1844,85 +1227,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] -[[package]] -name = "rich" -version = "14.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, -] - -[[package]] -name = "rpds-py" -version = "0.30.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" }, - { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" }, - { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" }, - { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" }, - { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" }, - { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" }, - { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" }, - { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" }, - { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" }, - { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" }, - { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" }, - { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" }, - { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" }, - { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" }, - { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" }, - { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, - { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, - { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, - { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, - { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, - { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, - { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, - { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, - { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, - { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, - { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, - { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, - { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, - { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, - { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, - { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" }, - { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" }, - { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" }, - { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" }, - { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" }, - { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" }, - { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" }, - { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" }, - { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" }, - { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" }, - { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" }, - { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, -] - -[[package]] -name = "rtree" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/09/7302695875a019514de9a5dd17b8320e7a19d6e7bc8f85dcfb79a4ce2da3/rtree-1.4.1.tar.gz", hash = "sha256:c6b1b3550881e57ebe530cc6cffefc87cd9bf49c30b37b894065a9f810875e46", size = 52425, upload-time = "2025-08-13T19:32:01.413Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/d9/108cd989a4c0954e60b3cdc86fd2826407702b5375f6dfdab2802e5fed98/rtree-1.4.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d672184298527522d4914d8ae53bf76982b86ca420b0acde9298a7a87d81d4a4", size = 468484, upload-time = "2025-08-13T19:31:50.593Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cf/2710b6fd6b07ea0aef317b29f335790ba6adf06a28ac236078ed9bd8a91d/rtree-1.4.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a7e48d805e12011c2cf739a29d6a60ae852fb1de9fc84220bbcef67e6e595d7d", size = 436325, upload-time = "2025-08-13T19:31:52.367Z" }, - { url = "https://files.pythonhosted.org/packages/55/e1/4d075268a46e68db3cac51846eb6a3ab96ed481c585c5a1ad411b3c23aad/rtree-1.4.1-py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:efa8c4496e31e9ad58ff6c7df89abceac7022d906cb64a3e18e4fceae6b77f65", size = 459789, upload-time = "2025-08-13T19:31:53.926Z" }, - { url = "https://files.pythonhosted.org/packages/d1/75/e5d44be90525cd28503e7f836d077ae6663ec0687a13ba7810b4114b3668/rtree-1.4.1-py3-none-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12de4578f1b3381a93a655846900be4e3d5f4cd5e306b8b00aa77c1121dc7e8c", size = 507644, upload-time = "2025-08-13T19:31:55.164Z" }, - { url = "https://files.pythonhosted.org/packages/fd/85/b8684f769a142163b52859a38a486493b05bafb4f2fb71d4f945de28ebf9/rtree-1.4.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b558edda52eca3e6d1ee629042192c65e6b7f2c150d6d6cd207ce82f85be3967", size = 1454478, upload-time = "2025-08-13T19:31:56.808Z" }, - { url = "https://files.pythonhosted.org/packages/e9/a4/c2292b95246b9165cc43a0c3757e80995d58bc9b43da5cb47ad6e3535213/rtree-1.4.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:f155bc8d6bac9dcd383481dee8c130947a4866db1d16cb6dff442329a038a0dc", size = 1555140, upload-time = "2025-08-13T19:31:58.031Z" }, - { url = "https://files.pythonhosted.org/packages/74/25/5282c8270bfcd620d3e73beb35b40ac4ab00f0a898d98ebeb41ef0989ec8/rtree-1.4.1-py3-none-win_amd64.whl", hash = "sha256:efe125f416fd27150197ab8521158662943a40f87acab8028a1aac4ad667a489", size = 389358, upload-time = "2025-08-13T19:31:59.247Z" }, - { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" }, -] - [[package]] name = "safetensors" version = "0.7.0" @@ -1945,47 +1249,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, ] -[package.optional-dependencies] -torch = [ - { name = "numpy" }, - { name = "packaging" }, - { name = "torch" }, -] - -[[package]] -name = "scikit-image" -version = "0.26.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "imageio" }, - { name = "lazy-loader" }, - { name = "networkx" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "scipy" }, - { name = "tifffile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a1/b4/2528bb43c67d48053a7a649a9666432dc307d66ba02e3a6d5c40f46655df/scikit_image-0.26.0.tar.gz", hash = "sha256:f5f970ab04efad85c24714321fcc91613fcb64ef2a892a13167df2f3e59199fa", size = 22729739, upload-time = "2025-12-20T17:12:21.824Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/16/8a407688b607f86f81f8c649bf0d68a2a6d67375f18c2d660aba20f5b648/scikit_image-0.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b1ede33a0fb3731457eaf53af6361e73dd510f449dac437ab54573b26788baf0", size = 12355510, upload-time = "2025-12-20T17:10:31.628Z" }, - { url = "https://files.pythonhosted.org/packages/6b/f9/7efc088ececb6f6868fd4475e16cfafc11f242ce9ab5fc3557d78b5da0d4/scikit_image-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7af7aa331c6846bd03fa28b164c18d0c3fd419dbb888fb05e958ac4257a78fdd", size = 12056334, upload-time = "2025-12-20T17:10:34.559Z" }, - { url = "https://files.pythonhosted.org/packages/9f/1e/bc7fb91fb5ff65ef42346c8b7ee8b09b04eabf89235ab7dbfdfd96cbd1ea/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ea6207d9e9d21c3f464efe733121c0504e494dbdc7728649ff3e23c3c5a4953", size = 13297768, upload-time = "2025-12-20T17:10:37.733Z" }, - { url = "https://files.pythonhosted.org/packages/a5/2a/e71c1a7d90e70da67b88ccc609bd6ae54798d5847369b15d3a8052232f9d/scikit_image-0.26.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74aa5518ccea28121f57a95374581d3b979839adc25bb03f289b1bc9b99c58af", size = 13711217, upload-time = "2025-12-20T17:10:40.935Z" }, - { url = "https://files.pythonhosted.org/packages/d4/59/9637ee12c23726266b91296791465218973ce1ad3e4c56fc81e4d8e7d6e1/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d5c244656de905e195a904e36dbc18585e06ecf67d90f0482cbde63d7f9ad59d", size = 14337782, upload-time = "2025-12-20T17:10:43.452Z" }, - { url = "https://files.pythonhosted.org/packages/e7/5c/a3e1e0860f9294663f540c117e4bf83d55e5b47c281d475cc06227e88411/scikit_image-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21a818ee6ca2f2131b9e04d8eb7637b5c18773ebe7b399ad23dcc5afaa226d2d", size = 14805997, upload-time = "2025-12-20T17:10:45.93Z" }, - { url = "https://files.pythonhosted.org/packages/d3/c6/2eeacf173da041a9e388975f54e5c49df750757fcfc3ee293cdbbae1ea0a/scikit_image-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:9490360c8d3f9a7e85c8de87daf7c0c66507960cf4947bb9610d1751928721c7", size = 11878486, upload-time = "2025-12-20T17:10:48.246Z" }, - { url = "https://files.pythonhosted.org/packages/c3/a4/a852c4949b9058d585e762a66bf7e9a2cd3be4795cd940413dfbfbb0ce79/scikit_image-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:0baa0108d2d027f34d748e84e592b78acc23e965a5de0e4bb03cf371de5c0581", size = 11346518, upload-time = "2025-12-20T17:10:50.575Z" }, - { url = "https://files.pythonhosted.org/packages/99/e8/e13757982264b33a1621628f86b587e9a73a13f5256dad49b19ba7dc9083/scikit_image-0.26.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d454b93a6fa770ac5ae2d33570f8e7a321bb80d29511ce4b6b78058ebe176e8c", size = 12376452, upload-time = "2025-12-20T17:10:52.796Z" }, - { url = "https://files.pythonhosted.org/packages/e3/be/f8dd17d0510f9911f9f17ba301f7455328bf13dae416560126d428de9568/scikit_image-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3409e89d66eff5734cd2b672d1c48d2759360057e714e1d92a11df82c87cba37", size = 12061567, upload-time = "2025-12-20T17:10:55.207Z" }, - { url = "https://files.pythonhosted.org/packages/b3/2b/c70120a6880579fb42b91567ad79feb4772f7be72e8d52fec403a3dde0c6/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c717490cec9e276afb0438dd165b7c3072d6c416709cc0f9f5a4c1070d23a44", size = 13084214, upload-time = "2025-12-20T17:10:57.468Z" }, - { url = "https://files.pythonhosted.org/packages/f4/a2/70401a107d6d7466d64b466927e6b96fcefa99d57494b972608e2f8be50f/scikit_image-0.26.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df650e79031634ac90b11e64a9eedaf5a5e06fcd09bcd03a34be01745744466", size = 13561683, upload-time = "2025-12-20T17:10:59.49Z" }, - { url = "https://files.pythonhosted.org/packages/13/a5/48bdfd92794c5002d664e0910a349d0a1504671ef5ad358150f21643c79a/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cefd85033e66d4ea35b525bb0937d7f42d4cdcfed2d1888e1570d5ce450d3932", size = 14112147, upload-time = "2025-12-20T17:11:02.083Z" }, - { url = "https://files.pythonhosted.org/packages/ee/b5/ac71694da92f5def5953ca99f18a10fe98eac2dd0a34079389b70b4d0394/scikit_image-0.26.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3f5bf622d7c0435884e1e141ebbe4b2804e16b2dd23ae4c6183e2ea99233be70", size = 14661625, upload-time = "2025-12-20T17:11:04.528Z" }, - { url = "https://files.pythonhosted.org/packages/23/4d/a3cc1e96f080e253dad2251bfae7587cf2b7912bcd76fd43fd366ff35a87/scikit_image-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:abed017474593cd3056ae0fe948d07d0747b27a085e92df5474f4955dd65aec0", size = 11911059, upload-time = "2025-12-20T17:11:06.61Z" }, - { url = "https://files.pythonhosted.org/packages/35/8a/d1b8055f584acc937478abf4550d122936f420352422a1a625eef2c605d8/scikit_image-0.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:4d57e39ef67a95d26860c8caf9b14b8fb130f83b34c6656a77f191fa6d1d04d8", size = 11348740, upload-time = "2025-12-20T17:11:09.118Z" }, -] - [[package]] name = "scikit-learn" version = "1.6.1" @@ -2041,19 +1304,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, ] -[[package]] -name = "semchunk" -version = "2.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mpire", extra = ["dill"] }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/62/96/c418c322730b385e81d4ab462e68dd48bb2dbda4d8efa17cad2ca468d9ac/semchunk-2.2.2.tar.gz", hash = "sha256:940e89896e64eeb01de97ba60f51c8c7b96c6a3951dfcf574f25ce2146752f52", size = 12271, upload-time = "2024-12-17T22:54:30.332Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/84/94ca7896c7df20032bcb09973e9a4d14c222507c0aadf22e89fa76bb0a04/semchunk-2.2.2-py3-none-any.whl", hash = "sha256:94ca19020c013c073abdfd06d79a7c13637b91738335f3b8cdb5655ee7cc94d2", size = 10271, upload-time = "2024-12-17T22:54:27.689Z" }, -] - [[package]] name = "setuptools" version = "82.0.0" @@ -2063,42 +1313,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, ] -[[package]] -name = "shapely" -version = "2.1.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4d/bc/0989043118a27cccb4e906a46b7565ce36ca7b57f5a18b78f4f1b0f72d9d/shapely-2.1.2.tar.gz", hash = "sha256:2ed4ecb28320a433db18a5bf029986aa8afcfd740745e78847e330d5d94922a9", size = 315489, upload-time = "2025-09-24T13:51:41.432Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/8d/1ff672dea9ec6a7b5d422eb6d095ed886e2e523733329f75fdcb14ee1149/shapely-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91121757b0a36c9aac3427a651a7e6567110a4a67c97edf04f8d55d4765f6618", size = 1820038, upload-time = "2025-09-24T13:50:15.628Z" }, - { url = "https://files.pythonhosted.org/packages/4f/ce/28fab8c772ce5db23a0d86bf0adaee0c4c79d5ad1db766055fa3dab442e2/shapely-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:16a9c722ba774cf50b5d4541242b4cce05aafd44a015290c82ba8a16931ff63d", size = 1626039, upload-time = "2025-09-24T13:50:16.881Z" }, - { url = "https://files.pythonhosted.org/packages/70/8b/868b7e3f4982f5006e9395c1e12343c66a8155c0374fdc07c0e6a1ab547d/shapely-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cc4f7397459b12c0b196c9efe1f9d7e92463cbba142632b4cc6d8bbbbd3e2b09", size = 3001519, upload-time = "2025-09-24T13:50:18.606Z" }, - { url = "https://files.pythonhosted.org/packages/13/02/58b0b8d9c17c93ab6340edd8b7308c0c5a5b81f94ce65705819b7416dba5/shapely-2.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:136ab87b17e733e22f0961504d05e77e7be8c9b5a8184f685b4a91a84efe3c26", size = 3110842, upload-time = "2025-09-24T13:50:21.77Z" }, - { url = "https://files.pythonhosted.org/packages/af/61/8e389c97994d5f331dcffb25e2fa761aeedfb52b3ad9bcdd7b8671f4810a/shapely-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:16c5d0fc45d3aa0a69074979f4f1928ca2734fb2e0dde8af9611e134e46774e7", size = 4021316, upload-time = "2025-09-24T13:50:23.626Z" }, - { url = "https://files.pythonhosted.org/packages/d3/d4/9b2a9fe6039f9e42ccf2cb3e84f219fd8364b0c3b8e7bbc857b5fbe9c14c/shapely-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ddc759f72b5b2b0f54a7e7cde44acef680a55019eb52ac63a7af2cf17cb9cd2", size = 4178586, upload-time = "2025-09-24T13:50:25.443Z" }, - { url = "https://files.pythonhosted.org/packages/16/f6/9840f6963ed4decf76b08fd6d7fed14f8779fb7a62cb45c5617fa8ac6eab/shapely-2.1.2-cp311-cp311-win32.whl", hash = "sha256:2fa78b49485391224755a856ed3b3bd91c8455f6121fee0db0e71cefb07d0ef6", size = 1543961, upload-time = "2025-09-24T13:50:26.968Z" }, - { url = "https://files.pythonhosted.org/packages/38/1e/3f8ea46353c2a33c1669eb7327f9665103aa3a8dfe7f2e4ef714c210b2c2/shapely-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:c64d5c97b2f47e3cd9b712eaced3b061f2b71234b3fc263e0fcf7d889c6559dc", size = 1722856, upload-time = "2025-09-24T13:50:28.497Z" }, - { url = "https://files.pythonhosted.org/packages/24/c0/f3b6453cf2dfa99adc0ba6675f9aaff9e526d2224cbd7ff9c1a879238693/shapely-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fe2533caae6a91a543dec62e8360fe86ffcdc42a7c55f9dfd0128a977a896b94", size = 1833550, upload-time = "2025-09-24T13:50:30.019Z" }, - { url = "https://files.pythonhosted.org/packages/86/07/59dee0bc4b913b7ab59ab1086225baca5b8f19865e6101db9ebb7243e132/shapely-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ba4d1333cc0bc94381d6d4308d2e4e008e0bd128bdcff5573199742ee3634359", size = 1643556, upload-time = "2025-09-24T13:50:32.291Z" }, - { url = "https://files.pythonhosted.org/packages/26/29/a5397e75b435b9895cd53e165083faed5d12fd9626eadec15a83a2411f0f/shapely-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0bd308103340030feef6c111d3eb98d50dc13feea33affc8a6f9fa549e9458a3", size = 2988308, upload-time = "2025-09-24T13:50:33.862Z" }, - { url = "https://files.pythonhosted.org/packages/b9/37/e781683abac55dde9771e086b790e554811a71ed0b2b8a1e789b7430dd44/shapely-2.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1e7d4d7ad262a48bb44277ca12c7c78cb1b0f56b32c10734ec9a1d30c0b0c54b", size = 3099844, upload-time = "2025-09-24T13:50:35.459Z" }, - { url = "https://files.pythonhosted.org/packages/d8/f3/9876b64d4a5a321b9dc482c92bb6f061f2fa42131cba643c699f39317cb9/shapely-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e9eddfe513096a71896441a7c37db72da0687b34752c4e193577a145c71736fc", size = 3988842, upload-time = "2025-09-24T13:50:37.478Z" }, - { url = "https://files.pythonhosted.org/packages/d1/a0/704c7292f7014c7e74ec84eddb7b109e1fbae74a16deae9c1504b1d15565/shapely-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:980c777c612514c0cf99bc8a9de6d286f5e186dcaf9091252fcd444e5638193d", size = 4152714, upload-time = "2025-09-24T13:50:39.9Z" }, - { url = "https://files.pythonhosted.org/packages/53/46/319c9dc788884ad0785242543cdffac0e6530e4d0deb6c4862bc4143dcf3/shapely-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9111274b88e4d7b54a95218e243282709b330ef52b7b86bc6aaf4f805306f454", size = 1542745, upload-time = "2025-09-24T13:50:41.414Z" }, - { url = "https://files.pythonhosted.org/packages/ec/bf/cb6c1c505cb31e818e900b9312d514f381fbfa5c4363edfce0fcc4f8c1a4/shapely-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:743044b4cfb34f9a67205cee9279feaf60ba7d02e69febc2afc609047cb49179", size = 1722861, upload-time = "2025-09-24T13:50:43.35Z" }, -] - -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - [[package]] name = "six" version = "1.17.0" @@ -2108,15 +1322,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] -[[package]] -name = "soupsieve" -version = "2.8.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, -] - [[package]] name = "sympy" version = "1.13.1" @@ -2129,15 +1334,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, ] -[[package]] -name = "tabulate" -version = "0.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, -] - [[package]] name = "tenacity" version = "9.1.4" @@ -2156,18 +1352,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] -[[package]] -name = "tifffile" -version = "2026.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c5/cb/2f6d79c7576e22c116352a801f4c3c8ace5957e9aced862012430b62e14f/tifffile-2026.3.3.tar.gz", hash = "sha256:d9a1266bed6f2ee1dd0abde2018a38b4f8b2935cb843df381d70ac4eac5458b7", size = 388745, upload-time = "2026-03-03T19:14:38.134Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" }, -] - [[package]] name = "tokenizers" version = "0.20.3" @@ -2252,10 +1436,10 @@ dependencies = [ { name = "torch" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, - { url = "https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, ] [[package]] @@ -2268,10 +1452,10 @@ dependencies = [ { name = "torch" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, - { url = "https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, + { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, ] [[package]] @@ -2307,90 +1491,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, ] -[[package]] -name = "tree-sitter" -version = "0.25.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, - { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, - { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, - { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, - { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, - { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, - { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, - { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, - { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, - { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, - { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, - { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, - { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, - { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, -] - -[[package]] -name = "tree-sitter-c" -version = "0.24.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, - { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, - { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, - { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, - { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, - { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, - { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, -] - -[[package]] -name = "tree-sitter-javascript" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, - { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, - { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, - { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, - { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, - { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, - { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, -] - -[[package]] -name = "tree-sitter-python" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, - { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, - { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, - { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, - { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, - { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, - { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, -] - -[[package]] -name = "tree-sitter-typescript" -version = "0.23.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1e/fc/bb52958f7e399250aee093751e9373a6311cadbe76b6e0d109b853757f35/tree_sitter_typescript-0.23.2.tar.gz", hash = "sha256:7b167b5827c882261cb7a50dfa0fb567975f9b315e87ed87ad0a0a3aedb3834d", size = 773053, upload-time = "2024-11-11T02:36:11.396Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/28/95/4c00680866280e008e81dd621fd4d3f54aa3dad1b76b857a19da1b2cc426/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3cd752d70d8e5371fdac6a9a4df9d8924b63b6998d268586f7d374c9fba2a478", size = 286677, upload-time = "2024-11-11T02:35:58.839Z" }, - { url = "https://files.pythonhosted.org/packages/8f/2f/1f36fda564518d84593f2740d5905ac127d590baf5c5753cef2a88a89c15/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c7cc1b0ff5d91bac863b0e38b1578d5505e718156c9db577c8baea2557f66de8", size = 302008, upload-time = "2024-11-11T02:36:00.733Z" }, - { url = "https://files.pythonhosted.org/packages/96/2d/975c2dad292aa9994f982eb0b69cc6fda0223e4b6c4ea714550477d8ec3a/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1eed5b0b3a8134e86126b00b743d667ec27c63fc9de1b7bb23168803879e31", size = 351987, upload-time = "2024-11-11T02:36:02.669Z" }, - { url = "https://files.pythonhosted.org/packages/49/d1/a71c36da6e2b8a4ed5e2970819b86ef13ba77ac40d9e333cb17df6a2c5db/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e96d36b85bcacdeb8ff5c2618d75593ef12ebaf1b4eace3477e2bdb2abb1752c", size = 344960, upload-time = "2024-11-11T02:36:04.443Z" }, - { url = "https://files.pythonhosted.org/packages/7f/cb/f57b149d7beed1a85b8266d0c60ebe4c46e79c9ba56bc17b898e17daf88e/tree_sitter_typescript-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8d4f0f9bcb61ad7b7509d49a1565ff2cc363863644a234e1e0fe10960e55aea0", size = 340245, upload-time = "2024-11-11T02:36:06.473Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ab/dd84f0e2337296a5f09749f7b5483215d75c8fa9e33738522e5ed81f7254/tree_sitter_typescript-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:3f730b66396bc3e11811e4465c41ee45d9e9edd6de355a58bbbc49fa770da8f9", size = 278015, upload-time = "2024-11-11T02:36:07.631Z" }, - { url = "https://files.pythonhosted.org/packages/9f/e4/81f9a935789233cf412a0ed5fe04c883841d2c8fb0b7e075958a35c65032/tree_sitter_typescript-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:05db58f70b95ef0ea126db5560f3775692f609589ed6f8dd0af84b7f19f1cbb7", size = 274052, upload-time = "2024-11-11T02:36:09.514Z" }, -] - [[package]] name = "triton" version = "3.2.0" @@ -2400,21 +1500,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, ] -[[package]] -name = "typer" -version = "0.16.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "rich" }, - { name = "shellingham" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, -] - [[package]] name = "typing-extensions" version = "4.15.0" @@ -2424,18 +1509,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] -[[package]] -name = "typing-inspection" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, -] - [[package]] name = "tzdata" version = "2025.3" @@ -2494,15 +1567,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, ] -[[package]] -name = "xlsxwriter" -version = "3.2.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, -] - [[package]] name = "yarl" version = "1.23.0" diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index c7a5b58..e0f7707 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -2,7 +2,7 @@ ## Environment Profiles - **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. -- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml` and intentionally excludes the Docling layout stack. Recommended installation commands: ```bash diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt index 402261a..73cb17f 100644 --- a/dependency_setup/requirements-glossapi-docling.txt +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling extraction/layout) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/requirements-glossapi-vanilla.txt b/dependency_setup/requirements-glossapi-vanilla.txt index b13df49..eca76ba 100644 --- a/dependency_setup/requirements-glossapi-vanilla.txt +++ b/dependency_setup/requirements-glossapi-vanilla.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling without GPU OCR extras) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/setup_glossapi.sh b/dependency_setup/setup_glossapi.sh index 024095e..70e9754 100755 --- a/dependency_setup/setup_glossapi.sh +++ b/dependency_setup/setup_glossapi.sh @@ -77,9 +77,6 @@ case "${MODE}" in warn "Mode 'vanilla' is deprecated; using 'docling' instead." MODE="docling" ;; - rapidocr) - error "RapidOCR setup has been removed. Use --mode docling or --mode deepseek." - ;; docling|deepseek) ;; *) echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 2fb796c..8b740d6 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -40,6 +40,7 @@ extract( skip_existing: bool = True, use_gpus: str = 'single', # 'single'|'multi' devices: list[int] | None = None, + workers_per_device: int = 1, use_cls: bool = False, benchmark_mode: bool = False, export_doc_json: bool = True, @@ -52,9 +53,10 @@ extract( - files already present in `downloads/` - or explicit `file_paths` - Important parameters: - - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout/OCR - - `force_ocr=True`: turn on OCR during extraction + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout extraction + - `force_ocr`: deprecated no-op kept for compatibility; OCR remediation now lives in `Corpus.ocr(backend='deepseek')` - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput - `export_doc_json=True`: write `json/.docling.json(.zst)` - `emit_formula_index=True`: also write `json/.formula_index.jsonl` - Main outputs: diff --git a/docs/api_corpus_tmp.md b/docs/api_corpus_tmp.md index 4181094..e584308 100644 --- a/docs/api_corpus_tmp.md +++ b/docs/api_corpus_tmp.md @@ -44,7 +44,7 @@ extract( ) -> None ``` -- Phase‑1 extraction; set `force_ocr=True` for OCR. +- Phase‑1 extraction; `force_ocr` is deprecated and ignored. - Docling layout JSON now writes by default (`json/.docling.json(.zst)`); set `emit_formula_index=True` to also produce `json/.formula_index.jsonl`. - Set `use_gpus='multi'` to use all visible GPUs (shared queue). @@ -85,7 +85,7 @@ ocr( ) -> None ``` -- Convenience shim that re‑runs `extract(force_ocr=True)` on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. +- Convenience shim that re-runs OCR on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. ## formula_enrich_from_json() diff --git a/docs/architecture/deepseek_only_upgrade_roadmap.md b/docs/architecture/deepseek_only_upgrade_roadmap.md deleted file mode 100644 index 6ebac64..0000000 --- a/docs/architecture/deepseek_only_upgrade_roadmap.md +++ /dev/null @@ -1,262 +0,0 @@ -# DeepSeek-Only Upgrade Roadmap - -This document describes the planned migration from a mixed OCR stack to a simpler pipeline that keeps Docling for extraction and structure, but uses DeepSeek as the only OCR backend. - -## Current status - -As of March 9, 2026, the following work has already been completed: - -- DeepSeek is the only supported OCR remediation backend in the pipeline -- stub execution is rejected for real OCR runs -- the dedicated DeepSeek runtime is managed through the uv-based setup flow -- RapidOCR implementation files and install profile have been removed -- real extract -> clean/evaluate -> OCR -> section validation has been run on capped Pergamos samples -- OCR progress artifacts were moved out of the canonical `markdown/` tree so downstream stages no longer treat them as real documents - -The following work is intentionally not part of the completed set yet: - -- Docling dependency upgrades -- page-level OCR reevaluation experiments -- broader corpus-level comparative benchmarking beyond the capped validation runs - -## Remaining TODO to wrap up the implemented changes - -These are the remaining tasks for closing out the already-implemented migration work: - -1. review and curate the final commit contents -2. keep only source, docs, and test changes that belong in the `development` branch -3. exclude local artifacts, downloaded models, disposable environments, and ad hoc validation output from the commit -4. optionally run one more small real-PDF compatibility slice if an extra release-confidence check is desired -5. create or switch to the `development` branch and push the finalized change set there - -This means the migration implementation itself is effectively done; what remains is mainly release hygiene and branch preparation. - -## Target architecture - -The target shape is: - -1. `download()` -2. `extract()` via safe backend or Docling -3. `clean()` and compute Greek-quality routing -4. `ocr()` via DeepSeek only for documents that need remediation -5. `section()` -6. `annotate()` -7. `export()` - -Important boundary: - -- keep `Docling` for extraction, layout, Markdown, JSON artifacts, and optional formula/code enrichment -- remove `RapidOCR` from the OCR path and installation surface -- enforce `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` for production and release validation - -This is a simplification, not a redesign of the entire pipeline contract. - -## Why this direction - -The current mixed OCR surface adds complexity in three places: - -- dependency installation and CUDA compatibility -- runtime branching and operational support burden -- validation burden when one OCR path succeeds and another fails differently - -The simplified design still preserves the important current properties: - -- selective OCR after Greek-quality validation -- Docling-generated layout and JSON artifacts for downstream stages -- explicit operational metadata and rerun semantics - -## Stage 1: DeepSeek-only OCR - -Goal: - -- make DeepSeek the only OCR remediation backend -- remove silent stub fallback from production paths - -Changes: - -- remove `rapidocr` as a supported OCR backend -- route `Corpus.ocr()` to DeepSeek only -- fail hard when DeepSeek runtime, weights, or CLI are unavailable -- keep the current document-level `needs_ocr` selection model - -Do not change in this stage: - -- Docling extraction contract -- sectioning and annotation behavior -- page-level routing policy -- formula/code enrichment policy - -Why this stage exists: - -- it gives the desired simplification without changing the rest of the pipeline contract at the same time -- it isolates OCR-engine risk from Docling-upgrade risk - -Success criteria: - -- no remaining production path imports or dispatches RapidOCR -- no final validation run succeeds via stub output -- documents flagged `needs_ocr=True` can still be remediated through DeepSeek - -Status: - -- completed - -## Stage 2: Installation simplification - -Goal: - -- reduce the environment surface to what the simplified pipeline actually needs - -Changes: - -- remove the `rapidocr` install profile and `onnxruntime-gpu` -- simplify setup profiles around: - - Docling extraction/runtime - - DeepSeek OCR runtime -- remove unused requirement baggage where it is not imported by GlossAPI itself -- make Python version constraints match current upstream reality - -Current constraint to fix: - -- GlossAPI currently declares `requires-python = ">=3.8"` while current Docling requires Python `>=3.10` - -Do not change in this stage: - -- pipeline behavior -- artifact layout -- OCR routing logic - -Why this stage exists: - -- environment simplification should follow architectural simplification -- it is easier to reason about required packages once RapidOCR is gone - -Success criteria: - -- setup documentation exposes only the supported environments -- install instructions no longer mention removed OCR components -- Python floor and dependency pins are internally consistent - -Status: - -- completed for the currently supported DeepSeek-only flow -- final branch hygiene and commit curation still remain - -## Stage 3: Docling upgrade - -Goal: - -- upgrade Docling after the OCR surface has already been simplified - -Changes: - -- update `docling` -- update `docling-core` -- update `docling-parse` -- update `docling-ibm-models` -- adapt any compatibility shims required by changed public APIs - -Do not change in this stage: - -- DeepSeek-only OCR decision -- page-level experiment -- formula/code enrichment policy unless explicitly validated - -Why this stage exists: - -- upgrading Docling before removing RapidOCR combines two unrelated breakage sources -- after Stage 1 the Docling integration surface is smaller and easier to validate - -Success criteria: - -- Phase-1 extraction still produces the documented canonical artifacts -- downstream sectioning, annotation, and export still consume the outputs -- metadata and resumability behavior do not regress - -Status: - -- deferred - -## Stage 4: Re-evaluate retained Docling capabilities - -Goal: - -- decide which Docling-powered features remain justified after the simplification - -Features to evaluate: - -- formula enrichment -- code enrichment -- table structure extraction -- any extra model/artifact prefetch currently required for non-default functionality - -Why this stage exists: - -- some capabilities may still be valuable for technical corpora -- some may only be increasing runtime and failure surface - -Rule: - -- do not remove formula/code enrichment just because it simplifies the stack -- remove it only if real-corpus evaluation shows little or no value - -Success criteria: - -- every retained capability has a measurable purpose -- every removed capability has an explicit evaluation-based justification - -Status: - -- pending - -## Stage 5: Page-level reevaluation experiment - -Goal: - -- test whether whole-document OCR reruns should be replaced or complemented by page-level escalation - -Experiment shape: - -- baseline branch: current document-level `needs_ocr` routing -- experiment branch: page-level or ROI-level routing - -What stays fixed: - -- DeepSeek remains the only OCR backend -- Docling remains the structured extraction/layout path - -Why this is separate: - -- it is an architectural experiment, not a prerequisite for the OCR simplification -- it should be compared against the stabilized DeepSeek-only baseline - -Primary evaluation questions: - -- does page-level escalation improve quality on long PDFs -- does it reduce OCR runtime and GPU cost -- does it preserve downstream sectioning and annotation quality - -Status: - -- pending - -## Non-goals for the first pass - -These are intentionally out of scope for the initial migration: - -- replacing Docling JSON/layout artifacts with DeepSeek-native structured artifacts -- merging all runtime concerns into one universal environment regardless of ecosystem constraints -- changing artifact layout at the same time as OCR simplification -- treating synthetic, mocked, or stubbed tests as sufficient release validation - -## Release sequence - -The intended order is: - -1. DeepSeek-only OCR and no-stub enforcement -2. installation simplification -3. Docling upgrade -4. retained-capability review -5. page-level experiment - -This order keeps one major architectural assumption changing at a time. diff --git a/docs/code_map.md b/docs/code_map.md index 97f12d5..8616def 100644 --- a/docs/code_map.md +++ b/docs/code_map.md @@ -8,8 +8,8 @@ without reading the entire repo. | Area | Main code | Responsibility | | --- | --- | --- | -| Public package entry | `src/glossapi/__init__.py` | Applies the RapidOCR patch on import and exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes. | -| High-level orchestration | `src/glossapi/corpus.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Public package entry | `src/glossapi/__init__.py` | Lazy-exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes without pulling heavy runtime dependencies at import time. | +| High-level orchestration | `src/glossapi/corpus/corpus_orchestrator.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | | Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | ## Pipeline Stages @@ -28,12 +28,11 @@ without reading the entire repo. | File | Responsibility | | --- | --- | -| `src/glossapi/_pipeline.py` | Canonical builders for layout-only and RapidOCR-backed Docling pipelines. | -| `src/glossapi/rapidocr_safe.py` | Monkey-patch/shim for Docling 2.48.x so problematic OCR crops do not crash whole documents. | -| `src/glossapi/_rapidocr_paths.py` | Resolves packaged RapidOCR ONNX models and Greek keys, with env-var override support. | -| `src/glossapi/ocr_pool.py` | Reuses RapidOCR model instances where possible. | -| `src/glossapi/json_io.py` | Writes and reads compressed Docling JSON artifacts. | -| `src/glossapi/triage.py` | Summarizes per-page formula density and updates parquet routing metadata. | +| `src/glossapi/ocr/docling/pipeline.py` | Canonical builder for the layout-only Docling Phase-1 pipeline, including runtime tuning knobs for the current Docling API. | +| `src/glossapi/ocr/docling_pipeline.py` | Compatibility re-export for the canonical Docling pipeline builder. | +| `src/glossapi/ocr/deepseek/runner.py` | Launches the DeepSeek OCR remediation path from `Corpus.ocr()`. | +| `src/glossapi/ocr/utils/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/corpus/phase_ocr_math.py` | Runs DeepSeek OCR remediation, math/code enrichment, and parquet status updates. | | `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | ## Rust Extensions @@ -50,12 +49,12 @@ without reading the entire repo. | `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | | `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | | `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | -| `tests/test_rapidocr_patch.py` | Covers the Docling/RapidOCR compatibility patch and fallback paths. | +| `tests/test_ocr_dispatch_backends.py` | Covers the DeepSeek-only OCR dispatch contract and backend validation. | ## If You Need To Change... - Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. -- Phase-1 parsing, OCR selection, or artifact generation: start in `src/glossapi/corpus.py` and `src/glossapi/gloss_extract.py`. -- Docling/RapidOCR wiring or provider issues: start in `src/glossapi/_pipeline.py`, `src/glossapi/rapidocr_safe.py`, and `src/glossapi/_rapidocr_paths.py`. +- Phase-1 parsing, worker fanout, or artifact generation: start in `src/glossapi/corpus/phase_extract.py`, `src/glossapi/corpus/corpus_orchestrator.py`, and `src/glossapi/gloss_extract.py`. +- Docling pipeline wiring or runtime tuning: start in `src/glossapi/ocr/docling/pipeline.py` and `src/glossapi/gloss_extract.py`. - Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. -- Output folder contracts or stage sequencing: start in `src/glossapi/corpus.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus/corpus_orchestrator.py`. diff --git a/docs/configuration.md b/docs/configuration.md index 0810530..f8dd8de 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -18,6 +18,17 @@ GlossAPI exposes two Phase‑1 profiles. Use `Corpus.extract(..., phase1_backend Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread per worker so multi‑GPU runs do not explode thread counts. +### Docling Runtime Tuning + +These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: + +- `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. +- `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. +- `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. +- `GLOSSAPI_DOCLING_QUEUE_MAX_SIZE`: override Docling `queue_max_size`. +- `GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT`: override Docling `document_timeout`. +- `GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL`: override Docling `batch_polling_interval_seconds`. + ### DeepSeek optional dependencies Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: @@ -27,6 +38,7 @@ Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. +The dedicated uv profile is OCR-only and does not install the Docling extraction stack. ### DeepSeek runtime controls diff --git a/docs/getting_started.md b/docs/getting_started.md index 94a2325..e86d492 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -27,6 +27,7 @@ Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `d ``` `setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `glossapi[deepseek]` and leaves Docling in the main environment. **DeepSeek runtime checklist** - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index b1b8956..feb3283 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -8,10 +8,11 @@ file paths**, so no worker rescans directories. ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. +- `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index f401829..1c2b630 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -86,7 +86,7 @@ If you need Phase‑2 math on files that do not require OCR, run `math_only` aft Phase‑1 (extract): ```python -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. diff --git a/docs/pipeline.md b/docs/pipeline.md index 2f4b9dd..2c00354 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -11,7 +11,7 @@ The `Corpus` class is the stable surface of the project. New functionality shoul | Stage | Main code | Typical inputs | Important parameters | Main outputs | | --- | --- | --- | --- | --- | | Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | -| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `force_ocr`, `use_gpus`, `devices`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `use_gpus`, `devices`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | | Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | | OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | | Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | @@ -42,9 +42,11 @@ The `Corpus` class is the stable surface of the project. New functionality shoul - or explicit `file_paths` - Important parameters: - `phase1_backend='safe'|'docling'|'auto'` - - `force_ocr=True` to turn on OCR during extraction - `use_gpus='single'|'multi'` + - `workers_per_device` to fan out more than one extraction worker onto each GPU - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Operational note: + - `force_ocr` is deprecated and ignored in Phase‑1; use `Corpus.ocr(backend='deepseek')` after `clean()` for OCR remediation - Main outputs: - canonical markdown in `markdown/.md` - optional Docling JSON and index artifacts in `json/` diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md index 0c00d59..29a5e15 100644 --- a/docs/testing/compatibility_matrix.md +++ b/docs/testing/compatibility_matrix.md @@ -97,8 +97,8 @@ The following must remain true unless a change explicitly revises the contract a | ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | | --- | --- | --- | --- | --- | --- | --- | -| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed RapidOCR profile | -| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no runtime import of removed RapidOCR modules | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed legacy OCR install modes | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no dead imports from removed OCR integrations | | `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | | `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | | `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | @@ -129,7 +129,7 @@ Critical checks: - packaging metadata uses a supported Python minimum - setup docs expose only supported install paths -- removal of RapidOCR does not leave dead imports or entrypoints +- removal of the old OCR integration does not leave dead GlossAPI imports or entrypoints ## Extraction contract @@ -256,7 +256,7 @@ This keeps low-level compatibility failures from being confused with downstream - DeepSeek-only OCR path works on real PDFs - no-stub enforcement verified -- no remaining release dependency on RapidOCR +- no supported GlossAPI OCR backend remains besides DeepSeek ### Stage 2 exit criteria diff --git a/pyproject.toml b/pyproject.toml index 3c045db..f1613ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", + "numpy>=1.26,<3", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -43,7 +43,7 @@ browser = [ ] # Docling extraction/layout stack docling = [ - "docling==2.48.0", + "docling==2.81.0", ] # Optional CUDA layout acceleration (Docling) cuda = [ diff --git a/requirements.txt b/requirements.txt index 95f4678..32b555c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -### GlossAPI runtime requirements (aligned with repro_rapidocr_onnx) +### GlossAPI runtime requirements # Core pipeline deps pandas>=1.3.0 -numpy<2 +numpy>=1.26,<3 python-dateutil>=2.8.2 pytz>=2021.1 scikit-learn==1.6.1 @@ -15,17 +15,12 @@ ftfy>=6.0.0 tenacity>=8.0.0 tqdm>=4.67.0 -# Docling + RapidOCR ONNX stack -docling==2.48.0 -# Prefer RapidOCR core package; it works with the GPU ORT wheel without pulling -# the CPU-only 'onnxruntime' dependency. -rapidocr>=3.3.0 -onnxruntime-gpu==1.18.1 +# Docling extraction/layout stack +docling==2.81.0 pyyaml>=6.0 # Enrichment & JSON compression (required for Phase-2 math/code and JSON zstd) pypdfium2>=4.0.0 zstandard>=0.22.0 -# Optional: install Torch CUDA for GPU layout (not required for OCR) -# pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 +# Optional: install Torch CUDA for GPU-backed Docling layout / enrichment diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index dd2fad6..7f254f1 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -350,6 +350,8 @@ def _load_metadata(self) -> None: # Top-level worker function for multi-GPU extraction (picklable by multiprocessing) def gpu_extract_worker_queue( device_id: int, + worker_slot: int, + worker_key: str, in_dir: str, out_dir: str, work_q, # multiprocessing Queue of filename strings @@ -392,12 +394,13 @@ def _ensure_thread_caps(): _ensure_thread_caps() _status_proxy = status_map - _marker_path = _Path(marker_dir).expanduser() / f"gpu{device_id}.current" if marker_dir else None + _worker_label = worker_key or f"gpu{device_id}-w{worker_slot}" + _marker_path = _Path(marker_dir).expanduser() / f"{_worker_label}.current" if marker_dir else None def _update_current(batch_items: List[str]) -> None: if _status_proxy is not None: try: - _status_proxy[device_id] = list(batch_items) + _status_proxy[_worker_label] = list(batch_items) except Exception: pass if _marker_path is not None: @@ -409,7 +412,7 @@ def _update_current(batch_items: List[str]) -> None: def _clear_current() -> None: if _status_proxy is not None: try: - _status_proxy.pop(device_id, None) + _status_proxy.pop(_worker_label, None) except Exception: pass if _marker_path is not None: @@ -423,7 +426,7 @@ def _clear_current() -> None: if _log_dir: _log_path = _Path(_log_dir).expanduser() _log_path.mkdir(parents=True, exist_ok=True) - _worker_log_file = _log_path / f"gpu{device_id}_{_os.getpid()}.log" + _worker_log_file = _log_path / f"{_worker_label}_{_os.getpid()}.log" _worker_log_handle = open(_worker_log_file, "a", encoding="utf-8", buffering=1) _sys.stdout = _worker_log_handle _sys.stderr = _worker_log_handle @@ -458,9 +461,13 @@ def _clear_current() -> None: except Exception: _phys = "" try: - print(f"[GPU{device_id}] bound: CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}") + print( + f"[GPU{device_id}/W{worker_slot}] bound: " + f"CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} " + f"pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}" + ) if _phys: - print(f"[GPU{device_id}] physical: {_phys}") + print(f"[GPU{device_id}/W{worker_slot}] physical: {_phys}") except Exception: pass except Exception: @@ -475,13 +482,15 @@ def _clear_current() -> None: _ensure_thread_caps() from glossapi import Corpus as _Corpus # type: ignore except Exception as _e: - print(f"[GPU{device_id}] Cannot import glossapi in worker: {_e}") + print(f"[{_worker_label}] Cannot import glossapi in worker: {_e}") if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -507,14 +516,16 @@ def _clear_current() -> None: phase1_backend=backend, ) except Exception as _e: - msg = f"[GPU{device_id}] Prime failed: {_e}" + msg = f"[{_worker_label}] Prime failed: {_e}" print(msg) if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -534,7 +545,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [str(x) for x in ok_list], "problematic": [str(x) for x in bad_list], "pid": _os.getpid(), @@ -553,15 +566,12 @@ def _report_batch(ok_list, bad_list): _batch_env = int(str(_os.environ.get("GLOSSAPI_GPU_BATCH_SIZE", "")).strip() or 0) except Exception: _batch_env = 0 - default_batch = 5 if not force else 1 + default_batch = 5 try: extractor = getattr(c, "extractor", None) if extractor is not None: configured = int(getattr(extractor, "max_batch_files", default_batch)) - if force: - default_batch = 1 - else: - default_batch = max(1, configured) + default_batch = max(1, configured) except Exception: pass BATCH_SIZE = max(1, _batch_env) if _batch_env else max(1, default_batch) @@ -605,7 +615,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -653,7 +665,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [], "problematic": list(batch), "pid": _os.getpid(), @@ -667,7 +681,7 @@ def _report_batch(ok_list, bad_list): # Occasional heartbeat if _time.time() - last_progress > 30: try: - print(f"[GPU{device_id}] processed ~{processed} files…") + print(f"[{_worker_label}] processed ~{processed} files...") except Exception: pass last_progress = _time.time() @@ -692,7 +706,9 @@ def _report_batch(ok_list, bad_list): try: result_q.put({ "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": exit_code, "pid": _os.getpid(), }) diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a748dcc..296429a 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -98,14 +98,14 @@ def prime_extractor( if force_ocr: self.logger.warning( - "Phase-1 Docling OCR is deprecated and no longer executes OCR. " + "Corpus.extract(force_ocr=True) is deprecated and no longer executes OCR. " "Use Corpus.ocr(backend='deepseek') for OCR remediation." ) # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, - require_ocr=bool(force_ocr), + require_ocr=False, require_math=bool(formula_enrichment or code_enrichment), require_backend_gpu=(backend_choice == "docling"), ) @@ -119,8 +119,8 @@ def prime_extractor( # Ensure converter exists (reuse when unchanged) self.extractor.ensure_extractor( - enable_ocr=bool(force_ocr), - force_full_page_ocr=bool(force_ocr), + enable_ocr=False, + force_full_page_ocr=False, formula_enrichment=bool(formula_enrichment), code_enrichment=bool(code_enrichment), images_scale=float(images_scale_env), @@ -142,12 +142,12 @@ def _resolve_phase1_backend( raise ValueError( f"Invalid phase1_backend='{requested}'. Expected one of: 'auto', 'safe', 'docling'." ) - needs_gpu = bool(force_ocr or formula_enrichment or code_enrichment) + needs_gpu = bool(formula_enrichment or code_enrichment) if choice == "auto": choice = "docling" if needs_gpu else "safe" if choice == "safe" and needs_gpu: self.logger.info( - "Phase-1 backend 'safe' overridden to 'docling' because OCR/math enrichment was requested." + "Phase-1 backend 'safe' overridden to 'docling' because math/code enrichment was requested." ) choice = "docling" return choice @@ -227,6 +227,7 @@ def extract( export_doc_json: bool = True, emit_formula_index: bool = False, phase1_backend: str = "auto", + workers_per_device: int = 1, _prepared: bool = False, ) -> None: """ @@ -240,8 +241,9 @@ def extract( export_doc_json: When True (default), writes Docling layout JSON to `json/.docling.json(.zst)` emit_formula_index: Also emit `json/.formula_index.jsonl` (default: False) phase1_backend: Selects the Phase-1 backend. ``"auto"`` (default) keeps the safe backend unless - OCR/math is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` forces the - Docling backend. + math/code enrichment is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` + forces the Docling backend. + workers_per_device: Number of extraction workers to bind to each visible GPU when ``use_gpus='multi'``. """ if not file_paths: @@ -415,12 +417,14 @@ def extract( except Exception: threads_effective = int(num_threads) if isinstance(num_threads, int) else max(2, 2 * max(1, len(devs))) - batch_hint = 5 if backend_choice == "docling" and not force_ocr else 1 + workers_per_device = max(1, int(workers_per_device or 1)) + batch_hint = 1 self.logger.info( - "Phase-1 config: backend=%s batch_size=%s threads=%s skip_existing=%s benchmark=%s", + "Phase-1 config: backend=%s batch_size=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, batch_hint, threads_effective, + workers_per_device, bool(skip_existing), bool(benchmark_mode), ) @@ -454,6 +458,7 @@ def extract( return # Dynamic work queue across GPUs + from .corpus_orchestrator import gpu_extract_worker_queue from multiprocessing import get_context ctx = get_context("spawn") manager = ctx.Manager() @@ -484,14 +489,29 @@ def extract( marker_base.mkdir(parents=True, exist_ok=True) except Exception as exc: self.logger.debug("Unable to prepare marker directory %s: %s", marker_base, exc) - procs: List[Any] = [] - proc_gpu: Dict[int, int] = {} - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} + worker_specs: List[Dict[str, Any]] = [] for dev_id in devs: + for worker_slot in range(workers_per_device): + worker_specs.append( + { + "device_id": int(dev_id), + "worker_slot": int(worker_slot), + "worker_key": f"gpu{dev_id}-w{worker_slot}", + } + ) + procs: List[Any] = [] + proc_specs: Dict[int, Dict[str, Any]] = {} + marker_files: Dict[str, Path] = { + spec["worker_key"]: marker_base / f"{spec['worker_key']}.current" + for spec in worker_specs + } + for spec in worker_specs: p = ctx.Process( target=gpu_extract_worker_queue, args=( - dev_id, + spec["device_id"], + spec["worker_slot"], + spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -514,7 +534,7 @@ def extract( p.start() procs.append(p) if p.pid is not None: - proc_gpu[p.pid] = dev_id + proc_specs[p.pid] = dict(spec) active = list(procs) any_fail = False last_summary = time.time() @@ -531,20 +551,21 @@ def extract( procs.remove(p) pid = p.pid or -1 heartbeat[pid] = time.time() - gpu_id = proc_gpu.pop(pid, None) + worker_spec = proc_specs.pop(pid, None) + worker_key = worker_spec["worker_key"] if worker_spec else None if p.exitcode not in (0, None): any_fail = True self.logger.warning("GPU worker pid=%s exited with code %s", p.pid, p.exitcode) current_paths: List[str] = [] stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) + if worker_key is not None: + current_entry = status_map.pop(worker_key, None) if current_entry: if not isinstance(current_entry, (list, tuple, set)): current_entry = [current_entry] current_paths = [str(x) for x in current_entry] stems_for_skip = [canonical_stem(path) for path in current_paths] - marker_path = marker_files.get(gpu_id) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -555,12 +576,17 @@ def extract( state_mgr.save(processed_files, problematic_files) if stems_for_skip: skip_mgr.add(stems_for_skip) - if gpu_id is not None: - self.logger.info("Respawning GPU%s worker after crash.", gpu_id) + if worker_spec is not None: + self.logger.info( + "Respawning %s after crash.", + worker_spec["worker_key"], + ) replacement = ctx.Process( target=gpu_extract_worker_queue, args=( - gpu_id, + worker_spec["device_id"], + worker_spec["worker_slot"], + worker_spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -584,13 +610,13 @@ def extract( procs.append(replacement) active.append(replacement) if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id + proc_specs[replacement.pid] = dict(worker_spec) heartbeat[replacement.pid] = time.time() continue else: - if gpu_id is not None: - status_map.pop(gpu_id, None) - marker_path = marker_files.get(gpu_id) + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -618,7 +644,7 @@ def extract( skip_mgr.add(bad_stems) state_mgr.save(processed_files, problematic_files) self.logger.info( - "GPU%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", + "%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", result.get("worker"), len(ok_stems), len(bad_stems), @@ -632,25 +658,20 @@ def extract( if result.get("exitcode", 0) not in (0, None): any_fail = True self.logger.warning( - "GPU%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") + "%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") ) worker_pid = result.get("pid") if worker_pid is not None: heartbeat[worker_pid] = time.time() - worker_gpu = result.get("worker") - if worker_gpu is not None: - try: - worker_gpu_int = int(worker_gpu) - except Exception: - worker_gpu_int = None - else: - status_map.pop(worker_gpu_int, None) - marker_path = marker_files.get(worker_gpu_int) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass + worker_key = result.get("worker") + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(str(worker_key)) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass now = time.time() if now - last_summary > 30: diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 861f28b..1c21cf1 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -46,9 +46,9 @@ def _maybe_import_torch(*, force: bool = False): MarkdownFormatOption = None CsvFormatOption = None StandardPdfPipeline = None -DoclingParseV2DocumentBackend = None DoclingParseDocumentBackend = None PyPdfiumDocumentBackend = None +_DOCLING_PARSE_BACKEND_NAME = "docling_parse" class _NoOpOption: # minimal stand-ins for optional helpers @@ -83,19 +83,23 @@ def _ensure_docling_converter_loaded() -> None: def _ensure_docling_pipeline_loaded() -> None: global _DOC_PIPELINE_LOADED, StandardPdfPipeline - global DoclingParseV2DocumentBackend, DoclingParseDocumentBackend, PyPdfiumDocumentBackend + global DoclingParseDocumentBackend, PyPdfiumDocumentBackend, _DOCLING_PARSE_BACKEND_NAME if _DOC_PIPELINE_LOADED: return try: StandardPdfPipeline = importlib.import_module( "docling.pipeline.standard_pdf_pipeline" ).StandardPdfPipeline - DoclingParseV2DocumentBackend = importlib.import_module( - "docling.backend.docling_parse_v2_backend" - ).DoclingParseV2DocumentBackend - DoclingParseDocumentBackend = importlib.import_module( - "docling.backend.docling_parse_backend" - ).DoclingParseDocumentBackend + try: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_backend" + ).DoclingParseDocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse" + except Exception: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_v2_backend" + ).DoclingParseV2DocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse_v2" PyPdfiumDocumentBackend = importlib.import_module( "docling.backend.pypdfium2_backend" ).PyPdfiumDocumentBackend @@ -382,7 +386,7 @@ def _convert_all_with_timeout(self, files: Iterable[Path], timeout_s: int, **kwa timeout_kw = None backend_cls = getattr(self, "_active_pdf_backend", None) - is_native_backend = backend_cls is DoclingParseV2DocumentBackend if backend_cls else False + is_native_backend = backend_cls is DoclingParseDocumentBackend if backend_cls else False if timeout_kw and not is_native_backend and len(set(budgets)) == 1: kw = dict(raises_on_error=False) @@ -556,8 +560,8 @@ def create_extractor( pass # Record the PDF backend name for provenance (default to native backend) - self.pdf_backend_name = "docling_parse_v2" - self._active_pdf_backend = DoclingParseV2DocumentBackend + self.pdf_backend_name = _DOCLING_PARSE_BACKEND_NAME + self._active_pdf_backend = DoclingParseDocumentBackend # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: @@ -582,7 +586,7 @@ def create_extractor( except Exception: pass - active_backend = DoclingParseV2DocumentBackend + active_backend = DoclingParseDocumentBackend device_str = self._current_device_str() or "cuda:0" _, opts = build_layout_pipeline( device=device_str, @@ -599,13 +603,13 @@ def create_extractor( self._active_pdf_options = opts self._current_ocr_enabled = False - pdf_backend = DoclingParseV2DocumentBackend + pdf_backend = DoclingParseDocumentBackend try: if getattr(self, "use_pypdfium_backend", False): pdf_backend = PyPdfiumDocumentBackend self.pdf_backend_name = "pypdfium" except Exception: - pdf_backend = DoclingParseV2DocumentBackend + pdf_backend = DoclingParseDocumentBackend active_backend = pdf_backend self.converter = DocumentConverter( @@ -1198,7 +1202,7 @@ def _update_extraction_metadata( if chunk_manifest_path is not None: data["chunk_manifest_path"] = str(chunk_manifest_path) # Backend and failure - backend_name = getattr(self, "pdf_backend_name", None) or ("docling_parse_v2" if getattr(self, "USE_V2", True) else "docling_parse") + backend_name = getattr(self, "pdf_backend_name", None) or _DOCLING_PARSE_BACKEND_NAME data["extraction_backend"] = backend_name if status in ("timeout", "error", "failure"): data["failure_mode"] = status diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 2568665..3005786 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -112,6 +112,7 @@ def run_for_files( return {} input_root = Path(getattr(self_ref, "input_dir", ".")).resolve() + pdf_root = (input_root / "downloads") if (input_root / "downloads").exists() else input_root out_root = Path(output_dir) if output_dir else Path(getattr(self_ref, "output_dir", input_root)) md_dir = out_root / "markdown" metrics_dir = out_root / "json" / "metrics" @@ -146,7 +147,7 @@ def run_for_files( raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") _run_cli( - input_dir=input_root, + input_dir=pdf_root, output_dir=out_root, files=file_list, model_dir=model_root, @@ -159,7 +160,7 @@ def run_for_files( results: Dict[str, Any] = {} for name in file_list: - pdf_path = (input_root / name).resolve() + pdf_path = (pdf_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py index aea64fd..8162e60 100644 --- a/src/glossapi/ocr/docling/pipeline.py +++ b/src/glossapi/ocr/docling/pipeline.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from typing import Tuple from docling.datamodel.pipeline_options import ( @@ -66,9 +67,55 @@ def _apply_common_pdf_options( setattr(opts, "images_scale", images_scale) except Exception: pass + _apply_runtime_overrides(opts) return opts +def _apply_runtime_overrides(opts: PdfPipelineOptions) -> None: + """Apply optional runtime tuning knobs exposed by newer Docling releases.""" + + int_env_map = { + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE": "layout_batch_size", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE": "table_batch_size", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE": "ocr_batch_size", + "GLOSSAPI_DOCLING_QUEUE_MAX_SIZE": "queue_max_size", + "GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT": "document_timeout", + } + float_env_map = { + "GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL": "batch_polling_interval_seconds", + } + + for env_name, attr_name in int_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = int(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + for env_name, attr_name in float_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = float(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + def build_layout_pipeline( *, device: str = "cuda:0", diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py index ef85950..4a96e09 100644 --- a/src/glossapi/ocr/docling_pipeline.py +++ b/src/glossapi/ocr/docling_pipeline.py @@ -1,82 +1,5 @@ -from __future__ import annotations - -from typing import Tuple - -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - PictureDescriptionApiOptions, - TableFormerMode, - TableStructureOptions, -) - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Build the Docling PDF pipeline used for Phase-1 extraction.""" - - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - acc, _ = _resolve_accelerator(device) - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", float(images_scale)) - except Exception: - pass - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts +"""Compatibility wrapper for the canonical Docling pipeline builder.""" +from .docling.pipeline import build_layout_pipeline __all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/scripts/ocr_gpu_batch.py b/src/glossapi/scripts/ocr_gpu_batch.py index 2183664..2646baa 100644 --- a/src/glossapi/scripts/ocr_gpu_batch.py +++ b/src/glossapi/scripts/ocr_gpu_batch.py @@ -115,15 +115,21 @@ def main(argv: Optional[List[str]] = None) -> int: "--force-ocr", dest="force_ocr", action="store_true", - help="Force GPU OCR during extraction (default).", + help="Deprecated no-op retained for compatibility; OCR now runs through Corpus.ocr(...).", ) parser.add_argument( "--no-force-ocr", dest="force_ocr", action="store_false", - help="Skip forced OCR (only run math/layout).", + help="Explicitly disable the deprecated Phase-1 OCR flag.", + ) + parser.set_defaults(force_ocr=False) + parser.add_argument( + "--workers-per-device", + type=int, + default=1, + help="Number of extraction workers to bind to each visible GPU (default: 1).", ) - parser.set_defaults(force_ocr=True) parser.add_argument( "--dry-run", action="store_true", @@ -182,6 +188,7 @@ def main(argv: Optional[List[str]] = None) -> int: export_doc_json=True, emit_formula_index=emit_formula_index, phase1_backend=args.phase1_backend, + workers_per_device=max(1, int(args.workers_per_device)), ) print("[ocr_gpu_batch] Extraction complete.") @@ -190,4 +197,3 @@ def main(argv: Optional[List[str]] = None) -> int: if __name__ == "__main__": # pragma: no cover - CLI entrypoint raise SystemExit(main()) - diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 424d359..a5ea0b1 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -60,21 +60,23 @@ def set_torch_stub(monkeypatch, *, available: bool, device_count: int): return torch_ns -def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_force_ocr_is_ignored_for_backend_selection(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - with pytest.raises(RuntimeError) as exc: - corpus.prime_extractor( - input_format="pdf", - accel_type="CUDA", - force_ocr=True, - phase1_backend="docling", - ) + corpus.prime_extractor( + input_format="pdf", + accel_type="CPU", + force_ocr=True, + phase1_backend="auto", + ) - assert "Torch CUDA is not available" in str(exc.value) + assert corpus.extractor.last_policy == "safe" + ensure_kwargs = corpus.extractor.ensure_calls[0] + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -109,7 +111,7 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey assert corpus.extractor.ensure_calls[0]["enable_ocr"] is False -def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_configures_docling_backend_explicitly(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() @@ -117,16 +119,15 @@ def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatc corpus.prime_extractor( input_format="pdf", accel_type="CUDA", - force_ocr=True, - phase1_backend="auto", + phase1_backend="docling", ) assert corpus.extractor.last_policy == "docling" assert corpus.extractor.last_max_batch_files == 1 assert corpus.extractor.last_prefer_safe_backend is False ensure_kwargs = corpus.extractor.ensure_calls[0] - assert ensure_kwargs["enable_ocr"] is True - assert ensure_kwargs["force_full_page_ocr"] is True + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypatch): @@ -188,6 +189,8 @@ def extract(self, *, file_paths=None, **kwargs): with pytest.raises(SystemExit) as exit_info: corpus_mod.gpu_extract_worker_queue( device_id=0, + worker_slot=0, + worker_key="gpu0-w0", in_dir=str(tmp_path), out_dir=str(tmp_path), work_q=work_q, diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 3779d07..e2198b7 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -54,4 +54,4 @@ def fail_math(*args, **kwargs): def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) + corpus.ocr(backend="bogus", fix_bad=True, math_enhance=False) diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 7dae1b7..f673a83 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -126,7 +126,6 @@ def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -244,7 +243,6 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -378,7 +376,6 @@ def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): accel_type="CUDA", num_threads=1, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -469,7 +466,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) From efd169835a414441b487e02948f73e09984d24e4 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 19:13:29 +0300 Subject: [PATCH 13/93] add multi-worker deepseek gpu sharding --- src/glossapi/corpus/phase_ocr_math.py | 12 ++ src/glossapi/ocr/deepseek/runner.py | 278 +++++++++++++++++++++++-- tests/test_deepseek_runner_contract.py | 88 ++++++++ tests/test_ocr_dispatch_backends.py | 39 ++++ 4 files changed, 405 insertions(+), 12 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 80afc7f..722f39a 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -48,6 +48,7 @@ def ocr( math_dpi_base: int = 220, use_gpus: str = "single", devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -74,6 +75,10 @@ def ocr( Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). + - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use + ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. + Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers + per visible GPU. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -581,6 +586,13 @@ def _run_math(stems: List[str]) -> None: self, bad_files, model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=persist_engine, + precision=precision, + device=device, + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=workers_per_gpu, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 2568665..95cd2ae 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -2,6 +2,7 @@ from __future__ import annotations +from contextlib import ExitStack import json import logging import os @@ -30,7 +31,7 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, @@ -41,7 +42,7 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], -) -> None: +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -61,8 +62,19 @@ def _run_cli( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + return cmd + +def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" @@ -70,11 +82,228 @@ def _run_cli( ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device) LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None + + +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, +) -> None: + lanes = _plan_lanes( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + for lane in lanes: + lane_files = list(lane["files"]) + if not lane_files: + continue + visible_device = int(lane["visible_device"]) + log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=lane_files, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s files=%d weight=%d: %s", + lane["lane_id"], + visible_device, + len(lane_files), + lane["weight"], + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) + + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + + def run_for_files( self_ref: Any, files: Iterable[str], @@ -91,6 +320,9 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, # reserved disable_fp8_kv: bool = False, # reserved **_: Any, @@ -98,7 +330,7 @@ def run_for_files( """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) - del log_dir, allow_stub, allow_cli, persist_engine, precision + del allow_stub, allow_cli, persist_engine, precision del gpu_memory_utilization, disable_fp8_kv if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": @@ -145,17 +377,39 @@ def run_for_files( if not python_exe.exists(): raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - _run_cli( - input_dir=input_root, - output_dir=out_root, - files=file_list, - model_dir=model_root, - python_bin=python_exe, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), device=device, ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=input_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ) + else: + _run_cli( + input_dir=input_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ) results: Dict[str, Any] = {} for name in file_list: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index a5a93e4..783f4e6 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path import pandas as pd @@ -60,3 +61,90 @@ def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): assert canonical_markdown.exists() assert canonical_markdown.read_text(encoding="utf-8") == "final\n" assert not progress_markdown.exists() + + +def test_deepseek_runner_multi_uses_visible_device_isolation(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + input_dir = tmp_path / "input" + output_dir = tmp_path / "output" + input_dir.mkdir() + output_dir.mkdir() + + files = ["a.pdf", "b.pdf", "c.pdf", "d.pdf"] + weights = {"a.pdf": 40, "b.pdf": 30, "c.pdf": 20, "d.pdf": 10} + for name in files: + (input_dir / name).write_bytes(b"%PDF-1.4\n%stub\n") + + class DummyCorpus: + def __init__(self, input_dir: Path, output_dir: Path): + self.input_dir = input_dir + self.output_dir = output_dir + + class FakePopen: + calls = [] + + def __init__(self, cmd, stdout=None, stderr=None, env=None): + self.cmd = list(cmd) + self.env = dict(env or {}) + self.returncode = 0 + FakePopen.calls.append(self) + + args = list(cmd) + out_root = Path(args[args.index("--output-dir") + 1]) + lane_files = [] + idx = args.index("--files") + 1 + while idx < len(args) and not args[idx].startswith("--"): + lane_files.append(args[idx]) + idx += 1 + md_dir = out_root / "markdown" + metrics_dir = out_root / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in lane_files: + stem = Path(name).stem + (md_dir / f"{stem}.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + "{\n \"page_count\": 1\n}\n", + encoding="utf-8", + ) + + def wait(self): + return self.returncode + + script = tmp_path / "run_pdf_ocr_transformers.py" + script.write_text("# stub\n", encoding="utf-8") + model_dir = tmp_path / "DeepSeek-OCR-2" + model_dir.mkdir() + + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + monkeypatch.setattr(runner.subprocess, "Popen", FakePopen) + + results = runner.run_for_files( + DummyCorpus(input_dir, output_dir), + files, + model_dir=model_dir, + python_bin=Path(sys.executable), + vllm_script=script, + use_gpus="multi", + devices=[2, 5], + workers_per_gpu=2, + ) + + assert sorted(results) == ["a", "b", "c", "d"] + assert len(FakePopen.calls) == 4 + + seen_files = [] + seen_visible_devices = [] + for call in FakePopen.calls: + args = call.cmd + assert "--device" in args + assert args[args.index("--device") + 1] == "cuda" + seen_visible_devices.append(call.env.get("CUDA_VISIBLE_DEVICES")) + idx = args.index("--files") + 1 + while idx < len(args) and not args[idx].startswith("--"): + seen_files.append(args[idx]) + idx += 1 + + assert sorted(seen_files) == sorted(files) + assert sorted(seen_visible_devices) == ["2", "2", "5", "5"] diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 3779d07..89ad4d0 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -55,3 +55,42 @@ def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False) + + +def test_deepseek_backend_forwards_parallelism_controls(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + use_gpus="multi", + devices=[1, 3], + workers_per_gpu=2, + max_pages=7, + ) + + assert calls["files"] == [fname] + assert calls["kwargs"]["use_gpus"] == "multi" + assert calls["kwargs"]["devices"] == [1, 3] + assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["max_pages"] == 7 From 8ed469b7f895499270680bcf0a5ff9ced340c593 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 23:25:21 +0300 Subject: [PATCH 14/93] add deepseek throughput tuning controls --- src/glossapi/corpus/phase_ocr_math.py | 15 +++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 105 +++++++++++++++--- src/glossapi/ocr/deepseek/runner.py | 62 +++++++++++ tests/test_deepseek_runner_contract.py | 39 +++++++ tests/test_ocr_dispatch_backends.py | 12 ++ 5 files changed, 218 insertions(+), 15 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 722f39a..719253f 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -49,6 +49,12 @@ def ocr( use_gpus: str = "single", devices: Optional[List[int]] = None, workers_per_gpu: int = 1, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -79,6 +85,9 @@ def ocr( ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. + - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + DeepSeek throughput and quality controls for benchmarking lighter OCR + modes and more efficient attention backends. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -590,6 +599,12 @@ def _run_math(stems: List[str]) -> None: persist_engine=persist_engine, precision=precision, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 0e0e868..80912d0 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -14,10 +14,28 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer -PROMPT = "\n<|grounding|>Convert the document to markdown. " +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": False, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--input-dir", required=True) @@ -26,6 +44,14 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--files", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -36,12 +62,12 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: return sorted(input_dir.glob("*.pdf")) -def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: +def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) - zoom = 144 / 72.0 + zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) for idx in range(page_count): page = doc[idx] @@ -65,12 +91,19 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() -def _load_model(model_dir: Path, device: str): - attn_impl = "flash_attention_2" +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested try: import flash_attn # noqa: F401 + return "flash_attention_2" except Exception: - attn_impl = "eager" + return "sdpa" + + +def _load_model(model_dir: Path, device: str, attn_backend: str): + attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) model = AutoModel.from_pretrained( model_dir, @@ -82,18 +115,28 @@ def _load_model(model_dir: Path, device: str): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - return tokenizer, model + return tokenizer, model, attn_impl -def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: result = model.infer( tokenizer, - prompt=PROMPT, + prompt=prompt, image_file=str(image_path), output_path=str(output_dir), - base_size=1024, - image_size=768, - crop_mode=True, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, save_results=False, eval_mode=True, ) @@ -155,10 +198,16 @@ def main() -> int: if not pdfs: return 0 - tokenizer, model = _load_model(model_dir, args.device) + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model(model_dir, args.device, args.attn_backend) for pdf_path in pdfs: - images = _render_pages(pdf_path, args.max_pages) + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) page_outputs: List[str] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) @@ -167,7 +216,16 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) @@ -180,6 +238,23 @@ def main() -> int: ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + metrics_path = output_dir / "json" / "metrics" / f"{pdf_path.stem}.metrics.json" + if metrics_path.exists(): + try: + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + metrics.update( + { + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + } + ) + metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8") + except Exception: + pass return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 95cd2ae..e2c677f 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -42,6 +42,12 @@ def _build_cli_command( max_pages: Optional[int], content_debug: bool, device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -62,6 +68,20 @@ def _build_cli_command( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] return cmd @@ -96,6 +116,12 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -108,6 +134,12 @@ def _run_cli( max_pages=max_pages, content_debug=content_debug, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -249,6 +281,12 @@ def _run_multi_cli( max_pages: Optional[int], content_debug: bool, log_dir: Path, + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -281,6 +319,12 @@ def _run_multi_cli( max_pages=max_pages, content_debug=content_debug, device="cuda", + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -320,6 +364,12 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -397,6 +447,12 @@ def run_for_files( max_pages=max_pages, content_debug=content_debug, log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) else: _run_cli( @@ -409,6 +465,12 @@ def run_for_files( max_pages=max_pages, content_debug=content_debug, device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 783f4e6..c7c4d2f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -148,3 +148,42 @@ def wait(self): assert sorted(seen_files) == sorted(files) assert sorted(seen_visible_devices) == ["2", "2", "5", "5"] + + +def test_deepseek_runner_builds_speed_control_flags(tmp_path): + from glossapi.ocr.deepseek import runner + + script = tmp_path / "run_pdf_ocr_transformers.py" + script.write_text("# stub\n", encoding="utf-8") + model_dir = tmp_path / "DeepSeek-OCR-2" + model_dir.mkdir() + + cmd = runner._build_cli_command( + input_dir=tmp_path / "input", + output_dir=tmp_path / "output", + files=["doc.pdf"], + model_dir=model_dir, + python_bin=Path(sys.executable), + script=script, + max_pages=3, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=False, + render_dpi=120, + ) + + assert "--ocr-profile" in cmd + assert cmd[cmd.index("--ocr-profile") + 1] == "plain_ocr" + assert "--attn-backend" in cmd + assert cmd[cmd.index("--attn-backend") + 1] == "sdpa" + assert "--base-size" in cmd + assert cmd[cmd.index("--base-size") + 1] == "640" + assert "--image-size" in cmd + assert cmd[cmd.index("--image-size") + 1] == "448" + assert "--no-crop-mode" in cmd + assert "--render-dpi" in cmd + assert cmd[cmd.index("--render-dpi") + 1] == "120" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 89ad4d0..7774145 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -86,6 +86,12 @@ def fake_run_for_files(self_ref, files, **kwargs): use_gpus="multi", devices=[1, 3], workers_per_gpu=2, + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=False, + render_dpi=120, max_pages=7, ) @@ -93,4 +99,10 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["use_gpus"] == "multi" assert calls["kwargs"]["devices"] == [1, 3] assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["ocr_profile"] == "plain_ocr" + assert calls["kwargs"]["attn_backend"] == "sdpa" + assert calls["kwargs"]["base_size"] == 640 + assert calls["kwargs"]["image_size"] == 448 + assert calls["kwargs"]["crop_mode"] is False + assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 From b749225384f24f3f6084804c3ca11a87316344dd Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sun, 29 Mar 2026 23:50:53 +0300 Subject: [PATCH 15/93] fallback to eager when deepseek sdpa is unsupported --- .../ocr/deepseek/run_pdf_ocr_transformers.py | 41 ++++++++++++++++--- tests/test_deepseek_runner_contract.py | 36 ++++++++++++++++ 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 80912d0..2ac927f 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -4,6 +4,7 @@ import argparse import json +import logging import re import tempfile from pathlib import Path @@ -14,6 +15,7 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer +LOGGER = logging.getLogger(__name__) PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" @@ -102,15 +104,42 @@ def _resolve_attn_backend(attn_backend: str) -> str: return "sdpa" +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', + ) + return any(marker in message for marker in markers) + + def _load_model(model_dir: Path, device: str, attn_backend: str): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - model = AutoModel.from_pretrained( - model_dir, - _attn_implementation=attn_impl, - trust_remote_code=True, - use_safetensors=True, - ) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) if device.startswith("cuda"): model = model.eval().to(device).to(torch.bfloat16) else: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index c7c4d2f..bf30602 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -187,3 +187,39 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert "--no-crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" + + +def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + class DummyModel: + def eval(self): + return self + + def to(self, *_args, **_kwargs): + return self + + monkeypatch.setattr( + cli.AutoTokenizer, + "from_pretrained", + lambda *args, **kwargs: "tokenizer", + ) + + calls: list[str] = [] + + def fake_from_pretrained(*_args, **kwargs): + attn = kwargs.get("_attn_implementation") + calls.append(attn) + if attn == "sdpa": + raise ValueError( + "DeepseekOCR2ForCausalLM does not support an attention implementation through " + "torch.nn.functional.scaled_dot_product_attention yet." + ) + return DummyModel() + + monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) + + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto") + + assert calls == ["sdpa", "eager"] + assert attn_impl == "eager" From 864b0eaa10062a2a04787dde75b26c55c4a2dd37 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 00:14:50 +0300 Subject: [PATCH 16/93] fix deepseek plain ocr crop defaults --- src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py | 2 +- tests/test_deepseek_runner_contract.py | 4 ++-- tests/test_ocr_dispatch_backends.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 2ac927f..7e9391b 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -28,7 +28,7 @@ def _profile_defaults(profile: str) -> dict: "prompt": PROMPT_PLAIN_OCR, "base_size": 768, "image_size": 512, - "crop_mode": False, + "crop_mode": True, } return { "prompt": PROMPT_GROUNDED_MARKDOWN, diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index bf30602..1c7d987 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -172,7 +172,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): attn_backend="sdpa", base_size=640, image_size=448, - crop_mode=False, + crop_mode=True, render_dpi=120, ) @@ -184,7 +184,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert cmd[cmd.index("--base-size") + 1] == "640" assert "--image-size" in cmd assert cmd[cmd.index("--image-size") + 1] == "448" - assert "--no-crop-mode" in cmd + assert "--crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 7774145..7b7fd15 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -90,7 +90,7 @@ def fake_run_for_files(self_ref, files, **kwargs): attn_backend="sdpa", base_size=640, image_size=448, - crop_mode=False, + crop_mode=True, render_dpi=120, max_pages=7, ) @@ -103,6 +103,6 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["attn_backend"] == "sdpa" assert calls["kwargs"]["base_size"] == 640 assert calls["kwargs"]["image_size"] == 448 - assert calls["kwargs"]["crop_mode"] is False + assert calls["kwargs"]["crop_mode"] is True assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 From b319ae5ae54689d30c4706fb3eb431d71e926812 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 00:42:20 +0300 Subject: [PATCH 17/93] add deepseek max token cap control --- src/glossapi/corpus/phase_ocr_math.py | 5 ++++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 30 +++++++++++++++++-- src/glossapi/ocr/deepseek/runner.py | 10 +++++++ tests/test_deepseek_runner_contract.py | 22 +++++++++++++- tests/test_ocr_dispatch_backends.py | 2 ++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 719253f..f028a9a 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -55,6 +55,7 @@ def ocr( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -88,6 +89,9 @@ def ocr( - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek throughput and quality controls for benchmarking lighter OCR modes and more efficient attention backends. + - max_new_tokens: optional cap for DeepSeek generation per page. Useful + for benchmarking and for containing long-tail pages with pathological + output lengths. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -605,6 +609,7 @@ def _run_math(stems: List[str]) -> None: image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 7e9391b..cf380e8 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -51,6 +51,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") parser.set_defaults(crop_mode=None) @@ -115,7 +116,26 @@ def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: return any(marker in message for marker in markers) -def _load_model(model_dir: Path, device: str, attn_backend: str): +def _cap_generate_tokens(model, max_new_tokens: int | None): + if max_new_tokens is None: + return + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: int | None): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: @@ -144,6 +164,7 @@ def _load_model(model_dir: Path, device: str, attn_backend: str): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) + _cap_generate_tokens(model, max_new_tokens) return tokenizer, model, attn_impl @@ -233,7 +254,12 @@ def main() -> int: image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) - tokenizer, model, attn_impl = _load_model(model_dir, args.device, args.attn_backend) + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + ) for pdf_path in pdfs: images = _render_pages(pdf_path, args.max_pages, args.render_dpi) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index e2c677f..503d16d 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -48,6 +48,7 @@ def _build_cli_command( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -82,6 +83,8 @@ def _build_cli_command( cmd.append("--no-crop-mode") if render_dpi is not None: cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] return cmd @@ -122,6 +125,7 @@ def _run_cli( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -140,6 +144,7 @@ def _run_cli( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -287,6 +292,7 @@ def _run_multi_cli( image_size: Optional[int], crop_mode: Optional[bool], render_dpi: Optional[int], + max_new_tokens: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -325,6 +331,7 @@ def _run_multi_cli( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -370,6 +377,7 @@ def run_for_files( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -453,6 +461,7 @@ def run_for_files( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) else: _run_cli( @@ -471,6 +480,7 @@ def run_for_files( image_size=image_size, crop_mode=crop_mode, render_dpi=render_dpi, + max_new_tokens=max_new_tokens, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 1c7d987..62c9a98 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -174,6 +174,7 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): image_size=448, crop_mode=True, render_dpi=120, + max_new_tokens=2048, ) assert "--ocr-profile" in cmd @@ -187,6 +188,8 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert "--crop-mode" in cmd assert "--render-dpi" in cmd assert cmd[cmd.index("--render-dpi") + 1] == "120" + assert "--max-new-tokens" in cmd + assert cmd[cmd.index("--max-new-tokens") + 1] == "2048" def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): @@ -219,7 +222,24 @@ def fake_from_pretrained(*_args, **kwargs): monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) - _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto") + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None) assert calls == ["sdpa", "eager"] assert attn_impl == "eager" + + +def test_deepseek_generate_cap_applies_max_new_tokens(): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + seen = {} + + class DummyModel: + def generate(self, *args, **kwargs): + seen["kwargs"] = dict(kwargs) + return "ok" + + model = DummyModel() + cli._cap_generate_tokens(model, 2048) + model.generate(max_new_tokens=8192, foo="bar") + assert seen["kwargs"]["max_new_tokens"] == 2048 + assert seen["kwargs"]["foo"] == "bar" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 7b7fd15..20efd77 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -93,6 +93,7 @@ def fake_run_for_files(self_ref, files, **kwargs): crop_mode=True, render_dpi=120, max_pages=7, + max_new_tokens=2048, ) assert calls["files"] == [fname] @@ -106,3 +107,4 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["crop_mode"] is True assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 + assert calls["kwargs"]["max_new_tokens"] == 2048 From 2635c0cdd8a42a8abc87a81df52a8951ac44a944 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Mon, 30 Mar 2026 02:52:00 +0300 Subject: [PATCH 18/93] add deepseek generation guards and page metrics --- src/glossapi/corpus/phase_ocr_math.py | 7 + .../ocr/deepseek/run_pdf_ocr_transformers.py | 166 ++++++++++++++---- src/glossapi/ocr/deepseek/runner.py | 20 +++ tests/test_deepseek_runner_contract.py | 35 +++- tests/test_ocr_dispatch_backends.py | 4 + 5 files changed, 197 insertions(+), 35 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index f028a9a..420fe57 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -56,6 +56,8 @@ def ocr( crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, @@ -92,6 +94,9 @@ def ocr( - max_new_tokens: optional cap for DeepSeek generation per page. Useful for benchmarking and for containing long-tail pages with pathological output lengths. + - repetition_penalty/no_repeat_ngram_size: optional generation guards + for DeepSeek. These are useful when OCR runs fall into repeated or + looping output on difficult pages. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -610,6 +615,8 @@ def _run_math(stems: List[str]) -> None: crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, use_gpus=use_gpus, devices=devices, workers_per_gpu=workers_per_gpu, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index cf380e8..356fc38 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -6,7 +6,9 @@ import json import logging import re +import sys import tempfile +import time from pathlib import Path from typing import Iterable, List @@ -15,6 +17,17 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + LOGGER = logging.getLogger(__name__) PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." @@ -52,6 +65,8 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") parser.set_defaults(crop_mode=None) @@ -94,6 +109,21 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + def _resolve_attn_backend(attn_backend: str) -> str: requested = str(attn_backend or "auto").strip().lower() if requested != "auto": @@ -116,26 +146,60 @@ def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: return any(marker in message for marker in markers) -def _cap_generate_tokens(model, max_new_tokens: int | None): - if max_new_tokens is None: +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): return - capped = int(max_new_tokens) - if capped <= 0: - raise ValueError("max_new_tokens must be > 0") + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") original_generate = model.generate def _wrapped_generate(*args, **kwargs): - current = kwargs.get("max_new_tokens") - if current is None: - kwargs["max_new_tokens"] = capped - else: - kwargs["max_new_tokens"] = min(int(current), capped) + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value return original_generate(*args, **kwargs) model.generate = _wrapped_generate -def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: int | None): +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: @@ -164,7 +228,12 @@ def _load_model(model_dir: Path, device: str, attn_backend: str, max_new_tokens: model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - _cap_generate_tokens(model, max_new_tokens) + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) return tokenizer, model, attn_impl @@ -193,7 +262,13 @@ def _infer_page( return _clean_markdown(str(result)) -def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: md_dir = output_dir / "markdown" metrics_dir = output_dir / "json" / "metrics" progress_dir = output_dir / "sidecars" / "ocr_progress" @@ -205,6 +280,8 @@ def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) "page_count": page_count, "model": "deepseek-ai/DeepSeek-OCR-2", } + if extra_metrics: + metrics.update(extra_metrics) (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") partial_path = progress_dir / f"{stem}.partial.md" if partial_path.exists(): @@ -259,11 +336,17 @@ def main() -> int: args.device, args.attn_backend, args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, ) for pdf_path in pdfs: + doc_start = time.perf_counter() + render_start = time.perf_counter() images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start page_outputs: List[str] = [] + page_metrics: List[dict] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: @@ -271,7 +354,8 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page( + infer_start = time.perf_counter() + raw_page_text = _infer_page( model, tokenizer, page_png, @@ -281,9 +365,24 @@ def main() -> int: image_size=image_size, crop_mode=crop_mode, ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) _write_progress( output_dir, pdf_path.stem, @@ -292,24 +391,27 @@ def main() -> int: idx + 1, ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" - _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) - metrics_path = output_dir / "json" / "metrics" / f"{pdf_path.stem}.metrics.json" - if metrics_path.exists(): - try: - metrics = json.loads(metrics_path.read_text(encoding="utf-8")) - metrics.update( - { - "ocr_profile": args.ocr_profile, - "attn_backend": attn_impl, - "base_size": base_size, - "image_size": image_size, - "crop_mode": crop_mode, - "render_dpi": int(args.render_dpi), - } - ) - metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - except Exception: - pass + _write_outputs( + output_dir, + pdf_path.stem, + markdown, + len(images), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 503d16d..de52e24 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -49,6 +49,8 @@ def _build_cli_command( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -85,6 +87,10 @@ def _build_cli_command( cmd += ["--render-dpi", str(int(render_dpi))] if max_new_tokens is not None: cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] return cmd @@ -126,6 +132,8 @@ def _run_cli( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -145,6 +153,8 @@ def _run_cli( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -293,6 +303,8 @@ def _run_multi_cli( crop_mode: Optional[bool], render_dpi: Optional[int], max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -332,6 +344,8 @@ def _run_multi_cli( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -378,6 +392,8 @@ def run_for_files( crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, @@ -462,6 +478,8 @@ def run_for_files( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) else: _run_cli( @@ -481,6 +499,8 @@ def run_for_files( crop_mode=crop_mode, render_dpi=render_dpi, max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 62c9a98..65d1d56 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -175,6 +175,8 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): crop_mode=True, render_dpi=120, max_new_tokens=2048, + repetition_penalty=1.05, + no_repeat_ngram_size=8, ) assert "--ocr-profile" in cmd @@ -190,6 +192,10 @@ def test_deepseek_runner_builds_speed_control_flags(tmp_path): assert cmd[cmd.index("--render-dpi") + 1] == "120" assert "--max-new-tokens" in cmd assert cmd[cmd.index("--max-new-tokens") + 1] == "2048" + assert "--repetition-penalty" in cmd + assert cmd[cmd.index("--repetition-penalty") + 1] == "1.05" + assert "--no-repeat-ngram-size" in cmd + assert cmd[cmd.index("--no-repeat-ngram-size") + 1] == "8" def test_deepseek_model_load_falls_back_to_eager_when_sdpa_is_unsupported(tmp_path, monkeypatch): @@ -222,13 +228,13 @@ def fake_from_pretrained(*_args, **kwargs): monkeypatch.setattr(cli.AutoModel, "from_pretrained", fake_from_pretrained) - _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None) + _tokenizer, _model, attn_impl = cli._load_model(tmp_path, "cpu", "auto", None, None, None) assert calls == ["sdpa", "eager"] assert attn_impl == "eager" -def test_deepseek_generate_cap_applies_max_new_tokens(): +def test_deepseek_generate_controls_apply(): from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli seen = {} @@ -239,7 +245,30 @@ def generate(self, *args, **kwargs): return "ok" model = DummyModel() - cli._cap_generate_tokens(model, 2048) + cli._configure_generate( + model, + max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, + ) model.generate(max_new_tokens=8192, foo="bar") assert seen["kwargs"]["max_new_tokens"] == 2048 + assert seen["kwargs"]["repetition_penalty"] == 1.08 + assert seen["kwargs"]["no_repeat_ngram_size"] == 12 assert seen["kwargs"]["foo"] == "bar" + + +def test_postprocess_page_text_strips_prompt_and_truncates_repetition(): + from glossapi.ocr.deepseek import run_pdf_ocr_transformers as cli + + prompt = cli.PROMPT_PLAIN_OCR + raw = ( + "\nExtract the text from the document page in reading order.\n" + "Γραμμή 1\n" + + "\n".join(["ΕΠΑΝΑΛΗΨΗ"] * 12) + ) + cleaned, metrics = cli._postprocess_page_text(raw, prompt=prompt, content_debug=False) + + assert "Extract the text from the document page in reading order." not in cleaned + assert cleaned.splitlines().count("ΕΠΑΝΑΛΗΨΗ") <= 10 + assert metrics["early_stops"] == 1 diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 20efd77..2d075df 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -94,6 +94,8 @@ def fake_run_for_files(self_ref, files, **kwargs): render_dpi=120, max_pages=7, max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, ) assert calls["files"] == [fname] @@ -108,3 +110,5 @@ def fake_run_for_files(self_ref, files, **kwargs): assert calls["kwargs"]["render_dpi"] == 120 assert calls["kwargs"]["max_pages"] == 7 assert calls["kwargs"]["max_new_tokens"] == 2048 + assert calls["kwargs"]["repetition_penalty"] == 1.08 + assert calls["kwargs"]["no_repeat_ngram_size"] == 12 From 4536e0e1995b4729c41d502360174f56b55029a8 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 03:44:19 +0300 Subject: [PATCH 19/93] Add DeepSeek OCR speed controls and sharding --- src/glossapi/corpus/phase_ocr_math.py | 37 ++ .../ocr/deepseek/run_pdf_ocr_transformers.py | 281 +++++++++++-- src/glossapi/ocr/deepseek/runner.py | 370 +++++++++++++++++- tests/test_deepseek_runner_contract.py | 83 ++++ 4 files changed, 736 insertions(+), 35 deletions(-) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 80afc7f..1e75a1b 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -41,6 +41,16 @@ def ocr( limit: Optional[int] = None, dpi: Optional[int] = None, # reserved for future use precision: Optional[str] = None, # reserved for future use ("fp16","bf16") + workers_per_gpu: int = 1, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -74,6 +84,17 @@ def ocr( Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - math_enhance: run math/code enrichment after OCR (default True). + - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use + ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. + Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers + per visible GPU. + - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + DeepSeek rendering and attention controls used for throughput/quality + benchmarking. + - max_new_tokens/repetition_penalty/no_repeat_ngram_size: + Optional generation controls forwarded to DeepSeek. These are exposed + for runtime experiments; leave them unset unless a benchmark calls for + them explicitly. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -581,6 +602,22 @@ def _run_math(stems: List[str]) -> None: self, bad_files, model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=bool(persist_engine), + precision=precision, + device=device, + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 0e0e868..e46fadf 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -4,8 +4,11 @@ import argparse import json +import logging import re +import sys import tempfile +import time from pathlib import Path from typing import Iterable, List @@ -14,10 +17,40 @@ from PIL import Image from transformers import AutoModel, AutoTokenizer -PROMPT = "\n<|grounding|>Convert the document to markdown. " +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + +LOGGER = logging.getLogger(__name__) +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": True, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--input-dir", required=True) @@ -26,6 +59,17 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--files", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -36,12 +80,12 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: return sorted(input_dir.glob("*.pdf")) -def _render_pages(pdf_path: Path, max_pages: int | None) -> List[Image.Image]: +def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) - zoom = 144 / 72.0 + zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) for idx in range(page_count): page = doc[idx] @@ -65,42 +109,169 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() -def _load_model(model_dir: Path, device: str): - attn_impl = "flash_attention_2" +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested try: import flash_attn # noqa: F401 + return "flash_attention_2" except Exception: - attn_impl = "eager" - tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - model = AutoModel.from_pretrained( - model_dir, - _attn_implementation=attn_impl, - trust_remote_code=True, - use_safetensors=True, + # DeepSeek-OCR-2's custom decoder path has not behaved reliably with SDPA + # on the stacks we have exercised; if FA2 is unavailable, prefer the known + # fallback instead of silently selecting a backend that then downgrades. + return "eager" + + +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', ) + return any(marker in message for marker in markers) + + +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): + return + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + attn_impl = _resolve_attn_backend(attn_backend) + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) if device.startswith("cuda"): model = model.eval().to(device).to(torch.bfloat16) else: model = model.eval().to(device) - return tokenizer, model + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + return tokenizer, model, attn_impl -def _infer_page(model, tokenizer, image_path: Path, output_dir: Path) -> str: +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: result = model.infer( tokenizer, - prompt=PROMPT, + prompt=prompt, image_file=str(image_path), output_path=str(output_dir), - base_size=1024, - image_size=768, - crop_mode=True, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, save_results=False, eval_mode=True, ) return _clean_markdown(str(result)) -def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) -> None: +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: md_dir = output_dir / "markdown" metrics_dir = output_dir / "json" / "metrics" progress_dir = output_dir / "sidecars" / "ocr_progress" @@ -112,6 +283,8 @@ def _write_outputs(output_dir: Path, stem: str, markdown: str, page_count: int) "page_count": page_count, "model": "deepseek-ai/DeepSeek-OCR-2", } + if extra_metrics: + metrics.update(extra_metrics) (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") partial_path = progress_dir / f"{stem}.partial.md" if partial_path.exists(): @@ -155,11 +328,28 @@ def main() -> int: if not pdfs: return 0 - tokenizer, model = _load_model(model_dir, args.device) + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, + ) for pdf_path in pdfs: - images = _render_pages(pdf_path, args.max_pages) + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start page_outputs: List[str] = [] + page_metrics: List[dict] = [] total_pages = len(images) _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: @@ -167,10 +357,35 @@ def main() -> int: for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" image.save(page_png, format="PNG") - page_text = _infer_page(model, tokenizer, page_png, tmp_dir / f"page_{idx + 1:04d}") + infer_start = time.perf_counter() + raw_page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) if args.content_debug: page_text = f"\n{page_text}".strip() page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) _write_progress( output_dir, pdf_path.stem, @@ -179,7 +394,27 @@ def main() -> int: idx + 1, ) markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" - _write_outputs(output_dir, pdf_path.stem, markdown, len(images)) + _write_outputs( + output_dir, + pdf_path.stem, + markdown, + len(images), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 3005786..61ba307 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -2,6 +2,7 @@ from __future__ import annotations +from contextlib import ExitStack import json import logging import os @@ -30,7 +31,7 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, @@ -41,7 +42,16 @@ def _run_cli( max_pages: Optional[int], content_debug: bool, device: Optional[str], -) -> None: + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -61,8 +71,39 @@ def _run_cli( cmd.append("--content-debug") if device: cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + return cmd + +def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" @@ -70,11 +111,264 @@ def _run_cli( ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device) LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None + + +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, + ocr_profile: str, + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], +) -> None: + lanes = _plan_lanes( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + for lane in lanes: + lane_files = list(lane["files"]) + if not lane_files: + continue + visible_device = int(lane["visible_device"]) + log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=lane_files, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s files=%d weight=%d: %s", + lane["lane_id"], + visible_device, + len(lane_files), + lane["weight"], + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) + + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + + def run_for_files( self_ref: Any, files: Iterable[str], @@ -91,6 +385,18 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + ocr_profile: str = "markdown_grounded", + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, # reserved disable_fp8_kv: bool = False, # reserved **_: Any, @@ -98,7 +404,7 @@ def run_for_files( """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) - del log_dir, allow_stub, allow_cli, persist_engine, precision + del allow_stub, allow_cli, persist_engine, precision del gpu_memory_utilization, disable_fp8_kv if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": @@ -146,17 +452,57 @@ def run_for_files( if not python_exe.exists(): raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - _run_cli( - input_dir=pdf_root, - output_dir=out_root, - files=file_list, - model_dir=model_root, - python_bin=python_exe, - script=script_path, - max_pages=max_pages, - content_debug=content_debug, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), device=device, ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=pdf_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + else: + _run_cli( + input_dir=pdf_root, + output_dir=out_root, + files=file_list, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) results: Dict[str, Any] = {} for name in file_list: diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index a5a93e4..81ec66f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path import pandas as pd @@ -60,3 +61,85 @@ def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): assert canonical_markdown.exists() assert canonical_markdown.read_text(encoding="utf-8") == "final\n" assert not progress_markdown.exists() + + +def test_auto_attn_backend_prefers_eager_when_flash_attn_is_unavailable(monkeypatch): + import builtins + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _resolve_attn_backend + + original_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "flash_attn": + raise ImportError("flash_attn unavailable") + return original_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert _resolve_attn_backend("auto") == "eager" + + +def test_runner_uses_downloads_subdir_when_present(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["input_dir"] = input_dir + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_transformers.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"]) + + assert calls["input_dir"] == downloads_dir.resolve() + assert result["doc"]["page_count"] == 1 + + +def test_build_cli_command_includes_speed_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + attn_backend="flash_attention_2", + base_size=768, + image_size=512, + crop_mode=True, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=1.05, + no_repeat_ngram_size=12, + ) + + assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--attn-backend" in cmd and "flash_attention_2" in cmd + assert "--base-size" in cmd and "768" in cmd + assert "--image-size" in cmd and "512" in cmd + assert "--crop-mode" in cmd + assert "--render-dpi" in cmd and "144" in cmd + assert "--max-new-tokens" in cmd and "1024" in cmd From 0ebabe7a774b39862e96e3d00f8ab2348f963139 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 04:05:01 +0300 Subject: [PATCH 20/93] Update DeepSeek runtime to Torch 2.9.1 cu130 --- dependency_setup/deepseek_uv/pyproject.toml | 16 +- dependency_setup/deepseek_uv/uv.lock | 280 +++++++++++++------- 2 files changed, 199 insertions(+), 97 deletions(-) diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml index a1caa65..0bfebb2 100644 --- a/dependency_setup/deepseek_uv/pyproject.toml +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -5,9 +5,9 @@ description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" requires-python = ">=3.11,<3.13" dependencies = [ "glossapi[deepseek]", - "torch==2.6.0", - "torchvision==0.21.0", - "torchaudio==2.6.0", + "torch==2.9.1", + "torchvision==0.24.1", + "torchaudio==2.9.1", ] [dependency-groups] @@ -18,11 +18,11 @@ test = [ [tool.uv.sources] glossapi = { path = "../..", editable = true } -torch = { index = "pytorch-cu118" } -torchvision = { index = "pytorch-cu118" } -torchaudio = { index = "pytorch-cu118" } +torch = { index = "pytorch-cu130" } +torchvision = { index = "pytorch-cu130" } +torchaudio = { index = "pytorch-cu130" } [[tool.uv.index]] -name = "pytorch-cu118" -url = "https://download.pytorch.org/whl/cu118" +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock index 4f99980..a136794 100644 --- a/dependency_setup/deepseek_uv/uv.lock +++ b/dependency_setup/deepseek_uv/uv.lock @@ -451,8 +451,10 @@ source = { virtual = "." } dependencies = [ { name = "glossapi", extra = ["deepseek"] }, { name = "torch" }, - { name = "torchaudio" }, - { name = "torchvision" }, + { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchaudio", version = "2.9.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] [package.dev-dependencies] @@ -464,9 +466,9 @@ test = [ [package.metadata] requires-dist = [ { name = "glossapi", extras = ["deepseek"], editable = "../../" }, - { name = "torch", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, - { name = "torchaudio", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu118" }, - { name = "torchvision", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu118" }, + { name = "torch", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchaudio", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchvision", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu130" }, ] [package.metadata.requires-dev] @@ -743,106 +745,152 @@ wheels = [ ] [[package]] -name = "nvidia-cublas-cu11" -version = "11.11.3.6" +name = "nvidia-cublas" +version = "13.0.0.19" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/46/be/c222e33e60d28ecd496a46fc4d78ccae0ee28e1fd7dc705b6288b4cad27e/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux1_x86_64.whl", hash = "sha256:39fb40e8f486dd8a2ddb8fdeefe1d5b28f5b99df01c87ab3676f057a74a5a6f3", size = 417870452, upload-time = "2022-10-18T21:17:48.638Z" }, - { url = "https://files.pythonhosted.org/packages/ea/2e/9d99c60771d275ecf6c914a612e9a577f740a615bc826bec132368e1d3ae/nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl", hash = "sha256:60252822adea5d0b10cd990a7dc7bedf7435f30ae40083c7a624a85a43225abc", size = 417870460, upload-time = "2024-08-17T00:00:26.889Z" }, + { url = "https://files.pythonhosted.org/packages/02/99/8447b9ee9f070522ee66604ee819d632ab4568c68b3134cebd3837a015cd/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:381b1a0ca636fdcb6920a871e8fc89dbfd1f6157f421ed0a6f2673e14cffd3bd", size = 539001158, upload-time = "2025-08-04T10:19:50.761Z" }, + { url = "https://files.pythonhosted.org/packages/5a/99/210e113dde53955e97042bd76dc4ad927eca04c5b4645ec157cc59f4f3ae/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:f6723af2e8e2600a11dc384037d90d9bf93070e346c24ef2e8f9001658c99896", size = 419392356, upload-time = "2025-08-04T10:20:19.449Z" }, ] [[package]] -name = "nvidia-cuda-cupti-cu11" -version = "11.8.87" +name = "nvidia-cuda-cupti" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/27/c9/b4b15f709a694ea9f84871c6c4fbeeb54bab225962d852665a2c6f77f90d/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux1_x86_64.whl", hash = "sha256:0e50c707df56c75a2c0703dc6b886f3c97a22f37d6f63839f75b7418ba672a8d", size = 13093657, upload-time = "2022-10-03T21:46:12.544Z" }, - { url = "https://files.pythonhosted.org/packages/74/42/9f5c5cc084ce6f3073048c4f6806f45ba4c8c73f227c9587215d9c372e05/nvidia_cuda_cupti_cu11-11.8.87-py3-none-manylinux2014_x86_64.whl", hash = "sha256:4191a17913a706b5098681280cd089cd7d8d3df209a6f5cb79384974a96d24f2", size = 13093662, upload-time = "2024-08-16T23:56:38.082Z" }, + { url = "https://files.pythonhosted.org/packages/72/63/e9c12c3ae07c1f3a0821536bc188d7bf76e1b633b3bcd2bd393b00bb3426/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:67c22627ef436afcf080b48e4ad17b3f83d9e7c0d990ad0c6c0627b01fb92ccc", size = 10171189, upload-time = "2025-08-04T10:16:24.39Z" }, + { url = "https://files.pythonhosted.org/packages/ba/28/e37d62ff27b4462953fdd5713d8a78760578dfa12685c30b71b55fab57b1/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:417699e216b23d81bc0bbcb7032352f81b9c5372ef73c097a01abb83125a3d09", size = 10718148, upload-time = "2025-08-04T10:16:33.605Z" }, ] [[package]] -name = "nvidia-cuda-nvrtc-cu11" -version = "11.8.89" +name = "nvidia-cuda-nvrtc" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/08/a9833e4e9f9165bedb7f36033b47aa399b053b9cb2eaf7b84d1e28705cf7/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:1f27d67b0f72902e9065ae568b4f6268dfe49ba3ed269c9a3da99bb86d1d2008", size = 23173264, upload-time = "2022-10-03T21:47:00.705Z" }, - { url = "https://files.pythonhosted.org/packages/60/44/202e027c224c26e15a53f01c5c7604c7f6b4fd368882d3164ea08fead207/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a8d02f3cba345be56b1ffc3e74d8f61f02bb758dd31b0f20e12277a5a244f756", size = 23173745, upload-time = "2024-08-16T23:58:16.539Z" }, + { url = "https://files.pythonhosted.org/packages/be/5b/f7636b3d66caefade6a0a0dc5b705c259a2062c20ad18b432b3129d348e0/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:87e13d186905a35e7c04ad553a2abded0fba22f93b43d02e5da6f6cf73fb4d0a", size = 90214268, upload-time = "2025-08-04T10:18:09.305Z" }, + { url = "https://files.pythonhosted.org/packages/c0/bd/eb18593b43dae42312612ffbac24b8e68149e590102c3b6cc2e3d3792069/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6ccf1ef1b90a0763ac7536f3c17046659d89869d76b98ac358efc2e09b348365", size = 43013627, upload-time = "2025-08-04T10:17:57.338Z" }, ] [[package]] -name = "nvidia-cuda-runtime-cu11" -version = "11.8.89" +name = "nvidia-cuda-runtime" +version = "13.0.48" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/45/3e/84db02be49fe6d6df6e42f69fd64501c22d0f9ada9c9877f885612085d20/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl", hash = "sha256:f587bd726eb2f7612cf77ce38a2c1e65cf23251ff49437f6161ce0d647f64f7c", size = 875585, upload-time = "2022-10-03T21:46:03.05Z" }, - { url = "https://files.pythonhosted.org/packages/a6/ec/a540f28b31de7bc1ed49eecc72035d4cb77db88ead1d42f7bfa5ae407ac6/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl", hash = "sha256:92d04069a987e1fbc9213f8376d265df0f7bb42617d44f5eda1f496acea7f2d1", size = 875592, upload-time = "2024-08-16T23:56:18.774Z" }, + { url = "https://files.pythonhosted.org/packages/55/3b/c5e5d8aafd355e2ff9922472ba71251331af6cc866e5b04a3b1dc8f58977/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b807c0bb925a307bfa667a24f24d253aef8eda3ac4be66b333f2c9d357557008", size = 2260687, upload-time = "2025-08-04T10:15:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/cc/78/edb119083ca2ff0f09ab0cd597e97775ac3f575b8aa0caf10d68ed49e032/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b54d12087a1abff81a4cbfa6556876e3afea1fc60da2e0816da374619810c89", size = 2242632, upload-time = "2025-08-04T10:15:49.339Z" }, ] [[package]] -name = "nvidia-cudnn-cu11" -version = "9.1.0.70" +name = "nvidia-cudnn-cu13" +version = "9.13.0.50" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/00/3b/0b776f04e364cd99e4cf152c2a9eadb5934c67c9a91429da55169a9447fd/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e6135ac63fe9d5b0b89cfb35c3fc1c1349f2b995becadf2e9dc21bca89d9633d", size = 663919573, upload-time = "2024-04-22T15:20:24.839Z" }, + { url = "https://files.pythonhosted.org/packages/8a/9c/9e99c00dc23db324244ec257d1e84d79539202ee2f185dee2c1fa97c9549/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:33f0aa0b64230101b348648fd0693342188071d3f8a137c0cf50051c24b3584b", size = 412337597, upload-time = "2025-09-04T20:22:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/cf/68/2712854561170b2a81bea7b6b35cc1ae264d9794c0c218986e5c685d45f7/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:2150b4850725d30653ec3e365f0732e3e2e3eb8633cf3bd2d3117628dea8b4f9", size = 348571624, upload-time = "2025-09-04T20:23:26.544Z" }, ] [[package]] -name = "nvidia-cufft-cu11" -version = "10.9.0.58" +name = "nvidia-cufft" +version = "12.0.0.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/e9/4e49b1baf6899e42eeec324a49d7aa2219fec42076327c4e468000dd375a/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1885731254835797572ff075f3daf43a2a0a2801210dea26971940dae7e1a367", size = 214053580, upload-time = "2025-08-04T10:20:45.781Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9f/e298b66e584ad25bd78ad4a45b061fe7bb57a1ec011128089404ce3fcc7d/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9f160b1f018e80bcb0d7c0fa50564b042fa26b13edc1b1ff14b6375a9edd2812", size = 214085489, upload-time = "2025-08-04T10:21:02.975Z" }, +] + +[[package]] +name = "nvidia-cufile" +version = "1.15.0.42" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/0a/4adf0c9bb1241cd1314fc923fde00f3749c7fc785b1e3b3f4a104cd3090c/nvidia_cufile-1.15.0.42-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8f9813eff24d61586699c615e39817e2b4e4f642cace32733c2ab6f663a7eab", size = 1223104, upload-time = "2025-08-04T10:21:31.131Z" }, + { url = "https://files.pythonhosted.org/packages/bf/a5/636baa43399ea10d22b63e7454f22a92ace4a7eaa3c45b94607250857e2d/nvidia_cufile-1.15.0.42-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bced4036b5a8dbf57e4d78cd4fafefec58ad754b784a9eaa272b011896754c62", size = 1136527, upload-time = "2025-08-04T10:21:22.441Z" }, +] + +[[package]] +name = "nvidia-curand" +version = "10.4.0.35" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/74/79/b912a77e38e41f15a0581a59f5c3548d1ddfdda3225936fb67c342719e7a/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl", hash = "sha256:222f9da70c80384632fd6035e4c3f16762d64ea7a843829cb278f98b3cb7dd81", size = 168405414, upload-time = "2022-10-03T23:29:47.505Z" }, - { url = "https://files.pythonhosted.org/packages/64/c8/133717b43182ba063803e983e7680a94826a9f4ff5734af0ca315803f1b3/nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e21037259995243cc370dd63c430d77ae9280bedb68d5b5a18226bfc92e5d748", size = 168405419, upload-time = "2024-08-17T00:02:03.562Z" }, + { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, ] [[package]] -name = "nvidia-curand-cu11" -version = "10.3.0.86" +name = "nvidia-cusolver" +version = "12.0.3.29" source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-cusparse", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] wheels = [ - { url = "https://files.pythonhosted.org/packages/49/28/c47f8e2439ddbcbeae3cf74d43ed572b651d630ea72863d5357f3759eb66/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:ac439548c88580269a1eb6aeb602a5aed32f0dbb20809a31d9ed7d01d77f6bf5", size = 58124493, upload-time = "2022-10-03T23:30:05.413Z" }, - { url = "https://files.pythonhosted.org/packages/58/e5/ce5806afc48a6e4e0dddd25316ac60b6fa94fd1791bdbf4ca17bf52696ea/nvidia_curand_cu11-10.3.0.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:cd4cffbf78bb06580206b4814d5dc696d1161c902aae37b2bba00056832379e6", size = 58124497, upload-time = "2024-08-17T00:03:01.833Z" }, + { url = "https://files.pythonhosted.org/packages/a7/bb/2e60de9bb1f0c3395eabd91ccad00f4ba3ef736dc9190a158a9d268419f5/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:3bb6e65ce0beaeafdd069b320246e8f17c1cd30ddb27a0539143a3706733a4d8", size = 193104180, upload-time = "2025-08-04T10:22:19.821Z" }, + { url = "https://files.pythonhosted.org/packages/a5/87/e3c9ee227b750e5b61572e7509f586cc8d494a4f7874b5163e734ed852c2/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:6f54c2eed5edab54c224dd1852dde80ba76b2b78e6d3ce7344fef5dfc66d16ab", size = 193474165, upload-time = "2025-08-04T10:22:47.976Z" }, ] [[package]] -name = "nvidia-cusolver-cu11" -version = "11.4.1.48" +name = "nvidia-cusparse" +version = "12.6.2.49" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu11", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/55/ee/939ff0104991dd7bdabb4c9767994c612ba0e1c9a55672a1ddd42f5e5b16/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl", hash = "sha256:ca538f545645b7e6629140786d3127fe067b3d5a085bd794cde5bfe877c8926f", size = 128240842, upload-time = "2022-10-03T23:30:24.348Z" }, - { url = "https://files.pythonhosted.org/packages/52/fe/866e87e6e6a1b0a5fcf8524a058042656702f2057e22bfdb8899a7c38e10/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea9fb1ad8c644ca9ed55af13cc39af3b7ba4c3eb5aef18471fe1fe77d94383cb", size = 128246438, upload-time = "2024-08-17T00:03:52.432Z" }, + { url = "https://files.pythonhosted.org/packages/fc/30/f32023427f2ef4ec27e8293dfddb5068de566912cd0a45eccfd400017a62/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d3269c19283a0057fb5ebfb003ae2a10c97a28a6958f4238354826b055827c7", size = 155888587, upload-time = "2025-08-04T10:23:04.091Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e8/b3f7a87cc719dca926c7baee92f2544de8909573a4126c85a9f1625431e8/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efcf0b01e3a0827c144feff5391456b8a06e9ce63dcd51c0943e32e605251952", size = 140247612, upload-time = "2025-08-04T10:23:29.844Z" }, ] [[package]] -name = "nvidia-cusparse-cu11" -version = "11.7.5.86" +name = "nvidia-cusparselt-cu13" +version = "0.8.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/e0/21b829c535d569831835a4ca5d049a19ba00d3e91f3e12ab4ad27bd7385f/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:4ae709fe78d3f23f60acaba8c54b8ad556cf16ca486e0cc1aa92dca7555d2d2b", size = 204126221, upload-time = "2022-10-18T21:19:28.04Z" }, - { url = "https://files.pythonhosted.org/packages/ed/5c/b0333b07c51ced77397c2fb0d9826072cea0da9d421aa7e792aa0f8ecc72/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8d7cf1628fd8d462b5d2ba6678fae34733a48ecb80495b9c68672ec6a6dde5ef", size = 204126227, upload-time = "2024-08-17T00:05:20.798Z" }, + { url = "https://files.pythonhosted.org/packages/46/10/8dcd1175260706a2fc92a16a52e306b71d4c1ea0b0cc4a9484183399818a/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:400c6ed1cf6780fc6efedd64ec9f1345871767e6a1a0a552a1ea0578117ea77c", size = 220791277, upload-time = "2025-08-13T19:22:40.982Z" }, + { url = "https://files.pythonhosted.org/packages/fd/53/43b0d71f4e702fa9733f8b4571fdca50a8813f1e450b656c239beff12315/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:25e30a8a7323935d4ad0340b95a0b69926eee755767e8e0b1cf8dd85b197d3fd", size = 169884119, upload-time = "2025-08-13T19:23:41.967Z" }, ] [[package]] -name = "nvidia-nccl-cu11" -version = "2.21.5" +name = "nvidia-nccl-cu13" +version = "2.27.7" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/9a/8b6a28b3b87d5fddab0e92cd835339eb8fbddaa71ae67518c8c1b3d05bae/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:49d8350629c7888701d1fd200934942671cb5c728f49acc5a0b3a768820bed29", size = 147811630, upload-time = "2024-04-03T15:33:12.879Z" }, + { url = "https://files.pythonhosted.org/packages/49/61/2c7762da6febee96341ea17d1f7309ac7559ac3cab00f3f7e1e7bd0e5d00/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5e3cc863e52bf9dd1e3ab1941bddb414098f489ae7342f6b3a274602303da123", size = 194014855, upload-time = "2025-09-23T16:30:27.56Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/dabb10684e60edfaf1a1c9984d12a668bc1091582099d4e03ac5b9983b51/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b28a524abd8389b76a4a3f133c76a7aaa7005e47fcaa9d9603b90103927a3f93", size = 193901479, upload-time = "2025-09-23T16:30:41.165Z" }, ] [[package]] -name = "nvidia-nvtx-cu11" -version = "11.8.86" +name = "nvidia-nvjitlink" +version = "13.0.39" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/a2/23214c23118784dc2189ac2d2e48190df3e4206e2f73eb17d47140797a2b/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux1_x86_64.whl", hash = "sha256:890656d8bd9b4e280231c832e1f0d03459200ba4824ddda3dcb59b1e1989b9f5", size = 99125, upload-time = "2022-10-03T21:47:19.565Z" }, - { url = "https://files.pythonhosted.org/packages/b5/ad/973a187b137a3d45dc3faac421ef1275fb41fc169fd3889e2d5ceb0daa54/nvidia_nvtx_cu11-11.8.86-py3-none-manylinux2014_x86_64.whl", hash = "sha256:979f5b2aef5da164c5c53c64c85c3dfa61b8b4704f4f963bb568bf98fa8472e8", size = 99130, upload-time = "2024-08-16T23:58:33.479Z" }, + { url = "https://files.pythonhosted.org/packages/95/39/726edebeb76f3efc25c79f885429fa1227c9d200e20ea219bf724b382e19/nvidia_nvjitlink-13.0.39-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:bc3179be558329ef9687884c6faa27cdc0659bdbc642432ec8cc6cc00d182627", size = 40709605, upload-time = "2025-08-04T10:25:04.129Z" }, + { url = "https://files.pythonhosted.org/packages/bc/7a/0fb4c4413b3b14519f8934edd4dcd9f411c4e14e2a2c0ae58709e4dda255/nvidia_nvjitlink-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce0d63fa5ebedf542056e7491c49feed2297c900980aa6269b6a55f478056ad7", size = 38767126, upload-time = "2025-08-04T10:24:53.05Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu13" +version = "3.3.24" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/7e/b8797780e442eabd9046cd6eb54100b8d0cb047ebc2f70931710cb03bcfe/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:28ae82a4d14b322b93409535de62df6b7b83f4f7672ca97fc89107c2d40ce2c2", size = 60168129, upload-time = "2025-08-22T19:56:28.818Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e9/8530afb8ed38d16bbc89cec80a4dd6a52dbf59bc93e546c3658cfa8b1f9b/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c14d09571697d2e57cb079c8daec88ab1c68cb3586532bfbd4886125a08339b7", size = 60390470, upload-time = "2025-08-22T19:56:49.848Z" }, +] + +[[package]] +name = "nvidia-nvtx" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/37/0d103c84e7884382a79a569b720965141f83dd1c5df9e3e00cbc02d7099c/nvidia_nvtx-13.0.39-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc113127785c96db8a0fe715df92db9788777b4b3d1bd713d42f75969201b5ce", size = 147197, upload-time = "2025-08-04T10:18:39.829Z" }, + { url = "https://files.pythonhosted.org/packages/86/91/8b486ba85f71a2859dd705a4ec6aab38c37a389b8b7f94343db027732999/nvidia_nvtx-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cddd2e08b35144f1000631c3880c9ebbcb8a2863d762e76f92d47d30ecaf87cc", size = 148037, upload-time = "2025-08-04T10:18:31.763Z" }, ] [[package]] @@ -1324,14 +1372,14 @@ wheels = [ [[package]] name = "sympy" -version = "1.13.1" +version = "1.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mpmath" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] [[package]] @@ -1398,64 +1446,116 @@ wheels = [ [[package]] name = "torch" -version = "2.6.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu11", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufft", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, + { name = "nvidia-curand", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusolver", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvtx", marker = "sys_platform == 'linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:3e73419aab6dbcd888a3cc6a00d1f52f5950d918d7289ea6aeae751346613edc" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:6ab0417ce9b78ab0a34721a99734b5fd4cc3d7b62ff1c068a7d636fd829772db" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:9f7d170d6c78726945d95fcc3a3d7601f36aed0e6e0dc9ca377a64d6a8fd7b3a" }, - { url = "https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:6c040e4181c5dae73b965b61394ec431c93b2018165e2be8f15fc68d44444cb3" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fd6c7d297e21758a7fa07624f2b5bb15607ee3b1dcc52519e8e796c6d4fcf960" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f40778951ca1533dc634b3842392641fa0b641181ff2f71d62728ef33cc36a5c" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:8db2814e63f2b365bda88526587ca75a6083a0b957a24b2b0d45ddc5ee350176" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e7f84cb10c7e7d9f862c318f056d64840544ab4f0bcbf8cf7ed6047fe04051f" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e70e1b18881e6b3c1ce402d0a989da39f956a3a057526e03c354df23d704ce9b" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:cd3232a562ad2a2699d48130255e1b24c07dfe694a40dcd24fad683c752de121" }, ] [[package]] name = "torchaudio" -version = "2.6.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "2.9.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "torch" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:493421d061375074ce84840ca619605f625892e16dead63ec97181ef02da3357" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b3c75f87e325946276c952864dbce2c8fabc88a00d86730c3d5bc0999ebf7789" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:089b54fb6d4f8348a07d4c460cf2da4da2de57f068154c1401b385626917d434" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:065ea2e015ef6d02ec289e0a5ecc4c8e7acd4b30a8612879637395e7e16217e4" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:e77fe770130b54fdbcecda829024fbd4235075e905f5c6019c19664577c70e1d" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:885bdd94f19f0dbad81e08c54f85ffbf10f00af8452c25d2b3b533cf2884d6b8" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1023bb6598fa6312e1990fdc78660f4b4ef128d8942a1f10c5827aea23d6bd7e" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:817e2660d35a3c9a2638dd80d63c7a488cbbe87446ddbb564a5cf88b9de632f7" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6c58d5e846da5a90d50bd425e2c24368747cd04297d95c6dd51d3f7f85fea26" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7533a17bed21e5b86b8c49fd79656779779f2c991aef2804af6f318d2022ea6a" }, ] [[package]] name = "torchvision" -version = "0.21.0+cu118" -source = { registry = "https://download.pytorch.org/whl/cu118" } +version = "0.24.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] dependencies = [ - { name = "numpy" }, - { name = "pillow" }, - { name = "torch" }, + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d4ba2532440a93c23a99c41423a765a0cdd47556afa3acf7c318dd1d3d6793e9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:89743dcee13e943f58b37c7647aff14b5bb24c11c84826376d457acf97586fec" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl", hash = "sha256:5ebe0267c872ac55b387008f772052bbf1f2fdfdd8afb011d4751e124759295e" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-win_amd64.whl", hash = "sha256:4e1325aa1189f97c89ae008cf645b7de8f283853193bf68ea7750856c194b6cc" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-linux_x86_64.whl", hash = "sha256:5d3679e0df9ab1725eaa7300d550cf8fe0a477119483bef12673957f30c768dc" }, - { url = "https://download-r2.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp312-cp312-win_amd64.whl", hash = "sha256:301eefd1d4df6619fab94cae539cb0cdcb029cc992e4686ef97c8366f77cf6a4" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b0cc84c57c1fd54644698a70a74d1ea1eddfa44ee2df3354b7bb2c619a5d2923" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:f564b9fdbc336ac187780931331fb4253f8511deae914dde12dca5bf17b3045f" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6939dd403cc28ab0a46f53e6c86e2e852cf65771c1b0ddd09c44c541a1cdbad9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:d31ceaded0d9b737471fa680ccd9e1acb6d5f0f70f03ef3a8d786a99c79da7cf" }, ] [[package]] @@ -1493,11 +1593,13 @@ wheels = [ [[package]] name = "triton" -version = "3.2.0" +version = "3.5.1" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636, upload-time = "2025-01-22T19:12:51.322Z" }, - { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, + { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" }, + { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" }, + { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" }, + { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" }, ] [[package]] From 502f8bc065ce660e4b222d606b95cc8642d63f3d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 12:24:46 +0300 Subject: [PATCH 21/93] Add vLLM DeepSeek OCR runtime --- src/glossapi/corpus/phase_ocr_math.py | 11 + src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 239 ++++++++++++++++++ src/glossapi/ocr/deepseek/runner.py | 54 +++- tests/test_deepseek_runner_contract.py | 66 +++++ 4 files changed, 365 insertions(+), 5 deletions(-) create mode 100644 src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 1e75a1b..1897aa2 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -42,6 +42,7 @@ def ocr( dpi: Optional[int] = None, # reserved for future use precision: Optional[str] = None, # reserved for future use ("fp16","bf16") workers_per_gpu: int = 1, + runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", attn_backend: str = "auto", base_size: Optional[int] = None, @@ -51,6 +52,9 @@ def ocr( max_new_tokens: Optional[int] = None, repetition_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = None, + disable_fp8_kv: bool = False, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -88,6 +92,7 @@ def ocr( ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. + - runtime_backend: ``transformers`` (default) or ``vllm``. - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek rendering and attention controls used for throughput/quality benchmarking. @@ -95,6 +100,8 @@ def ocr( Optional generation controls forwarded to DeepSeek. These are exposed for runtime experiments; leave them unset unless a benchmark calls for them explicitly. + - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv: + Optional vLLM controls. These are ignored by the transformers runtime. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -609,6 +616,7 @@ def _run_math(stems: List[str]) -> None: use_gpus=use_gpus, devices=devices, workers_per_gpu=int(max(1, workers_per_gpu)), + runtime_backend=runtime_backend, ocr_profile=ocr_profile, attn_backend=attn_backend, base_size=base_size, @@ -618,6 +626,9 @@ def _run_math(stems: List[str]) -> None: max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py new file mode 100644 index 0000000..2c547b3 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -0,0 +1,239 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files using vLLM.""" + +from __future__ import annotations + +import argparse +import logging +import tempfile +import time +from pathlib import Path +from typing import Dict, List + +from PIL import Image + +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( + PAGE_SPLIT, + _iter_pdfs, + _postprocess_page_text, + _profile_defaults, + _render_pages, + _write_outputs, + _write_progress, +) + +LOGGER = logging.getLogger(__name__) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--attn-backend", default="vllm") + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: bool): + from vllm import LLM + + logits_processors = None + try: + from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor + + logits_processors = [NGramPerReqLogitsProcessor] + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("DeepSeek OCR logits processor unavailable in vLLM; continuing without it: %s", exc) + + engine_kwargs = { + "model": str(model_dir), + "tokenizer": str(model_dir), + "trust_remote_code": True, + "dtype": "bfloat16", + "enable_prefix_caching": False, + "mm_processor_cache_gb": 0, + "gpu_memory_utilization": float(gpu_memory_utilization), + "tensor_parallel_size": 1, + } + if disable_fp8_kv: + engine_kwargs["kv_cache_dtype"] = "auto" + if logits_processors: + engine_kwargs["logits_processors"] = logits_processors + return LLM(**engine_kwargs) + + +def _sampling_params(max_new_tokens: int | None): + from vllm import SamplingParams + + return SamplingParams( + temperature=0.0, + max_tokens=int(max_new_tokens or 8192), + skip_special_tokens=False, + extra_args={ + "ngram_size": 30, + "window_size": 90, + "whitelist_token_ids": {128821, 128822}, + }, + ) + + +def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: + size = max(1, int(batch_size)) + return [items[idx : idx + size] for idx in range(0, len(items), size)] + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + pdfs = _iter_pdfs(input_dir, args.files) + if not pdfs: + return 0 + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + llm = _load_vllm( + model_dir, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + ) + sampling_params = _sampling_params(args.max_new_tokens) + + with tempfile.TemporaryDirectory(prefix="deepseek_vllm_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + doc_states: Dict[str, dict] = {} + jobs: List[dict] = [] + + for pdf_path in pdfs: + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + render_sec = time.perf_counter() - render_start + total_pages = len(images) + state = { + "stem": pdf_path.stem, + "page_outputs": [""] * total_pages, + "page_metrics": [], + "render_sec": float(render_sec), + "doc_start": float(doc_start), + "completed_pages": 0, + "total_pages": total_pages, + } + doc_states[pdf_path.stem] = state + _write_progress(output_dir, pdf_path.stem, [], total_pages, 0) + for idx, image in enumerate(images): + page_path = tmp_dir / f"{pdf_path.stem}_page_{idx + 1:04d}.png" + image.save(page_path, format="PNG") + image.close() + jobs.append( + { + "stem": pdf_path.stem, + "page_number": int(idx + 1), + "image_path": page_path, + } + ) + + for batch in _batched(jobs, args.batch_size): + prompt_batch = [] + images: List[Image.Image] = [] + for item in batch: + image = Image.open(item["image_path"]).convert("RGB") + images.append(image) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + infer_start = time.perf_counter() + outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + per_item_sec = infer_sec / max(1, len(batch)) + for image in images: + image.close() + + for item, output in zip(batch, outputs): + state = doc_states[item["stem"]] + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + state["page_outputs"][item["page_number"] - 1] = page_text + state["page_metrics"].append( + { + "page_number": int(item["page_number"]), + "infer_sec": float(per_item_sec), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) + state["completed_pages"] = int(state["completed_pages"]) + 1 + progress_pages = [page for page in state["page_outputs"] if page] + _write_progress( + output_dir, + item["stem"], + progress_pages, + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + for stem, state in doc_states.items(): + markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" + page_metrics = sorted(state["page_metrics"], key=lambda item: int(item["page_number"])) + _write_outputs( + output_dir, + stem, + markdown, + int(state["total_pages"]), + extra_metrics={ + "ocr_profile": args.ocr_profile, + "attn_backend": "vllm", + "runtime_backend": "vllm", + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "batch_size": int(args.batch_size), + "gpu_memory_utilization": float(args.gpu_memory_utilization), + "disable_fp8_kv": bool(args.disable_fp8_kv), + "render_sec": float(state["render_sec"]), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "page_metrics": page_metrics, + }, + ) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 61ba307..fe60390 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -20,6 +20,7 @@ LOGGER = logging.getLogger(__name__) REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" def _page_count(pdf_path: Path) -> int: @@ -51,6 +52,10 @@ def _build_cli_command( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -91,6 +96,14 @@ def _build_cli_command( cmd += ["--repetition-penalty", str(float(repetition_penalty))] if no_repeat_ngram_size is not None: cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + if runtime_backend_norm == "vllm": + if vllm_batch_size is not None: + cmd += ["--batch-size", str(int(vllm_batch_size))] + if gpu_memory_utilization is not None: + cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] + if disable_fp8_kv: + cmd.append("--disable-fp8-kv") return cmd @@ -134,6 +147,10 @@ def _run_cli( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -155,6 +172,10 @@ def _run_cli( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -305,6 +326,10 @@ def _run_multi_cli( max_new_tokens: Optional[int], repetition_penalty: Optional[float], no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -346,6 +371,10 @@ def _run_multi_cli( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -385,6 +414,7 @@ def run_for_files( persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved device: Optional[str] = None, + runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", attn_backend: str = "auto", base_size: Optional[int] = None, @@ -397,22 +427,27 @@ def run_for_files( use_gpus: Optional[str] = None, devices: Optional[List[int]] = None, workers_per_gpu: int = 1, - gpu_memory_utilization: Optional[float] = None, # reserved - disable_fp8_kv: bool = False, # reserved + gpu_memory_utilization: Optional[float] = None, + disable_fp8_kv: bool = False, + vllm_batch_size: Optional[int] = None, **_: Any, ) -> Dict[str, Any]: """Run DeepSeek OCR for the provided files.""" requested_stub = bool(allow_stub) del allow_stub, allow_cli, persist_engine, precision - del gpu_memory_utilization, disable_fp8_kv - if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": raise RuntimeError( "DeepSeek stub execution has been removed. " "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." ) + runtime_backend_norm = str( + runtime_backend or os.environ.get("GLOSSAPI_DEEPSEEK_RUNTIME_BACKEND", "transformers") + ).strip().lower() + if runtime_backend_norm not in {"transformers", "vllm"}: + raise ValueError("runtime_backend must be 'transformers' or 'vllm'") + file_list = [str(f) for f in files or []] if not file_list: return {} @@ -435,10 +470,11 @@ def run_for_files( "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." ) + default_script = DEFAULT_VLLM_SCRIPT if runtime_backend_norm == "vllm" else DEFAULT_SCRIPT script_path = Path( vllm_script or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") - or DEFAULT_SCRIPT + or default_script ) if not script_path.exists(): raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") @@ -481,6 +517,10 @@ def run_for_files( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) else: _run_cli( @@ -502,6 +542,10 @@ def run_for_files( max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 81ec66f..4629d2f 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -134,6 +134,10 @@ def test_build_cli_command_includes_speed_flags(tmp_path): max_new_tokens=1024, repetition_penalty=1.05, no_repeat_ngram_size=12, + runtime_backend="transformers", + vllm_batch_size=None, + gpu_memory_utilization=None, + disable_fp8_kv=False, ) assert "--ocr-profile" in cmd and "plain_ocr" in cmd @@ -143,3 +147,65 @@ def test_build_cli_command_includes_speed_flags(tmp_path): assert "--crop-mode" in cmd assert "--render-dpi" in cmd and "144" in cmd assert "--max-new-tokens" in cmd and "1024" in cmd + + +def test_build_cli_command_includes_vllm_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=110, + max_new_tokens=768, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=16, + gpu_memory_utilization=0.92, + disable_fp8_kv=True, + ) + + assert "--batch-size" in cmd and "16" in cmd + assert "--gpu-memory-utilization" in cmd and "0.92" in cmd + assert "--disable-fp8-kv" in cmd + + +def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["script"] = kwargs["script"] + calls["runtime_backend"] = kwargs["runtime_backend"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["runtime_backend"] == "vllm" + assert Path(calls["script"]).name == "run_pdf_ocr_vllm.py" + assert result["doc"]["page_count"] == 1 From cbeb638913f43df13f4704e3646b6b6b75961101 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:11:17 +0300 Subject: [PATCH 22/93] Add DeepSeek markdown repair pipeline --- docs/ocr_and_math_enhancement.md | 50 +++ src/glossapi/corpus/phase_ocr_math.py | 13 +- .../ocr/deepseek/run_pdf_ocr_transformers.py | 3 +- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 404 ++++++++++++++++-- src/glossapi/ocr/deepseek/runner.py | 20 + tests/test_deepseek_runner_contract.py | 21 + 6 files changed, 460 insertions(+), 51 deletions(-) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 1c2b630..ac2a5b7 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -82,6 +82,40 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. +### DeepSeek fast path + +The current recommended high-throughput DeepSeek configuration is: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `repair_mode='auto'` to keep markdown as the primary output while selectively rerunning suspicious pages +- large `vllm_batch_size` chosen to keep `sec/page/GPU` at or below the best validated floor for the target hardware + +Example: + +```python +c.ocr( + backend='deepseek', + fix_bad=True, + math_enhance=False, + runtime_backend='vllm', + ocr_profile='markdown_grounded', + vllm_batch_size=160, + gpu_memory_utilization=0.9, + repair_mode='auto', + use_gpus='multi', +) +``` + +`repair_mode='auto'` runs the pipeline in distinct phases inside the vLLM runner: + +1. markdown first pass over all rendered pages +2. cheap per-page triage using output quality plus simple image density statistics +3. plain-text rerun bucket for garbage markdown pages +4. tiled markdown rerun bucket for short coverage failures + +This keeps the fast path batched while avoiding per-page sequential fallback overhead. + ## Multi‑GPU Phase‑1 (extract): @@ -105,9 +139,25 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning +### Validated benchmark floor + +The current non-regression metric is `sec/page/GPU`. + +Validated on 2026-03-30: + +- Host: AWS `g7e.48xlarge` +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Render DPI: `144` +- GPU memory utilization: `0.9` +- Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` + +That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. + - Batch sizes - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. + - DeepSeek vLLM: push `vllm_batch_size` as high as the hardware allows while tracking `sec/page/GPU`; on the validated `g7e.48xlarge` path, larger batches continued improving throughput through `batch_size=160`. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 1897aa2..cd261ed 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -44,6 +44,7 @@ def ocr( workers_per_gpu: int = 1, runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, attn_backend: str = "auto", base_size: Optional[int] = None, image_size: Optional[int] = None, @@ -55,6 +56,7 @@ def ocr( vllm_batch_size: Optional[int] = None, gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, + repair_mode: str = "auto", # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -93,15 +95,18 @@ def ocr( Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. - runtime_backend: ``transformers`` (default) or ``vllm``. - - ocr_profile/attn_backend/base_size/image_size/crop_mode/render_dpi: + - ocr_profile/prompt_override/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek rendering and attention controls used for throughput/quality benchmarking. - max_new_tokens/repetition_penalty/no_repeat_ngram_size: Optional generation controls forwarded to DeepSeek. These are exposed for runtime experiments; leave them unset unless a benchmark calls for them explicitly. - - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv: - Optional vLLM controls. These are ignored by the transformers runtime. + - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode: + Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first + repair pipeline (plain fallback for garbage pages, tiled fallback for + short coverage failures). These are ignored by the transformers runtime + except for ``prompt_override``. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -618,6 +623,7 @@ def _run_math(stems: List[str]) -> None: workers_per_gpu=int(max(1, workers_per_gpu)), runtime_backend=runtime_backend, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -629,6 +635,7 @@ def _run_math(stems: List[str]) -> None: vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index e46fadf..071b3b5 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -60,6 +60,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) @@ -329,7 +330,7 @@ def main() -> int: return 0 profile_defaults = _profile_defaults(args.ocr_profile) - prompt = profile_defaults["prompt"] + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 2c547b3..56870e5 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -7,7 +7,7 @@ import tempfile import time from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Tuple from PIL import Image @@ -22,6 +22,18 @@ ) LOGGER = logging.getLogger(__name__) +REPAIR_TILE_SPECS: Tuple[Tuple[str, float, float], ...] = ( + ("top", 0.0, 0.5), + ("mid", 0.35, 0.8), + ("bottom", 0.65, 1.0), +) +REPAIR_DARK_THRESHOLD = 235 +REPAIR_SHORT_CHARS = 700 +REPAIR_EXTREME_SHORT_CHARS = 120 +REPAIR_PUA_THRESHOLD = 64 +REPAIR_MIN_HALF_DARK = 0.08 +REPAIR_MAX_OVERALL_DARK = 0.25 +REPAIR_MIN_OVERALL_DARK = 0.04 def _parse_args() -> argparse.Namespace: @@ -33,6 +45,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) parser.add_argument("--attn-backend", default="vllm") parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) @@ -46,6 +59,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--batch-size", type=int, default=8) parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) parser.add_argument("--content-debug", action="store_true") return parser.parse_args() @@ -98,6 +112,161 @@ def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: return [items[idx : idx + size] for idx in range(0, len(items), size)] +def _image_content_stats(image: Image.Image) -> dict: + sample = image.convert("L") + sample.thumbnail((256, 256)) + width, height = sample.size + pixels = list(sample.getdata()) + + def _dark_ratio(y0: int, y1: int) -> float: + values = [] + for row in range(y0, y1): + start = row * width + values.extend(pixels[start : start + width]) + total = len(values) + if total <= 0: + return 0.0 + dark = sum(1 for value in values if value < REPAIR_DARK_THRESHOLD) + return float(dark) / float(total) + + half = max(1, height // 2) + dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) + return { + "top_dark_ratio": _dark_ratio(0, half), + "bottom_dark_ratio": _dark_ratio(half, height), + "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), + } + + +def _count_private_use_chars(text: str) -> int: + return sum( + 1 + for ch in str(text or "") + if 0xE000 <= ord(ch) <= 0xF8FF + or 0xF0000 <= ord(ch) <= 0xFFFFD + or 0x100000 <= ord(ch) <= 0x10FFFD + ) + + +def _text_quality_metrics(text: str) -> dict: + stripped = str(text or "").strip() + letters = sum(1 for ch in stripped if ch.isalpha()) + digits = sum(1 for ch in stripped if ch.isdigit()) + pua_chars = _count_private_use_chars(stripped) + score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) + return { + "chars": int(len(stripped)), + "letters": int(letters), + "digits": int(digits), + "pua_chars": int(pua_chars), + "quality_score": float(score), + } + + +def _classify_repair(text: str, image_stats: dict, repair_mode: str) -> tuple[str, str | None]: + if str(repair_mode or "off").strip().lower() != "auto": + return "none", None + quality = _text_quality_metrics(text) + chars = int(quality["chars"]) + pua_chars = int(quality["pua_chars"]) + pua_ratio = float(pua_chars) / float(max(1, chars)) + if pua_chars >= REPAIR_PUA_THRESHOLD or pua_ratio >= 0.10: + return "plain", "markdown_garbage" + if chars <= REPAIR_EXTREME_SHORT_CHARS: + return "plain", "extreme_short" + top_dark = float(image_stats.get("top_dark_ratio", 0.0)) + bottom_dark = float(image_stats.get("bottom_dark_ratio", 0.0)) + overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if ( + chars <= REPAIR_SHORT_CHARS + and top_dark >= REPAIR_MIN_HALF_DARK + and bottom_dark >= REPAIR_MIN_HALF_DARK + and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK + ): + return "tile", "short_coverage" + return "none", None + + +def _load_job_image(item: dict) -> Image.Image: + image = Image.open(item["image_path"]).convert("RGB") + crop_box = item.get("crop_box") + if not crop_box: + return image + width, height = image.size + x0_norm, y0_norm, x1_norm, y1_norm = crop_box + crop_pixels = ( + int(round(float(x0_norm) * width)), + int(round(float(y0_norm) * height)), + int(round(float(x1_norm) * width)), + int(round(float(y1_norm) * height)), + ) + cropped = image.crop(crop_pixels) + image.close() + return cropped + + +def _generate_batch_outputs( + llm, + *, + jobs: List[dict], + prompt: str, + batch_size: int, + sampling_params, +) -> List[dict]: + outputs_by_key: Dict[tuple[str, int, str], dict] = {} + for batch in _batched(jobs, batch_size): + prompt_batch = [] + opened_images: List[Image.Image] = [] + keys: List[tuple[str, int, str]] = [] + for item in batch: + image = _load_job_image(item) + opened_images.append(image) + keys.append((str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + infer_start = time.perf_counter() + batch_outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + per_item_sec = infer_sec / max(1, len(batch)) + for image in opened_images: + image.close() + for item, key, output in zip(batch, keys, batch_outputs): + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + outputs_by_key[key] = { + "item": item, + "raw_text": raw_text, + "infer_sec": float(per_item_sec), + } + ordered = [] + for item in jobs: + ordered.append(outputs_by_key[(str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))]) + return ordered + + +def _stitch_tiled_markdown(parts: List[str]) -> str: + stitched: List[str] = [] + previous_lines: List[str] = [] + for part in parts: + lines = [line.rstrip() for line in str(part or "").splitlines() if line.strip()] + if not lines: + continue + overlap = 0 + max_overlap = min(len(previous_lines), len(lines), 12) + for size in range(max_overlap, 0, -1): + if previous_lines[-size:] == lines[:size]: + overlap = size + break + stitched.extend(lines[overlap:]) + previous_lines = lines + return "\n".join(stitched).strip() + + def main() -> int: args = _parse_args() input_dir = Path(args.input_dir).resolve() @@ -108,7 +277,8 @@ def main() -> int: return 0 profile_defaults = _profile_defaults(args.ocr_profile) - prompt = profile_defaults["prompt"] + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + plain_prompt = _profile_defaults("plain_ocr")["prompt"] base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) @@ -134,7 +304,7 @@ def main() -> int: state = { "stem": pdf_path.stem, "page_outputs": [""] * total_pages, - "page_metrics": [], + "page_metrics": [None] * total_pages, "render_sec": float(render_sec), "doc_start": float(doc_start), "completed_pages": 0, @@ -144,6 +314,7 @@ def main() -> int: _write_progress(output_dir, pdf_path.stem, [], total_pages, 0) for idx, image in enumerate(images): page_path = tmp_dir / f"{pdf_path.stem}_page_{idx + 1:04d}.png" + image_stats = _image_content_stats(image) image.save(page_path, format="PNG") image.close() jobs.append( @@ -151,63 +322,200 @@ def main() -> int: "stem": pdf_path.stem, "page_number": int(idx + 1), "image_path": page_path, + "image_stats": image_stats, + "variant": "page", } ) - for batch in _batched(jobs, args.batch_size): - prompt_batch = [] - images: List[Image.Image] = [] - for item in batch: - image = Image.open(item["image_path"]).convert("RGB") - images.append(image) - prompt_batch.append( - { - "prompt": prompt, - "multi_modal_data": {"image": image}, - } - ) - infer_start = time.perf_counter() - outputs = llm.generate(prompt_batch, sampling_params=sampling_params) - infer_sec = time.perf_counter() - infer_start - per_item_sec = infer_sec / max(1, len(batch)) - for image in images: - image.close() + plain_repair_jobs: List[dict] = [] + tile_repair_requests: List[dict] = [] + first_pass_outputs = _generate_batch_outputs( + llm, + jobs=jobs, + prompt=prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in first_pass_outputs: + item = result["item"] + state = doc_states[item["stem"]] + raw_text = str(result["raw_text"]) + image_stats = dict(item.get("image_stats", {})) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + state["page_outputs"][item["page_number"] - 1] = page_text + quality = _text_quality_metrics(page_text) + repair_strategy, repair_reason = _classify_repair( + page_text, + image_stats=image_stats, + repair_mode=args.repair_mode, + ) + metric = { + "page_number": int(item["page_number"]), + "infer_sec": float(result["infer_sec"]), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + "first_pass_quality_score": float(quality["quality_score"]), + "first_pass_letters": int(quality["letters"]), + "first_pass_digits": int(quality["digits"]), + "first_pass_pua_chars": int(quality["pua_chars"]), + "repair_strategy": repair_strategy, + "repair_reason": repair_reason, + "repair_attempted": False, + "repair_applied": False, + **image_stats, + **postprocess_metrics, + } + state["page_metrics"][item["page_number"] - 1] = metric + if repair_strategy == "plain": + plain_repair_jobs.append(item) + elif repair_strategy == "tile": + tile_repair_requests.append(item) + state["completed_pages"] = int(state["completed_pages"]) + 1 + progress_pages = [page for page in state["page_outputs"] if page] + _write_progress( + output_dir, + item["stem"], + progress_pages, + int(state["total_pages"]), + int(state["completed_pages"]), + ) - for item, output in zip(batch, outputs): + if plain_repair_jobs: + plain_repair_outputs = _generate_batch_outputs( + llm, + jobs=plain_repair_jobs, + prompt=plain_prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + for result in plain_repair_outputs: + item = result["item"] state = doc_states[item["stem"]] - raw_text = "" - if getattr(output, "outputs", None): - raw_text = str(output.outputs[0].text) - page_text, postprocess_metrics = _postprocess_page_text( - raw_text, - prompt=prompt, + metric = state["page_metrics"][item["page_number"] - 1] + original_text = state["page_outputs"][item["page_number"] - 1] + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, content_debug=bool(args.content_debug), ) if args.content_debug: - page_text = f"\n{page_text}".strip() - state["page_outputs"][item["page_number"] - 1] = page_text - state["page_metrics"].append( - { - "page_number": int(item["page_number"]), - "infer_sec": float(per_item_sec), - "raw_chars": int(len(raw_text.strip())), - "final_chars": int(len(page_text.strip())), - **postprocess_metrics, - } + repair_text = f"\n{repair_text}".strip() + original_quality = _text_quality_metrics(original_text) + repair_quality = _text_quality_metrics(repair_text) + apply_repair = bool(repair_text.strip()) and ( + float(repair_quality["quality_score"]) >= float(original_quality["quality_score"]) + or str(metric.get("repair_reason")) in {"markdown_garbage", "extreme_short"} + ) + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_final_chars"] = int(len(repair_text.strip())) + metric["repair_quality_score"] = float(repair_quality["quality_score"]) + metric["repair_profile"] = "plain_ocr" + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) + if apply_repair: + state["page_outputs"][item["page_number"] - 1] = repair_text + metric["repair_applied"] = True + metric["final_chars"] = int(len(repair_text.strip())) + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + if tile_repair_requests: + tile_jobs: List[dict] = [] + for item in tile_repair_requests: + for tile_name, y0, y1 in REPAIR_TILE_SPECS: + tile_jobs.append( + { + "stem": item["stem"], + "page_number": int(item["page_number"]), + "image_path": item["image_path"], + "variant": tile_name, + "crop_box": (0.0, y0, 1.0, y1), + } + ) + tile_outputs = _generate_batch_outputs( + llm, + jobs=tile_jobs, + prompt=prompt, + batch_size=int(args.batch_size), + sampling_params=sampling_params, + ) + grouped_tile_outputs: Dict[tuple[str, int], List[dict]] = {} + for result in tile_outputs: + key = (str(result["item"]["stem"]), int(result["item"]["page_number"])) + grouped_tile_outputs.setdefault(key, []).append(result) + for item in tile_repair_requests: + key = (str(item["stem"]), int(item["page_number"])) + state = doc_states[item["stem"]] + metric = state["page_metrics"][item["page_number"] - 1] + original_text = state["page_outputs"][item["page_number"] - 1] + grouped = sorted( + grouped_tile_outputs.get(key, []), + key=lambda value: {"top": 0, "mid": 1, "bottom": 2}.get(str(value["item"].get("variant")), 99), ) - state["completed_pages"] = int(state["completed_pages"]) + 1 - progress_pages = [page for page in state["page_outputs"] if page] - _write_progress( - output_dir, - item["stem"], - progress_pages, - int(state["total_pages"]), - int(state["completed_pages"]), + tile_parts: List[str] = [] + repair_infer_sec = 0.0 + for result in grouped: + repair_infer_sec += float(result["infer_sec"]) + tile_text, _ = _postprocess_page_text( + str(result["raw_text"]), + prompt=prompt, + content_debug=bool(args.content_debug), + ) + tile_parts.append(tile_text) + stitched = _stitch_tiled_markdown(tile_parts) + if args.content_debug: + stitched = f"\n{stitched}".strip() + original_quality = _text_quality_metrics(original_text) + stitched_quality = _text_quality_metrics(stitched) + apply_repair = bool(stitched.strip()) and ( + float(stitched_quality["quality_score"]) > float(original_quality["quality_score"]) + and int(stitched_quality["chars"]) >= int(original_quality["chars"]) ) + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(metric.get("repair_infer_sec", 0.0)) + float(repair_infer_sec) + metric["repair_final_chars"] = int(len(stitched.strip())) + metric["repair_quality_score"] = float(stitched_quality["quality_score"]) + metric["repair_tile_count"] = int(len(grouped)) + metric["repair_profile"] = "markdown_grounded_tiled" + metric["infer_sec"] = float(metric["infer_sec"]) + float(repair_infer_sec) + if apply_repair: + state["page_outputs"][item["page_number"] - 1] = stitched + metric["repair_applied"] = True + metric["final_chars"] = int(len(stitched.strip())) + _write_progress( + output_dir, + item["stem"], + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) for stem, state in doc_states.items(): markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" - page_metrics = sorted(state["page_metrics"], key=lambda item: int(item["page_number"])) + page_metrics = sorted( + [item for item in state["page_metrics"] if item], + key=lambda item: int(item["page_number"]), + ) + repair_summary = { + "repair_mode": str(args.repair_mode), + "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), + "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), + "plain_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied")))), + "tiled_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "markdown_grounded_tiled" and bool(item.get("repair_applied")))), + } _write_outputs( output_dir, stem, @@ -225,9 +533,11 @@ def main() -> int: "batch_size": int(args.batch_size), "gpu_memory_utilization": float(args.gpu_memory_utilization), "disable_fp8_kv": bool(args.disable_fp8_kv), + "repair_mode": str(args.repair_mode), "render_sec": float(state["render_sec"]), "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "repair_summary": repair_summary, "page_metrics": page_metrics, }, ) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index fe60390..8959e25 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -44,6 +44,7 @@ def _build_cli_command( content_debug: bool, device: Optional[str], ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -56,6 +57,7 @@ def _build_cli_command( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -78,6 +80,8 @@ def _build_cli_command( cmd += ["--device", str(device)] if ocr_profile: cmd += ["--ocr-profile", str(ocr_profile)] + if prompt_override: + cmd += ["--prompt-override", str(prompt_override)] if attn_backend: cmd += ["--attn-backend", str(attn_backend)] if base_size is not None: @@ -104,6 +108,8 @@ def _build_cli_command( cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] if disable_fp8_kv: cmd.append("--disable-fp8-kv") + if repair_mode: + cmd += ["--repair-mode", str(repair_mode)] return cmd @@ -139,6 +145,7 @@ def _run_cli( content_debug: bool, device: Optional[str], ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -151,6 +158,7 @@ def _run_cli( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -164,6 +172,7 @@ def _run_cli( content_debug=content_debug, device=device, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -176,6 +185,7 @@ def _run_cli( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) env = _build_env(python_bin=python_bin, visible_device=visible_device) @@ -318,6 +328,7 @@ def _run_multi_cli( content_debug: bool, log_dir: Path, ocr_profile: str, + prompt_override: Optional[str], attn_backend: str, base_size: Optional[int], image_size: Optional[int], @@ -330,6 +341,7 @@ def _run_multi_cli( vllm_batch_size: Optional[int], gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, + repair_mode: Optional[str], ) -> None: lanes = _plan_lanes( file_list=file_list, @@ -363,6 +375,7 @@ def _run_multi_cli( content_debug=content_debug, device="cuda", ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -375,6 +388,7 @@ def _run_multi_cli( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( @@ -416,6 +430,7 @@ def run_for_files( device: Optional[str] = None, runtime_backend: str = "transformers", ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, attn_backend: str = "auto", base_size: Optional[int] = None, image_size: Optional[int] = None, @@ -430,6 +445,7 @@ def run_for_files( gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, vllm_batch_size: Optional[int] = None, + repair_mode: str = "auto", **_: Any, ) -> Dict[str, Any]: """Run DeepSeek OCR for the provided files.""" @@ -509,6 +525,7 @@ def run_for_files( content_debug=content_debug, log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -521,6 +538,7 @@ def run_for_files( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) else: _run_cli( @@ -534,6 +552,7 @@ def run_for_files( content_debug=content_debug, device=device, ocr_profile=ocr_profile, + prompt_override=prompt_override, attn_backend=attn_backend, base_size=base_size, image_size=image_size, @@ -546,6 +565,7 @@ def run_for_files( vllm_batch_size=vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, ) results: Dict[str, Any] = {} diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 4629d2f..bc20acd 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -126,6 +126,7 @@ def test_build_cli_command_includes_speed_flags(tmp_path): content_debug=False, device="cuda", ocr_profile="plain_ocr", + prompt_override="custom prompt", attn_backend="flash_attention_2", base_size=768, image_size=512, @@ -138,9 +139,11 @@ def test_build_cli_command_includes_speed_flags(tmp_path): vllm_batch_size=None, gpu_memory_utilization=None, disable_fp8_kv=False, + repair_mode=None, ) assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--prompt-override" in cmd and "custom prompt" in cmd assert "--attn-backend" in cmd and "flash_attention_2" in cmd assert "--base-size" in cmd and "768" in cmd assert "--image-size" in cmd and "512" in cmd @@ -163,6 +166,7 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): content_debug=False, device="cuda", ocr_profile="markdown_grounded", + prompt_override=None, attn_backend="auto", base_size=None, image_size=None, @@ -175,11 +179,28 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): vllm_batch_size=16, gpu_memory_utilization=0.92, disable_fp8_kv=True, + repair_mode="auto", ) assert "--batch-size" in cmd and "16" in cmd assert "--gpu-memory-utilization" in cmd and "0.92" in cmd assert "--disable-fp8-kv" in cmd + assert "--repair-mode" in cmd and "auto" in cmd + + +def test_vllm_repair_classifier_routes_garbage_and_short_pages(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _classify_repair + + dense_page = { + "top_dark_ratio": 0.16, + "bottom_dark_ratio": 0.16, + "overall_dark_ratio": 0.15, + } + assert _classify_repair("\uf0b7" * 80, dense_page, "auto") == ("plain", "markdown_garbage") + assert _classify_repair("42", dense_page, "auto") == ("plain", "extreme_short") + assert _classify_repair("Α" * 300, dense_page, "auto") == ("tile", "short_coverage") + assert _classify_repair("Α" * 1200, dense_page, "auto") == ("none", None) + assert _classify_repair("Α" * 300, dense_page, "off") == ("none", None) def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): From 5ad862043d628aa1871eabf9b851e702914a543a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:25:11 +0300 Subject: [PATCH 23/93] Add DeepSeek pipeline benchmark harness --- src/glossapi/ocr/deepseek/runner.py | 43 +- .../scripts/deepseek_pipeline_benchmark.py | 435 ++++++++++++++++++ 2 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 src/glossapi/scripts/deepseek_pipeline_benchmark.py diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 8959e25..906f30d 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -21,6 +21,7 @@ REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" +AUTO_VLLM_BATCH_PAGE_CAP = 160 def _page_count(pdf_path: Path) -> int: @@ -314,6 +315,24 @@ def _plan_lanes( return lanes +def _auto_vllm_batch_size( + *, + runtime_backend: str, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + total_pages = 0 + for name in file_list: + pdf_path = (input_root / name).resolve() + total_pages += int(_effective_page_count(pdf_path, max_pages)) + if total_pages <= 0: + return 1 + return min(int(total_pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + def _run_multi_cli( *, input_root: Path, @@ -362,6 +381,16 @@ def _run_multi_cli( if not lane_files: continue visible_device = int(lane["visible_device"]) + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend, + file_list=lane_files, + input_root=input_root, + max_pages=max_pages, + ) + ) log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" fh = stack.enter_context(log_path.open("w", encoding="utf-8")) cmd = _build_cli_command( @@ -385,7 +414,7 @@ def _run_multi_cli( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, runtime_backend=runtime_backend, - vllm_batch_size=vllm_batch_size, + vllm_batch_size=resolved_vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, @@ -541,6 +570,16 @@ def run_for_files( repair_mode=repair_mode, ) else: + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend_norm, + file_list=file_list, + input_root=pdf_root, + max_pages=max_pages, + ) + ) _run_cli( input_dir=pdf_root, output_dir=out_root, @@ -562,7 +601,7 @@ def run_for_files( repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, runtime_backend=runtime_backend_norm, - vllm_batch_size=vllm_batch_size, + vllm_batch_size=resolved_vllm_batch_size, gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py new file mode 100644 index 0000000..b9a0c94 --- /dev/null +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -0,0 +1,435 @@ +from __future__ import annotations + +import argparse +import json +import os +import random +import shutil +import subprocess +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + + +def _parse_devices(spec: str) -> List[int]: + tokens = [piece.strip() for piece in str(spec or "").split(",") if piece.strip()] + if not tokens: + raise argparse.ArgumentTypeError("--devices must contain at least one GPU id") + try: + return [int(token) for token in tokens] + except ValueError as exc: + raise argparse.ArgumentTypeError(f"Invalid GPU list: {spec}") from exc + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_pipeline_benchmark", + description="Benchmark DeepSeek OCR pipeline throughput for static and streaming-style scheduling.", + ) + p.add_argument("--repo", required=True) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--python-bin", required=True) + p.add_argument("--model-dir", required=True) + p.add_argument("--label", required=True) + p.add_argument("--mode", default="static", choices=["static", "streaming"]) + p.add_argument("--devices", default="0,1,2,3,4,5,6,7") + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--max-docs", type=int, default=None) + p.add_argument("--doc-order", default="name", choices=["name", "random", "largest_first"]) + p.add_argument("--seed", type=int, default=20260330) + p.add_argument("--stream-batch-pages", type=int, default=160) + p.add_argument("--runtime-backend", default="vllm", choices=["transformers", "vllm"]) + p.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + p.add_argument("--prompt-override", default=None) + p.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + p.add_argument("--attn-backend", default="auto") + p.add_argument("--base-size", type=int, default=None) + p.add_argument("--image-size", type=int, default=None) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--max-new-tokens", type=int, default=None) + p.add_argument("--vllm-batch-size", type=int, default=None) + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + p.add_argument("--disable-fp8-kv", action="store_true") + p.add_argument("--clean", action="store_true") + return p.parse_args() + + +def _weighted_files( + *, + input_dir: Path, + max_docs: Optional[int], + doc_order: str, + seed: int, +) -> List[Dict[str, Any]]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + weighted = [] + for path in sorted(input_dir.glob("*.pdf")): + pages = int(deepseek_runner._effective_page_count(path, None)) + weighted.append({"name": path.name, "pages": pages}) + if doc_order == "largest_first": + weighted.sort(key=lambda item: (-int(item["pages"]), str(item["name"]))) + elif doc_order == "random": + rng = random.Random(int(seed)) + rng.shuffle(weighted) + if max_docs is not None: + weighted = weighted[: max(0, int(max_docs))] + return weighted + + +def _empty_lanes(devices: List[int], workers_per_gpu: int) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "batches": [], + "assigned_pages": 0, + } + ) + lane_id += 1 + return lanes + + +def _plan_static( + weighted_files: List[Dict[str, Any]], + devices: List[int], + workers_per_gpu: int, + input_dir: Path, +) -> List[Dict[str, Any]]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + lanes = deepseek_runner._plan_lanes( + file_list=[str(item["name"]) for item in weighted_files], + input_root=input_dir, + lane_devices=devices, + workers_per_gpu=max(1, int(workers_per_gpu)), + max_pages=None, + ) + weights = {str(item["name"]): int(item["pages"]) for item in weighted_files} + planned: List[Dict[str, Any]] = [] + for lane in lanes: + files = list(lane["files"]) + if not files: + continue + weight = sum(int(weights.get(name, 0)) for name in files) + planned.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "assigned_pages": int(weight), + "batches": [ + { + "batch_id": 0, + "files": files, + "pages": int(weight), + } + ], + } + ) + return planned + + +def _plan_streaming( + weighted_files: List[Dict[str, Any]], + devices: List[int], + workers_per_gpu: int, + stream_batch_pages: int, +) -> List[Dict[str, Any]]: + lanes = _empty_lanes(devices, workers_per_gpu) + batch_target = max(1, int(stream_batch_pages)) + current: Dict[int, Dict[str, Any]] = { + int(lane["lane_id"]): {"files": [], "pages": 0} + for lane in lanes + } + + def flush(lane: Dict[str, Any]) -> None: + lane_id = int(lane["lane_id"]) + state = current[lane_id] + if not state["files"]: + return + lane["batches"].append( + { + "batch_id": len(lane["batches"]), + "files": list(state["files"]), + "pages": int(state["pages"]), + } + ) + state["files"] = [] + state["pages"] = 0 + + for item in weighted_files: + lane = min(lanes, key=lambda value: (int(value["assigned_pages"]) + int(current[int(value["lane_id"])]["pages"]), int(value["lane_id"]))) + lane_id = int(lane["lane_id"]) + current[lane_id]["files"].append(str(item["name"])) + current[lane_id]["pages"] = int(current[lane_id]["pages"]) + int(item["pages"]) + lane["assigned_pages"] = int(lane["assigned_pages"]) + int(item["pages"]) + if int(current[lane_id]["pages"]) >= batch_target: + flush(lane) + + for lane in lanes: + flush(lane) + return [lane for lane in lanes if lane["batches"]] + + +def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: + metrics_dir = run_dir / "json" / "metrics" + totals = { + "docs_with_metrics": 0, + "pages_flagged": 0, + "pages_repaired": 0, + "plain_repairs": 0, + "tiled_repairs": 0, + } + if not metrics_dir.exists(): + return totals + for path in metrics_dir.glob("*.metrics.json"): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + totals["docs_with_metrics"] += 1 + summary = data.get("repair_summary") or {} + totals["pages_flagged"] += int(summary.get("pages_flagged", 0)) + totals["pages_repaired"] += int(summary.get("pages_repaired", 0)) + totals["plain_repairs"] += int(summary.get("plain_repairs", 0)) + totals["tiled_repairs"] += int(summary.get("tiled_repairs", 0)) + return totals + + +def main() -> int: + args = _parse_args() + repo = Path(args.repo).resolve() + input_dir = Path(args.input_dir).resolve() + output_root = Path(args.output_dir).resolve() + python_bin = Path(args.python_bin).expanduser() + model_dir = Path(args.model_dir).resolve() + devices = _parse_devices(args.devices) + + from glossapi.ocr.deepseek import runner as deepseek_runner + + weighted_files = _weighted_files( + input_dir=input_dir, + max_docs=args.max_docs, + doc_order=args.doc_order, + seed=int(args.seed), + ) + if not weighted_files: + raise SystemExit("No PDFs found for benchmark input set.") + + if str(args.mode) == "streaming": + lanes = _plan_streaming( + weighted_files=weighted_files, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + stream_batch_pages=max(1, int(args.stream_batch_pages)), + ) + else: + lanes = _plan_static( + weighted_files=weighted_files, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + input_dir=input_dir, + ) + + run_dir = output_root / args.label + if args.clean and run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + logs_dir = run_dir / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "lane_plan.json").write_text(json.dumps(lanes, indent=2), encoding="utf-8") + + script_path = ( + deepseek_runner.DEFAULT_VLLM_SCRIPT + if str(args.runtime_backend) == "vllm" + else deepseek_runner.DEFAULT_SCRIPT + ) + + py_env = {"PYTHONPATH": str(repo / "src")} + + def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + batch_id = int(batch["batch_id"]) + files = list(batch["files"]) + pages = int(batch["pages"]) + resolved_vllm_batch_size = ( + int(args.vllm_batch_size) + if args.vllm_batch_size is not None + else deepseek_runner._auto_vllm_batch_size( + runtime_backend=str(args.runtime_backend), + file_list=files, + input_root=input_dir, + max_pages=None, + ) + ) + log_path = logs_dir / f"lane_{lane_id:02d}_batch_{batch_id:03d}_gpu{visible_device}.log" + fh = log_path.open("w", encoding="utf-8") + cmd = deepseek_runner._build_cli_command( + input_dir=input_dir, + output_dir=run_dir, + files=files, + model_dir=model_dir, + python_bin=python_bin, + script=script_path, + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile=str(args.ocr_profile), + prompt_override=args.prompt_override, + attn_backend=str(args.attn_backend), + base_size=args.base_size, + image_size=args.image_size, + crop_mode=None, + render_dpi=int(args.render_dpi), + max_new_tokens=args.max_new_tokens, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend=str(args.runtime_backend), + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + repair_mode=str(args.repair_mode), + ) + env = deepseek_runner._build_env(python_bin=python_bin, visible_device=visible_device) + if env.get("PYTHONPATH"): + env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" + else: + env["PYTHONPATH"] = py_env["PYTHONPATH"] + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + return { + "lane_id": lane_id, + "visible_device": visible_device, + "batch_id": batch_id, + "pages": pages, + "files": files, + "resolved_vllm_batch_size": resolved_vllm_batch_size, + "log_path": str(log_path), + "fh": fh, + "proc": proc, + "start_ts": time.perf_counter(), + "cmd": cmd, + } + + pending_batches: Dict[int, List[Dict[str, Any]]] = { + int(lane["lane_id"]): list(lane["batches"]) + for lane in lanes + } + active: List[Dict[str, Any]] = [] + global_start = time.perf_counter() + for lane in lanes: + lane_id = int(lane["lane_id"]) + if pending_batches[lane_id]: + first_batch = pending_batches[lane_id].pop(0) + active.append(start_batch(lane, first_batch)) + + batch_results: List[Dict[str, Any]] = [] + while active: + time.sleep(0.2) + for item in list(active): + rc = item["proc"].poll() + if rc is None: + continue + end_ts = time.perf_counter() + item["fh"].close() + elapsed = max(0.000001, float(end_ts - item["start_ts"])) + batch_results.append( + { + "lane_id": int(item["lane_id"]), + "visible_device": int(item["visible_device"]), + "batch_id": int(item["batch_id"]), + "pages": int(item["pages"]), + "files": list(item["files"]), + "return_code": int(rc), + "resolved_vllm_batch_size": item["resolved_vllm_batch_size"], + "start_offset_sec": float(item["start_ts"] - global_start), + "end_offset_sec": float(end_ts - global_start), + "elapsed_sec": float(elapsed), + "sec_per_page": float(elapsed / max(1, int(item["pages"]))), + "log_path": str(item["log_path"]), + "cmd": item["cmd"], + } + ) + active.remove(item) + lane = next(lane for lane in lanes if int(lane["lane_id"]) == int(item["lane_id"])) + if pending_batches[int(item["lane_id"])]: + next_batch = pending_batches[int(item["lane_id"])].pop(0) + active.append(start_batch(lane, next_batch)) + + total_elapsed = max(0.000001, time.perf_counter() - global_start) + total_pages = sum(int(item["pages"]) for item in weighted_files) + failures = [item for item in batch_results if int(item["return_code"]) != 0] + + lane_results: List[Dict[str, Any]] = [] + for lane in lanes: + lane_batches = [item for item in batch_results if int(item["lane_id"]) == int(lane["lane_id"])] + if not lane_batches: + continue + lane_start = min(float(item["start_offset_sec"]) for item in lane_batches) + lane_end = max(float(item["end_offset_sec"]) for item in lane_batches) + lane_elapsed = max(0.000001, lane_end - lane_start) + lane_pages = sum(int(item["pages"]) for item in lane_batches) + lane_results.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "batch_count": len(lane_batches), + "pages": int(lane_pages), + "active_elapsed_sec": float(lane_elapsed), + "sec_per_page": float(lane_elapsed / max(1, lane_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in lane_batches), + } + ) + + gpu_results: List[Dict[str, Any]] = [] + for visible_device in sorted({int(item["visible_device"]) for item in batch_results}): + gpu_batches = [item for item in batch_results if int(item["visible_device"]) == visible_device] + gpu_start = min(float(item["start_offset_sec"]) for item in gpu_batches) + gpu_end = max(float(item["end_offset_sec"]) for item in gpu_batches) + gpu_elapsed = max(0.000001, gpu_end - gpu_start) + gpu_pages = sum(int(item["pages"]) for item in gpu_batches) + gpu_results.append( + { + "visible_device": visible_device, + "batch_count": len(gpu_batches), + "pages": int(gpu_pages), + "active_elapsed_sec": float(gpu_elapsed), + "sec_per_page": float(gpu_elapsed / max(1, gpu_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in gpu_batches), + } + ) + + repair_metrics = _collect_repair_metrics(run_dir) + summary = { + "label": str(args.label), + "status": "pass" if not failures else "fail", + "mode": str(args.mode), + "runtime_backend": str(args.runtime_backend), + "ocr_profile": str(args.ocr_profile), + "repair_mode": str(args.repair_mode), + "devices": devices, + "workers_per_gpu": int(args.workers_per_gpu), + "doc_order": str(args.doc_order), + "stream_batch_pages": int(args.stream_batch_pages), + "docs": len(weighted_files), + "pages": int(total_pages), + "wall_time_sec": float(total_elapsed), + "sec_per_page": float(total_elapsed / max(1, total_pages)), + "batch_results": batch_results, + "lane_results": lane_results, + "gpu_results": gpu_results, + "repair_metrics": repair_metrics, + "failures": failures, + } + (run_dir / "pipeline_benchmark_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(json.dumps(summary, indent=2)) + return 1 if failures else 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) From 41b983e4ac925f9f419ce687b3d78980df80fe29 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 18:52:32 +0300 Subject: [PATCH 24/93] Document DeepSeek pipeline benchmark results --- docs/ocr_and_math_enhancement.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index ac2a5b7..3c9f584 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -152,6 +152,30 @@ Validated on 2026-03-30: - GPU memory utilization: `0.9` - Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` +Production markdown+repair benchmark on the same host: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Max new tokens: `2048` +- GPUs: `8` +- Static sharding (`1` shard/GPU): `574.87s` wall, `0.0754 sec/page` overall, `0.4971` to `0.5484 sec/page/GPU` +- Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` +- Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU +- Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches + +Decision: + +- Keep static sharding as the default large-run pipeline shape for now +- Do not enable streaming admission by default yet; on this benchmark it regressed badly versus static sharding +- Treat the earlier `0.3109 sec/page/GPU` result as the raw floor, and the static repaired-markdown result above as the current production-like baseline on this hardware + +Attention/runtime note: + +- The production fast path is `vllm`; logs on this stack show `flashinfer` autotuning plus CUDA graph capture +- Transformers remain the fallback path; prefer `flash_attention_2` there and do not optimize around `sdpa` + That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. - Batch sizes From 0a863238261fb402c32687161daa78218a8f946d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 21:37:39 +0300 Subject: [PATCH 25/93] Harden DeepSeek repair classification --- .../ocr/deepseek/run_pdf_ocr_transformers.py | 8 +-- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 54 ++++++++++++++++++- tests/test_deepseek_runner_contract.py | 11 ++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 071b3b5..d7b0387 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -12,10 +12,7 @@ from pathlib import Path from typing import Iterable, List -import fitz -import torch from PIL import Image -from transformers import AutoModel, AutoTokenizer SRC_ROOT = Path(__file__).resolve().parents[3] if str(SRC_ROOT) not in sys.path: @@ -82,6 +79,8 @@ def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: + import fitz + images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: @@ -204,6 +203,9 @@ def _load_model( repetition_penalty: float | None, no_repeat_ngram_size: int | None, ): + import torch + from transformers import AutoModel, AutoTokenizer + attn_impl = _resolve_attn_backend(attn_backend) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) try: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 56870e5..6d8354c 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -32,8 +32,12 @@ REPAIR_EXTREME_SHORT_CHARS = 120 REPAIR_PUA_THRESHOLD = 64 REPAIR_MIN_HALF_DARK = 0.08 +REPAIR_MIN_THIRD_DARK = 0.07 REPAIR_MAX_OVERALL_DARK = 0.25 REPAIR_MIN_OVERALL_DARK = 0.04 +REPAIR_FOOTNOTE_SHORT_CHARS = 1100 +REPAIR_MIN_FOOTNOTE_LINES = 2 +REPAIR_FOOTNOTE_RATIO = 0.40 def _parse_args() -> argparse.Namespace: @@ -130,10 +134,16 @@ def _dark_ratio(y0: int, y1: int) -> float: return float(dark) / float(total) half = max(1, height // 2) + third = max(1, height // 3) + top_third_end = min(height, third) + middle_third_end = min(height, third * 2) dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) return { "top_dark_ratio": _dark_ratio(0, half), "bottom_dark_ratio": _dark_ratio(half, height), + "top_third_dark_ratio": _dark_ratio(0, top_third_end), + "middle_third_dark_ratio": _dark_ratio(top_third_end, middle_third_end), + "bottom_third_dark_ratio": _dark_ratio(middle_third_end, height), "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), } @@ -153,34 +163,74 @@ def _text_quality_metrics(text: str) -> dict: letters = sum(1 for ch in stripped if ch.isalpha()) digits = sum(1 for ch in stripped if ch.isdigit()) pua_chars = _count_private_use_chars(stripped) + lines = [line.strip() for line in stripped.splitlines() if line.strip()] + footnote_like_lines = sum(1 for line in lines if _is_footnote_like_line(line)) + avg_line_length = (sum(len(line) for line in lines) / float(len(lines))) if lines else 0.0 score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) return { "chars": int(len(stripped)), "letters": int(letters), "digits": int(digits), "pua_chars": int(pua_chars), + "line_count": int(len(lines)), + "footnote_like_lines": int(footnote_like_lines), + "avg_line_length": float(avg_line_length), "quality_score": float(score), } +def _is_footnote_like_line(line: str) -> bool: + stripped = str(line or "").strip() + if not stripped: + return False + if len(stripped) <= 2: + return False + if stripped[0].isdigit(): + if len(stripped) > 1 and stripped[1] in {".", ")", "]"}: + return True + if len(stripped) > 2 and stripped[1].isspace(): + return True + if stripped[0] in {"*", "•", "-", "†", "‡"}: + return True + return False + + def _classify_repair(text: str, image_stats: dict, repair_mode: str) -> tuple[str, str | None]: if str(repair_mode or "off").strip().lower() != "auto": return "none", None quality = _text_quality_metrics(text) chars = int(quality["chars"]) pua_chars = int(quality["pua_chars"]) + line_count = int(quality["line_count"]) + footnote_like_lines = int(quality["footnote_like_lines"]) + footnote_ratio = float(footnote_like_lines) / float(max(1, line_count)) pua_ratio = float(pua_chars) / float(max(1, chars)) if pua_chars >= REPAIR_PUA_THRESHOLD or pua_ratio >= 0.10: return "plain", "markdown_garbage" - if chars <= REPAIR_EXTREME_SHORT_CHARS: - return "plain", "extreme_short" top_dark = float(image_stats.get("top_dark_ratio", 0.0)) bottom_dark = float(image_stats.get("bottom_dark_ratio", 0.0)) + top_third_dark = float(image_stats.get("top_third_dark_ratio", top_dark)) + middle_third_dark = float(image_stats.get("middle_third_dark_ratio", 0.0)) + bottom_third_dark = float(image_stats.get("bottom_third_dark_ratio", bottom_dark)) overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if ( + chars <= REPAIR_FOOTNOTE_SHORT_CHARS + and footnote_like_lines >= REPAIR_MIN_FOOTNOTE_LINES + and footnote_ratio >= REPAIR_FOOTNOTE_RATIO + and top_third_dark >= REPAIR_MIN_THIRD_DARK + and middle_third_dark >= REPAIR_MIN_THIRD_DARK + and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK + ): + return "tile", "footnote_dominant" + if chars <= REPAIR_EXTREME_SHORT_CHARS: + return "plain", "extreme_short" if ( chars <= REPAIR_SHORT_CHARS and top_dark >= REPAIR_MIN_HALF_DARK and bottom_dark >= REPAIR_MIN_HALF_DARK + and top_third_dark >= REPAIR_MIN_THIRD_DARK + and middle_third_dark >= REPAIR_MIN_THIRD_DARK + and bottom_third_dark >= REPAIR_MIN_THIRD_DARK and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK ): return "tile", "short_coverage" diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index bc20acd..d58472d 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -194,11 +194,22 @@ def test_vllm_repair_classifier_routes_garbage_and_short_pages(): dense_page = { "top_dark_ratio": 0.16, "bottom_dark_ratio": 0.16, + "top_third_dark_ratio": 0.15, + "middle_third_dark_ratio": 0.15, + "bottom_third_dark_ratio": 0.15, "overall_dark_ratio": 0.15, } assert _classify_repair("\uf0b7" * 80, dense_page, "auto") == ("plain", "markdown_garbage") assert _classify_repair("42", dense_page, "auto") == ("plain", "extreme_short") assert _classify_repair("Α" * 300, dense_page, "auto") == ("tile", "short_coverage") + footnote_only = "\n".join( + [ + "1. υποσημείωση πρώτη γραμμή", + "2. υποσημείωση δεύτερη γραμμή", + "3. υποσημείωση τρίτη γραμμή", + ] + ) + assert _classify_repair(footnote_only, dense_page, "auto") == ("tile", "footnote_dominant") assert _classify_repair("Α" * 1200, dense_page, "auto") == ("none", None) assert _classify_repair("Α" * 300, dense_page, "off") == ("none", None) From 3038fa8e4d71f5cd68ab6e215bec7c2dcbedef77 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Mon, 30 Mar 2026 23:45:19 +0300 Subject: [PATCH 26/93] Update DeepSeek benchmark note --- docs/ocr_and_math_enhancement.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 3c9f584..dd26569 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -160,7 +160,7 @@ Production markdown+repair benchmark on the same host: - Repair mode: `auto` - Max new tokens: `2048` - GPUs: `8` -- Static sharding (`1` shard/GPU): `574.87s` wall, `0.0754 sec/page` overall, `0.4971` to `0.5484 sec/page/GPU` +- Static sharding (`1` shard/GPU), validated rerun after classifier hardening: `558.88s` wall, `0.0733 sec/page` overall, `0.4912` to `0.5475 sec/page/GPU` - Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` - Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU - Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches From 6ab1e49aae957e16a1ad25f176a088941983e6c4 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 04:48:18 +0300 Subject: [PATCH 27/93] Improve DeepSeek scheduling and standardize defaults --- docs/configuration.md | 11 + docs/getting_started.md | 9 +- docs/ocr_and_math_enhancement.md | 62 +++ src/glossapi/corpus/phase_ocr_math.py | 17 +- .../ocr/deepseek/run_pdf_ocr_transformers.py | 103 ++++- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 437 ++++++++---------- src/glossapi/ocr/deepseek/runner.py | 183 +++++++- src/glossapi/ocr/deepseek/scheduling.py | 242 ++++++++++ src/glossapi/ocr/utils/cleaning.py | 211 ++++++++- .../scripts/deepseek_pipeline_benchmark.py | 263 +++++------ tests/test_deepseek_runner_contract.py | 143 +++++- tests/test_deepseek_scheduling.py | 238 ++++++++++ tests/test_streaming_garbage_detector.py | 83 ++++ 13 files changed, 1551 insertions(+), 451 deletions(-) create mode 100644 src/glossapi/ocr/deepseek/scheduling.py create mode 100644 tests/test_deepseek_scheduling.py create mode 100644 tests/test_streaming_garbage_detector.py diff --git a/docs/configuration.md b/docs/configuration.md index f8dd8de..af8737a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -49,6 +49,17 @@ The dedicated uv profile is OCR-only and does not install the Docling extraction - `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. +Standard OCR defaults: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` +- `repair_mode='auto'` +- `scheduler='auto'` +- `target_batch_pages=160` + +The DeepSeek runners now default to `max_new_tokens=2048`. Do not leave the token cap implicit in one environment and explicit in another when comparing benchmarks. + ## Math Enrichment (Phase‑2) - `GLOSSAPI_LATEX_EARLYSTOP` = `1|0` (default 1): enable/disable early‑stop wrapper. diff --git a/docs/getting_started.md b/docs/getting_started.md index e86d492..d1557d3 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -35,9 +35,16 @@ The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `gl - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- Standard OCR defaults after setup: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `scheduler='auto'` + - `target_batch_pages=160` - `flash-attn` is optional. The runner uses it when available and otherwise falls back to the Transformers `eager` attention implementation. +- Do not benchmark against an ad hoc DeepSeek venv and compare it to the validated `dependency_setup/.venvs/deepseek` results as if they were the same stack. ### Option 1 — pip (evaluate quickly) diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index dd26569..b013dd3 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -27,6 +27,29 @@ Policy: never OCR and math on the same file - Torch CUDA installed in the DeepSeek env (the uv setup pins the tested stack). - Optional helpers for Phase‑2 JSON: `pypdfium2`, `zstandard`. +### Standard DeepSeek venv + +Use a dedicated OCR runtime and treat it as the source of truth for DeepSeek runs: + +```bash +./dependency_setup/setup_deepseek_uv.sh \ + --venv dependency_setup/.venvs/deepseek \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ + --run-tests --smoke-test +``` + +Recommended environment variables after setup: + +```bash +export GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 +export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 +export GLOSSAPI_DEEPSEEK_PYTHON="$PWD/dependency_setup/.venvs/deepseek/bin/python" +export GLOSSAPI_DEEPSEEK_MODEL_DIR="/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2" +``` + +The OCR runtime should not silently drift between ad hoc virtual environments during benchmarking. If a benchmark uses a different DeepSeek venv, treat the result as a different runtime stack. + Verify GPU readiness before forcing OCR or math: ```bash @@ -88,7 +111,10 @@ The current recommended high-throughput DeepSeek configuration is: - `runtime_backend='vllm'` - `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` as the standard default ceiling - `repair_mode='auto'` to keep markdown as the primary output while selectively rerunning suspicious pages +- `scheduler='auto'` so multi-GPU vLLM runs resolve to exact-fill page-range batching +- `target_batch_pages=160` - large `vllm_batch_size` chosen to keep `sec/page/GPU` at or below the best validated floor for the target hardware Example: @@ -100,9 +126,12 @@ c.ocr( math_enhance=False, runtime_backend='vllm', ocr_profile='markdown_grounded', + max_new_tokens=2048, vllm_batch_size=160, gpu_memory_utilization=0.9, repair_mode='auto', + scheduler='auto', + target_batch_pages=160, use_gpus='multi', ) ``` @@ -116,6 +145,15 @@ c.ocr( This keeps the fast path batched while avoiding per-page sequential fallback overhead. +### What is now implemented + +- Empty-page skipping before OCR dispatch +- Streaming garbage early-stop during markdown generation +- Plain-text retry for pages that hit the garbage early-stop +- Multi-GPU exact-fill page-range scheduling for the DeepSeek runner +- Benchmark harness support for `whole_doc`, `fixed_shard`, and `exact_fill` +- Corpus API forwarding for the scheduler controls + ## Multi‑GPU Phase‑1 (extract): @@ -165,11 +203,29 @@ Production markdown+repair benchmark on the same host: - Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU - Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches +Validated on 2026-03-31 after standardizing the DeepSeek runtime ceiling back to `2048` and restoring the persistent one-process-per-lane architecture: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Scheduler: `whole_doc` +- Max new tokens: `2048` +- GPUs: `8` +- Clean rebuilt whole-document rerun: about `541s` wall, `0.0710 sec/page` overall, and `0.3927` to `0.5000 sec/page/GPU` + +Interpretation: + +- The rebuilt stack is back near the validated March 30 throughput once the silent `8192` ceiling regression is removed. +- The remaining performance problem is not raw inference speed; it is whole-document tail imbalance, where one oversized PDF can keep a single GPU busy after the other lanes finish. +- Multi-GPU `exact_fill` must therefore be benchmarked only on the persistent lane-worker architecture. The earlier exact-fill regression was caused by spawning a fresh OCR CLI per batch, not by the scheduling idea itself. + Decision: - Keep static sharding as the default large-run pipeline shape for now - Do not enable streaming admission by default yet; on this benchmark it regressed badly versus static sharding - Treat the earlier `0.3109 sec/page/GPU` result as the raw floor, and the static repaired-markdown result above as the current production-like baseline on this hardware +- Treat the 2026-03-31 clean whole-document rerun as the restored benchmark sanity check for the standardized `2048` ceiling on the rebuilt runtime Attention/runtime note: @@ -178,6 +234,12 @@ Attention/runtime note: That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. +Default policy note: + +- The standard DeepSeek OCR default is now `max_new_tokens=2048` for both the Transformers and vLLM runners. +- Leaving the flag unset must not silently expand to a larger ceiling such as `8192`. +- When comparing benchmark runs, treat a different token ceiling or a different DeepSeek venv as a different runtime/configuration. + - Batch sizes - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index cd261ed..552af09 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -50,13 +50,17 @@ def ocr( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, - max_new_tokens: Optional[int] = None, + max_new_tokens: Optional[int] = 2048, repetition_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, vllm_batch_size: Optional[int] = None, gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, repair_mode: str = "auto", + scheduler: str = "auto", + target_batch_pages: int = 160, + shard_pages: int = 0, + shard_threshold_pages: int = 0, # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, @@ -94,6 +98,13 @@ def ocr( ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers per visible GPU. + - scheduler/target_batch_pages/shard_pages/shard_threshold_pages: + Multi-GPU scheduling controls. ``scheduler='auto'`` resolves to + exact-fill page-range batching for multi-GPU vLLM runs and falls back + to whole-document scheduling elsewhere. ``target_batch_pages`` is the + per-lane page budget the scheduler tries to fill. ``fixed_shard`` uses + ``shard_pages`` and ``shard_threshold_pages`` when explicit shard-based + planning is requested. - runtime_backend: ``transformers`` (default) or ``vllm``. - ocr_profile/prompt_override/attn_backend/base_size/image_size/crop_mode/render_dpi: DeepSeek rendering and attention controls used for throughput/quality @@ -636,6 +647,10 @@ def _run_math(stems: List[str]) -> None: gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + scheduler=scheduler, + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), content_debug=bool(content_debug), ) except Exception as _e: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index d7b0387..213fdcf 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -29,6 +29,7 @@ PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +DEFAULT_MAX_NEW_TOKENS = 2048 def _profile_defaults(profile: str) -> dict: @@ -54,6 +55,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--output-dir", required=True) parser.add_argument("--model-dir", required=True) parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) @@ -62,7 +64,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) - parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) parser.add_argument("--repetition-penalty", type=float, default=None) parser.add_argument("--no-repeat-ngram-size", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") @@ -72,22 +74,81 @@ def _parse_args() -> argparse.Namespace: return parser.parse_args() -def _iter_pdfs(input_dir: Path, files: List[str]) -> List[Path]: +def _parse_page_range_spec(input_dir: Path, spec: str) -> dict: + try: + name, start_raw, end_raw = str(spec).rsplit(":", 2) + except ValueError as exc: + raise ValueError(f"Invalid page range spec: {spec}") from exc + start_page = int(start_raw) + end_page = int(end_raw) + if start_page <= 0 or end_page < start_page: + raise ValueError(f"Invalid page range bounds: {spec}") + pdf_path = (input_dir / name).resolve() + return { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": start_page, + "end_page": end_page, + "stem": f"{pdf_path.stem}__p{start_page:05d}-{end_page:05d}", + } + + +def _iter_pdf_jobs(input_dir: Path, files: List[str], page_ranges: List[str]) -> List[dict]: + jobs: List[dict] = [] if files: - return [(input_dir / name).resolve() for name in files] - return sorted(input_dir.glob("*.pdf")) + for name in files: + pdf_path = (input_dir / name).resolve() + jobs.append( + { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": 1, + "end_page": None, + "stem": pdf_path.stem, + } + ) + if page_ranges: + jobs.extend(_parse_page_range_spec(input_dir, spec) for spec in page_ranges) + if jobs: + return jobs + return [ + { + "pdf_path": path.resolve(), + "source_name": path.name, + "source_stem": path.stem, + "start_page": 1, + "end_page": None, + "stem": path.stem, + } + for path in sorted(input_dir.glob("*.pdf")) + ] -def _render_pages(pdf_path: Path, max_pages: int | None, render_dpi: int) -> List[Image.Image]: +def _render_pages( + pdf_path: Path, + max_pages: int | None, + render_dpi: int, + *, + start_page: int = 1, + end_page: int | None = None, +) -> List[Image.Image]: import fitz images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: - page_count = doc.page_count if max_pages is None else min(doc.page_count, max_pages) + doc_page_count = int(doc.page_count) + first_idx = max(0, int(start_page) - 1) + last_idx = doc_page_count - 1 if end_page is None else min(doc_page_count - 1, int(end_page) - 1) + if max_pages is not None: + last_idx = min(last_idx, first_idx + int(max_pages) - 1) + if last_idx < first_idx: + return images zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) - for idx in range(page_count): + for idx in range(first_idx, last_idx + 1): page = doc[idx] pixmap = page.get_pixmap(matrix=matrix, alpha=False) img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) @@ -327,8 +388,8 @@ def main() -> int: input_dir = Path(args.input_dir).resolve() output_dir = Path(args.output_dir).resolve() model_dir = Path(args.model_dir).resolve() - pdfs = _iter_pdfs(input_dir, args.files) - if not pdfs: + jobs = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs: return 0 profile_defaults = _profile_defaults(args.ocr_profile) @@ -346,16 +407,24 @@ def main() -> int: args.no_repeat_ngram_size, ) - for pdf_path in pdfs: + for job in jobs: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) doc_start = time.perf_counter() render_start = time.perf_counter() - images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + images = _render_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) render_sec = time.perf_counter() - render_start page_outputs: List[str] = [] page_metrics: List[dict] = [] total_pages = len(images) - _write_progress(output_dir, pdf_path.stem, page_outputs, total_pages, 0) - with tempfile.TemporaryDirectory(prefix=f"{pdf_path.stem}_deepseek_") as tmp_dir_str: + _write_progress(output_dir, stem, page_outputs, total_pages, 0) + with tempfile.TemporaryDirectory(prefix=f"{stem}_deepseek_") as tmp_dir_str: tmp_dir = Path(tmp_dir_str) for idx, image in enumerate(images): page_png = tmp_dir / f"page_{idx + 1:04d}.png" @@ -391,7 +460,7 @@ def main() -> int: ) _write_progress( output_dir, - pdf_path.stem, + stem, page_outputs, total_pages, idx + 1, @@ -399,10 +468,14 @@ def main() -> int: markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" _write_outputs( output_dir, - pdf_path.stem, + stem, markdown, len(images), extra_metrics={ + "source_file": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "source_end_page": int(job["start_page"]) + max(0, len(images) - 1), "ocr_profile": args.ocr_profile, "attn_backend": attn_impl, "base_size": base_size, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 6d8354c..6368f81 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -7,37 +7,28 @@ import tempfile import time from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List from PIL import Image from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( + DEFAULT_MAX_NEW_TOKENS, PAGE_SPLIT, - _iter_pdfs, + _iter_pdf_jobs, _postprocess_page_text, _profile_defaults, _render_pages, _write_outputs, _write_progress, ) +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector LOGGER = logging.getLogger(__name__) -REPAIR_TILE_SPECS: Tuple[Tuple[str, float, float], ...] = ( - ("top", 0.0, 0.5), - ("mid", 0.35, 0.8), - ("bottom", 0.65, 1.0), -) REPAIR_DARK_THRESHOLD = 235 -REPAIR_SHORT_CHARS = 700 -REPAIR_EXTREME_SHORT_CHARS = 120 -REPAIR_PUA_THRESHOLD = 64 -REPAIR_MIN_HALF_DARK = 0.08 -REPAIR_MIN_THIRD_DARK = 0.07 -REPAIR_MAX_OVERALL_DARK = 0.25 -REPAIR_MIN_OVERALL_DARK = 0.04 -REPAIR_FOOTNOTE_SHORT_CHARS = 1100 -REPAIR_MIN_FOOTNOTE_LINES = 2 -REPAIR_FOOTNOTE_RATIO = 0.40 +EMPTY_PAGE_OVERALL_DARK_MAX = 0.0015 +EMPTY_PAGE_BAND_DARK_MAX = 0.0025 +GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS = 48 +GARBAGE_EARLY_STOP_WINDOW_TOKENS = 160 def _parse_args() -> argparse.Namespace: @@ -46,6 +37,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--output-dir", required=True) parser.add_argument("--model-dir", required=True) parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) parser.add_argument("--max-pages", type=int, default=None) parser.add_argument("--device", default="cuda") parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) @@ -54,7 +46,7 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--base-size", type=int, default=None) parser.add_argument("--image-size", type=int, default=None) parser.add_argument("--render-dpi", type=int, default=144) - parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) parser.add_argument("--repetition-penalty", type=float, default=None) parser.add_argument("--no-repeat-ngram-size", type=int, default=None) parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") @@ -71,14 +63,100 @@ def _parse_args() -> argparse.Namespace: def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: bool): from vllm import LLM - logits_processors = None + logits_processors = [] try: from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor - logits_processors = [NGramPerReqLogitsProcessor] + logits_processors.append(NGramPerReqLogitsProcessor) except Exception as exc: # pragma: no cover - environment dependent LOGGER.warning("DeepSeek OCR logits processor unavailable in vLLM; continuing without it: %s", exc) + try: + from transformers import AutoTokenizer + from vllm.sampling_params import SamplingParams + from vllm.v1.sample.logits_processor import AdapterLogitsProcessor + + class _GarbageStopPerReqLogitsProcessor: + def __init__( + self, + tokenizer, + eos_token_id: int | None, + *, + min_output_tokens: int, + window_tokens: int, + ) -> None: + self.tokenizer = tokenizer + self.eos_token_id = eos_token_id + self.min_output_tokens = int(min_output_tokens) + self.window_tokens = int(window_tokens) + self.detector = StreamingGarbageDetector() + self.seen_output_tokens = 0 + + def __call__(self, prompt_ids: list[int], output_ids: list[int], logits): + del prompt_ids + if self.eos_token_id is None: + return logits + current_len = len(output_ids) + if current_len <= self.seen_output_tokens: + return logits + new_ids = output_ids[self.seen_output_tokens :] + self.seen_output_tokens = current_len + if not new_ids: + return logits + new_text = self.tokenizer.decode(new_ids, skip_special_tokens=False) + if new_text: + self.detector.feed(new_text) + if current_len < self.min_output_tokens or self.detector.triggered_reason is None: + return logits + eos_token_id = int(self.eos_token_id) + eos_value = logits[eos_token_id].clone() + logits[:] = float("-inf") + logits[eos_token_id] = eos_value + return logits + + class GarbageEarlyStopLogitsProcessor(AdapterLogitsProcessor): + @classmethod + def validate_params(cls, params: SamplingParams): + extra = params.extra_args or {} + enabled = extra.get("garbage_early_stop") + if enabled is None: + return + if not isinstance(enabled, bool): + raise ValueError("garbage_early_stop must be a bool when provided") + min_output_tokens = extra.get("garbage_min_output_tokens") + if min_output_tokens is not None and int(min_output_tokens) <= 0: + raise ValueError("garbage_min_output_tokens must be > 0") + window_tokens = extra.get("garbage_window_tokens") + if window_tokens is not None and int(window_tokens) <= 0: + raise ValueError("garbage_window_tokens must be > 0") + + def __init__(self, vllm_config, device, is_pin_memory): + super().__init__(vllm_config, device, is_pin_memory) + self._tokenizer = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True) + self._eos_token_id = self._tokenizer.eos_token_id + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor(self, params: SamplingParams): + extra = params.extra_args or {} + if not bool(extra.get("garbage_early_stop", False)): + return None + return _GarbageStopPerReqLogitsProcessor( + self._tokenizer, + self._eos_token_id, + min_output_tokens=int( + extra.get("garbage_min_output_tokens", GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS) + ), + window_tokens=int( + extra.get("garbage_window_tokens", GARBAGE_EARLY_STOP_WINDOW_TOKENS) + ), + ) + + logits_processors.append(GarbageEarlyStopLogitsProcessor) + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("Garbage-stop logits processor unavailable in vLLM; continuing without it: %s", exc) + engine_kwargs = { "model": str(model_dir), "tokenizer": str(model_dir), @@ -96,17 +174,20 @@ def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: b return LLM(**engine_kwargs) -def _sampling_params(max_new_tokens: int | None): +def _sampling_params(max_new_tokens: int | None, *, enable_garbage_early_stop: bool): from vllm import SamplingParams return SamplingParams( temperature=0.0, - max_tokens=int(max_new_tokens or 8192), + max_tokens=int(max_new_tokens or DEFAULT_MAX_NEW_TOKENS), skip_special_tokens=False, extra_args={ "ngram_size": 30, "window_size": 90, "whitelist_token_ids": {128821, 128822}, + "garbage_early_stop": bool(enable_garbage_early_stop), + "garbage_min_output_tokens": int(GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS), + "garbage_window_tokens": int(GARBAGE_EARLY_STOP_WINDOW_TOKENS), }, ) @@ -148,23 +229,18 @@ def _dark_ratio(y0: int, y1: int) -> float: } -def _count_private_use_chars(text: str) -> int: - return sum( +def _text_quality_metrics(text: str) -> dict: + stripped = str(text or "").strip() + letters = sum(1 for ch in stripped if ch.isalpha()) + digits = sum(1 for ch in stripped if ch.isdigit()) + pua_chars = sum( 1 - for ch in str(text or "") + for ch in stripped if 0xE000 <= ord(ch) <= 0xF8FF or 0xF0000 <= ord(ch) <= 0xFFFFD or 0x100000 <= ord(ch) <= 0x10FFFD ) - - -def _text_quality_metrics(text: str) -> dict: - stripped = str(text or "").strip() - letters = sum(1 for ch in stripped if ch.isalpha()) - digits = sum(1 for ch in stripped if ch.isdigit()) - pua_chars = _count_private_use_chars(stripped) lines = [line.strip() for line in stripped.splitlines() if line.strip()] - footnote_like_lines = sum(1 for line in lines if _is_footnote_like_line(line)) avg_line_length = (sum(len(line) for line in lines) / float(len(lines))) if lines else 0.0 score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) return { @@ -173,86 +249,31 @@ def _text_quality_metrics(text: str) -> dict: "digits": int(digits), "pua_chars": int(pua_chars), "line_count": int(len(lines)), - "footnote_like_lines": int(footnote_like_lines), "avg_line_length": float(avg_line_length), "quality_score": float(score), } -def _is_footnote_like_line(line: str) -> bool: - stripped = str(line or "").strip() - if not stripped: - return False - if len(stripped) <= 2: - return False - if stripped[0].isdigit(): - if len(stripped) > 1 and stripped[1] in {".", ")", "]"}: - return True - if len(stripped) > 2 and stripped[1].isspace(): - return True - if stripped[0] in {"*", "•", "-", "†", "‡"}: - return True - return False - - -def _classify_repair(text: str, image_stats: dict, repair_mode: str) -> tuple[str, str | None]: +def _is_effectively_empty_page(image_stats: dict, repair_mode: str) -> bool: if str(repair_mode or "off").strip().lower() != "auto": - return "none", None - quality = _text_quality_metrics(text) - chars = int(quality["chars"]) - pua_chars = int(quality["pua_chars"]) - line_count = int(quality["line_count"]) - footnote_like_lines = int(quality["footnote_like_lines"]) - footnote_ratio = float(footnote_like_lines) / float(max(1, line_count)) - pua_ratio = float(pua_chars) / float(max(1, chars)) - if pua_chars >= REPAIR_PUA_THRESHOLD or pua_ratio >= 0.10: - return "plain", "markdown_garbage" - top_dark = float(image_stats.get("top_dark_ratio", 0.0)) - bottom_dark = float(image_stats.get("bottom_dark_ratio", 0.0)) - top_third_dark = float(image_stats.get("top_third_dark_ratio", top_dark)) - middle_third_dark = float(image_stats.get("middle_third_dark_ratio", 0.0)) - bottom_third_dark = float(image_stats.get("bottom_third_dark_ratio", bottom_dark)) + return False overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) - if ( - chars <= REPAIR_FOOTNOTE_SHORT_CHARS - and footnote_like_lines >= REPAIR_MIN_FOOTNOTE_LINES - and footnote_ratio >= REPAIR_FOOTNOTE_RATIO - and top_third_dark >= REPAIR_MIN_THIRD_DARK - and middle_third_dark >= REPAIR_MIN_THIRD_DARK - and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK - ): - return "tile", "footnote_dominant" - if chars <= REPAIR_EXTREME_SHORT_CHARS: - return "plain", "extreme_short" - if ( - chars <= REPAIR_SHORT_CHARS - and top_dark >= REPAIR_MIN_HALF_DARK - and bottom_dark >= REPAIR_MIN_HALF_DARK - and top_third_dark >= REPAIR_MIN_THIRD_DARK - and middle_third_dark >= REPAIR_MIN_THIRD_DARK - and bottom_third_dark >= REPAIR_MIN_THIRD_DARK - and REPAIR_MIN_OVERALL_DARK <= overall_dark <= REPAIR_MAX_OVERALL_DARK - ): - return "tile", "short_coverage" - return "none", None + if overall_dark > EMPTY_PAGE_OVERALL_DARK_MAX: + return False + return all( + float(image_stats.get(key, 0.0)) <= EMPTY_PAGE_BAND_DARK_MAX + for key in ( + "top_dark_ratio", + "bottom_dark_ratio", + "top_third_dark_ratio", + "middle_third_dark_ratio", + "bottom_third_dark_ratio", + ) + ) def _load_job_image(item: dict) -> Image.Image: - image = Image.open(item["image_path"]).convert("RGB") - crop_box = item.get("crop_box") - if not crop_box: - return image - width, height = image.size - x0_norm, y0_norm, x1_norm, y1_norm = crop_box - crop_pixels = ( - int(round(float(x0_norm) * width)), - int(round(float(y0_norm) * height)), - int(round(float(x1_norm) * width)), - int(round(float(y1_norm) * height)), - ) - cropped = image.crop(crop_pixels) - image.close() - return cropped + return Image.open(item["image_path"]).convert("RGB") def _generate_batch_outputs( @@ -263,15 +284,15 @@ def _generate_batch_outputs( batch_size: int, sampling_params, ) -> List[dict]: - outputs_by_key: Dict[tuple[str, int, str], dict] = {} + outputs_by_key: Dict[tuple[str, int], dict] = {} for batch in _batched(jobs, batch_size): prompt_batch = [] opened_images: List[Image.Image] = [] - keys: List[tuple[str, int, str]] = [] + keys: List[tuple[str, int]] = [] for item in batch: image = _load_job_image(item) opened_images.append(image) - keys.append((str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))) + keys.append((str(item["stem"]), int(item["page_number"]))) prompt_batch.append( { "prompt": prompt, @@ -293,28 +314,7 @@ def _generate_batch_outputs( "raw_text": raw_text, "infer_sec": float(per_item_sec), } - ordered = [] - for item in jobs: - ordered.append(outputs_by_key[(str(item["stem"]), int(item["page_number"]), str(item.get("variant", "page")))]) - return ordered - - -def _stitch_tiled_markdown(parts: List[str]) -> str: - stitched: List[str] = [] - previous_lines: List[str] = [] - for part in parts: - lines = [line.rstrip() for line in str(part or "").splitlines() if line.strip()] - if not lines: - continue - overlap = 0 - max_overlap = min(len(previous_lines), len(lines), 12) - for size in range(max_overlap, 0, -1): - if previous_lines[-size:] == lines[:size]: - overlap = size - break - stitched.extend(lines[overlap:]) - previous_lines = lines - return "\n".join(stitched).strip() + return [outputs_by_key[(str(item["stem"]), int(item["page_number"]))] for item in jobs] def main() -> int: @@ -322,8 +322,8 @@ def main() -> int: input_dir = Path(args.input_dir).resolve() output_dir = Path(args.output_dir).resolve() model_dir = Path(args.model_dir).resolve() - pdfs = _iter_pdfs(input_dir, args.files) - if not pdfs: + jobs_to_run = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs_to_run: return 0 profile_defaults = _profile_defaults(args.ocr_profile) @@ -338,21 +338,36 @@ def main() -> int: gpu_memory_utilization=float(args.gpu_memory_utilization), disable_fp8_kv=bool(args.disable_fp8_kv), ) - sampling_params = _sampling_params(args.max_new_tokens) + sampling_params = _sampling_params( + args.max_new_tokens, + enable_garbage_early_stop=str(args.repair_mode or "off").strip().lower() == "auto", + ) with tempfile.TemporaryDirectory(prefix="deepseek_vllm_") as tmp_dir_str: tmp_dir = Path(tmp_dir_str) doc_states: Dict[str, dict] = {} jobs: List[dict] = [] + plain_retry_jobs: List[dict] = [] - for pdf_path in pdfs: + for job in jobs_to_run: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) doc_start = time.perf_counter() render_start = time.perf_counter() - images = _render_pages(pdf_path, args.max_pages, args.render_dpi) + images = _render_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) render_sec = time.perf_counter() - render_start total_pages = len(images) state = { - "stem": pdf_path.stem, + "stem": stem, + "source_name": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), "page_outputs": [""] * total_pages, "page_metrics": [None] * total_pages, "render_sec": float(render_sec), @@ -360,25 +375,50 @@ def main() -> int: "completed_pages": 0, "total_pages": total_pages, } - doc_states[pdf_path.stem] = state - _write_progress(output_dir, pdf_path.stem, [], total_pages, 0) + doc_states[stem] = state + _write_progress(output_dir, stem, [], total_pages, 0) for idx, image in enumerate(images): - page_path = tmp_dir / f"{pdf_path.stem}_page_{idx + 1:04d}.png" + page_path = tmp_dir / f"{stem}_page_{idx + 1:04d}.png" image_stats = _image_content_stats(image) + if _is_effectively_empty_page(image_stats, args.repair_mode): + state["page_metrics"][idx] = { + "page_number": int(idx + 1), + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "skip_empty", + "repair_reason": "empty_page", + "repair_attempted": False, + "repair_applied": False, + "empty_page_skipped": True, + "garbage_early_stop_applied": False, + **image_stats, + } + state["completed_pages"] = int(state["completed_pages"]) + 1 + _write_progress( + output_dir, + stem, + [page for page in state["page_outputs"] if page], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + image.close() + continue image.save(page_path, format="PNG") image.close() jobs.append( { - "stem": pdf_path.stem, + "stem": stem, "page_number": int(idx + 1), "image_path": page_path, "image_stats": image_stats, - "variant": "page", } ) - plain_repair_jobs: List[dict] = [] - tile_repair_requests: List[dict] = [] first_pass_outputs = _generate_batch_outputs( llm, jobs=jobs, @@ -400,11 +440,6 @@ def main() -> int: page_text = f"\n{page_text}".strip() state["page_outputs"][item["page_number"] - 1] = page_text quality = _text_quality_metrics(page_text) - repair_strategy, repair_reason = _classify_repair( - page_text, - image_stats=image_stats, - repair_mode=args.repair_mode, - ) metric = { "page_number": int(item["page_number"]), "infer_sec": float(result["infer_sec"]), @@ -414,32 +449,31 @@ def main() -> int: "first_pass_letters": int(quality["letters"]), "first_pass_digits": int(quality["digits"]), "first_pass_pua_chars": int(quality["pua_chars"]), - "repair_strategy": repair_strategy, - "repair_reason": repair_reason, + "repair_strategy": "plain" if bool(postprocess_metrics.get("early_stops", 0)) else "none", + "repair_reason": "early_stop_markdown_garbage" if bool(postprocess_metrics.get("early_stops", 0)) else None, "repair_attempted": False, "repair_applied": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": bool(postprocess_metrics.get("early_stops", 0)), **image_stats, **postprocess_metrics, } state["page_metrics"][item["page_number"] - 1] = metric - if repair_strategy == "plain": - plain_repair_jobs.append(item) - elif repair_strategy == "tile": - tile_repair_requests.append(item) + if bool(postprocess_metrics.get("early_stops", 0)) and str(args.repair_mode or "off").strip().lower() == "auto": + plain_retry_jobs.append(item) state["completed_pages"] = int(state["completed_pages"]) + 1 - progress_pages = [page for page in state["page_outputs"] if page] _write_progress( output_dir, item["stem"], - progress_pages, + [page for page in state["page_outputs"] if page], int(state["total_pages"]), int(state["completed_pages"]), ) - if plain_repair_jobs: + if plain_retry_jobs: plain_repair_outputs = _generate_batch_outputs( llm, - jobs=plain_repair_jobs, + jobs=plain_retry_jobs, prompt=plain_prompt, batch_size=int(args.batch_size), sampling_params=sampling_params, @@ -448,7 +482,6 @@ def main() -> int: item = result["item"] state = doc_states[item["stem"]] metric = state["page_metrics"][item["page_number"] - 1] - original_text = state["page_outputs"][item["page_number"] - 1] repair_text, repair_postprocess = _postprocess_page_text( str(result["raw_text"]), prompt=plain_prompt, @@ -456,21 +489,16 @@ def main() -> int: ) if args.content_debug: repair_text = f"\n{repair_text}".strip() - original_quality = _text_quality_metrics(original_text) - repair_quality = _text_quality_metrics(repair_text) - apply_repair = bool(repair_text.strip()) and ( - float(repair_quality["quality_score"]) >= float(original_quality["quality_score"]) - or str(metric.get("repair_reason")) in {"markdown_garbage", "extreme_short"} - ) metric["repair_attempted"] = True metric["repair_infer_sec"] = float(result["infer_sec"]) metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) metric["repair_final_chars"] = int(len(repair_text.strip())) - metric["repair_quality_score"] = float(repair_quality["quality_score"]) metric["repair_profile"] = "plain_ocr" + metric["repair_quality_score"] = float(_text_quality_metrics(repair_text)["quality_score"]) + metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) - if apply_repair: + if repair_text.strip(): state["page_outputs"][item["page_number"] - 1] = repair_text metric["repair_applied"] = True metric["final_chars"] = int(len(repair_text.strip())) @@ -482,77 +510,6 @@ def main() -> int: int(state["completed_pages"]), ) - if tile_repair_requests: - tile_jobs: List[dict] = [] - for item in tile_repair_requests: - for tile_name, y0, y1 in REPAIR_TILE_SPECS: - tile_jobs.append( - { - "stem": item["stem"], - "page_number": int(item["page_number"]), - "image_path": item["image_path"], - "variant": tile_name, - "crop_box": (0.0, y0, 1.0, y1), - } - ) - tile_outputs = _generate_batch_outputs( - llm, - jobs=tile_jobs, - prompt=prompt, - batch_size=int(args.batch_size), - sampling_params=sampling_params, - ) - grouped_tile_outputs: Dict[tuple[str, int], List[dict]] = {} - for result in tile_outputs: - key = (str(result["item"]["stem"]), int(result["item"]["page_number"])) - grouped_tile_outputs.setdefault(key, []).append(result) - for item in tile_repair_requests: - key = (str(item["stem"]), int(item["page_number"])) - state = doc_states[item["stem"]] - metric = state["page_metrics"][item["page_number"] - 1] - original_text = state["page_outputs"][item["page_number"] - 1] - grouped = sorted( - grouped_tile_outputs.get(key, []), - key=lambda value: {"top": 0, "mid": 1, "bottom": 2}.get(str(value["item"].get("variant")), 99), - ) - tile_parts: List[str] = [] - repair_infer_sec = 0.0 - for result in grouped: - repair_infer_sec += float(result["infer_sec"]) - tile_text, _ = _postprocess_page_text( - str(result["raw_text"]), - prompt=prompt, - content_debug=bool(args.content_debug), - ) - tile_parts.append(tile_text) - stitched = _stitch_tiled_markdown(tile_parts) - if args.content_debug: - stitched = f"\n{stitched}".strip() - original_quality = _text_quality_metrics(original_text) - stitched_quality = _text_quality_metrics(stitched) - apply_repair = bool(stitched.strip()) and ( - float(stitched_quality["quality_score"]) > float(original_quality["quality_score"]) - and int(stitched_quality["chars"]) >= int(original_quality["chars"]) - ) - metric["repair_attempted"] = True - metric["repair_infer_sec"] = float(metric.get("repair_infer_sec", 0.0)) + float(repair_infer_sec) - metric["repair_final_chars"] = int(len(stitched.strip())) - metric["repair_quality_score"] = float(stitched_quality["quality_score"]) - metric["repair_tile_count"] = int(len(grouped)) - metric["repair_profile"] = "markdown_grounded_tiled" - metric["infer_sec"] = float(metric["infer_sec"]) + float(repair_infer_sec) - if apply_repair: - state["page_outputs"][item["page_number"] - 1] = stitched - metric["repair_applied"] = True - metric["final_chars"] = int(len(stitched.strip())) - _write_progress( - output_dir, - item["stem"], - [page for page in state["page_outputs"] if page], - int(state["total_pages"]), - int(state["completed_pages"]), - ) - for stem, state in doc_states.items(): markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" page_metrics = sorted( @@ -564,7 +521,9 @@ def main() -> int: "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), "plain_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied")))), - "tiled_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "markdown_grounded_tiled" and bool(item.get("repair_applied")))), + "tiled_repairs": 0, + "empty_pages_skipped": int(sum(1 for item in page_metrics if bool(item.get("empty_page_skipped")))), + "pages_with_early_stop": int(sum(1 for item in page_metrics if bool(item.get("garbage_early_stop_applied")))), } _write_outputs( output_dir, @@ -572,6 +531,10 @@ def main() -> int: markdown, int(state["total_pages"]), extra_metrics={ + "source_file": str(state["source_name"]), + "source_stem": str(state["source_stem"]), + "source_start_page": int(state["source_start_page"]), + "source_end_page": int(state["source_start_page"]) + max(0, len(page_metrics) - 1), "ocr_profile": args.ocr_profile, "attn_backend": "vllm", "runtime_backend": "vllm", diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 906f30d..7a22018 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -12,6 +12,15 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Optional +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) + try: import pypdfium2 as _pypdfium2 except Exception: # pragma: no cover - optional dependency @@ -22,6 +31,7 @@ DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" AUTO_VLLM_BATCH_PAGE_CAP = 160 +DEFAULT_MAX_NEW_TOKENS = 2048 def _page_count(pdf_path: Path) -> int: @@ -38,6 +48,7 @@ def _build_cli_command( output_dir: Path, *, files: List[str], + page_ranges: Optional[List[str]], model_dir: Path, python_bin: Optional[Path], script: Path, @@ -73,6 +84,8 @@ def _build_cli_command( ] if files: cmd += ["--files", *files] + if page_ranges: + cmd += ["--page-ranges", *page_ranges] if max_pages is not None: cmd += ["--max-pages", str(max_pages)] if content_debug: @@ -166,6 +179,7 @@ def _run_cli( input_dir=input_dir, output_dir=output_dir, files=files, + page_ranges=None, model_dir=model_dir, python_bin=python_bin, script=script, @@ -278,6 +292,24 @@ def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: return max(1, count) +def _source_documents( + *, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> List[SourceDocument]: + documents: List[SourceDocument] = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + documents.append( + SourceDocument( + name=str(name), + pages=int(_effective_page_count(pdf_path, max_pages)), + ) + ) + return documents + + def _plan_lanes( *, file_list: List[str], @@ -315,6 +347,75 @@ def _plan_lanes( return lanes +def _resolve_scheduler( + *, + scheduler: Optional[str], + runtime_backend: str, + lane_devices: List[int], + workers_per_gpu: int, +) -> str: + scheduler_norm = str(scheduler or "auto").strip().lower() + if scheduler_norm not in {"auto", "whole_doc", "fixed_shard", "exact_fill"}: + raise ValueError("scheduler must be one of 'auto', 'whole_doc', 'fixed_shard', or 'exact_fill'") + if scheduler_norm != "auto": + return scheduler_norm + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + lane_count = max(1, len(lane_devices)) * max(1, int(workers_per_gpu)) + if runtime_backend_norm == "vllm" and lane_count > 1: + return "exact_fill" + return "whole_doc" + + +def _plan_lane_batches( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], + runtime_backend: str, + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + documents = _source_documents( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + ) + scheduler_norm = _resolve_scheduler( + scheduler=scheduler, + runtime_backend=runtime_backend, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches( + documents, + target_batch_pages=max(1, int(target_batch_pages)), + ) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches( + slices, + target_batch_pages=max(1, int(target_batch_pages)), + ) + lanes = assign_batches_to_lanes( + batches, + devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + return [lane.to_dict() for lane in lanes if lane.batches] + + def _auto_vllm_batch_size( *, runtime_backend: str, @@ -333,6 +434,34 @@ def _auto_vllm_batch_size( return min(int(total_pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) +def _auto_vllm_batch_size_for_pages(*, runtime_backend: str, pages: int) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + if int(pages) <= 0: + return 1 + return min(int(pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + def _run_multi_cli( *, input_root: Path, @@ -361,13 +490,22 @@ def _run_multi_cli( gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, repair_mode: Optional[str], + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, ) -> None: - lanes = _plan_lanes( + lanes = _plan_lane_batches( file_list=file_list, input_root=input_root, lane_devices=lane_devices, workers_per_gpu=workers_per_gpu, max_pages=max_pages, + runtime_backend=runtime_backend, + scheduler=scheduler, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, ) if not lanes: return @@ -376,27 +514,31 @@ def _run_multi_cli( failures: List[str] = [] with ExitStack() as stack: procs = [] + for lane in lanes: - lane_files = list(lane["files"]) - if not lane_files: - continue + lane_id = int(lane["lane_id"]) visible_device = int(lane["visible_device"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) + if pages <= 0: + continue resolved_vllm_batch_size = ( int(vllm_batch_size) if vllm_batch_size is not None - else _auto_vllm_batch_size( + else _auto_vllm_batch_size_for_pages( runtime_backend=runtime_backend, - file_list=lane_files, - input_root=input_root, - max_pages=max_pages, + pages=min(int(target_batch_pages), int(pages)), ) ) - log_path = log_dir / f"lane_{lane['lane_id']}_gpu{visible_device}.log" + log_path = log_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" fh = stack.enter_context(log_path.open("w", encoding="utf-8")) cmd = _build_cli_command( input_dir=input_root, output_dir=out_root, - files=lane_files, + files=files, + page_ranges=page_ranges, model_dir=model_root, python_bin=python_exe, script=script_path, @@ -421,11 +563,13 @@ def _run_multi_cli( ) env = _build_env(python_bin=python_exe, visible_device=visible_device) LOGGER.info( - "Running DeepSeek OCR lane=%s visible_gpu=%s files=%d weight=%d: %s", - lane["lane_id"], + "Running DeepSeek OCR lane=%s visible_gpu=%s pages=%s planned_batches=%s files=%d ranges=%d: %s", + lane_id, visible_device, - len(lane_files), - lane["weight"], + pages, + lane_plan["planned_batch_count"], + len(files), + len(page_ranges), " ".join(cmd), ) proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args @@ -465,7 +609,7 @@ def run_for_files( image_size: Optional[int] = None, crop_mode: Optional[bool] = None, render_dpi: Optional[int] = None, - max_new_tokens: Optional[int] = None, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, repetition_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, use_gpus: Optional[str] = None, @@ -475,6 +619,10 @@ def run_for_files( disable_fp8_kv: bool = False, vllm_batch_size: Optional[int] = None, repair_mode: str = "auto", + scheduler: str = "auto", + target_batch_pages: int = AUTO_VLLM_BATCH_PAGE_CAP, + shard_pages: int = 0, + shard_threshold_pages: int = 0, **_: Any, ) -> Dict[str, Any]: """Run DeepSeek OCR for the provided files.""" @@ -568,6 +716,10 @@ def run_for_files( gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + scheduler=scheduler, + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), ) else: resolved_vllm_batch_size = ( @@ -584,6 +736,7 @@ def run_for_files( input_dir=pdf_root, output_dir=out_root, files=file_list, + page_ranges=None, model_dir=model_root, python_bin=python_exe, script=script_path, diff --git a/src/glossapi/ocr/deepseek/scheduling.py b/src/glossapi/ocr/deepseek/scheduling.py new file mode 100644 index 0000000..339b3e6 --- /dev/null +++ b/src/glossapi/ocr/deepseek/scheduling.py @@ -0,0 +1,242 @@ +"""Scheduling helpers for DeepSeek OCR page-range planning. + +The core abstraction is a divisible PDF page stream. We can cut a document into +page ranges exactly where a batch boundary needs it, then reconstruct outputs +later by `(doc_id, page_number)`. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import heapq +from typing import Iterable, List, Optional + + +@dataclass(frozen=True) +class SourceDocument: + name: str + pages: int + + +@dataclass(frozen=True) +class WorkSlice: + source_name: str + source_pages: int + start_page: int + end_page: int + + @property + def pages(self) -> int: + return int(self.end_page) - int(self.start_page) + 1 + + @property + def is_full_document(self) -> bool: + return int(self.start_page) == 1 and int(self.end_page) == int(self.source_pages) + + @property + def item_id(self) -> str: + if self.is_full_document: + return str(self.source_name) + return f"{self.source_name}:{int(self.start_page)}:{int(self.end_page)}" + + @property + def cli_file(self) -> Optional[str]: + return str(self.source_name) if self.is_full_document else None + + @property + def cli_page_range(self) -> Optional[str]: + if self.is_full_document: + return None + return self.item_id + + def to_dict(self) -> dict: + return { + "item_id": self.item_id, + "pages": int(self.pages), + "file": self.cli_file, + "page_range": self.cli_page_range, + "source_name": str(self.source_name), + "start_page": int(self.start_page), + "end_page": int(self.end_page), + "is_full_document": bool(self.is_full_document), + } + + +@dataclass +class DocumentCursor: + name: str + total_pages: int + next_page: int = 1 + + @property + def remaining_pages(self) -> int: + return max(0, int(self.total_pages) - int(self.next_page) + 1) + + def take(self, requested_pages: int) -> WorkSlice: + take_pages = min(max(1, int(requested_pages)), int(self.remaining_pages)) + start_page = int(self.next_page) + end_page = start_page + take_pages - 1 + self.next_page = end_page + 1 + return WorkSlice( + source_name=str(self.name), + source_pages=int(self.total_pages), + start_page=int(start_page), + end_page=int(end_page), + ) + + +@dataclass +class BatchPlan: + batch_id: int + items: List[WorkSlice] = field(default_factory=list) + + @property + def pages(self) -> int: + return sum(int(item.pages) for item in self.items) + + def to_dict(self) -> dict: + return { + "batch_id": int(self.batch_id), + "item_ids": [item.item_id for item in self.items], + "files": [item.cli_file for item in self.items if item.cli_file], + "page_ranges": [item.cli_page_range for item in self.items if item.cli_page_range], + "pages": int(self.pages), + "items": [item.to_dict() for item in self.items], + } + + +@dataclass +class LanePlan: + lane_id: int + visible_device: int + batches: List[BatchPlan] = field(default_factory=list) + + @property + def assigned_pages(self) -> int: + return sum(int(batch.pages) for batch in self.batches) + + def to_dict(self) -> dict: + return { + "lane_id": int(self.lane_id), + "visible_device": int(self.visible_device), + "assigned_pages": int(self.assigned_pages), + "batches": [batch.to_dict() for batch in self.batches], + } + + +def build_whole_document_slices(documents: Iterable[SourceDocument]) -> List[WorkSlice]: + return [ + WorkSlice( + source_name=str(doc.name), + source_pages=int(doc.pages), + start_page=1, + end_page=int(doc.pages), + ) + for doc in documents + ] + + +def build_fixed_shard_slices( + documents: Iterable[SourceDocument], + *, + shard_pages: int, + shard_threshold_pages: int, +) -> List[WorkSlice]: + shard_size = max(0, int(shard_pages)) + threshold = max(0, int(shard_threshold_pages)) + slices: List[WorkSlice] = [] + for doc in documents: + total_pages = int(doc.pages) + if shard_size <= 0 or total_pages <= max(threshold, shard_size): + slices.extend(build_whole_document_slices([doc])) + continue + start_page = 1 + while start_page <= total_pages: + end_page = min(total_pages, start_page + shard_size - 1) + slices.append( + WorkSlice( + source_name=str(doc.name), + source_pages=total_pages, + start_page=int(start_page), + end_page=int(end_page), + ) + ) + start_page = end_page + 1 + return slices + + +def build_exact_fill_batches( + documents: Iterable[SourceDocument], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + heap: List[tuple[int, int, DocumentCursor]] = [] + for idx, doc in enumerate(documents): + cursor = DocumentCursor(name=str(doc.name), total_pages=int(doc.pages)) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + + batches: List[BatchPlan] = [] + while heap: + remaining_capacity = int(target) + items: List[WorkSlice] = [] + while remaining_capacity > 0 and heap: + _neg_remaining, idx, cursor = heapq.heappop(heap) + take_pages = min(int(cursor.remaining_pages), int(remaining_capacity)) + items.append(cursor.take(take_pages)) + remaining_capacity -= int(take_pages) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + batches.append(BatchPlan(batch_id=len(batches), items=items)) + return batches + + +def pack_slices_into_batches( + slices: Iterable[WorkSlice], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + ordered = sorted(list(slices), key=lambda item: (-int(item.pages), item.item_id)) + batches: List[BatchPlan] = [] + current: List[WorkSlice] = [] + current_pages = 0 + + def flush() -> None: + nonlocal current, current_pages + if not current: + return + batches.append(BatchPlan(batch_id=len(batches), items=list(current))) + current = [] + current_pages = 0 + + for item in ordered: + item_pages = int(item.pages) + if current and current_pages + item_pages > target: + flush() + current.append(item) + current_pages += item_pages + if current_pages >= target: + flush() + flush() + return batches + + +def assign_batches_to_lanes( + batches: Iterable[BatchPlan], + *, + devices: List[int], + workers_per_gpu: int, +) -> List[LanePlan]: + lanes: List[LanePlan] = [] + lane_id = 0 + for visible_device in devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append(LanePlan(lane_id=lane_id, visible_device=int(visible_device))) + lane_id += 1 + for batch in batches: + lane = min(lanes, key=lambda item: (int(item.assigned_pages), int(item.lane_id))) + lane.batches.append(batch) + return lanes + diff --git a/src/glossapi/ocr/utils/cleaning.py b/src/glossapi/ocr/utils/cleaning.py index 9b4e287..c194c72 100644 --- a/src/glossapi/ocr/utils/cleaning.py +++ b/src/glossapi/ocr/utils/cleaning.py @@ -260,11 +260,207 @@ def _detect_repeated_lines_cut(text: str, *, threshold: int = 10) -> Optional[in return None +def _is_private_use_char(ch: str) -> bool: + codepoint = ord(ch) + return ( + 0xE000 <= codepoint <= 0xF8FF + or 0xF0000 <= codepoint <= 0xFFFFD + or 0x100000 <= codepoint <= 0x10FFFD + ) + + +def _is_symbol_garbage_char(ch: str) -> bool: + if _is_private_use_char(ch): + return True + return ch in { + "•", + "", + "·", + "◦", + "▪", + "▫", + "‣", + "∙", + "⋅", + "●", + "○", + "◉", + "◌", + "◆", + "◇", + "■", + "□", + "▲", + "△", + "▼", + "▽", + "►", + "◄", + "◊", + "", + "", + "", + "", + "", + "", + } + + +def _detect_symbol_garbage_cut(text: str, *, threshold: int = 16) -> Optional[int]: + """Cut on long runs of isolated bullet/dingbat/private-use symbols. + + This targets the common DeepSeek garbage mode where the model emits long + whitespace-separated runs of bullets or private-use glyphs instead of text. + """ + if threshold <= 1: + return 0 + run_count = 0 + run_start: Optional[int] = None + last_non_ws = -10_000 + for index, ch in enumerate(text): + if ch.isspace(): + continue + if _is_symbol_garbage_char(ch): + if run_count == 0 or (index - last_non_ws) > 3: + run_start = index + run_count = 1 + else: + run_count += 1 + last_non_ws = index + if run_count >= threshold: + return run_start + continue + run_count = 0 + run_start = None + last_non_ws = index + return None + + +NUMERIC_LIST_TOKEN_PATTERN = re.compile(r"(? Optional[int]: + """Cut on degenerate `1. 2. 3. ...` style list output.""" + if threshold <= 1: + return 0 + matches = list(NUMERIC_LIST_TOKEN_PATTERN.finditer(text)) + if len(matches) < threshold: + return None + run_start = matches[0].start() + run_count = 1 + prev_value = int(matches[0].group(1)) + prev_end = matches[0].end() + for match in matches[1:]: + current_value = int(match.group(1)) + gap = text[prev_end : match.start()] + if current_value == prev_value + 1 and len(gap) <= 4 and gap.strip() == "": + run_count += 1 + else: + run_start = match.start() + run_count = 1 + if run_count >= threshold: + return run_start + prev_value = current_value + prev_end = match.end() + return None + + +class StreamingGarbageDetector: + """Incremental detector for common OCR garbage generation modes. + + This is designed for hot decode loops: feed only newly decoded text chunks + and keep O(1) mutable state instead of rescanning the whole suffix. + """ + + def __init__( + self, + *, + symbol_threshold: int = 16, + numeric_list_threshold: int = 12, + ) -> None: + self.symbol_threshold = int(symbol_threshold) + self.numeric_list_threshold = int(numeric_list_threshold) + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number: Optional[int] = None + self._digits_buffer: str = "" + self.triggered_reason: Optional[str] = None + + def reset(self) -> None: + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + self.triggered_reason = None + + def _reset_numeric(self) -> None: + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + + def _feed_symbol_char(self, ch: str) -> bool: + if ch.isspace(): + return False + if _is_symbol_garbage_char(ch): + self._symbol_run += 1 + if self._symbol_run >= self.symbol_threshold: + self.triggered_reason = "symbol_garbage" + return True + return False + self._symbol_run = 0 + return False + + def _feed_numeric_char(self, ch: str) -> bool: + if ch.isspace(): + if self._digits_buffer: + self._reset_numeric() + return False + if "0" <= ch <= "9": + self._digits_buffer += ch + return False + if ch in {".", ")"} and self._digits_buffer: + value = int(self._digits_buffer) + self._digits_buffer = "" + if self._expected_next_number is None: + if value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + else: + if value == self._expected_next_number: + self._numeric_run += 1 + self._expected_next_number += 1 + elif value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + if self._numeric_run >= self.numeric_list_threshold: + self.triggered_reason = "numeric_list_garbage" + return True + return False + self._reset_numeric() + return False + + def feed(self, text: str) -> bool: + if self.triggered_reason is not None: + return True + for ch in str(text or ""): + if self._feed_symbol_char(ch): + return True + if self._feed_numeric_char(ch): + return True + return False + + def detect_early_stop_index( text: str, *, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, ) -> Optional[int]: """Find earliest cut index based on repetition heuristics. @@ -273,11 +469,12 @@ def detect_early_stop_index( """ idx_char = _detect_repeated_char_cut(text, threshold=char_repeat_threshold) idx_line = _detect_repeated_lines_cut(text, threshold=line_repeat_threshold) - if idx_char is None: - return idx_line - if idx_line is None: - return idx_char - return min(idx_char, idx_line) + idx_symbol = _detect_symbol_garbage_cut(text, threshold=symbol_garbage_threshold) + idx_numeric = _detect_numeric_list_garbage_cut(text, threshold=numeric_list_threshold) + candidates = [idx for idx in (idx_char, idx_line, idx_symbol, idx_numeric) if idx is not None] + if not candidates: + return None + return min(candidates) def apply_early_stop( @@ -286,6 +483,8 @@ def apply_early_stop( content_debug: bool = False, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, metrics: Optional[dict] = None, ) -> str: """Apply early termination heuristics to ``text`` and optionally append notice. @@ -299,6 +498,8 @@ def apply_early_stop( text, line_repeat_threshold=line_repeat_threshold, char_repeat_threshold=char_repeat_threshold, + symbol_garbage_threshold=symbol_garbage_threshold, + numeric_list_threshold=numeric_list_threshold, ) if cut is None: return text diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py index b9a0c94..83a8a8b 100644 --- a/src/glossapi/scripts/deepseek_pipeline_benchmark.py +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -2,7 +2,6 @@ import argparse import json -import os import random import shutil import subprocess @@ -10,6 +9,15 @@ from pathlib import Path from typing import Any, Dict, List, Optional +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) + def _parse_devices(spec: str) -> List[int]: tokens = [piece.strip() for piece in str(spec or "").split(",") if piece.strip()] @@ -24,7 +32,7 @@ def _parse_devices(spec: str) -> List[int]: def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( prog="python -m glossapi.scripts.deepseek_pipeline_benchmark", - description="Benchmark DeepSeek OCR pipeline throughput for static and streaming-style scheduling.", + description="Benchmark DeepSeek OCR pipeline throughput for different scheduling strategies.", ) p.add_argument("--repo", required=True) p.add_argument("--input-dir", required=True) @@ -33,12 +41,20 @@ def _parse_args() -> argparse.Namespace: p.add_argument("--model-dir", required=True) p.add_argument("--label", required=True) p.add_argument("--mode", default="static", choices=["static", "streaming"]) + p.add_argument( + "--scheduler", + default="whole_doc", + choices=["whole_doc", "fixed_shard", "exact_fill"], + ) p.add_argument("--devices", default="0,1,2,3,4,5,6,7") p.add_argument("--workers-per-gpu", type=int, default=1) p.add_argument("--max-docs", type=int, default=None) p.add_argument("--doc-order", default="name", choices=["name", "random", "largest_first"]) p.add_argument("--seed", type=int, default=20260330) + p.add_argument("--target-batch-pages", type=int, default=160) p.add_argument("--stream-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) p.add_argument("--runtime-backend", default="vllm", choices=["transformers", "vllm"]) p.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) p.add_argument("--prompt-override", default=None) @@ -47,7 +63,7 @@ def _parse_args() -> argparse.Namespace: p.add_argument("--base-size", type=int, default=None) p.add_argument("--image-size", type=int, default=None) p.add_argument("--render-dpi", type=int, default=144) - p.add_argument("--max-new-tokens", type=int, default=None) + p.add_argument("--max-new-tokens", type=int, default=2048) p.add_argument("--vllm-batch-size", type=int, default=None) p.add_argument("--gpu-memory-utilization", type=float, default=0.9) p.add_argument("--disable-fp8-kv", action="store_true") @@ -55,125 +71,58 @@ def _parse_args() -> argparse.Namespace: return p.parse_args() -def _weighted_files( +def _weighted_documents( *, input_dir: Path, max_docs: Optional[int], doc_order: str, seed: int, -) -> List[Dict[str, Any]]: +) -> List[SourceDocument]: from glossapi.ocr.deepseek import runner as deepseek_runner - weighted = [] - for path in sorted(input_dir.glob("*.pdf")): - pages = int(deepseek_runner._effective_page_count(path, None)) - weighted.append({"name": path.name, "pages": pages}) + documents = [ + SourceDocument(name=path.name, pages=int(deepseek_runner._effective_page_count(path, None))) + for path in sorted(input_dir.glob("*.pdf")) + ] if doc_order == "largest_first": - weighted.sort(key=lambda item: (-int(item["pages"]), str(item["name"]))) + documents.sort(key=lambda item: (-int(item.pages), str(item.name))) elif doc_order == "random": rng = random.Random(int(seed)) - rng.shuffle(weighted) + rng.shuffle(documents) if max_docs is not None: - weighted = weighted[: max(0, int(max_docs))] - return weighted - + documents = documents[: max(0, int(max_docs))] + return documents -def _empty_lanes(devices: List[int], workers_per_gpu: int) -> List[Dict[str, Any]]: - lanes: List[Dict[str, Any]] = [] - lane_id = 0 - for visible_device in devices: - for _ in range(max(1, int(workers_per_gpu))): - lanes.append( - { - "lane_id": lane_id, - "visible_device": int(visible_device), - "batches": [], - "assigned_pages": 0, - } - ) - lane_id += 1 - return lanes - -def _plan_static( - weighted_files: List[Dict[str, Any]], +def _plan_lanes( + *, + documents: List[SourceDocument], devices: List[int], workers_per_gpu: int, - input_dir: Path, + scheduler: str, + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, ) -> List[Dict[str, Any]]: - from glossapi.ocr.deepseek import runner as deepseek_runner - - lanes = deepseek_runner._plan_lanes( - file_list=[str(item["name"]) for item in weighted_files], - input_root=input_dir, - lane_devices=devices, + scheduler_norm = str(scheduler or "whole_doc").strip().lower() + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches(documents, target_batch_pages=max(1, int(target_batch_pages))) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches(slices, target_batch_pages=max(1, int(target_batch_pages))) + lanes = assign_batches_to_lanes( + batches, + devices=devices, workers_per_gpu=max(1, int(workers_per_gpu)), - max_pages=None, ) - weights = {str(item["name"]): int(item["pages"]) for item in weighted_files} - planned: List[Dict[str, Any]] = [] - for lane in lanes: - files = list(lane["files"]) - if not files: - continue - weight = sum(int(weights.get(name, 0)) for name in files) - planned.append( - { - "lane_id": int(lane["lane_id"]), - "visible_device": int(lane["visible_device"]), - "assigned_pages": int(weight), - "batches": [ - { - "batch_id": 0, - "files": files, - "pages": int(weight), - } - ], - } - ) - return planned - - -def _plan_streaming( - weighted_files: List[Dict[str, Any]], - devices: List[int], - workers_per_gpu: int, - stream_batch_pages: int, -) -> List[Dict[str, Any]]: - lanes = _empty_lanes(devices, workers_per_gpu) - batch_target = max(1, int(stream_batch_pages)) - current: Dict[int, Dict[str, Any]] = { - int(lane["lane_id"]): {"files": [], "pages": 0} - for lane in lanes - } - - def flush(lane: Dict[str, Any]) -> None: - lane_id = int(lane["lane_id"]) - state = current[lane_id] - if not state["files"]: - return - lane["batches"].append( - { - "batch_id": len(lane["batches"]), - "files": list(state["files"]), - "pages": int(state["pages"]), - } - ) - state["files"] = [] - state["pages"] = 0 - - for item in weighted_files: - lane = min(lanes, key=lambda value: (int(value["assigned_pages"]) + int(current[int(value["lane_id"])]["pages"]), int(value["lane_id"]))) - lane_id = int(lane["lane_id"]) - current[lane_id]["files"].append(str(item["name"])) - current[lane_id]["pages"] = int(current[lane_id]["pages"]) + int(item["pages"]) - lane["assigned_pages"] = int(lane["assigned_pages"]) + int(item["pages"]) - if int(current[lane_id]["pages"]) >= batch_target: - flush(lane) - - for lane in lanes: - flush(lane) - return [lane for lane in lanes if lane["batches"]] + return [lane.to_dict() for lane in lanes if lane.batches] def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: @@ -201,6 +150,26 @@ def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: return totals +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + def main() -> int: args = _parse_args() repo = Path(args.repo).resolve() @@ -212,29 +181,23 @@ def main() -> int: from glossapi.ocr.deepseek import runner as deepseek_runner - weighted_files = _weighted_files( + documents = _weighted_documents( input_dir=input_dir, max_docs=args.max_docs, doc_order=args.doc_order, seed=int(args.seed), ) - if not weighted_files: + if not documents: raise SystemExit("No PDFs found for benchmark input set.") - - if str(args.mode) == "streaming": - lanes = _plan_streaming( - weighted_files=weighted_files, - devices=devices, - workers_per_gpu=max(1, int(args.workers_per_gpu)), - stream_batch_pages=max(1, int(args.stream_batch_pages)), - ) - else: - lanes = _plan_static( - weighted_files=weighted_files, - devices=devices, - workers_per_gpu=max(1, int(args.workers_per_gpu)), - input_dir=input_dir, - ) + lanes = _plan_lanes( + documents=documents, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + ) run_dir = output_root / args.label if args.clean and run_dir.exists(): @@ -249,31 +212,27 @@ def main() -> int: if str(args.runtime_backend) == "vllm" else deepseek_runner.DEFAULT_SCRIPT ) - py_env = {"PYTHONPATH": str(repo / "src")} - def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: + def start_lane(lane: Dict[str, Any]) -> Dict[str, Any]: lane_id = int(lane["lane_id"]) visible_device = int(lane["visible_device"]) - batch_id = int(batch["batch_id"]) - files = list(batch["files"]) - pages = int(batch["pages"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) resolved_vllm_batch_size = ( int(args.vllm_batch_size) if args.vllm_batch_size is not None - else deepseek_runner._auto_vllm_batch_size( - runtime_backend=str(args.runtime_backend), - file_list=files, - input_root=input_dir, - max_pages=None, - ) + else min(max(1, int(args.target_batch_pages)), max(1, pages)) ) - log_path = logs_dir / f"lane_{lane_id:02d}_batch_{batch_id:03d}_gpu{visible_device}.log" + log_path = logs_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" fh = log_path.open("w", encoding="utf-8") cmd = deepseek_runner._build_cli_command( input_dir=input_dir, output_dir=run_dir, files=files, + page_ranges=page_ranges, model_dir=model_dir, python_bin=python_bin, script=script_path, @@ -297,17 +256,17 @@ def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: repair_mode=str(args.repair_mode), ) env = deepseek_runner._build_env(python_bin=python_bin, visible_device=visible_device) - if env.get("PYTHONPATH"): - env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" - else: - env["PYTHONPATH"] = py_env["PYTHONPATH"] + env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" if env.get("PYTHONPATH") else py_env["PYTHONPATH"] proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args return { "lane_id": lane_id, "visible_device": visible_device, - "batch_id": batch_id, + "batch_id": 0, "pages": pages, "files": files, + "page_ranges": page_ranges, + "planned_batch_count": int(lane_plan["planned_batch_count"]), + "planned_batch_pages": list(lane_plan["planned_batch_pages"]), "resolved_vllm_batch_size": resolved_vllm_batch_size, "log_path": str(log_path), "fh": fh, @@ -316,17 +275,8 @@ def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: "cmd": cmd, } - pending_batches: Dict[int, List[Dict[str, Any]]] = { - int(lane["lane_id"]): list(lane["batches"]) - for lane in lanes - } - active: List[Dict[str, Any]] = [] global_start = time.perf_counter() - for lane in lanes: - lane_id = int(lane["lane_id"]) - if pending_batches[lane_id]: - first_batch = pending_batches[lane_id].pop(0) - active.append(start_batch(lane, first_batch)) + active: List[Dict[str, Any]] = [start_lane(lane) for lane in lanes] batch_results: List[Dict[str, Any]] = [] while active: @@ -345,8 +295,11 @@ def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: "batch_id": int(item["batch_id"]), "pages": int(item["pages"]), "files": list(item["files"]), + "page_ranges": list(item.get("page_ranges") or []), + "planned_batch_count": int(item.get("planned_batch_count", 1)), + "planned_batch_pages": list(item.get("planned_batch_pages") or []), "return_code": int(rc), - "resolved_vllm_batch_size": item["resolved_vllm_batch_size"], + "resolved_vllm_batch_size": int(item["resolved_vllm_batch_size"]), "start_offset_sec": float(item["start_ts"] - global_start), "end_offset_sec": float(end_ts - global_start), "elapsed_sec": float(elapsed), @@ -356,13 +309,9 @@ def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: } ) active.remove(item) - lane = next(lane for lane in lanes if int(lane["lane_id"]) == int(item["lane_id"])) - if pending_batches[int(item["lane_id"])]: - next_batch = pending_batches[int(item["lane_id"])].pop(0) - active.append(start_batch(lane, next_batch)) total_elapsed = max(0.000001, time.perf_counter() - global_start) - total_pages = sum(int(item["pages"]) for item in weighted_files) + total_pages = sum(int(doc.pages) for doc in documents) failures = [item for item in batch_results if int(item["return_code"]) != 0] lane_results: List[Dict[str, Any]] = [] @@ -409,15 +358,19 @@ def start_batch(lane: Dict[str, Any], batch: Dict[str, Any]) -> Dict[str, Any]: "label": str(args.label), "status": "pass" if not failures else "fail", "mode": str(args.mode), + "scheduler": str(args.scheduler), "runtime_backend": str(args.runtime_backend), "ocr_profile": str(args.ocr_profile), "repair_mode": str(args.repair_mode), "devices": devices, "workers_per_gpu": int(args.workers_per_gpu), "doc_order": str(args.doc_order), + "target_batch_pages": int(args.target_batch_pages), "stream_batch_pages": int(args.stream_batch_pages), - "docs": len(weighted_files), + "docs": len(documents), "pages": int(total_pages), + "shard_pages": int(args.shard_pages), + "shard_threshold_pages": int(args.shard_threshold_pages), "wall_time_sec": float(total_elapsed), "sec_per_page": float(total_elapsed / max(1, total_pages)), "batch_results": batch_results, diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index d58472d..1e39cd5 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -119,6 +119,7 @@ def test_build_cli_command_includes_speed_flags(tmp_path): input_dir=tmp_path / "in", output_dir=tmp_path / "out", files=["a.pdf"], + page_ranges=None, model_dir=tmp_path / "model", python_bin=Path("/usr/bin/python3"), script=tmp_path / "run.py", @@ -152,6 +153,14 @@ def test_build_cli_command_includes_speed_flags(tmp_path): assert "--max-new-tokens" in cmd and "1024" in cmd +def test_deepseek_default_max_new_tokens_is_standardized(): + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import DEFAULT_MAX_NEW_TOKENS + + assert DEFAULT_MAX_NEW_TOKENS == 2048 + assert runner.DEFAULT_MAX_NEW_TOKENS == 2048 + + def test_build_cli_command_includes_vllm_flags(tmp_path): from glossapi.ocr.deepseek.runner import _build_cli_command @@ -159,6 +168,7 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): input_dir=tmp_path / "in", output_dir=tmp_path / "out", files=["a.pdf"], + page_ranges=None, model_dir=tmp_path / "model", python_bin=Path("/usr/bin/python3"), script=tmp_path / "run_vllm.py", @@ -188,30 +198,79 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): assert "--repair-mode" in cmd and "auto" in cmd -def test_vllm_repair_classifier_routes_garbage_and_short_pages(): - from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _classify_repair +def test_build_cli_command_includes_page_ranges(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command - dense_page = { - "top_dark_ratio": 0.16, - "bottom_dark_ratio": 0.16, - "top_third_dark_ratio": 0.15, - "middle_third_dark_ratio": 0.15, - "bottom_third_dark_ratio": 0.15, - "overall_dark_ratio": 0.15, - } - assert _classify_repair("\uf0b7" * 80, dense_page, "auto") == ("plain", "markdown_garbage") - assert _classify_repair("42", dense_page, "auto") == ("plain", "extreme_short") - assert _classify_repair("Α" * 300, dense_page, "auto") == ("tile", "short_coverage") - footnote_only = "\n".join( - [ - "1. υποσημείωση πρώτη γραμμή", - "2. υποσημείωση δεύτερη γραμμή", - "3. υποσημείωση τρίτη γραμμή", - ] + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=[], + page_ranges=["a.pdf:1:64", "b.pdf:65:128"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=32, + gpu_memory_utilization=0.9, + disable_fp8_kv=False, + repair_mode="auto", ) - assert _classify_repair(footnote_only, dense_page, "auto") == ("tile", "footnote_dominant") - assert _classify_repair("Α" * 1200, dense_page, "auto") == ("none", None) - assert _classify_repair("Α" * 300, dense_page, "off") == ("none", None) + + assert "--page-ranges" in cmd + assert "a.pdf:1:64" in cmd + assert "b.pdf:65:128" in cmd + + +def test_vllm_empty_page_detector_is_conservative(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _is_effectively_empty_page + + empty_page = { + "top_dark_ratio": 0.0004, + "bottom_dark_ratio": 0.0006, + "top_third_dark_ratio": 0.0002, + "middle_third_dark_ratio": 0.0005, + "bottom_third_dark_ratio": 0.0007, + "overall_dark_ratio": 0.0008, + } + non_empty_sparse_page = { + "top_dark_ratio": 0.003, + "bottom_dark_ratio": 0.004, + "top_third_dark_ratio": 0.0028, + "middle_third_dark_ratio": 0.0031, + "bottom_third_dark_ratio": 0.0042, + "overall_dark_ratio": 0.0022, + } + assert _is_effectively_empty_page(empty_page, "auto") is True + assert _is_effectively_empty_page(non_empty_sparse_page, "auto") is False + assert _is_effectively_empty_page(empty_page, "off") is False + + +def test_early_stop_detects_symbol_and_numeric_list_garbage(): + from glossapi.ocr.utils.cleaning import detect_early_stop_index + + symbol_garbage = "Κανονικό κείμενο\n" + (" " * 20) + numeric_list_garbage = "Πρόλογος\n" + " ".join(f"{idx}." for idx in range(1, 20)) + + symbol_cut = detect_early_stop_index(symbol_garbage) + numeric_cut = detect_early_stop_index(numeric_list_garbage) + + assert symbol_cut is not None + assert "Κανονικό κείμενο" in symbol_garbage[:symbol_cut] + assert numeric_cut is not None + assert "Πρόλογος" in numeric_list_garbage[:numeric_cut] def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): @@ -241,3 +300,43 @@ def fake_run_cli(input_dir, output_dir, **kwargs): assert calls["runtime_backend"] == "vllm" assert Path(calls["script"]).name == "run_pdf_ocr_vllm.py" assert result["doc"]["page_count"] == 1 + + +def test_runner_forwards_scheduler_controls_to_multi_cli(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_multi_cli(**kwargs): + calls.update(kwargs) + md_dir = kwargs["out_root"] / "markdown" + metrics_dir = kwargs["out_root"] / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_multi_cli", fake_run_multi_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files( + corpus, + ["doc.pdf"], + runtime_backend="vllm", + use_gpus="multi", + devices=[0, 1], + scheduler="exact_fill", + target_batch_pages=196, + shard_pages=64, + shard_threshold_pages=256, + ) + + assert calls["scheduler"] == "exact_fill" + assert calls["target_batch_pages"] == 196 + assert calls["shard_pages"] == 64 + assert calls["shard_threshold_pages"] == 256 + assert result["doc"]["page_count"] == 1 diff --git a/tests/test_deepseek_scheduling.py b/tests/test_deepseek_scheduling.py new file mode 100644 index 0000000..25983a8 --- /dev/null +++ b/tests/test_deepseek_scheduling.py @@ -0,0 +1,238 @@ +from pathlib import Path + + +def _touch_files(root: Path, names: list[str]) -> None: + root.mkdir(parents=True, exist_ok=True) + for name in names: + (root / name).write_bytes(b"%PDF-1.4\n%stub\n") + + +def test_plan_lanes_balances_weighted_docs_greedily(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "huge.pdf": 500, + "mid_a.pdf": 300, + "mid_b.pdf": 300, + "small_a.pdf": 200, + "tiny_a.pdf": 100, + "tiny_b.pdf": 100, + } + _touch_files(tmp_path, list(weights)) + + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + lanes = runner._plan_lanes( + file_list=["tiny_b.pdf", "mid_a.pdf", "huge.pdf", "small_a.pdf", "tiny_a.pdf", "mid_b.pdf"], + input_root=tmp_path, + lane_devices=[0, 1, 2], + workers_per_gpu=1, + max_pages=None, + ) + + assert [int(lane["weight"]) for lane in lanes] == [500, 500, 500] + assigned = [name for lane in lanes for name in lane["files"]] + assert sorted(assigned) == sorted(weights) + assert len(assigned) == len(set(assigned)) + + +def test_auto_vllm_batch_size_caps_total_pages(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "a.pdf": 90, + "b.pdf": 120, + "c.pdf": 400, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + capped = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=None, + ) + reduced = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=20, + ) + + assert capped == 160 + assert reduced == 60 + + +def test_auto_scheduler_prefers_exact_fill_for_multi_gpu_vllm(): + from glossapi.ocr.deepseek import runner + + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="vllm", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "exact_fill" + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="transformers", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "whole_doc" + + +def test_fixed_shard_builder_only_splits_large_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_fixed_shard_slices + + documents = [ + SourceDocument(name="huge.pdf", pages=310), + SourceDocument(name="mid.pdf", pages=120), + SourceDocument(name="small.pdf", pages=40), + ] + + slices = build_fixed_shard_slices(documents, shard_pages=128, shard_threshold_pages=200) + + assert [item.item_id for item in slices] == [ + "huge.pdf:1:128", + "huge.pdf:129:256", + "huge.pdf:257:310", + "mid.pdf", + "small.pdf", + ] + + +def test_exact_fill_batches_split_documents_to_fill_target(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_exact_fill_batches + + documents = [ + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=60), + SourceDocument(name="c.pdf", pages=60), + SourceDocument(name="d.pdf", pages=20), + ] + + batches = build_exact_fill_batches(documents, target_batch_pages=160) + + assert [batch.pages for batch in batches] == [160, 160, 20] + assert [item.item_id for item in batches[0].items] == ["a.pdf:1:160"] + assert set(item.item_id for item in batches[1].items) == {"a.pdf:161:200", "b.pdf", "c.pdf"} + assert [item.item_id for item in batches[2].items] == ["d.pdf"] + + +def test_assign_batches_to_lanes_balances_full_batches(): + from glossapi.ocr.deepseek.scheduling import ( + BatchPlan, + WorkSlice, + assign_batches_to_lanes, + ) + + batches = [ + BatchPlan(batch_id=0, items=[WorkSlice("a.pdf", 160, 1, 160)]), + BatchPlan(batch_id=1, items=[WorkSlice("b.pdf", 160, 1, 160)]), + BatchPlan(batch_id=2, items=[WorkSlice("c.pdf", 160, 1, 160)]), + BatchPlan(batch_id=3, items=[WorkSlice("d.pdf", 20, 1, 20)]), + ] + + lanes = assign_batches_to_lanes(batches, devices=[0, 1], workers_per_gpu=1) + + assert sorted(lane.assigned_pages for lane in lanes) == [180, 320] + assert [len(lane.batches) for lane in lanes] == [2, 2] + + +def test_benchmark_planner_exact_fill_mixes_ranges_and_whole_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=200), + SourceDocument(name="tiny.pdf", pages=20), + SourceDocument(name="mid.pdf", pages=60), + SourceDocument(name="mid2.pdf", pages=60), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] + + +def test_benchmark_planner_whole_doc_preserves_whole_files(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=1085), + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=200), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="whole_doc", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + assigned = [name for lane in lanes for batch in lane["batches"] for name in batch["files"]] + assert sorted(assigned) == ["a.pdf", "b.pdf", "monster.pdf"] + + +def test_runner_lane_batches_exact_fill_split_large_docs(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "monster.pdf": 200, + "mid.pdf": 60, + "mid2.pdf": 60, + "tiny.pdf": 20, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + lanes = runner._plan_lane_batches( + file_list=list(weights), + input_root=tmp_path, + lane_devices=[0, 1], + workers_per_gpu=1, + max_pages=None, + runtime_backend="vllm", + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] diff --git a/tests/test_streaming_garbage_detector.py b/tests/test_streaming_garbage_detector.py new file mode 100644 index 0000000..0d12fdd --- /dev/null +++ b/tests/test_streaming_garbage_detector.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import pytest + +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector + + +DOWNLOAD_EXPORT = ( + Path.home() + / "Downloads" + / "deepseek_ocr_43pdfs_allpages_20260331" +) + + +def _stream_detect(text: str, *, chunk_size: int) -> tuple[bool, str | None]: + detector = StreamingGarbageDetector() + for idx in range(0, len(text), max(1, int(chunk_size))): + if detector.feed(text[idx : idx + chunk_size]): + return True, detector.triggered_reason + return False, detector.triggered_reason + + +def _load_real_markdown_garbage() -> str: + root = DOWNLOAD_EXPORT / "corrections_markdown_garbage" + if not root.exists(): + pytest.skip(f"missing local export: {root}") + for path in sorted(root.glob("*__markdown_original.md")): + text = path.read_text(encoding="utf-8", errors="ignore") + if "\uf0b7" in text or "" in text or "" in text: + return text + pytest.skip("no local symbol-garbage sample found") + + +def _load_real_empty_page_numeric_garbage() -> str: + if not DOWNLOAD_EXPORT.exists(): + pytest.skip(f"missing local export: {DOWNLOAD_EXPORT}") + preferred = DOWNLOAD_EXPORT / ( + "000008__04afb897cb954a76fe378b2ca22f2f059097876fa60a57666de75e37319e5968__p0008__markdown_original.md" + ) + candidates = [preferred] if preferred.exists() else sorted(DOWNLOAD_EXPORT.glob("*__markdown_original.md")) + for path in candidates: + text = path.read_text(encoding="utf-8", errors="ignore") + if "1. 2. 3." in text: + return text + pytest.skip("no local numeric-list garbage sample found") + + +@pytest.mark.parametrize("chunk_size", [1, 2, 5, 17]) +def test_streaming_detector_catches_symbol_garbage_across_chunks(chunk_size): + text = "Κανονικό κείμενο\n" + (" " * 20) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 2, 4, 11]) +def test_streaming_detector_catches_numeric_list_garbage_across_chunks(chunk_size): + text = " ".join(f"{idx}." for idx in range(1, 25)) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage" + + +def test_streaming_detector_ignores_non_ascii_digit_glyphs(): + triggered, reason = _stream_detect("x³ y² z¹", chunk_size=1) + assert triggered is False + assert reason is None + + +@pytest.mark.parametrize("chunk_size", [1, 3, 9, 23]) +def test_streaming_detector_real_faulty_page_from_downloads(chunk_size): + text = _load_real_markdown_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 3, 8, 21]) +def test_streaming_detector_real_empty_page_generation_from_downloads(chunk_size): + text = _load_real_empty_page_numeric_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage" From f59bd96de9e6f0470d5e8132cdda757dfaf3ddc4 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 13:02:18 +0300 Subject: [PATCH 28/93] Add OpenArchives OCR rollout plan --- .../openarchives_ocr_rollout_plan.md | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 docs/operations/openarchives_ocr_rollout_plan.md diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md new file mode 100644 index 0000000..9aa5424 --- /dev/null +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -0,0 +1,307 @@ +# OpenArchives OCR Rollout Plan + +This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. + +## Current validated baseline + +- Validated OCR node type: `g7e.48xlarge` +- Validated AMI: `ami-052266c3e21dff7db` +- AMI name: `Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04) 20260320` +- Validated runtime stack on the OCR node: + - `torch 2.10.0+cu130` + - `vllm 0.18.0` + - `transformers 4.57.6` +- Standard DeepSeek settings: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` +- Restored clean benchmark on the stopped OCR box: + - `7,624` pages in about `541s` + - about `0.0710 sec/page` overall on one `8`-GPU node + - about `0.3927` to `0.5000 sec/page/GPU` +- Derived per-node throughput: + - about `14.08 pages/sec` + - about `50,700 pages/hour` + +## Current AWS capacity + +`us-east-1` service quotas currently allow: + +- `Running On-Demand G and VT instances = 768` +- `Running On-Demand Standard instances = 640` + +For the validated OCR node: + +- `g7e.48xlarge = 192 vCPU, 8 GPUs` + +So the current maximum concurrent validated OCR fleet is: + +- `floor(768 / 192) = 4` nodes +- total rollout capacity: `32 GPUs` + +## Phase 1: Recover or regenerate the canonical OCR routing state + +Goal: + +- produce one canonical `download_results/download_results.parquet` for the OpenArchives corpus root +- ensure it contains, at minimum: + - `filename` + - `needs_ocr` + - `greek_badness_score` + - `mojibake_badness_score` + - `ocr_success` + - `page_count` or `pages_total` + +Decision order: + +1. Check the stopped GPU OCR instance first. +2. If the full corpus parquet is not there, run a dedicated CPU cleaning pass. + +### 1A. Check the stopped OCR instance first + +Reason: + +- the NVMe volume persists across stop/start +- if the full OpenArchives cleaning pass was already run there, this is the fastest path + +Concrete steps: + +1. Start instance `i-0504a326a1fee541f`. +2. SSH in and search for the full OpenArchives corpus root and canonical parquet: + - `find /opt /data /home -name download_results.parquet` + - verify row count is the full OpenArchives set, not the `43`-document benchmark subset +3. Validate that the parquet has the required OCR routing columns listed above. +4. If found: + - copy the canonical parquet and any supporting cleaner outputs back to stable storage + - stage a copy on `home` + - upload the parquet artifact to the Hugging Face dataset repo as routing metadata + +Acceptance check: + +- row count matches the full OpenArchives working set +- `needs_ocr=True` count is available directly from the parquet +- page totals are available + +### 1B. Fallback: regenerate the routing state on a CPU instance + +If the OCR box does not contain the full canonical parquet: + +- launch a dedicated CPU node for the cleaner pass +- recommended instance family: `c7i` or `r7i` +- recommended first choice: `c7i.8xlarge` with sufficient gp3 storage for the OpenArchives markdown/output root + +Reason: + +- `Corpus.clean()` is CPU-bound and does not need GPUs +- we only need one clean, reproducible routing pass + +Concrete steps: + +1. Launch one Ubuntu 24.04 CPU instance. +2. Clone `glossapi-development` at `development`. +3. Bootstrap the standard GlossAPI environment. +4. Mount or sync the full OpenArchives corpus root. +5. Run `Corpus.clean()` over the full markdown corpus. +6. Verify that `download_results/download_results.parquet` now exists and includes the required OCR routing columns. +7. Store the resulting parquet: + - on the corpus root + - on `home` + - in the Hugging Face dataset repo as routing metadata + +## Phase 2: Quantify the actual OCR workload + +Once the canonical parquet exists: + +1. Filter `needs_ocr == True` +2. Count: + - total documents + - total pages from `pages_total` or `page_count` +3. Also record: + - `greek_badness_score > 60` + - `mojibake_badness_score > 0.1` + - overlap between those conditions and `needs_ocr` + +This step defines the real production workload and the true ETA. + +## Phase 3: Shard across nodes + +Shard across nodes by document, not by page range. + +Reason: + +- cross-node merge stays trivial +- node-local GPU scheduling already exists in GlossAPI +- splitting one document across nodes adds complexity without clear benefit + +### Coordinator manifest + +Build one coordinator manifest from the canonical parquet with: + +- `filename` +- stable OpenArchives document id or canonical filename +- `pages_total` +- `needs_ocr` + +Then: + +1. keep only `needs_ocr=True` +2. greedily bin-pack documents across `N=4` nodes by page count +3. write one shard manifest parquet per node + +Each shard manifest should contain: + +- `filename` +- `pages_total` +- `node_id` +- `shard_id` +- original metadata keys needed for rejoin + +### Node-local execution + +Each node: + +1. loads only its shard manifest +2. runs GlossAPI OCR over that subset +3. keeps standard GlossAPI outputs only: + - `markdown/.md` + - `json/metrics/*.json` + - shard-local `download_results.parquet` + +Inside each node: + +- use the existing GlossAPI DeepSeek path +- let node-local scheduling handle GPU balance +- do not invent a separate OCR metadata format + +## Phase 4: Merge back into the canonical corpus + +Merge rules: + +1. Markdown: + - copy updated `markdown/.md` into the canonical corpus root +2. Metrics: + - copy `json/metrics/*.json` into the canonical corpus root +3. Metadata parquet: + - concatenate shard metadata + - upsert by canonical document id / filename into the master parquet + - preserve the standard GlossAPI contract: + - `needs_ocr` + - `ocr_success` + - `processing_stage` + - page and quality fields + +Recommended additional execution metadata: + +- `ocr_node_id` +- `ocr_shard_id` +- `ocr_started_at` +- `ocr_finished_at` +- `ocr_attempt_count` + +These fields are operational and should not replace the existing GlossAPI routing fields. + +## Phase 5: Standardize all OCR nodes + +All OCR nodes should use the exact same: + +- AMI +- bootstrap script +- DeepSeek venv setup +- model path +- runtime defaults + +Standard production recipe: + +- AMI: `ami-052266c3e21dff7db` +- instance type: `g7e.48xlarge` +- DeepSeek venv created by `dependency_setup/setup_deepseek_uv.sh` +- defaults: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` + +Do not allow per-node env drift during the rollout. + +## Phase 6: ETA + +Validated throughput on one node: + +- about `50,700 pages/hour` + +With `4` nodes: + +- about `202,800 pages/hour` + +Exact ETA formula: + +- `ETA_hours = total_needs_ocr_pages / 202800` + +Reference scenarios: + +- `400,000` pages: about `1.97h` +- `600,000` pages: about `2.96h` +- `800,000` pages: about `3.95h` +- `1,000,000` pages: about `4.93h` + +Equivalent document scenarios for `40,000` documents: + +- average `10` pages/doc: about `1.97h` +- average `15` pages/doc: about `2.96h` +- average `20` pages/doc: about `3.95h` +- average `25` pages/doc: about `4.93h` + +The exact ETA should be recalculated once the canonical parquet gives the real total page count for `needs_ocr=True`. + +## Phase 7: Deployment and monitoring + +### Deployment + +1. Produce canonical parquet +2. Compute shard manifests +3. Stage manifests and source data +4. Launch `4` OCR nodes +5. Bootstrap the same OCR environment on all nodes +6. Run one shard per node +7. Collect outputs +8. Merge back into the canonical corpus + +### Monitoring + +Each node should write a heartbeat JSON at a fixed interval with: + +- `node_id` +- `docs_done` +- `pages_done` +- current file +- GPU utilization snapshot +- VRAM usage snapshot +- last successful write time +- error count + +The coordinator should watch: + +- stale heartbeat +- zero progress +- failed OCR process +- low GPU utilization for a sustained period + +### Recovery + +- rerun only failed shard manifests +- keep shard manifests immutable +- merge is idempotent by canonical document id / filename + +## Immediate next actions + +1. Start the stopped OCR instance and search for the full OpenArchives canonical parquet. +2. If found, validate and upload the routing parquet to stable storage and Hugging Face. +3. If not found, launch one CPU instance and run the full `Corpus.clean()` pass. +4. Compute exact `needs_ocr` doc/page totals from the canonical parquet. +5. Generate the `4` node shard manifests. +6. Launch the `4` OCR nodes and execute the distributed run. From 39fe1c168c77e70e95967e8843aff16dd4984876 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 14:24:57 +0300 Subject: [PATCH 29/93] Add OpenArchives OCR shard and merge tooling --- .../openarchives_ocr_rollout_plan.md | 51 ++++ .../scripts/openarchives_ocr_merge.py | 62 +++++ .../scripts/openarchives_ocr_shards.py | 222 ++++++++++++++++++ tests/test_openarchives_ocr_shards.py | 80 +++++++ 4 files changed, 415 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_ocr_merge.py create mode 100644 src/glossapi/scripts/openarchives_ocr_shards.py create mode 100644 tests/test_openarchives_ocr_shards.py diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 9aa5424..789e357 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -2,6 +2,21 @@ This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. +## Implemented tooling + +The rollout is backed by concrete scripts in `src/glossapi/scripts/`: + +- `openarchives_ocr_shards.py` + - reads the canonical parquet + - filters `needs_ocr=True` + - balances documents across `N` nodes by page count + - writes one shard manifest parquet per node + - writes a JSON summary with page totals and ETA +- `openarchives_ocr_merge.py` + - merges shard-level OCR metadata back into the canonical parquet by `filename` + +These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. + ## Current validated baseline - Validated OCR node type: `g7e.48xlarge` @@ -228,6 +243,42 @@ Standard production recipe: Do not allow per-node env drift during the rollout. +Cleaner/fallback venv decision: + +- CPU cleaning pass should use the standard GlossAPI environment from `development` +- OCR nodes should use the dedicated DeepSeek venv only +- do not mix the cleaner runtime and the OCR runtime on the same benchmark measurement path + +## Instance options + +Primary OCR choice: + +- `g7e.48xlarge` + - validated benchmarked path + - `192 vCPU` + - `8` RTX PRO Server 6000 GPUs + - current recommended production OCR node + +Secondary OCR options, only if we intentionally rebenchmark: + +- `g6e.48xlarge` + - `192 vCPU` + - `8` L40S GPUs +- `g5.48xlarge` + - `192 vCPU` + - `8` A10G GPUs +- `p5.48xlarge` + - technically available, but not the cost/default target for this rollout + +Cleaner node options: + +- first choice: `c7i.8xlarge` + - `32 vCPU` + - good CPU-bound cleaner candidate +- alternative: `r7i.8xlarge` + - `32 vCPU` + - use if the cleaner pass needs more memory headroom + ## Phase 6: ETA Validated throughput on one node: diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py new file mode 100644 index 0000000..a66f564 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import List + +import pandas as pd + + +def _parse_args(argv: List[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_merge", + description="Merge shard-level OCR metadata back into a canonical GlossAPI download_results parquet.", + ) + p.add_argument("--master-parquet", required=True) + p.add_argument("--shard-parquets", nargs="+", required=True) + p.add_argument("--output-parquet", required=True) + p.add_argument("--key-column", default="filename") + return p.parse_args(argv) + + +def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: + if key not in df.columns: + raise SystemExit(f"Key column '{key}' not present in dataframe.") + return df[key].astype(str).str.strip() + + +def main(argv: List[str] | None = None) -> int: + args = _parse_args(argv) + master_path = Path(args.master_parquet).expanduser().resolve() + out_path = Path(args.output_parquet).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + master = pd.read_parquet(master_path).copy() + master["_merge_key"] = _normalize_key(master, str(args.key_column)) + + shard_frames: List[pd.DataFrame] = [] + for shard in args.shard_parquets: + shard_df = pd.read_parquet(Path(shard).expanduser().resolve()).copy() + shard_df["_merge_key"] = _normalize_key(shard_df, str(args.key_column)) + shard_frames.append(shard_df) + shards = pd.concat(shard_frames, ignore_index=True) + shards = shards.drop_duplicates(subset=["_merge_key"], keep="last") + + master = master.set_index("_merge_key", drop=False) + shards = shards.set_index("_merge_key", drop=False) + + for column in shards.columns: + if column == "_merge_key": + continue + master.loc[shards.index, column] = shards[column] + + master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") + master.to_parquet(out_path, index=False) + print( + f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}" + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_shards.py b/src/glossapi/scripts/openarchives_ocr_shards.py new file mode 100644 index 0000000..c85a718 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_shards.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import pandas as pd + + +PAGE_COLUMN_CANDIDATES: Sequence[str] = ( + "pages_total", + "page_count", + "total_pages", + "num_pages", + "pages", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_shards", + description="Create page-balanced OCR shard manifests from a canonical GlossAPI parquet.", + ) + p.add_argument("--parquet", required=True, help="Canonical download_results parquet with needs_ocr flags.") + p.add_argument("--output-dir", required=True, help="Directory where shard manifests and summaries will be written.") + p.add_argument("--nodes", type=int, default=4, help="Number of OCR nodes to shard across.") + p.add_argument( + "--pages-per-hour-per-node", + type=float, + default=50700.0, + help="Validated throughput per OCR node, used for ETA calculations.", + ) + p.add_argument("--filename-column", default="filename") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--page-column", + default=None, + help="Explicit page-count column. If omitted, the script searches common page columns.", + ) + p.add_argument( + "--copy-columns", + default="", + help="Comma-separated extra metadata columns to preserve in every shard manifest.", + ) + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive the target set from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _resolve_page_column(df: pd.DataFrame, explicit: Optional[str]) -> str: + if explicit: + if explicit not in df.columns: + raise SystemExit(f"--page-column '{explicit}' not found in parquet.") + return explicit + for candidate in PAGE_COLUMN_CANDIDATES: + if candidate in df.columns: + return candidate + raise SystemExit( + "No page-count column found. Expected one of: " + + ", ".join(PAGE_COLUMN_CANDIDATES) + + " or pass --page-column." + ) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _page_int(value: object) -> int: + try: + return max(1, int(value)) + except Exception: + return 1 + + +def _make_node_bins(node_count: int) -> List[Dict[str, object]]: + return [ + { + "node_id": idx, + "pages_total": 0, + "docs_total": 0, + "rows": [], + } + for idx in range(max(1, int(node_count))) + ] + + +def _assign_rows(df: pd.DataFrame, *, page_column: str, node_count: int) -> List[Dict[str, object]]: + ordered = df.copy() + ordered["_pages_int"] = ordered[page_column].map(_page_int) + ordered = ordered.sort_values(["_pages_int"], ascending=[False]).reset_index(drop=True) + bins = _make_node_bins(node_count) + for row in ordered.to_dict(orient="records"): + node = min(bins, key=lambda item: (int(item["pages_total"]), int(item["node_id"]))) + row["node_id"] = int(node["node_id"]) + node["rows"].append(row) + node["docs_total"] = int(node["docs_total"]) + 1 + node["pages_total"] = int(node["pages_total"]) + int(row["_pages_int"]) + return bins + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + if args.filename_column not in df.columns: + raise SystemExit(f"Filename column '{args.filename_column}' not found in parquet.") + + page_column = _resolve_page_column(df, args.page_column) + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + shard_df = df.loc[target_mask].copy() + if shard_df.empty: + raise SystemExit("No OCR target rows selected; shard manifests were not created.") + + copy_columns = [c.strip() for c in str(args.copy_columns or "").split(",") if c.strip()] + selected_columns = [args.filename_column, page_column] + for optional in [ + "needs_ocr", + "greek_badness_score", + "mojibake_badness_score", + "ocr_success", + "source_row", + "document_type", + ] + copy_columns: + if optional in shard_df.columns and optional not in selected_columns: + selected_columns.append(optional) + shard_df = shard_df[selected_columns].copy() + + bins = _assign_rows(shard_df, page_column=page_column, node_count=int(args.nodes)) + summaries: List[Dict[str, object]] = [] + total_pages = 0 + total_docs = 0 + for node in bins: + node_id = int(node["node_id"]) + rows = list(node["rows"]) + node_df = pd.DataFrame(rows) + if "_pages_int" in node_df.columns: + node_df = node_df.drop(columns=["_pages_int"]) + node_df["shard_id"] = f"node-{node_id:02d}" + node_df["node_id"] = node_id + out_path = output_dir / f"openarchives_ocr_shard_node_{node_id:02d}.parquet" + node_df.to_parquet(out_path, index=False) + + node_pages = int(node["pages_total"]) + node_docs = int(node["docs_total"]) + total_pages += node_pages + total_docs += node_docs + summaries.append( + { + "node_id": node_id, + "manifest_path": str(out_path), + "docs_total": node_docs, + "pages_total": node_pages, + "eta_hours_at_validated_speed": float(node_pages / float(args.pages_per_hour_per_node)), + } + ) + + overall = { + "source_parquet": str(parquet_path), + "nodes": int(args.nodes), + "filename_column": str(args.filename_column), + "page_column": str(page_column), + "docs_total": int(total_docs), + "pages_total": int(total_pages), + "pages_per_hour_per_node": float(args.pages_per_hour_per_node), + "eta_hours_one_node": float(total_pages / float(args.pages_per_hour_per_node)), + "eta_hours_all_nodes": float(total_pages / (float(args.pages_per_hour_per_node) * max(1, int(args.nodes)))), + "node_summaries": summaries, + } + (output_dir / "openarchives_ocr_shard_summary.json").write_text( + json.dumps(overall, indent=2), + encoding="utf-8", + ) + print(json.dumps(overall, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py new file mode 100644 index 0000000..314b785 --- /dev/null +++ b/tests/test_openarchives_ocr_shards.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd + +from glossapi.scripts import openarchives_ocr_merge, openarchives_ocr_shards + + +def test_openarchives_ocr_shards_balances_pages(tmp_path: Path) -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "pages_total": 100}, + {"filename": "b.pdf", "needs_ocr": True, "pages_total": 90}, + {"filename": "c.pdf", "needs_ocr": True, "pages_total": 40}, + {"filename": "d.pdf", "needs_ocr": True, "pages_total": 30}, + {"filename": "skip.pdf", "needs_ocr": False, "pages_total": 999}, + ] + ) + source = tmp_path / "download_results.parquet" + out_dir = tmp_path / "shards" + df.to_parquet(source, index=False) + + rc = openarchives_ocr_shards.main( + [ + "--parquet", + str(source), + "--output-dir", + str(out_dir), + "--nodes", + "2", + ] + ) + assert rc == 0 + + summary = json.loads((out_dir / "openarchives_ocr_shard_summary.json").read_text()) + assert summary["docs_total"] == 4 + assert summary["pages_total"] == 260 + manifests = sorted(out_dir.glob("openarchives_ocr_shard_node_*.parquet")) + assert len(manifests) == 2 + page_totals = [int(pd.read_parquet(path)["pages_total"].sum()) for path in manifests] + assert max(page_totals) - min(page_totals) <= 20 + + +def test_openarchives_ocr_merge_updates_master(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "ocr_success": False}, + {"filename": "b.pdf", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": False, "ocr_success": True, "ocr_node_id": 2}, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("filename") + assert bool(merged.loc["a.pdf", "ocr_success"]) is True + assert bool(merged.loc["a.pdf", "needs_ocr"]) is False + assert int(merged.loc["a.pdf", "ocr_node_id"]) == 2 + assert bool(merged.loc["b.pdf", "ocr_success"]) is False From 13710404719393fbbf4ad8846c1bad3ace75b627 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 14:27:24 +0300 Subject: [PATCH 30/93] Record OpenArchives parquet recovery result --- docs/operations/openarchives_ocr_rollout_plan.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 789e357..1272fe1 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -100,6 +100,13 @@ Acceptance check: - `needs_ocr=True` count is available directly from the parquet - page totals are available +Current state on 2026-03-31: + +- checked OCR instance `i-0504a326a1fee541f` +- no `download_results.parquet` was found under `/opt`, `/data`, or `/home` +- therefore this path did not recover the canonical OpenArchives routing parquet +- the rollout should proceed with the CPU cleaning-pass fallback below + ### 1B. Fallback: regenerate the routing state on a CPU instance If the OCR box does not contain the full canonical parquet: From 151105cd9e4b5cc940b29782a9ba1d9b89213bf7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 15:33:47 +0300 Subject: [PATCH 31/93] Add OpenArchives OCR enrichment manifest tooling --- .../openarchives_ocr_rollout_plan.md | 43 ++++ .../scripts/openarchives_ocr_enrich.py | 226 ++++++++++++++++++ .../scripts/openarchives_ocr_shards.py | 2 + tests/test_openarchives_ocr_enrich.py | 144 +++++++++++ 4 files changed, 415 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_ocr_enrich.py create mode 100644 tests/test_openarchives_ocr_enrich.py diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 1272fe1..6cb04ad 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -6,6 +6,11 @@ This document records the concrete execution plan for running DeepSeek OCR over The rollout is backed by concrete scripts in `src/glossapi/scripts/`: +- `openarchives_ocr_enrich.py` + - reads the canonical OpenArchives parquet + - scans raw HF JSONL shards for the target docs + - extracts `page_count_source`, `pages_total_source`, and `pdf_url` + - writes a shard-ready enriched parquet for OCR deployment - `openarchives_ocr_shards.py` - reads the canonical parquet - filters `needs_ocr=True` @@ -17,6 +22,44 @@ The rollout is backed by concrete scripts in `src/glossapi/scripts/`: These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. +## Executed result on 2026-03-31 + +The CPU fallback path has now been executed successfully on AWS: + +- CPU cleaner node: + - instance: `c7i.8xlarge` + - instance id: `i-0ccf5ab1a510b31d8` +- Full OA reevaluation fill: + - input rows: `179,845` + - missing `greek_badness_score` rows materialized and cleaned: `89,892` + - unique raw JSONL shards needed for the fill subset: `108` +- Filled routing result: + - `greek_badness_score` coverage: `179,845 / 179,845` + - `needs_ocr == true`: `45,547` +- Enriched OCR target manifest: + - OCR-target docs: `45,547` + - OCR-target pages: `3,292,392` + - raw JSONL shards needed for the full OCR target set: `218` +- Balanced 4-node shard result: + - `4` shard manifests + - `823,098` pages per node + - `11,386` or `11,387` docs per node +- ETA from validated `g7e.48xlarge` throughput: + - one node: `64.94h` + - four nodes: `16.23h` + +Published artifacts on Hugging Face dataset `glossAPI/openarchives.gr`: + +- `data/openarchives_ocr_completion/20260331/summary.json` +- `data/openarchives_ocr_completion/20260331/filled_document_level.parquet` +- `data/openarchives_ocr_completion/20260331/filled_document_quality.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/needs_ocr_enriched.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_00.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_01.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_02.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_03.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_summary.json` + ## Current validated baseline - Validated OCR node type: `g7e.48xlarge` diff --git a/src/glossapi/scripts/openarchives_ocr_enrich.py b/src/glossapi/scripts/openarchives_ocr_enrich.py new file mode 100644 index 0000000..7bfd767 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_enrich.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import argparse +import io +import json +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_enrich", + description="Enrich OpenArchives OCR routing rows with page counts and PDF URLs from raw JSONL shards.", + ) + p.add_argument("--parquet", required=True, help="Canonical parquet after OpenArchives cleaning/fill.") + p.add_argument("--raw-repo-root", required=True, help="Local root of the raw HF OpenArchives dataset.") + p.add_argument("--output-parquet", required=True, help="Where the enriched parquet will be written.") + p.add_argument("--filename-column", default="filename") + p.add_argument("--doc-id-column", default="source_doc_id") + p.add_argument("--source-jsonl-column", default="source_jsonl") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive targets from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _resolve_jsonl_path(raw_repo_root: Path, recorded_path: str) -> Path: + candidate = Path(recorded_path) + if candidate.exists(): + return candidate + + marker = "data/openarchives/" + text = str(recorded_path) + idx = text.find(marker) + if idx != -1: + rel = Path(text[idx:]) + rewritten = raw_repo_root / rel + if rewritten.exists(): + return rewritten + + name = Path(recorded_path).name + matches = list((raw_repo_root / "data" / "openarchives").glob(f"**/{name}")) + if len(matches) == 1: + return matches[0] + raise FileNotFoundError(f"could not resolve JSONL path for {recorded_path}") + + +def _pick_pdf_url(source_meta: dict) -> str: + for key in ("refined_pdf_links_json", "pdf_links_json"): + value = source_meta.get(key) + url = _normalize_pdf_link(value) + if url: + return url + for key in ("external_link", "handle_url", "url"): + value = source_meta.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return "" + + +def _normalize_pdf_link(value: object) -> str: + if value is None: + return "" + if isinstance(value, str): + text = value.strip() + if not text: + return "" + if text.startswith("http://") or text.startswith("https://"): + return text + try: + parsed = json.loads(text) + except Exception: + return text + return _normalize_pdf_link(parsed) + if isinstance(value, list): + for item in value: + normalized = _normalize_pdf_link(item) + if normalized: + return normalized + return "" + if isinstance(value, dict): + for key in ("url", "href", "pdf_url", "link"): + if key in value: + normalized = _normalize_pdf_link(value[key]) + if normalized: + return normalized + return "" + return "" + + +def _coerce_page_count(value: object) -> Optional[int]: + if value is None: + return None + try: + return max(1, int(float(value))) + except Exception: + return None + + +def _enrich_targets( + targets: pd.DataFrame, + *, + raw_repo_root: Path, + doc_id_column: str, + source_jsonl_column: str, +) -> pd.DataFrame: + work = targets.copy() + work["_resolved_jsonl"] = work[source_jsonl_column].map( + lambda p: str(_resolve_jsonl_path(raw_repo_root, str(p))) + ) + grouped: Dict[str, Dict[str, int]] = {} + for row_index, row in work[[doc_id_column, "_resolved_jsonl"]].iterrows(): + grouped.setdefault(str(row["_resolved_jsonl"]), {})[str(row[doc_id_column])] = int(row_index) + + dctx = zstd.ZstdDecompressor() + for jsonl_path, doc_map in grouped.items(): + with Path(jsonl_path).open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + record = json.loads(line) + doc_id = str(record.get("doc_id") or "") + row_index = doc_map.get(doc_id) + if row_index is None: + continue + pipeline = record.get("pipeline_metadata") or {} + source_meta = record.get("source_metadata") or {} + page_count = _coerce_page_count(pipeline.get("page_count")) + pages_total = _coerce_page_count(pipeline.get("pages_total")) + if page_count is None: + page_count = pages_total + if pages_total is None: + pages_total = page_count + work.at[row_index, "page_count_source"] = page_count + work.at[row_index, "pages_total_source"] = pages_total + work.at[row_index, "pdf_url"] = _pick_pdf_url(source_meta) + work.at[row_index, "source_collection_slug"] = source_meta.get("collection_slug") or "" + work.at[row_index, "source_language_code"] = source_meta.get("language_code") or "" + + return work.drop(columns=["_resolved_jsonl"]) + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + raw_repo_root = Path(args.raw_repo_root).expanduser().resolve() + output_path = Path(args.output_parquet).expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + for required in (args.filename_column, args.doc_id_column, args.source_jsonl_column): + if required not in df.columns: + raise SystemExit(f"Required column '{required}' not found in parquet.") + + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + targets = df.loc[target_mask].copy() + if targets.empty: + raise SystemExit("No OCR target rows selected; enriched parquet was not created.") + + enriched_targets = _enrich_targets( + targets, + raw_repo_root=raw_repo_root, + doc_id_column=str(args.doc_id_column), + source_jsonl_column=str(args.source_jsonl_column), + ) + + enriched_targets.to_parquet(output_path, index=False) + summary = { + "source_parquet": str(parquet_path), + "output_parquet": str(output_path), + "target_docs": int(len(enriched_targets)), + "page_count_source_non_null": int(enriched_targets["page_count_source"].notna().sum()), + "pdf_url_non_empty": int(enriched_targets["pdf_url"].fillna("").astype(str).str.len().gt(0).sum()), + "pages_total_sum": int(pd.to_numeric(enriched_targets["page_count_source"], errors="coerce").fillna(0).sum()), + } + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_shards.py b/src/glossapi/scripts/openarchives_ocr_shards.py index c85a718..e68833c 100644 --- a/src/glossapi/scripts/openarchives_ocr_shards.py +++ b/src/glossapi/scripts/openarchives_ocr_shards.py @@ -9,6 +9,8 @@ PAGE_COLUMN_CANDIDATES: Sequence[str] = ( + "page_count_source", + "pages_total_source", "pages_total", "page_count", "total_pages", diff --git a/tests/test_openarchives_ocr_enrich.py b/tests/test_openarchives_ocr_enrich.py new file mode 100644 index 0000000..16d683a --- /dev/null +++ b/tests/test_openarchives_ocr_enrich.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows).encode("utf-8") + cctx = zstd.ZstdCompressor() + path.write_bytes(cctx.compress(payload)) + + +def test_openarchives_ocr_enrich_extracts_page_counts_and_pdf_url(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_01" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "pipeline_metadata": {"page_count": 98, "pages_total": 98}, + "source_metadata": { + "pdf_links_json": "https://example.com/a.pdf", + "collection_slug": "Dione", + "language_code": "el", + }, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "pipeline_metadata": {"pages_total": 12}, + "source_metadata": { + "pdf_links_json": json.dumps( + [ + {"url": "https://example.com/b.pdf"}, + {"url": "https://example.com/b2.pdf"}, + ] + ), + "collection_slug": "Pandemos", + "language_code": "el", + }, + }, + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "filename": "AAA_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-b", + "filename": "BBB_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-c", + "filename": "CCC_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": False, + }, + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output).sort_values("filename").reset_index(drop=True) + assert enriched["filename"].tolist() == ["AAA_000.pdf", "BBB_000.pdf"] + assert enriched["page_count_source"].tolist() == [98, 12] + assert enriched["pages_total_source"].tolist() == [98, 12] + assert enriched["pdf_url"].tolist() == ["https://example.com/a.pdf", "https://example.com/b.pdf"] + assert enriched["source_collection_slug"].tolist() == ["Dione", "Pandemos"] + + +def test_openarchives_ocr_enrich_resolves_rewritten_source_jsonl_path(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_02" / "chunk-001.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-x", + "filename": "XXX_000", + "text": "x", + "pipeline_metadata": {"page_count": 7}, + "source_metadata": {"external_link": "https://example.com/x"}, + } + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-x", + "filename": "XXX_000.pdf", + "source_jsonl": "/home/foivos/data/glossapi_raw/hf/openarchives.gr/data/openarchives/shard_02/chunk-001.jsonl.zst", + "needs_ocr": True, + } + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output) + assert int(enriched.loc[0, "page_count_source"]) == 7 + assert enriched.loc[0, "pdf_url"] == "https://example.com/x" From 4e506d157776b0ea7cdbaa6513e12803818b7ab1 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 15:50:49 +0300 Subject: [PATCH 32/93] Add OpenArchives OCR node runner --- .../openarchives_ocr_rollout_plan.md | 43 +++ .../scripts/openarchives_ocr_run_node.py | 312 ++++++++++++++++++ tests/test_openarchives_ocr_run_node.py | 55 +++ 3 files changed, 410 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_ocr_run_node.py create mode 100644 tests/test_openarchives_ocr_run_node.py diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 6cb04ad..09dc3b2 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -60,6 +60,49 @@ Published artifacts on Hugging Face dataset `glossAPI/openarchives.gr`: - `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_03.parquet` - `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_summary.json` +## Node runner contract + +Each OCR node should materialize one shard into its own GlossAPI corpus root and +run DeepSeek OCR through the standard `Corpus.ocr(...)` API, not through a +standalone benchmark wrapper. + +Stored runner: + +- `python -m glossapi.scripts.openarchives_ocr_run_node` + +The runner does four things in order: + +1. reads one shard parquet +2. downloads the shard PDFs into `downloads/` using their OA filenames +3. writes the shard metadata as canonical `download_results/download_results.parquet` +4. runs `Corpus.ocr(...)` with the validated DeepSeek settings + +Standard node command: + +```bash +PYTHONPATH=src /home/ubuntu/venvs/deepseek/bin/python -m glossapi.scripts.openarchives_ocr_run_node \ + --shard-parquet /data/openarchives/shards/openarchives_ocr_shard_node_00.parquet \ + --work-root /data/openarchives/node_00 \ + --heartbeat-path /data/openarchives/heartbeats/node_00.json \ + --instance-id "$INSTANCE_ID" \ + --node-id node-00 \ + --scheduler whole_doc \ + --runtime-backend vllm \ + --ocr-profile markdown_grounded \ + --render-dpi 144 \ + --max-new-tokens 2048 \ + --repair-mode auto \ + --gpu-memory-utilization 0.9 +``` + +Current rollout note: + +- use `scheduler=whole_doc` for the first production OA pass because that is the + last large-run configuration validated cleanly on the standardized stack +- keep `exact_fill` as the next benchmarking target, but do not silently switch + the production rollout to it until the same stack shows a non-regression or + improvement + ## Current validated baseline - Validated OCR node type: `g7e.48xlarge` diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py new file mode 100644 index 0000000..7ebbe39 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +import socket +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from glossapi import Corpus +from glossapi.parquet_schema import ParquetSchema + + +DEFAULT_DOWNLOAD_CONCURRENCY = 24 +DEFAULT_DOWNLOAD_TIMEOUT = 60 +DEFAULT_HEARTBEAT_INTERVAL = 60 + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_run_node", + description=( + "Materialize one OpenArchives OCR shard into a normal GlossAPI corpus root, " + "download its PDFs, and run DeepSeek OCR with the standardized settings." + ), + ) + p.add_argument("--shard-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-group-by", default="repository_collection") + p.add_argument("--heartbeat-path") + p.add_argument("--heartbeat-interval", type=int, default=DEFAULT_HEARTBEAT_INTERVAL) + p.add_argument("--instance-id", default="") + p.add_argument("--node-id", default="") + p.add_argument("--dry-run", action="store_true") + p.add_argument("--scheduler", default="whole_doc") + p.add_argument("--target-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--runtime-backend", default="vllm") + p.add_argument("--ocr-profile", default="markdown_grounded") + p.add_argument("--max-new-tokens", type=int, default=2048) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--repair-mode", default="auto") + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + return p.parse_args(argv) + + +def _hostname() -> str: + try: + return socket.gethostname() + except Exception: + return "" + + +def _atomic_write_json(path: Path, payload: Dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def _prepare_download_input(df: pd.DataFrame) -> pd.DataFrame: + required = {"filename", "pdf_url"} + missing = sorted(required - set(df.columns)) + if missing: + raise SystemExit(f"Shard parquet missing required column(s): {', '.join(missing)}") + out = df.copy() + out["url"] = out["pdf_url"].astype(str) + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + return out + + +def _load_frame(path: Path) -> pd.DataFrame: + return pd.read_parquet(path).copy() + + +def _normalize_download_results( + *, + shard_df: pd.DataFrame, + download_results_df: pd.DataFrame, + url_column: str = "url", +) -> pd.DataFrame: + shard = shard_df.copy() + if "filename_base" not in shard.columns: + shard["filename_base"] = shard["filename"].astype(str).map(lambda s: Path(s).stem) + + dl = download_results_df.copy() + if "filename_base" not in dl.columns: + dl["filename_base"] = dl["filename"].astype(str).map(lambda s: Path(s).stem) + + merged = dl.merge( + shard, + on="filename_base", + how="left", + suffixes=("", "_shard"), + ) + if "filename_shard" in merged.columns: + merged["filename"] = merged["filename_shard"].fillna(merged["filename"]) + merged = merged.drop(columns=["filename_shard"]) + if "pdf_url" in merged.columns and url_column in merged.columns: + merged[url_column] = merged["pdf_url"].fillna(merged[url_column]) + elif "pdf_url" in merged.columns and url_column not in merged.columns: + merged[url_column] = merged["pdf_url"] + if "download_success" not in merged.columns: + merged["download_success"] = False + if "download_error" not in merged.columns: + merged["download_error"] = "" + if "ocr_success" not in merged.columns: + merged["ocr_success"] = False + if "needs_ocr" not in merged.columns: + merged["needs_ocr"] = True + return merged + + +def _write_canonical_metadata(work_root: Path, df: pd.DataFrame) -> Path: + schema = ParquetSchema({"url_column": "url"}) + canonical = work_root / "download_results" / "download_results.parquet" + canonical.parent.mkdir(parents=True, exist_ok=True) + normalized = schema.normalize_metadata_frame(df) + schema.write_metadata_parquet(normalized, canonical) + return canonical + + +def _read_progress(parquet_path: Path, page_col: str = "page_count_source") -> Dict[str, Any]: + try: + df = pd.read_parquet(parquet_path) + except Exception as exc: + return {"parquet_error": str(exc)} + total_docs = int(len(df)) + docs_done = int(df.get("ocr_success", pd.Series(dtype=bool)).fillna(False).sum()) if "ocr_success" in df.columns else 0 + total_pages = 0 + pages_done = 0 + if page_col in df.columns: + page_values = pd.to_numeric(df[page_col], errors="coerce").fillna(0) + total_pages = int(page_values.sum()) + if "ocr_success" in df.columns: + pages_done = int(page_values[df["ocr_success"].fillna(False)].sum()) + return { + "docs_total": total_docs, + "docs_done": docs_done, + "pages_total": total_pages, + "pages_done": pages_done, + } + + +class _HeartbeatThread(threading.Thread): + def __init__( + self, + *, + heartbeat_path: Path, + interval: int, + parquet_path: Path, + context: Dict[str, Any], + ) -> None: + super().__init__(daemon=True) + self.heartbeat_path = heartbeat_path + self.interval = max(10, int(interval)) + self.parquet_path = parquet_path + self.context = dict(context) + self.stage = "init" + self.error = "" + self.stop_event = threading.Event() + self.started_at = time.time() + + def set_stage(self, stage: str) -> None: + self.stage = str(stage) + + def set_error(self, error: str) -> None: + self.error = str(error) + + def stop(self) -> None: + self.stop_event.set() + + def _payload(self) -> Dict[str, Any]: + payload = dict(self.context) + payload.update( + { + "timestamp": int(time.time()), + "hostname": _hostname(), + "stage": self.stage, + "error": self.error, + "uptime_sec": round(time.time() - self.started_at, 1), + "parquet_path": str(self.parquet_path), + } + ) + payload.update(_read_progress(self.parquet_path)) + return payload + + def run(self) -> None: + while not self.stop_event.is_set(): + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + self.stop_event.wait(self.interval) + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + shard_path = Path(args.shard_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + shard_df = _prepare_download_input(_load_frame(shard_path)) + download_input = manifests_dir / "download_input.parquet" + shard_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + _write_canonical_metadata(work_root, shard_df) + + heartbeat: Optional[_HeartbeatThread] = None + if args.heartbeat_path: + heartbeat = _HeartbeatThread( + heartbeat_path=Path(args.heartbeat_path).expanduser().resolve(), + interval=int(args.heartbeat_interval), + parquet_path=metadata_path, + context={ + "instance_id": str(args.instance_id or ""), + "node_id": str(args.node_id or ""), + "shard_parquet": str(shard_path), + "work_root": str(work_root), + }, + ) + heartbeat.start() + + try: + if args.dry_run: + if heartbeat: + heartbeat.set_stage("dry_run") + return 0 + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + + if heartbeat: + heartbeat.set_stage("download") + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + ) + canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") + metadata_path = _write_canonical_metadata(work_root, canonical_df) + if heartbeat: + heartbeat.parquet_path = metadata_path + heartbeat.set_stage("ocr") + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + corpus.ocr( + fix_bad=True, + mode="ocr_bad", + backend="deepseek", + runtime_backend=str(args.runtime_backend), + ocr_profile=str(args.ocr_profile), + use_gpus="multi", + workers_per_gpu=int(args.workers_per_gpu), + render_dpi=int(args.render_dpi), + max_new_tokens=int(args.max_new_tokens), + repair_mode=str(args.repair_mode), + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + gpu_memory_utilization=float(args.gpu_memory_utilization), + math_enhance=False, + ) + if heartbeat: + heartbeat.set_stage("done") + return 0 + except Exception as exc: + if heartbeat: + heartbeat.set_stage("failed") + heartbeat.set_error(str(exc)) + raise + finally: + if heartbeat: + heartbeat.stop() + heartbeat.join(timeout=5) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_openarchives_ocr_run_node.py b/tests/test_openarchives_ocr_run_node.py new file mode 100644 index 0000000..0b66d52 --- /dev/null +++ b/tests/test_openarchives_ocr_run_node.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_ocr_run_node import ( + _normalize_download_results, + _prepare_download_input, +) + + +def test_prepare_download_input_adds_url_and_filename_base() -> None: + df = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ) + out = _prepare_download_input(df) + assert out.loc[0, "url"] == "https://example.com/a.pdf" + assert out.loc[0, "filename_base"] == "ABC_001" + + +def test_normalize_download_results_preserves_shard_filename_and_metadata() -> None: + shard = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "filename_base": "ABC_001", + "needs_ocr": True, + "source_doc_id": "doc-1", + } + ] + ) + dl = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "filename_base": "ABC_001", + "download_success": True, + "download_error": "", + "url": "https://example.com/a.pdf", + } + ] + ) + out = _normalize_download_results(shard_df=shard, download_results_df=dl) + assert out.loc[0, "filename"] == "ABC_001.pdf" + assert out.loc[0, "source_doc_id"] == "doc-1" + assert bool(out.loc[0, "download_success"]) is True + assert bool(out.loc[0, "needs_ocr"]) is True From b4086f2e2daf37f442348f9ec85bbf0d6c6455cb Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 15:54:37 +0300 Subject: [PATCH 33/93] Align DeepSeek runtime pins with validated OCR nodes --- dependency_setup/deepseek_uv/pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml index 0bfebb2..6f7ffe0 100644 --- a/dependency_setup/deepseek_uv/pyproject.toml +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -5,9 +5,9 @@ description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" requires-python = ">=3.11,<3.13" dependencies = [ "glossapi[deepseek]", - "torch==2.9.1", - "torchvision==0.24.1", - "torchaudio==2.9.1", + "torch==2.10.0", + "torchvision==0.25.0", + "torchaudio==2.10.0", ] [dependency-groups] From 03c28234899551d624b870085f248aff9c8ba97a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 15:59:25 +0300 Subject: [PATCH 34/93] Update DeepSeek extra to validated vLLM stack --- pyproject.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f1613ad..a296c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,11 +52,12 @@ cuda = [ ] # DeepSeek OCR backend extras (Torch should be installed from the PyTorch index). deepseek = [ - "transformers==4.46.3", - "tokenizers==0.20.3", + "vllm==0.18.0", + "transformers==4.57.6", + "tokenizers==0.22.2", "accelerate>=1.2.1,<2", "pymupdf==1.24.10", - "Pillow==10.4.0", + "Pillow==12.1.1", "img2pdf>=0.5.1", "einops", "easydict", From 5d2a386caf96eeea3abb613ad185c8bcc59d18ef Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 16:40:59 +0300 Subject: [PATCH 35/93] Add host-aware OpenArchives download policy overrides --- .../openarchives_ocr_rollout_plan.md | 17 +++ src/glossapi/download_policy.py | 10 ++ src/glossapi/gloss_downloader.py | 119 ++++++++++++++---- tests/test_browser_gloss_downloader.py | 79 ++++++++++++ 4 files changed, 200 insertions(+), 25 deletions(-) diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 09dc3b2..ee659c0 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -77,6 +77,23 @@ The runner does four things in order: 3. writes the shard metadata as canonical `download_results/download_results.parquet` 4. runs `Corpus.ocr(...)` with the validated DeepSeek settings +Download policy note: + +- OpenArchives download should be host-first, not collection-first. +- GlossAPI now supports host-specific download policy overrides in the normal downloader path for: + - `downloader` + - `request_timeout` + - `ssl_verify` + - `ssl_cafile` + - `request_method` + - `sleep` + - `per_domain_concurrency` + - `domain_concurrency_floor` + - `domain_concurrency_ceiling` + - `skip_failed_after` + - `domain_cookies` +- That means the OA freeze-download phase can stay inside `Corpus.download(...)`; we do not need a separate downloader implementation. + Standard node command: ```bash diff --git a/src/glossapi/download_policy.py b/src/glossapi/download_policy.py index f42e043..36d3ce6 100644 --- a/src/glossapi/download_policy.py +++ b/src/glossapi/download_policy.py @@ -12,6 +12,16 @@ VALID_DOWNLOADERS = {"standard", "browser", "auto"} ROUTE_OPTION_KEYS = { + "request_timeout", + "ssl_verify", + "ssl_cafile", + "request_method", + "sleep", + "per_domain_concurrency", + "domain_concurrency_floor", + "domain_concurrency_ceiling", + "skip_failed_after", + "domain_cookies", "browser_timeout_ms", "browser_post_load_wait_ms", "browser_engine", diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index ffce858..b1b6c61 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -534,12 +534,47 @@ def _extract_base_domain(self, url: str) -> str: except Exception: return '' + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + return "standard", {} + + def _route_setting(self, route_options: Optional[Dict[str, Any]], name: str, fallback: Any) -> Any: + if route_options and name in route_options: + return route_options[name] + return fallback + + def _resolve_domain_scheduler_settings( + self, + route_options: Optional[Dict[str, Any]], + ) -> tuple[int, int, int, int]: + floor = max( + 1, + int(self._route_setting(route_options, "domain_concurrency_floor", self.domain_concurrency_floor)), + ) + raw_ceiling = self._route_setting(route_options, "domain_concurrency_ceiling", self.domain_concurrency_ceiling) + if raw_ceiling is None: + ceiling = max(floor, int(self.domain_concurrency_ceiling)) + else: + ceiling = max(floor, int(raw_ceiling)) + start = max( + floor, + min( + int(self._route_setting(route_options, "per_domain_concurrency", self.per_domain_concurrency)), + max(1, self.concurrency), + ceiling, + ), + ) + skip_after = max(1, int(self._route_setting(route_options, "skip_failed_after", self.skip_failed_after))) + return floor, ceiling, start, skip_after + @dataclass class _DomainState: base: str queue: deque = field(default_factory=deque) active: int = 0 concurrency: int = 1 + concurrency_floor: int = 1 + concurrency_ceiling: int = 1 + skip_failed_after: int = 3 successes: int = 0 failures: int = 0 http_429: int = 0 @@ -896,7 +931,7 @@ def _build_request_headers(self, url: str, user_agent: str, referer: Optional[st 'DNT': '1' } - def _resolve_request_cookies(self, url: str) -> Dict[str, str]: + def _resolve_request_cookies(self, url: str, route_options: Optional[Dict[str, Any]] = None) -> Dict[str, str]: cookies: Dict[str, str] = {} for domain_pattern, domain_cookies in self.domain_cookies.items(): if domain_pattern in url: @@ -907,25 +942,39 @@ def _resolve_request_cookies(self, url: str) -> Dict[str, str]: # Replace with an actual random value (only supporting this pattern for now) if 'session-id' in str(value): cookies[key] = f"session-id-{random.randint(100000000, 999999999)}" + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict): + cookies.update({str(k): str(v) for k, v in extra_cookies.items()}) return cookies - def _build_request_timeout(self, retry_count: int) -> aiohttp.ClientTimeout: + def _build_request_timeout( + self, + retry_count: int, + route_options: Optional[Dict[str, Any]] = None, + ) -> aiohttp.ClientTimeout: + base_request_timeout = float(self._route_setting(route_options, "request_timeout", self.request_timeout)) return aiohttp.ClientTimeout( - total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes + total=min(base_request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes ) - def _build_session_connector(self, url: str) -> Optional[aiohttp.TCPConnector]: + def _build_session_connector( + self, + url: str, + route_options: Optional[Dict[str, Any]] = None, + ) -> Optional[aiohttp.TCPConnector]: connector = None url_base = self._extract_base_domain(url) force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) - if (not self.ssl_verify) or force_insecure: + ssl_verify = bool(self._route_setting(route_options, "ssl_verify", self.ssl_verify)) + ssl_cafile = self._route_setting(route_options, "ssl_cafile", self.ssl_cafile) + if (not ssl_verify) or force_insecure: connector = aiohttp.TCPConnector(ssl=False) - elif self.ssl_cafile: + elif ssl_cafile: import ssl as _ssl - ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + ctx = _ssl.create_default_context(cafile=str(ssl_cafile)) connector = aiohttp.TCPConnector(ssl=ctx) return connector @@ -934,6 +983,7 @@ async def _bootstrap_download_session( session: aiohttp.ClientSession, url: str, headers: Dict[str, str], + route_options: Optional[Dict[str, Any]] = None, ) -> Dict[str, str]: headers = await self.setup_session(session, url, headers) @@ -942,7 +992,11 @@ async def _bootstrap_download_session( try: # Visit the base domain to establish cookies if needed base_domain = urlparse(url).netloc - if any(domain in base_domain for domain in self.domain_cookies.keys()): + all_cookie_domains = set(self.domain_cookies.keys()) + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict) and extra_cookies: + all_cookie_domains.add(base_domain) + if any(domain in base_domain for domain in all_cookie_domains): base_url = f"https://{base_domain}" async with session.get(base_url, headers=headers, timeout=base_timeout): pass @@ -1155,15 +1209,17 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn return False, "", "", "Empty URL", retry_count url = self._normalize_request_url(url) + _, route_options = self._resolve_route(url) user_agent = next(self.user_agents) headers = self._build_request_headers(url, user_agent, referer) - cookies = self._resolve_request_cookies(url) + cookies = self._resolve_request_cookies(url, route_options=route_options) if semaphore: await semaphore.acquire() try: await rate_limiter.acquire() - sleep_time = self.sleep * (2 ** retry_count) + base_sleep = float(self._route_setting(route_options, "sleep", self.sleep)) + sleep_time = base_sleep * (2 ** retry_count) await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) preflight = await self._preflight_download( row_index=row_index, @@ -1174,14 +1230,19 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn ) if preflight is not None: return preflight - timeout = self._build_request_timeout(retry_count) + timeout = self._build_request_timeout(retry_count, route_options=route_options) try: - connector = self._build_session_connector(url) + connector = self._build_session_connector(url, route_options=route_options) async with aiohttp.ClientSession(cookies=cookies, connector=connector) as session: try: - headers = await self._bootstrap_download_session(session, url, headers) - requester = self.request_method.lower() + headers = await self._bootstrap_download_session( + session, + url, + headers, + route_options=route_options, + ) + requester = str(self._route_setting(route_options, "request_method", self.request_method)).lower() try: self.verbose_log(f"Attempting download request to URL: {url}") @@ -1343,6 +1404,8 @@ def _write_checkpoint() -> None: for i, row_idx in enumerate(batch_indices): url = df.loc[row_idx, self.url_column] retry_count = df.loc[row_idx, 'download_retry_count'] + _, route_options = self._resolve_route(url) + _, _, _, skip_after = self._resolve_domain_scheduler_settings(route_options) # Optional per-row referer (e.g., external_link page) ref_val = None if self.referer_column and self.referer_column in df.columns: @@ -1362,7 +1425,7 @@ def _write_checkpoint() -> None: pass # Skip URLs that have failed too many times - if retry_count >= self.skip_failed_after: + if retry_count >= skip_after: self.logger.info(f"Skipping URL at row {row_idx} - too many failures: {retry_count}") continue @@ -1573,6 +1636,7 @@ def _write_checkpoint() -> None: domains: Dict[str, GlossDownloader._DomainState] = {} for idx in row_indices: url = df.at[idx, self.url_column] + _, route_options = self._resolve_route(url) # Determine grouping key if self.scheduler_group_by and self.scheduler_group_by != 'base_domain': key = str(df.at[idx, self.scheduler_group_by]) if self.scheduler_group_by in df.columns else '' @@ -1583,9 +1647,14 @@ def _write_checkpoint() -> None: if not key: key = '' if key not in domains: - # Each group starts with up to per_domain_concurrency, but not exceeding global - start_c = min(self.per_domain_concurrency, max(1, self.concurrency)) - domains[key] = GlossDownloader._DomainState(base=key, concurrency=start_c) + floor_c, ceiling_c, start_c, skip_after = self._resolve_domain_scheduler_settings(route_options) + domains[key] = GlossDownloader._DomainState( + base=key, + concurrency=start_c, + concurrency_floor=floor_c, + concurrency_ceiling=ceiling_c, + skip_failed_after=skip_after, + ) domains[key].queue.append(idx) if not domains: @@ -1844,7 +1913,7 @@ def estimate_eta_s(state: GlossDownloader._DomainState) -> float: if remaining <= 0: return 0.0 avg = state.avg_duration() or 5.0 # default initial guess - eff_c = max(self.domain_concurrency_floor, min(state.concurrency, self.domain_concurrency_ceiling)) + eff_c = max(state.concurrency_floor, min(state.concurrency, state.concurrency_ceiling)) # ETA ≈ remaining * avg / eff_c (assuming steady parallelism) return float(remaining) * avg / max(1, eff_c) @@ -1928,7 +1997,7 @@ async def dispatch_ready(): if pending_domains: active_order.append(pending_domains.popleft()) continue - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Unparked domain: {dom}; resuming at concurrency={state.concurrency}") # Attempt to launch up to (state.concurrency - state.active) while ( @@ -1940,7 +2009,7 @@ async def dispatch_ready(): url = df.at[row_idx, self.url_column] retry_count = int(df.at[row_idx, 'download_retry_count']) if 'download_retry_count' in df.columns else 0 # Skip rows with too many failures - if retry_count >= self.skip_failed_after: + if retry_count >= state.skip_failed_after: continue # Launch task t0 = time.time() @@ -2122,7 +2191,7 @@ async def dispatch_ready(): # Dynamic tuning: ease if overloaded if self.dynamic_tuning and should_ease(state): - if state.concurrency > self.domain_concurrency_floor: + if state.concurrency > state.concurrency_floor: state.concurrency -= 1 self.logger.info(f"Easing concurrency for {dom} -> {state.concurrency}") @@ -2142,14 +2211,14 @@ async def dispatch_ready(): if retry_after is None: retry_after = max(1, int(self.ping_recheck_seconds)) state.parked_until = now2 + retry_after - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Rate limited: {dom}; parked for {retry_after}s") # Timeout streak -> exponential backoff elif state.timeout_streak >= int(getattr(self, 'timeout_streak_threshold', 5)): backoff = min(float(getattr(self, 'backoff_min_s', 60.0)) * (2 ** max(0, state.ping_failures)), float(getattr(self, 'backoff_max_s', 900.0))) state.ping_failures += 1 state.parked_until = now2 + backoff - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) state.timeout_streak = 0 self.progress_logger.info(f"[park] Timeout streak: {dom}; parked for {int(backoff)}s (level={state.ping_failures})") else: @@ -2171,7 +2240,7 @@ async def dispatch_ready(): state.eta_exceeded_count += 1 if state.eta_exceeded_count == 1: # Try to increase concurrency gently to improve ETA, up to ceiling - if state.concurrency < self.domain_concurrency_ceiling: + if state.concurrency < state.concurrency_ceiling: state.concurrency += 1 self.logger.info( f"ETA high for {dom} ({int(eta_s)}s). Bumping concurrency -> {state.concurrency}" diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py index 9412d23..20707e7 100644 --- a/tests/test_browser_gloss_downloader.py +++ b/tests/test_browser_gloss_downloader.py @@ -200,6 +200,85 @@ async def _fake_download_browser_route(**kwargs): assert observed["route_options"]["browser_timeout_ms"] == 1234 +def test_download_policy_preserves_transport_and_scheduler_options(): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ikee.lib.auth.gr"]}, + "downloader": "standard", + "request_timeout": 120, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 3, + "skip_failed_after": 5, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + + route, options = policy.resolve("https://ikee.lib.auth.gr/record/123/files/file.pdf") + + assert route == "standard" + assert options["request_timeout"] == 120 + assert options["ssl_verify"] is False + assert options["per_domain_concurrency"] == 2 + assert options["domain_concurrency_floor"] == 1 + assert options["domain_concurrency_ceiling"] == 3 + assert options["skip_failed_after"] == 5 + assert options["domain_cookies"] == {"sessionid": "abc"} + + +def test_browser_downloader_route_options_apply_standard_transport_settings(tmp_path): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ktisis.cut.ac.cy"]}, + "downloader": "standard", + "request_timeout": 90, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 2, + "skip_failed_after": 4, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + async def _build_connector(): + return downloader._build_session_connector( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + + route, route_options = downloader._resolve_route("https://ktisis.cut.ac.cy/items/123/file.pdf") + timeout = downloader._build_request_timeout(0, route_options=route_options) + connector = asyncio.run(_build_connector()) + cookies = downloader._resolve_request_cookies( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + floor, ceiling, start, skip_after = downloader._resolve_domain_scheduler_settings(route_options) + + assert route == "standard" + assert timeout.total == 90 + assert connector is not None + assert cookies["sessionid"] == "abc" + assert (floor, ceiling, start, skip_after) == (1, 2, 2, 4) + + def test_corpus_download_mode_selects_browser_downloader(tmp_path, monkeypatch): input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) input_parquet = tmp_path / "urls.parquet" From fde296781b0606f3c5b38dc5974c4fc7d3ed01ec Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 16:55:05 +0300 Subject: [PATCH 36/93] Add OpenArchives download probe and policy --- .../openarchives_ocr_rollout_plan.md | 4 + samples/openarchives_download_policy.yml | 73 ++++++++ .../scripts/openarchives_download_probe.py | 156 ++++++++++++++++++ .../scripts/openarchives_ocr_run_node.py | 4 +- tests/test_openarchives_download_probe.py | 32 ++++ 5 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 samples/openarchives_download_policy.yml create mode 100644 src/glossapi/scripts/openarchives_download_probe.py create mode 100644 tests/test_openarchives_download_probe.py diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index ee659c0..a427288 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -93,6 +93,10 @@ Download policy note: - `skip_failed_after` - `domain_cookies` - That means the OA freeze-download phase can stay inside `Corpus.download(...)`; we do not need a separate downloader implementation. +- Stored OA policy sample: + - `samples/openarchives_download_policy.yml` +- Stored OA probe runner: + - `python -m glossapi.scripts.openarchives_download_probe` Standard node command: diff --git a/samples/openarchives_download_policy.yml b/samples/openarchives_download_policy.yml new file mode 100644 index 0000000..a693604 --- /dev/null +++ b/samples/openarchives_download_policy.yml @@ -0,0 +1,73 @@ +default: + downloader: standard + request_timeout: 60 + ssl_verify: true + per_domain_concurrency: 8 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.25 + +rules: + - match: + domains: [ikee.lib.auth.gr] + downloader: standard + request_timeout: 120 + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 5 + sleep: 1.0 + + - match: + domains: [ktisis.cut.ac.cy] + downloader: standard + request_timeout: 90 + ssl_verify: false + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 4 + sleep: 0.5 + + - match: + domains: [repository.academyofathens.gr] + downloader: standard + request_timeout: 45 + per_domain_concurrency: 16 + domain_concurrency_floor: 2 + domain_concurrency_ceiling: 16 + skip_failed_after: 3 + sleep: 0.1 + + - match: + domains: + - dspace.lib.ntua.gr + - olympias.lib.uoi.gr + - dione.lib.unipi.gr + - pergamos.lib.uoa.gr + - hellanicus.lib.aegean.gr + - dias.library.tuc.gr + downloader: standard + request_timeout: 60 + per_domain_concurrency: 12 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.2 + + - match: + domains: + - repository.ihu.gr + - dlib.statistics.gr + - apothesis.eap.gr + - repository.edulll.gr + - dspace.lib.uom.gr + - dspace.aua.gr + downloader: standard + request_timeout: 75 + per_domain_concurrency: 6 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 8 + skip_failed_after: 4 + sleep: 0.25 diff --git a/src/glossapi/scripts/openarchives_download_probe.py b/src/glossapi/scripts/openarchives_download_probe.py new file mode 100644 index 0000000..76329c3 --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_probe.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable, Optional +from urllib.parse import urlparse + +import pandas as pd + +from glossapi import Corpus + + +def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_probe", + description=( + "Sample OpenArchives OCR-target PDFs by host, run a controlled download probe, " + "and write per-host success summaries." + ), + ) + p.add_argument("--parquet", required=True, help="needs_ocr_enriched parquet with pdf_url and filename columns") + p.add_argument("--output-dir", required=True) + p.add_argument("--policy-file", default="") + p.add_argument("--samples-per-host", type=int, default=12) + p.add_argument("--max-hosts", type=int, default=12) + p.add_argument("--seed", type=int, default=42) + p.add_argument("--concurrency", type=int, default=12) + p.add_argument("--request-timeout", type=int, default=60) + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--hosts", nargs="*", default=None, help="Optional explicit host allowlist") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _host_from_url(url: str) -> str: + try: + return (urlparse(str(url)).hostname or "").lower() + except Exception: + return "" + + +def _prepare_probe_frame( + df: pd.DataFrame, + *, + samples_per_host: int, + max_hosts: int, + seed: int, + hosts: Optional[Iterable[str]] = None, +) -> pd.DataFrame: + frame = df.copy() + if "pdf_url" not in frame.columns or "filename" not in frame.columns: + raise SystemExit("Probe parquet must include at least 'pdf_url' and 'filename' columns") + frame["host"] = frame["pdf_url"].astype(str).map(_host_from_url) + frame = frame[frame["host"].astype(bool)].copy() + if hosts: + allowed = {str(h).strip().lower() for h in hosts if str(h).strip()} + frame = frame[frame["host"].isin(allowed)].copy() + ranked_hosts = ( + frame.groupby("host", dropna=False) + .size() + .sort_values(ascending=False) + .head(max(1, int(max_hosts))) + .index.tolist() + ) + probe = frame[frame["host"].isin(ranked_hosts)].copy() + sampled = ( + probe.groupby("host", group_keys=True) + .apply( + lambda grp: grp.sample(n=min(len(grp), int(samples_per_host)), random_state=int(seed)), + include_groups=False, + ) + .reset_index(level=0) + .reset_index(drop=True) + ) + sampled["url"] = sampled["pdf_url"].astype(str) + sampled["base_domain"] = sampled["pdf_url"].astype(str).map( + lambda s: f"{urlparse(str(s)).scheme or 'https'}://{(urlparse(str(s)).netloc or '').lower()}".rstrip("/") + if _host_from_url(str(s)) + else "" + ) + return sampled + + +def _summary_payload(df: pd.DataFrame, *, source_rows: int) -> dict: + out = df.copy() + if "download_success" not in out.columns: + out["download_success"] = False + grouped = ( + out.groupby("host", dropna=False) + .agg( + docs=("host", "size"), + successes=("download_success", lambda s: int(pd.Series(s).fillna(False).sum())), + failures=("download_success", lambda s: int((~pd.Series(s).fillna(False)).sum())), + ) + .reset_index() + .sort_values(["docs", "successes"], ascending=[False, False]) + ) + return { + "source_rows": int(source_rows), + "probe_rows": int(len(out)), + "hosts": grouped.to_dict(orient="records"), + } + + +def main(argv: Optional[list[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + source_df = pd.read_parquet(parquet_path) + probe_df = _prepare_probe_frame( + source_df, + samples_per_host=int(args.samples_per_host), + max_hosts=int(args.max_hosts), + seed=int(args.seed), + hosts=args.hosts, + ) + probe_input = output_dir / "probe_input.parquet" + probe_df.to_parquet(probe_input, index=False) + + if args.dry_run: + summary = _summary_payload(probe_df, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + corpus = Corpus( + input_dir=output_dir / "downloads", + output_dir=output_dir, + log_level="INFO", + verbose=False, + ) + results = corpus.download( + input_parquet=probe_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.concurrency), + request_timeout=int(args.request_timeout), + download_policy_file=(str(args.policy_file) if str(args.policy_file or "").strip() else None), + ) + merged = results.merge( + probe_df[["url", "host", "filename"]], + on="url", + how="left", + suffixes=("", "_probe"), + ) + merged_path = output_dir / "probe_results.parquet" + merged.to_parquet(merged_path, index=False) + summary = _summary_payload(merged, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py index 7ebbe39..5b949a3 100644 --- a/src/glossapi/scripts/openarchives_ocr_run_node.py +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -34,7 +34,8 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--python-log-level", default="INFO") p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) - p.add_argument("--download-group-by", default="repository_collection") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") p.add_argument("--heartbeat-path") p.add_argument("--heartbeat-interval", type=int, default=DEFAULT_HEARTBEAT_INTERVAL) p.add_argument("--instance-id", default="") @@ -262,6 +263,7 @@ def main(argv: Optional[List[str]] = None) -> int: parallelize_by=str(args.download_group_by), concurrency=int(args.download_concurrency), request_timeout=int(args.download_timeout), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), ) canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") metadata_path = _write_canonical_metadata(work_root, canonical_df) diff --git a/tests/test_openarchives_download_probe.py b/tests/test_openarchives_download_probe.py new file mode 100644 index 0000000..0213438 --- /dev/null +++ b/tests/test_openarchives_download_probe.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_download_probe import _prepare_probe_frame + + +def test_prepare_probe_frame_limits_per_host_and_adds_runtime_columns() -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/a.pdf"}, + {"filename": "b.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/b.pdf"}, + {"filename": "c.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/c.pdf"}, + {"filename": "d.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/d.pdf"}, + {"filename": "e.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/e.pdf"}, + ] + ) + + out = _prepare_probe_frame( + df, + samples_per_host=2, + max_hosts=2, + seed=7, + ) + + counts = out.groupby("host").size().to_dict() + assert counts["ikee.lib.auth.gr"] == 2 + assert counts["dspace.lib.ntua.gr"] == 2 + assert set(out["url"]) <= set(df["pdf_url"]) + assert set(out["base_domain"]) == {"https://ikee.lib.auth.gr", "https://dspace.lib.ntua.gr"} From 36b76668923f49ee8b90b45788d1cd94633604b9 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 17:28:30 +0300 Subject: [PATCH 37/93] Tighten OpenArchives download scheduling defaults --- .../openarchives_ocr_rollout_plan.md | 16 ++++++++++ samples/openarchives_download_policy.yml | 29 +++++++++++++++---- .../scripts/openarchives_download_probe.py | 2 ++ .../scripts/openarchives_ocr_run_node.py | 2 ++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index a427288..78e7112 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -97,6 +97,22 @@ Download policy note: - `samples/openarchives_download_policy.yml` - Stored OA probe runner: - `python -m glossapi.scripts.openarchives_download_probe` +- OA download runs should use `scheduler_mode=per_domain` together with `parallelize_by=base_domain`, + otherwise the host-level concurrency policy is mostly inert. +- Probe result on the CPU box: + - `dspace.lib.ntua.gr` succeeds cleanly once OA downloads use `scheduler_mode=per_domain` + and the host is throttled to a single in-flight request + - `ktisis.cut.ac.cy` succeeds with `ssl_verify=false` + - `repository.academyofathens.gr`, `repository.ihu.gr`, `pergamos.lib.uoa.gr`, + and `dione.lib.unipi.gr` behaved like standard hosts in the probe + - `ikee.lib.auth.gr` is not just a pre-ping false negative; direct PDF requests hit + real connection timeouts + - `olympias.lib.uoi.gr` is not just a pre-ping false negative either; direct PDF + requests reach the host but stall on response reads +- Operational recommendation: + - bulk-freeze the good hosts first + - keep `ikee.lib.auth.gr` and `olympias.lib.uoi.gr` in a dedicated slow-path download phase + so they do not dominate the main corpus freeze run Standard node command: diff --git a/samples/openarchives_download_policy.yml b/samples/openarchives_download_policy.yml index a693604..8e1091e 100644 --- a/samples/openarchives_download_policy.yml +++ b/samples/openarchives_download_policy.yml @@ -12,11 +12,32 @@ rules: - match: domains: [ikee.lib.auth.gr] downloader: standard - request_timeout: 120 - per_domain_concurrency: 2 + request_timeout: 180 + per_domain_concurrency: 1 domain_concurrency_floor: 1 - domain_concurrency_ceiling: 2 + domain_concurrency_ceiling: 1 skip_failed_after: 5 + sleep: 1.5 + + - match: + domains: [dspace.lib.ntua.gr] + downloader: standard + request_timeout: 120 + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 + sleep: 1.0 + + - match: + domains: [olympias.lib.uoi.gr] + downloader: standard + request_timeout: 180 + ssl_verify: false + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 sleep: 1.0 - match: @@ -42,8 +63,6 @@ rules: - match: domains: - - dspace.lib.ntua.gr - - olympias.lib.uoi.gr - dione.lib.unipi.gr - pergamos.lib.uoa.gr - hellanicus.lib.aegean.gr diff --git a/src/glossapi/scripts/openarchives_download_probe.py b/src/glossapi/scripts/openarchives_download_probe.py index 76329c3..d253b9b 100644 --- a/src/glossapi/scripts/openarchives_download_probe.py +++ b/src/glossapi/scripts/openarchives_download_probe.py @@ -27,6 +27,7 @@ def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: p.add_argument("--seed", type=int, default=42) p.add_argument("--concurrency", type=int, default=12) p.add_argument("--request-timeout", type=int, default=60) + p.add_argument("--scheduler-mode", default="per_domain") p.add_argument("--download-group-by", default="base_domain") p.add_argument("--hosts", nargs="*", default=None, help="Optional explicit host allowlist") p.add_argument("--dry-run", action="store_true") @@ -137,6 +138,7 @@ def main(argv: Optional[list[str]] = None) -> int: parallelize_by=str(args.download_group_by), concurrency=int(args.concurrency), request_timeout=int(args.request_timeout), + scheduler_mode=str(args.scheduler_mode), download_policy_file=(str(args.policy_file) if str(args.policy_file or "").strip() else None), ) merged = results.merge( diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py index 5b949a3..6970161 100644 --- a/src/glossapi/scripts/openarchives_ocr_run_node.py +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -34,6 +34,7 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--python-log-level", default="INFO") p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-scheduler-mode", default="per_domain") p.add_argument("--download-group-by", default="base_domain") p.add_argument("--download-policy-file", default="") p.add_argument("--heartbeat-path") @@ -263,6 +264,7 @@ def main(argv: Optional[List[str]] = None) -> int: parallelize_by=str(args.download_group_by), concurrency=int(args.download_concurrency), request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), ) canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") From 95146184e1174cb566bbcb30eb442b32063c45b2 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 17:34:39 +0300 Subject: [PATCH 38/93] Add OpenArchives download freeze runner --- .../openarchives_ocr_rollout_plan.md | 8 ++ .../scripts/openarchives_download_freeze.py | 81 +++++++++++++++++++ tests/test_openarchives_download_freeze.py | 26 ++++++ 3 files changed, 115 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_download_freeze.py create mode 100644 tests/test_openarchives_download_freeze.py diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 78e7112..590d18b 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -69,6 +69,7 @@ standalone benchmark wrapper. Stored runner: - `python -m glossapi.scripts.openarchives_ocr_run_node` +- `python -m glossapi.scripts.openarchives_download_freeze` The runner does four things in order: @@ -77,6 +78,13 @@ The runner does four things in order: 3. writes the shard metadata as canonical `download_results/download_results.parquet` 4. runs `Corpus.ocr(...)` with the validated DeepSeek settings +The download-freeze runner is the matching download-only entrypoint: + +1. reads one OA manifest parquet +2. downloads the PDFs into `downloads/` using their OA filenames +3. writes canonical `download_results/download_results.parquet` +4. stops there, without starting OCR + Download policy note: - OpenArchives download should be host-first, not collection-first. diff --git a/src/glossapi/scripts/openarchives_download_freeze.py b/src/glossapi/scripts/openarchives_download_freeze.py new file mode 100644 index 0000000..8188e9a --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_freeze.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import argparse +import logging +from pathlib import Path +from typing import List, Optional + +from glossapi import Corpus +from glossapi.scripts.openarchives_ocr_run_node import ( + DEFAULT_DOWNLOAD_CONCURRENCY, + DEFAULT_DOWNLOAD_TIMEOUT, + _load_frame, + _normalize_download_results, + _prepare_download_input, + _write_canonical_metadata, +) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_freeze", + description=( + "Materialize one OpenArchives manifest into a canonical GlossAPI downloads root " + "without starting OCR. This is the reproducible PDF-freeze entrypoint." + ), + ) + p.add_argument("--input-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + input_path = Path(args.input_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + manifest_df = _prepare_download_input(_load_frame(input_path)) + download_input = manifests_dir / "download_input.parquet" + manifest_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + _write_canonical_metadata(work_root, manifest_df) + + if args.dry_run: + return 0 + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=manifest_df, download_results_df=dl_df, url_column="url") + _write_canonical_metadata(work_root, canonical_df) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_openarchives_download_freeze.py b/tests/test_openarchives_download_freeze.py new file mode 100644 index 0000000..6420372 --- /dev/null +++ b/tests/test_openarchives_download_freeze.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_download_freeze import main + + +def test_download_freeze_dry_run_materializes_manifest(tmp_path: Path) -> None: + src = tmp_path / "input.parquet" + pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ).to_parquet(src, index=False) + + work_root = tmp_path / "work" + rc = main(["--input-parquet", str(src), "--work-root", str(work_root), "--dry-run"]) + assert rc == 0 + assert (work_root / "manifests" / "download_input.parquet").exists() + assert (work_root / "download_results" / "download_results.parquet").exists() From 5f9dbd5c665292020012c2f661e965c844c5ea1c Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 17:51:24 +0300 Subject: [PATCH 39/93] Add OpenArchives HF shard refresh tool --- .../scripts/openarchives_hf_refresh.py | 232 ++++++++++++++++++ tests/test_openarchives_hf_refresh.py | 160 ++++++++++++ 2 files changed, 392 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_hf_refresh.py create mode 100644 tests/test_openarchives_hf_refresh.py diff --git a/src/glossapi/scripts/openarchives_hf_refresh.py b/src/glossapi/scripts/openarchives_hf_refresh.py new file mode 100644 index 0000000..911c795 --- /dev/null +++ b/src/glossapi/scripts/openarchives_hf_refresh.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import _resolve_jsonl_path + + +PIPELINE_FIELDS = ( + "greek_badness_score", + "mojibake_badness_score", + "latin_percentage", + "polytonic_ratio", + "char_count_no_comments", + "is_empty", + "filter", + "needs_ocr", + "ocr_success", + "quality_method", + "reevaluated_at", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_hf_refresh", + description=( + "Refresh the canonical OpenArchives HF jsonl.zst shards in place from a refreshed " + "document-level parquet and update the dataset card counts." + ), + ) + p.add_argument("--dataset-root", required=True, help="Local clone/snapshot root of the HF dataset repo.") + p.add_argument("--metadata-parquet", required=True, help="Refreshed document-level parquet with source_jsonl/doc ids.") + p.add_argument("--output-root", default="", help="Optional separate output root. Defaults to in-place dataset-root.") + p.add_argument("--readme-path", default="README.md", help="Dataset card path relative to dataset-root/output-root.") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _normalize_source_key(dataset_root: Path, recorded_path: str) -> str: + resolved = _resolve_jsonl_path(dataset_root, recorded_path) + return str(resolved.relative_to(dataset_root)) + + +def _clean_value(value: object) -> object: + if pd.isna(value): # type: ignore[arg-type] + return None + if isinstance(value, pd.Timestamp): + return value.isoformat() + if hasattr(value, "item"): + try: + return value.item() + except Exception: + return value + return value + + +def _build_update_index(metadata_df: pd.DataFrame, *, dataset_root: Path) -> Dict[str, Dict[str, dict]]: + required = {"source_doc_id", "source_jsonl"} + missing = sorted(required - set(metadata_df.columns)) + if missing: + raise SystemExit(f"Metadata parquet missing required column(s): {', '.join(missing)}") + updates: Dict[str, Dict[str, dict]] = {} + work = metadata_df.copy() + work["_source_key"] = work["source_jsonl"].astype(str).map(lambda p: _normalize_source_key(dataset_root, p)) + for _, row in work.iterrows(): + source_key = str(row["_source_key"]) + doc_id = str(row["source_doc_id"] or "") + payload = {field: _clean_value(row[field]) for field in PIPELINE_FIELDS if field in row.index} + updates.setdefault(source_key, {})[doc_id] = payload + return updates + + +def _iter_jsonl_rows(path: Path) -> Iterable[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + yield json.loads(line) + + +def _write_jsonl_rows(path: Path, rows: Iterable[dict]) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + count = 0 + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + payload = (json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8") + writer.write(payload) + count += 1 + return count + + +def _refresh_readme(readme_text: str, *, total_docs: int, needs_ocr_docs: int) -> str: + title_text = f"OpenArchives.gr {total_docs:,} docs".replace(",", ",") + percent = (100.0 * needs_ocr_docs / total_docs) if total_docs else 0.0 + pct_text = f"{percent:.2f}%" + + replacements = [ + (r"pretty_name:\s*OpenArchives\.gr [^\n]+", f"pretty_name: {title_text}"), + (r"# OpenArchives\.gr [^\n]+", f"# {title_text}"), + ( + r"- Σύνολο markdown αρχείων: \*\*[0-9,]+\*\* from openarchives\.gr", + f"- Σύνολο markdown αρχείων: **{total_docs:,}** from openarchives.gr", + ), + ( + r"- Total markdown files: \*\*[0-9,]+\*\* from openarchives\.gr", + f"- Total markdown files: **{total_docs:,}** from openarchives.gr", + ), + ( + r"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ( + r"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ] + updated = readme_text + for pattern, replacement in replacements: + updated = re.sub(pattern, replacement, updated) + return updated + + +def _refresh_shard( + *, + input_path: Path, + output_path: Path, + updates: Dict[str, dict], + dry_run: bool, +) -> dict: + total = 0 + matched = 0 + needs_ocr = 0 + unmatched_doc_ids: list[str] = [] + rows_out: list[dict] = [] + + for row in _iter_jsonl_rows(input_path): + total += 1 + doc_id = str(row.get("doc_id") or "") + payload = updates.get(doc_id) + if payload is not None: + pipeline = dict(row.get("pipeline_metadata") or {}) + pipeline.update({k: v for k, v in payload.items() if v is not None}) + row["pipeline_metadata"] = pipeline + matched += 1 + else: + unmatched_doc_ids.append(doc_id) + pipeline = row.get("pipeline_metadata") or {} + if bool(pipeline.get("needs_ocr")): + needs_ocr += 1 + rows_out.append(row) + + if not dry_run: + _write_jsonl_rows(output_path, rows_out) + + return { + "path": str(input_path), + "total_rows": total, + "matched_rows": matched, + "unmatched_rows": total - matched, + "needs_ocr_rows": needs_ocr, + "sample_unmatched_doc_ids": unmatched_doc_ids[:5], + } + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + dataset_root = Path(args.dataset_root).expanduser().resolve() + output_root = Path(args.output_root).expanduser().resolve() if str(args.output_root).strip() else dataset_root + output_root.mkdir(parents=True, exist_ok=True) + metadata_path = Path(args.metadata_parquet).expanduser().resolve() + + metadata_df = pd.read_parquet(metadata_path).copy() + updates_by_shard = _build_update_index(metadata_df, dataset_root=dataset_root) + + summaries: list[dict] = [] + total_rows = 0 + matched_rows = 0 + needs_ocr_rows = 0 + shard_root = dataset_root / "data" / "openarchives" + for rel_key, updates in sorted(updates_by_shard.items()): + input_path = dataset_root / rel_key + output_path = output_root / rel_key + summary = _refresh_shard( + input_path=input_path, + output_path=output_path, + updates=updates, + dry_run=bool(args.dry_run), + ) + summaries.append(summary) + total_rows += int(summary["total_rows"]) + matched_rows += int(summary["matched_rows"]) + needs_ocr_rows += int(summary["needs_ocr_rows"]) + + readme_rel = Path(args.readme_path) + readme_in = dataset_root / readme_rel + readme_out = output_root / readme_rel + if readme_in.exists() and not args.dry_run: + readme_text = readme_in.read_text(encoding="utf-8") + readme_out.write_text( + _refresh_readme(readme_text, total_docs=matched_rows, needs_ocr_docs=int(metadata_df["needs_ocr"].fillna(False).sum())), + encoding="utf-8", + ) + + summary = { + "dataset_root": str(dataset_root), + "output_root": str(output_root), + "metadata_parquet": str(metadata_path), + "shards_touched": len(summaries), + "total_rows_seen": total_rows, + "matched_rows": matched_rows, + "unmatched_rows": total_rows - matched_rows, + "needs_ocr_rows_after_refresh": needs_ocr_rows, + "metadata_rows": int(len(metadata_df)), + "metadata_needs_ocr_rows": int(metadata_df["needs_ocr"].fillna(False).sum()) if "needs_ocr" in metadata_df.columns else None, + "sample_shards": summaries[:5], + } + print(json.dumps(summary, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_openarchives_hf_refresh.py b/tests/test_openarchives_hf_refresh.py new file mode 100644 index 0000000..81f015e --- /dev/null +++ b/tests/test_openarchives_hf_refresh.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_hf_refresh import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + writer.write((json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8")) + + +def _read_jsonl_zst(path: Path) -> list[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text = io.TextIOWrapper(reader, encoding="utf-8").read() + return [json.loads(line) for line in text.splitlines() if line.strip()] + + +def test_openarchives_hf_refresh_updates_pipeline_metadata_and_readme(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {"filename": "AAA_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 1.0}, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "source_metadata": {"filename": "BBB_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 2.0}, + }, + ], + ) + (dataset_root / "README.md").write_text( + "---\npretty_name: OpenArchives.gr 191,000 docs\n---\n\n# OpenArchives.gr 191,000 docs\n\n" + "- Σύνολο markdown αρχείων: **191,301** from openarchives.gr\n" + "- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **23,083 / 191,301 (12.07%)**\n" + "- Total markdown files: **191,301** from openarchives.gr\n" + "- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **23,083 / 191,301 (12.07%)**\n", + encoding="utf-8", + ) + + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + "ocr_success": False, + "greek_badness_score": 72.0, + "mojibake_badness_score": 0.2, + "latin_percentage": 33.3, + "polytonic_ratio": 0.0, + "char_count_no_comments": 1234.0, + "is_empty": False, + "filter": "ok", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + { + "source_doc_id": "doc-b", + "source_jsonl": str(shard_path), + "needs_ocr": False, + "ocr_success": False, + "greek_badness_score": 2.0, + "mojibake_badness_score": 0.0, + "latin_percentage": 22.0, + "polytonic_ratio": 0.0, + "char_count_no_comments": 456.0, + "is_empty": True, + "filter": "empty_text==0", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + ] + ) + assert rc == 0 + + rows = _read_jsonl_zst(out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst") + assert rows[0]["pipeline_metadata"]["needs_ocr"] is True + assert rows[0]["pipeline_metadata"]["greek_badness_score"] == 72.0 + assert rows[1]["pipeline_metadata"]["is_empty"] is True + assert rows[1]["pipeline_metadata"]["filter"] == "empty_text==0" + + readme = (out_root / "README.md").read_text(encoding="utf-8") + assert "OpenArchives.gr 2 docs" in readme + assert "**1 / 2 (50.00%)**" in readme + + +def test_openarchives_hf_refresh_dry_run_does_not_write_outputs(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {}, + "pipeline_metadata": {"needs_ocr": False}, + } + ], + ) + (dataset_root / "README.md").write_text("# OpenArchives.gr 191,000 docs\n", encoding="utf-8") + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + } + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + "--dry-run", + ] + ) + assert rc == 0 + assert not (out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst").exists() From 31df75b89c9177af46627151a9f03e4b6abe2b0f Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 17:52:35 +0300 Subject: [PATCH 40/93] Add resumable OpenArchives PDF staging puller --- .../scripts/openarchives_pdf_stage_pull.py | 457 ++++++++++++++++++ tests/test_openarchives_pdf_stage_pull.py | 89 ++++ 2 files changed, 546 insertions(+) create mode 100644 src/glossapi/scripts/openarchives_pdf_stage_pull.py create mode 100644 tests/test_openarchives_pdf_stage_pull.py diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..4165a08 --- /dev/null +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import argparse +import csv +import json +import os +import shutil +import signal +import sqlite3 +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable, Optional, Sequence + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +@dataclass(frozen=True) +class TransferItem: + canonical_filename: str + remote_path: str + remote_size_bytes: int + remote_name: str + + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS transfer_items ( + canonical_filename TEXT PRIMARY KEY, + remote_path TEXT NOT NULL, + remote_size_bytes INTEGER NOT NULL, + remote_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT NOT NULL DEFAULT '', + transfer_started_at TEXT, + transfer_finished_at TEXT, + last_seen_size_bytes INTEGER NOT NULL DEFAULT 0 +); +""" + + +def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_pdf_stage_pull", + description="Resumable staged pull of OpenArchives PDFs from the Greece storage box.", + ) + p.add_argument("--manifest", required=True, help="TSV manifest with canonical_filename, remote_path, remote_size_bytes, remote_name.") + p.add_argument("--work-root", required=True, help="Root directory for downloads, partials, logs, and state.") + p.add_argument("--remote-host", default="debian@83.212.80.170") + p.add_argument("--password-env", default="GREECE_BOX_PASSWORD", help="Environment variable containing the remote SSH password.") + p.add_argument("--max-attempts", type=int, default=20) + p.add_argument("--connect-timeout", type=int, default=30) + p.add_argument("--io-timeout", type=int, default=180) + p.add_argument("--sleep-after-failure", type=float, default=10.0) + p.add_argument("--summary-interval-seconds", type=float, default=5.0) + p.add_argument("--limit", type=int, default=0, help="Optional limit for testing.") + return p.parse_args(argv) + + +class TransferState: + def __init__(self, db_path: Path): + self.db_path = db_path + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self.conn = sqlite3.connect(str(self.db_path)) + self.conn.execute("PRAGMA journal_mode=WAL") + self.conn.execute(SCHEMA) + self.conn.commit() + + def close(self) -> None: + self.conn.close() + + def sync_manifest(self, items: Iterable[TransferItem]) -> None: + rows = [ + (item.canonical_filename, item.remote_path, int(item.remote_size_bytes), item.remote_name) + for item in items + ] + self.conn.executemany( + """ + INSERT INTO transfer_items ( + canonical_filename, remote_path, remote_size_bytes, remote_name, status + ) VALUES (?, ?, ?, ?, 'pending') + ON CONFLICT(canonical_filename) DO UPDATE SET + remote_path=excluded.remote_path, + remote_size_bytes=excluded.remote_size_bytes, + remote_name=excluded.remote_name + """, + rows, + ) + self.conn.commit() + + def reset_stale_in_progress(self) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error=CASE + WHEN last_error = '' THEN 'Recovered from interrupted transfer' + ELSE last_error || ' | Recovered from interrupted transfer' + END + WHERE status='in_progress' + """ + ) + self.conn.commit() + + def mark_completed_if_present(self, downloads_dir: Path, partial_dir: Path) -> None: + cur = self.conn.execute( + "SELECT canonical_filename, remote_size_bytes, status FROM transfer_items" + ) + updates = [] + for canonical_filename, remote_size_bytes, status in cur.fetchall(): + final_path = downloads_dir / canonical_filename + if final_path.exists() and final_path.stat().st_size == int(remote_size_bytes): + updates.append((int(remote_size_bytes), utc_now(), canonical_filename)) + continue + part_path = partial_dir / f"{canonical_filename}.part" + if part_path.exists() and status == "completed": + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error='Final file missing; resuming from partial', + transfer_finished_at=NULL + WHERE canonical_filename=? + """, + (canonical_filename,), + ) + if updates: + self.conn.executemany( + """ + UPDATE transfer_items + SET status='completed', + last_seen_size_bytes=?, + transfer_finished_at=?, + last_error='' + WHERE canonical_filename=? + """, + updates, + ) + self.conn.commit() + + def next_item(self, *, max_attempts: int) -> Optional[sqlite3.Row]: + self.conn.row_factory = sqlite3.Row + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + ORDER BY attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) + return cur.fetchone() + + def mark_in_progress(self, canonical_filename: str, current_size: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='in_progress', + attempts=attempts+1, + transfer_started_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(current_size), canonical_filename), + ) + self.conn.commit() + + def mark_completed(self, canonical_filename: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='completed', + transfer_finished_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def mark_failed(self, canonical_filename: str, error: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='failed', + last_error=?, + last_seen_size_bytes=? + WHERE canonical_filename=? + """, + (str(error), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT status, COUNT(*) AS c + FROM transfer_items + GROUP BY status + """ + ) + counts = {"pending": 0, "in_progress": 0, "completed": 0, "failed": 0} + for status, count in cur.fetchall(): + counts[str(status)] = int(count) + counts["total"] = sum(counts.values()) + return counts + + def byte_counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT + COALESCE(SUM(remote_size_bytes), 0) AS bytes_total, + COALESCE(SUM(CASE WHEN status = 'completed' THEN remote_size_bytes ELSE 0 END), 0) AS bytes_completed, + COALESCE(SUM(CASE WHEN status = 'in_progress' THEN last_seen_size_bytes ELSE 0 END), 0) AS bytes_in_progress + FROM transfer_items + """ + ) + row = cur.fetchone() + bytes_total = int(row[0] or 0) + bytes_completed = int(row[1] or 0) + bytes_in_progress = int(row[2] or 0) + bytes_remaining = max(0, bytes_total - bytes_completed) + return { + "bytes_total": bytes_total, + "bytes_completed": bytes_completed, + "bytes_in_progress": bytes_in_progress, + "bytes_remaining": bytes_remaining, + } + + +def read_manifest(path: Path) -> list[TransferItem]: + items: list[TransferItem] = [] + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + required = {"canonical_filename", "remote_path", "remote_size_bytes", "remote_name"} + if not required.issubset(reader.fieldnames or set()): + raise SystemExit(f"Manifest missing required columns: {sorted(required)}") + for row in reader: + items.append( + TransferItem( + canonical_filename=str(row["canonical_filename"]).strip(), + remote_path=str(row["remote_path"]).strip(), + remote_size_bytes=int(row["remote_size_bytes"]), + remote_name=str(row["remote_name"]).strip(), + ) + ) + return items + + +def write_json(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def append_event(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload, ensure_ascii=False) + "\n") + + +def sftp_one( + *, + remote_host: str, + remote_path: str, + temp_path: Path, + password_env: str, + connect_timeout: int, + io_timeout: int, +) -> subprocess.CompletedProcess[str]: + cmd = [ + "sshpass", + "-e", + "sftp", + "-o", + "BatchMode=no", + "-o", + "PreferredAuthentications=password", + "-o", + "PubkeyAuthentication=no", + "-o", + "KbdInteractiveAuthentication=yes", + "-o", + f"ConnectTimeout={int(connect_timeout)}", + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", + "-o", + "ConnectionAttempts=3", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/greece_box_known_hosts", + "-b", + "-", + remote_host, + ] + env = os.environ.copy() + secret = env.get(password_env) + if not secret: + raise SystemExit(f"Password env var '{password_env}' is not set.") + env["SSHPASS"] = secret + batch = f'reget "{remote_path}" "{temp_path}"\n' + return subprocess.run(cmd, capture_output=True, text=True, env=env, input=batch) + + +def run(argv: Optional[Sequence[str]] = None) -> int: + args = parse_args(argv) + manifest_path = Path(args.manifest).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + downloads_dir = work_root / "downloads" + partial_dir = work_root / "partials" + logs_dir = work_root / "logs" + state_dir = work_root / "state" + downloads_dir.mkdir(parents=True, exist_ok=True) + partial_dir.mkdir(parents=True, exist_ok=True) + logs_dir.mkdir(parents=True, exist_ok=True) + state_dir.mkdir(parents=True, exist_ok=True) + + state = TransferState(state_dir / "transfer_state.sqlite3") + items = read_manifest(manifest_path) + if args.limit and int(args.limit) > 0: + items = items[: int(args.limit)] + state.sync_manifest(items) + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads_dir, partial_dir) + + stop_requested = False + + def _handle_signal(signum, _frame) -> None: + nonlocal stop_requested + stop_requested = True + print(f"[transfer] signal {signum} received; stopping after current file", file=sys.stderr) + + signal.signal(signal.SIGINT, _handle_signal) + signal.signal(signal.SIGTERM, _handle_signal) + + last_summary_ts = 0.0 + current_path = state_dir / "current_transfer.json" + summary_path = state_dir / "summary.json" + events_path = logs_dir / "events.jsonl" + + while not stop_requested: + row = state.next_item(max_attempts=int(args.max_attempts)) + if row is None: + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + break + + canonical = str(row["canonical_filename"]) + remote_path = str(row["remote_path"]) + remote_size = int(row["remote_size_bytes"]) + final_path = downloads_dir / canonical + temp_path = partial_dir / f"{canonical}.part" + current_size = temp_path.stat().st_size if temp_path.exists() else 0 + + state.mark_in_progress(canonical, current_size) + write_json( + current_path, + { + "updated_at": utc_now(), + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_path": str(temp_path), + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + append_event( + events_path, + { + "ts": utc_now(), + "event": "start", + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + + result = sftp_one( + remote_host=str(args.remote_host), + remote_path=remote_path, + temp_path=temp_path, + password_env=str(args.password_env), + connect_timeout=int(args.connect_timeout), + io_timeout=int(args.io_timeout), + ) + + if result.returncode == 0 and temp_path.exists(): + actual_size = temp_path.stat().st_size + if remote_size > 0 and actual_size != remote_size: + state.mark_failed( + canonical, + f"Size mismatch after transfer: expected {remote_size}, got {actual_size}", + actual_size, + ) + else: + final_path.parent.mkdir(parents=True, exist_ok=True) + os.replace(temp_path, final_path) + state.mark_completed(canonical, actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "completed", + "canonical_filename": canonical, + "size_bytes": actual_size, + }, + ) + else: + actual_size = temp_path.stat().st_size if temp_path.exists() else 0 + error = (result.stderr or result.stdout or "").strip()[-4000:] + state.mark_failed(canonical, error or f"transfer failed with code {result.returncode}", actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "failed", + "canonical_filename": canonical, + "return_code": int(result.returncode), + "partial_size_bytes": actual_size, + "error": error or f"transfer failed with code {result.returncode}", + }, + ) + time.sleep(float(args.sleep_after_failure)) + + now = time.time() + if now - last_summary_ts >= float(args.summary_interval_seconds): + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": False}) + last_summary_ts = now + + if current_path.exists(): + try: + current_path.unlink() + except Exception: + pass + + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + state.close() + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(run()) diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..f115370 --- /dev/null +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from pathlib import Path + +from glossapi.scripts.openarchives_pdf_stage_pull import TransferItem, TransferState, read_manifest + + +def _write_manifest(path: Path) -> None: + path.write_text( + "\t".join(["canonical_filename", "remote_path", "remote_size_bytes", "remote_name"]) + + "\n" + + "\t".join(["AAA_456.pdf", "/remote/AAA_456.pdf", "10", "AAA_456.pdf"]) + + "\n" + + "\t".join(["VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", "20", "VFK_368.pdf.Ac6Dc3BA"]) + + "\n", + encoding="utf-8", + ) + + +def test_read_manifest_parses_rows(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + + items = read_manifest(manifest) + + assert items == [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", 20, "VFK_368.pdf.Ac6Dc3BA"), + ] + + +def test_transfer_state_resets_stale_and_marks_completed(tmp_path: Path) -> None: + db_path = tmp_path / "state.sqlite3" + downloads = tmp_path / "downloads" + partials = tmp_path / "partials" + downloads.mkdir() + partials.mkdir() + state = TransferState(db_path) + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + + state.mark_in_progress("AAA_456.pdf", 5) + (downloads / "BBB_001.pdf").write_bytes(b"x" * 12) + + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads, partials) + + cur = state.conn.execute( + "SELECT canonical_filename, status, last_seen_size_bytes, last_error FROM transfer_items ORDER BY canonical_filename" + ) + rows = cur.fetchall() + assert rows[0][0] == "AAA_456.pdf" + assert rows[0][1] == "pending" + assert "Recovered from interrupted transfer" in rows[0][3] + assert rows[1][0] == "BBB_001.pdf" + assert rows[1][1] == "completed" + assert rows[1][2] == 12 + + counts = state.counts() + assert counts["pending"] == 1 + assert counts["completed"] == 1 + state.close() + + +def test_transfer_state_next_item_respects_attempt_limit(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=25 WHERE canonical_filename='AAA_456.pdf'" + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=2 WHERE canonical_filename='BBB_001.pdf'" + ) + state.conn.commit() + + row = state.next_item(max_attempts=20) + + assert row is not None + assert row["canonical_filename"] == "BBB_001.pdf" + state.close() From 820889865ffbadf51204cc9874a01dc98b177fd7 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 18:20:19 +0300 Subject: [PATCH 41/93] Fix OpenArchives HF refresh card totals --- src/glossapi/scripts/openarchives_hf_refresh.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/glossapi/scripts/openarchives_hf_refresh.py b/src/glossapi/scripts/openarchives_hf_refresh.py index 911c795..133852f 100644 --- a/src/glossapi/scripts/openarchives_hf_refresh.py +++ b/src/glossapi/scripts/openarchives_hf_refresh.py @@ -108,8 +108,8 @@ def _refresh_readme(readme_text: str, *, total_docs: int, needs_ocr_docs: int) - (r"pretty_name:\s*OpenArchives\.gr [^\n]+", f"pretty_name: {title_text}"), (r"# OpenArchives\.gr [^\n]+", f"# {title_text}"), ( - r"- Σύνολο markdown αρχείων: \*\*[0-9,]+\*\* from openarchives\.gr", - f"- Σύνολο markdown αρχείων: **{total_docs:,}** from openarchives.gr", + r"- Σύνολο markdown αρχείων: \*\*[0-9,]+\*\* (?:from|από) openarchives\.gr", + f"- Σύνολο markdown αρχείων: **{total_docs:,}** από openarchives.gr", ), ( r"- Total markdown files: \*\*[0-9,]+\*\* from openarchives\.gr", From b966086f793e35710e6648a1f3b7b955caeaef27 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 18:34:44 +0300 Subject: [PATCH 42/93] Prioritize staged OA pulls from dynamic unreachable lists --- .../scripts/openarchives_pdf_stage_pull.py | 139 +++++++++++++++++- tests/test_openarchives_pdf_stage_pull.py | 46 +++++- 2 files changed, 179 insertions(+), 6 deletions(-) diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py index 4165a08..be7ef55 100644 --- a/src/glossapi/scripts/openarchives_pdf_stage_pull.py +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -4,7 +4,7 @@ import csv import json import os -import shutil +import re import signal import sqlite3 import subprocess @@ -36,6 +36,7 @@ class TransferItem: remote_name TEXT NOT NULL, status TEXT NOT NULL DEFAULT 'pending', attempts INTEGER NOT NULL DEFAULT 0, + priority_rank INTEGER NOT NULL DEFAULT 0, last_error TEXT NOT NULL DEFAULT '', transfer_started_at TEXT, transfer_finished_at TEXT, @@ -43,6 +44,8 @@ class TransferItem: ); """ +PDF_NAME_PATTERN = re.compile(r"([A-Za-z0-9._-]+\.pdf(?:\.[A-Za-z0-9_-]+)?)", re.IGNORECASE) + def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: p = argparse.ArgumentParser( @@ -59,6 +62,11 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: p.add_argument("--sleep-after-failure", type=float, default=10.0) p.add_argument("--summary-interval-seconds", type=float, default=5.0) p.add_argument("--limit", type=int, default=0, help="Optional limit for testing.") + p.add_argument( + "--priority-dir", + default=None, + help="Directory of dynamic priority files or filename lists. Items here are transferred first.", + ) return p.parse_args(argv) @@ -69,11 +77,17 @@ def __init__(self, db_path: Path): self.conn = sqlite3.connect(str(self.db_path)) self.conn.execute("PRAGMA journal_mode=WAL") self.conn.execute(SCHEMA) + self._ensure_columns() self.conn.commit() def close(self) -> None: self.conn.close() + def _ensure_columns(self) -> None: + cols = {row[1] for row in self.conn.execute("PRAGMA table_info(transfer_items)").fetchall()} + if "priority_rank" not in cols: + self.conn.execute("ALTER TABLE transfer_items ADD COLUMN priority_rank INTEGER NOT NULL DEFAULT 0") + def sync_manifest(self, items: Iterable[TransferItem]) -> None: rows = [ (item.canonical_filename, item.remote_path, int(item.remote_size_bytes), item.remote_name) @@ -151,7 +165,7 @@ def next_item(self, *, max_attempts: int) -> Optional[sqlite3.Row]: FROM transfer_items WHERE status IN ('pending', 'failed') AND attempts < ? - ORDER BY attempts ASC, canonical_filename ASC + ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC LIMIT 1 """, (max_attempts,), @@ -236,6 +250,46 @@ def byte_counts(self) -> dict[str, int]: "bytes_remaining": bytes_remaining, } + def set_priorities(self, canonical_filenames: set[str]) -> None: + self.conn.execute("UPDATE transfer_items SET priority_rank=0 WHERE priority_rank != 0") + if canonical_filenames: + batch = [] + for name in sorted(canonical_filenames): + batch.append(name) + if len(batch) >= 500: + placeholders = ",".join("?" for _ in batch) + self.conn.execute( + f"UPDATE transfer_items SET priority_rank=100 WHERE canonical_filename IN ({placeholders})", + batch, + ) + batch.clear() + if batch: + placeholders = ",".join("?" for _ in batch) + self.conn.execute( + f"UPDATE transfer_items SET priority_rank=100 WHERE canonical_filename IN ({placeholders})", + batch, + ) + self.conn.commit() + + def priority_counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT + COALESCE(SUM(CASE WHEN priority_rank > 0 THEN 1 ELSE 0 END), 0) AS priority_total, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='pending' THEN 1 ELSE 0 END), 0) AS priority_pending, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='completed' THEN 1 ELSE 0 END), 0) AS priority_completed, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='failed' THEN 1 ELSE 0 END), 0) AS priority_failed + FROM transfer_items + """ + ) + row = cur.fetchone() + return { + "priority_total": int(row[0] or 0), + "priority_pending": int(row[1] or 0), + "priority_completed": int(row[2] or 0), + "priority_failed": int(row[3] or 0), + } + def read_manifest(path: Path) -> list[TransferItem]: items: list[TransferItem] = [] @@ -269,6 +323,42 @@ def append_event(path: Path, payload: dict) -> None: handle.write(json.dumps(payload, ensure_ascii=False) + "\n") +def canonicalize_pdf_name(raw: str) -> Optional[str]: + text = os.path.basename(str(raw).strip()) + if not text: + return None + lower = text.lower() + marker = ".pdf." + if marker in lower: + idx = lower.index(marker) + return text[: idx + 4] + if lower.endswith(".pdf"): + return text + return None + + +def load_priority_filenames(priority_dir: Path) -> set[str]: + results: set[str] = set() + if not priority_dir.exists(): + return results + for path in sorted(priority_dir.rglob("*")): + if not path.is_file(): + continue + direct = canonicalize_pdf_name(path.name) + if direct is not None: + results.add(direct) + continue + try: + text = path.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + for match in PDF_NAME_PATTERN.findall(text): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + def sftp_one( *, remote_host: str, @@ -319,6 +409,7 @@ def run(argv: Optional[Sequence[str]] = None) -> int: args = parse_args(argv) manifest_path = Path(args.manifest).expanduser().resolve() work_root = Path(args.work_root).expanduser().resolve() + priority_dir = Path(args.priority_dir).expanduser().resolve() if args.priority_dir else (work_root / "unreachable_from_source_20260331") downloads_dir = work_root / "downloads" partial_dir = work_root / "partials" logs_dir = work_root / "logs" @@ -335,6 +426,7 @@ def run(argv: Optional[Sequence[str]] = None) -> int: state.sync_manifest(items) state.reset_stale_in_progress() state.mark_completed_if_present(downloads_dir, partial_dir) + manifest_names = {item.canonical_filename for item in items} stop_requested = False @@ -350,11 +442,46 @@ def _handle_signal(signum, _frame) -> None: current_path = state_dir / "current_transfer.json" summary_path = state_dir / "summary.json" events_path = logs_dir / "events.jsonl" + priority_summary_path = state_dir / "priority_summary.json" + priority_available_path = state_dir / "priority_available_in_manifest.txt" + priority_missing_path = state_dir / "priority_missing_in_manifest.txt" + last_priority_set: Optional[set[str]] = None + + def refresh_priorities() -> dict[str, int]: + nonlocal last_priority_set + requested = load_priority_filenames(priority_dir) + if last_priority_set is None or requested != last_priority_set: + available = requested & manifest_names + missing = requested - manifest_names + state.set_priorities(available) + priority_available_path.write_text( + "".join(f"{name}\n" for name in sorted(available)), + encoding="utf-8", + ) + priority_missing_path.write_text( + "".join(f"{name}\n" for name in sorted(missing)), + encoding="utf-8", + ) + write_json( + priority_summary_path, + { + "updated_at": utc_now(), + "priority_dir": str(priority_dir), + "requested_total": len(requested), + "available_in_manifest_total": len(available), + "missing_in_manifest_total": len(missing), + }, + ) + last_priority_set = requested + return state.priority_counts() + + priority_counts = refresh_priorities() while not stop_requested: + priority_counts = refresh_priorities() row = state.next_item(max_attempts=int(args.max_attempts)) if row is None: - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": True}) break canonical = str(row["canonical_filename"]) @@ -439,7 +566,8 @@ def _handle_signal(signum, _frame) -> None: now = time.time() if now - last_summary_ts >= float(args.summary_interval_seconds): - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": False}) + priority_counts = refresh_priorities() + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": False}) last_summary_ts = now if current_path.exists(): @@ -448,7 +576,8 @@ def _handle_signal(signum, _frame) -> None: except Exception: pass - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), "done": True}) + priority_counts = refresh_priorities() + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": True}) state.close() return 0 diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py index f115370..404d4cd 100644 --- a/tests/test_openarchives_pdf_stage_pull.py +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -2,7 +2,13 @@ from pathlib import Path -from glossapi.scripts.openarchives_pdf_stage_pull import TransferItem, TransferState, read_manifest +from glossapi.scripts.openarchives_pdf_stage_pull import ( + TransferItem, + TransferState, + canonicalize_pdf_name, + load_priority_filenames, + read_manifest, +) def _write_manifest(path: Path) -> None: @@ -87,3 +93,41 @@ def test_transfer_state_next_item_respects_attempt_limit(tmp_path: Path) -> None assert row is not None assert row["canonical_filename"] == "BBB_001.pdf" state.close() + + +def test_load_priority_filenames_supports_lists_and_suffix_forms(tmp_path: Path) -> None: + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "manual.txt").write_text( + "AAA_456.pdf\n" + "/tmp/VFK_368.pdf.Ac6Dc3BA\n" + "ignore me\n", + encoding="utf-8", + ) + (priority_dir / "BBB_001.pdf").write_text("", encoding="utf-8") + + names = load_priority_filenames(priority_dir) + + assert names == {"AAA_456.pdf", "VFK_368.pdf", "BBB_001.pdf"} + assert canonicalize_pdf_name("VFK_368.pdf.Ac6Dc3BA") == "VFK_368.pdf" + + +def test_transfer_state_priorities_are_selected_first(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + TransferItem("CCC_002.pdf", "/remote/CCC_002.pdf", 14, "CCC_002.pdf"), + ] + ) + state.set_priorities({"CCC_002.pdf"}) + + row = state.next_item(max_attempts=20) + + assert row is not None + assert row["canonical_filename"] == "CCC_002.pdf" + counts = state.priority_counts() + assert counts["priority_total"] == 1 + assert counts["priority_pending"] == 1 + state.close() From e23d4f1829cb020c23731618685ead976cebfb24 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 19:24:07 +0300 Subject: [PATCH 43/93] Add rsync transport for staged OA PDF pulls --- .../scripts/openarchives_pdf_stage_pull.py | 209 ++++++++++++++---- tests/test_openarchives_pdf_stage_pull.py | 58 +++++ 2 files changed, 228 insertions(+), 39 deletions(-) diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py index be7ef55..3af4571 100644 --- a/src/glossapi/scripts/openarchives_pdf_stage_pull.py +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -45,6 +45,7 @@ class TransferItem: """ PDF_NAME_PATTERN = re.compile(r"([A-Za-z0-9._-]+\.pdf(?:\.[A-Za-z0-9_-]+)?)", re.IGNORECASE) +FILENAME_KEYS = ("filename", "canonical_filename", "md_filename", "source_filename") def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: @@ -56,6 +57,7 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: p.add_argument("--work-root", required=True, help="Root directory for downloads, partials, logs, and state.") p.add_argument("--remote-host", default="debian@83.212.80.170") p.add_argument("--password-env", default="GREECE_BOX_PASSWORD", help="Environment variable containing the remote SSH password.") + p.add_argument("--transport", choices=("sftp", "rsync"), default="sftp") p.add_argument("--max-attempts", type=int, default=20) p.add_argument("--connect-timeout", type=int, default=30) p.add_argument("--io-timeout", type=int, default=180) @@ -323,6 +325,40 @@ def append_event(path: Path, payload: dict) -> None: handle.write(json.dumps(payload, ensure_ascii=False) + "\n") +def sshpass_env(password_env: str) -> dict[str, str]: + env = os.environ.copy() + secret = env.get(password_env) + if not secret: + raise SystemExit(f"Password env var '{password_env}' is not set.") + env["SSHPASS"] = secret + return env + + +def ssh_transport_options(connect_timeout: int) -> list[str]: + return [ + "-o", + "BatchMode=no", + "-o", + "PreferredAuthentications=password", + "-o", + "PubkeyAuthentication=no", + "-o", + "KbdInteractiveAuthentication=yes", + "-o", + f"ConnectTimeout={int(connect_timeout)}", + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", + "-o", + "ConnectionAttempts=3", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/greece_box_known_hosts", + ] + + def canonicalize_pdf_name(raw: str) -> Optional[str]: text = os.path.basename(str(raw).strip()) if not text: @@ -337,6 +373,74 @@ def canonicalize_pdf_name(raw: str) -> Optional[str]: return None +def _walk_json_strings(obj) -> Iterable[str]: + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(key, str): + yield key + yield from _walk_json_strings(value) + elif isinstance(obj, list): + for item in obj: + yield from _walk_json_strings(item) + elif isinstance(obj, str): + yield obj + + +def _extract_priority_filenames_from_csv(path: Path) -> set[str]: + results: set[str] = set() + with path.open("r", encoding="utf-8", errors="ignore", newline="") as handle: + reader = csv.DictReader(handle) + fields = {field.strip() for field in (reader.fieldnames or []) if field} + keyed = any(key in fields for key in FILENAME_KEYS) + for row in reader: + if keyed: + for key in FILENAME_KEYS: + value = row.get(key) + if value: + canonical = canonicalize_pdf_name(value) + if canonical is not None: + results.add(canonical) + break + else: + for value in row.values(): + if not value: + continue + for match in PDF_NAME_PATTERN.findall(str(value)): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + +def _extract_priority_filenames_from_json(path: Path) -> set[str]: + results: set[str] = set() + data = json.loads(path.read_text(encoding="utf-8", errors="ignore")) + for text in _walk_json_strings(data): + canonical = canonicalize_pdf_name(text) + if canonical is not None: + results.add(canonical) + continue + for match in PDF_NAME_PATTERN.findall(text): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + +def _extract_priority_filenames_from_text(path: Path) -> set[str]: + results: set[str] = set() + text = path.read_text(encoding="utf-8", errors="ignore") + for line in text.splitlines(): + canonical = canonicalize_pdf_name(line) + if canonical is not None: + results.add(canonical) + for match in PDF_NAME_PATTERN.findall(text): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + def load_priority_filenames(priority_dir: Path) -> set[str]: results: set[str] = set() if not priority_dir.exists(): @@ -348,17 +452,60 @@ def load_priority_filenames(priority_dir: Path) -> set[str]: if direct is not None: results.add(direct) continue + suffix = path.suffix.lower() try: - text = path.read_text(encoding="utf-8", errors="ignore") + if suffix == ".csv": + results.update(_extract_priority_filenames_from_csv(path)) + elif suffix == ".json": + results.update(_extract_priority_filenames_from_json(path)) + elif suffix in {".txt", ".list", ".lst", ".log"}: + results.update(_extract_priority_filenames_from_text(path)) + else: + continue except Exception: continue - for match in PDF_NAME_PATTERN.findall(text): - canonical = canonicalize_pdf_name(match) - if canonical is not None: - results.add(canonical) return results +def rsync_one( + *, + remote_host: str, + remote_path: str, + temp_path: Path, + password_env: str, + connect_timeout: int, + io_timeout: int, +) -> subprocess.CompletedProcess[str]: + ssh_cmd = ( + "ssh " + "-o BatchMode=no " + "-o PreferredAuthentications=password " + "-o PubkeyAuthentication=no " + "-o KbdInteractiveAuthentication=yes " + f"-o ConnectTimeout={int(connect_timeout)} " + "-o ServerAliveInterval=15 " + "-o ServerAliveCountMax=3 " + "-o ConnectionAttempts=3 " + "-o StrictHostKeyChecking=no " + "-o UserKnownHostsFile=/tmp/greece_box_known_hosts" + ) + cmd = [ + "sshpass", + "-e", + "rsync", + "-av", + "--partial", + "--append-verify", + "--inplace", + f"--timeout={int(io_timeout)}", + "-e", + ssh_cmd, + f"{remote_host}:{remote_path}", + str(temp_path), + ] + return subprocess.run(cmd, capture_output=True, text=True, env=sshpass_env(password_env)) + + def sftp_one( *, remote_host: str, @@ -372,37 +519,13 @@ def sftp_one( "sshpass", "-e", "sftp", - "-o", - "BatchMode=no", - "-o", - "PreferredAuthentications=password", - "-o", - "PubkeyAuthentication=no", - "-o", - "KbdInteractiveAuthentication=yes", - "-o", - f"ConnectTimeout={int(connect_timeout)}", - "-o", - "ServerAliveInterval=15", - "-o", - "ServerAliveCountMax=3", - "-o", - "ConnectionAttempts=3", - "-o", - "StrictHostKeyChecking=no", - "-o", - "UserKnownHostsFile=/tmp/greece_box_known_hosts", + *ssh_transport_options(connect_timeout), "-b", "-", remote_host, ] - env = os.environ.copy() - secret = env.get(password_env) - if not secret: - raise SystemExit(f"Password env var '{password_env}' is not set.") - env["SSHPASS"] = secret batch = f'reget "{remote_path}" "{temp_path}"\n' - return subprocess.run(cmd, capture_output=True, text=True, env=env, input=batch) + return subprocess.run(cmd, capture_output=True, text=True, env=sshpass_env(password_env), input=batch) def run(argv: Optional[Sequence[str]] = None) -> int: @@ -496,6 +619,7 @@ def refresh_priorities() -> dict[str, int]: current_path, { "updated_at": utc_now(), + "transport": str(args.transport), "canonical_filename": canonical, "remote_path": remote_path, "remote_size_bytes": remote_size, @@ -509,6 +633,7 @@ def refresh_priorities() -> dict[str, int]: { "ts": utc_now(), "event": "start", + "transport": str(args.transport), "canonical_filename": canonical, "remote_path": remote_path, "remote_size_bytes": remote_size, @@ -517,14 +642,18 @@ def refresh_priorities() -> dict[str, int]: }, ) - result = sftp_one( - remote_host=str(args.remote_host), - remote_path=remote_path, - temp_path=temp_path, - password_env=str(args.password_env), - connect_timeout=int(args.connect_timeout), - io_timeout=int(args.io_timeout), - ) + transfer_kwargs = { + "remote_host": str(args.remote_host), + "remote_path": remote_path, + "temp_path": temp_path, + "password_env": str(args.password_env), + "connect_timeout": int(args.connect_timeout), + "io_timeout": int(args.io_timeout), + } + if str(args.transport) == "rsync": + result = rsync_one(**transfer_kwargs) + else: + result = sftp_one(**transfer_kwargs) if result.returncode == 0 and temp_path.exists(): actual_size = temp_path.stat().st_size @@ -543,6 +672,7 @@ def refresh_priorities() -> dict[str, int]: { "ts": utc_now(), "event": "completed", + "transport": str(args.transport), "canonical_filename": canonical, "size_bytes": actual_size, }, @@ -556,6 +686,7 @@ def refresh_priorities() -> dict[str, int]: { "ts": utc_now(), "event": "failed", + "transport": str(args.transport), "canonical_filename": canonical, "return_code": int(result.returncode), "partial_size_bytes": actual_size, diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py index 404d4cd..52e63ed 100644 --- a/tests/test_openarchives_pdf_stage_pull.py +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -1,5 +1,6 @@ from __future__ import annotations +import subprocess from pathlib import Path from glossapi.scripts.openarchives_pdf_stage_pull import ( @@ -8,6 +9,7 @@ canonicalize_pdf_name, load_priority_filenames, read_manifest, + run, ) @@ -131,3 +133,59 @@ def test_transfer_state_priorities_are_selected_first(tmp_path: Path) -> None: assert counts["priority_total"] == 1 assert counts["priority_pending"] == 1 state.close() + + +def test_load_priority_filenames_ignores_parquet_and_reads_csv_columns(tmp_path: Path) -> None: + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "unreachable_from_source_20260331.csv").write_text( + "filename,source_unreachable_reason\n" + "ZFV_051.pdf,connect_timeout\n" + "ZGA_056.pdf,connect_timeout\n", + encoding="utf-8", + ) + (priority_dir / "unreachable_from_source_20260331.parquet").write_bytes(b"PAR1junkZXY_999.pdfjunk") + + names = load_priority_filenames(priority_dir) + + assert names == {"ZFV_051.pdf", "ZGA_056.pdf"} + + +def test_run_uses_rsync_transport_when_requested(tmp_path: Path, monkeypatch) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + work_root = tmp_path / "work" + seen: list[str] = [] + + def _fake_rsync_one(**kwargs): + seen.append("rsync") + Path(kwargs["temp_path"]).parent.mkdir(parents=True, exist_ok=True) + Path(kwargs["temp_path"]).write_bytes(b"x" * 10) + return subprocess.CompletedProcess(args=["rsync"], returncode=0, stdout="", stderr="") + + def _fake_sftp_one(**kwargs): + seen.append("sftp") + return subprocess.CompletedProcess(args=["sftp"], returncode=1, stdout="", stderr="unexpected") + + monkeypatch.setenv("GREECE_BOX_PASSWORD", "secret") + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.rsync_one", _fake_rsync_one) + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.sftp_one", _fake_sftp_one) + + rc = run( + [ + "--manifest", + str(manifest), + "--work-root", + str(work_root), + "--transport", + "rsync", + "--limit", + "1", + "--summary-interval-seconds", + "0", + ] + ) + + assert rc == 0 + assert seen == ["rsync"] + assert (work_root / "downloads" / "AAA_456.pdf").exists() From 93f5d1255627d1b5f2cca0dd76a5619fc0a32bae Mon Sep 17 00:00:00 2001 From: fffoivos Date: Tue, 31 Mar 2026 23:34:16 +0300 Subject: [PATCH 44/93] Add cutoff-based OpenArchives OCR sharding --- .../scripts/openarchives_ocr_cutoff_shards.py | 194 ++++++++++++++++++ .../scripts/openarchives_ocr_merge.py | 51 ++++- .../scripts/openarchives_ocr_run_node.py | 103 +++++++--- tests/test_openarchives_ocr_shards.py | 99 ++++++++- 4 files changed, 420 insertions(+), 27 deletions(-) create mode 100644 src/glossapi/scripts/openarchives_ocr_cutoff_shards.py diff --git a/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py b/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py new file mode 100644 index 0000000..8548faa --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import pandas as pd + +from glossapi.scripts.openarchives_ocr_shards import ( + PAGE_COLUMN_CANDIDATES, + _assign_rows, + _coerce_bool_series, + _resolve_page_column, + _resolve_targets, +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_cutoff_shards", + description=( + "Build OCR shard manifests from the materialized local PDFs available at a cutoff, " + "plus residual manifests for missing OCR targets." + ), + ) + p.add_argument("--parquet", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--local-download-root", action="append", default=[]) + p.add_argument("--nodes", type=int, default=4) + p.add_argument("--pages-per-hour-per-node", type=float, default=50700.0) + p.add_argument("--filename-column", default="filename") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument("--page-column", default=None) + p.add_argument("--allow-threshold-derive", action="store_true") + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + p.add_argument("--key-column", default="source_doc_id") + p.add_argument("--cutoff-id", default="") + return p.parse_args(argv) + + +def _canonical_stem_from_row(row: pd.Series, filename_column: str) -> str: + if "filename_base" in row.index and str(row.get("filename_base") or "").strip(): + return str(row.get("filename_base")).strip() + return Path(str(row.get(filename_column) or "")).stem + + +def _scan_local_pdfs(roots: Sequence[Path]) -> Dict[str, Tuple[Path, Path]]: + available: Dict[str, Tuple[Path, Path]] = {} + for root in roots: + root = root.expanduser().resolve() + if not root.exists(): + continue + for pdf in sorted(p for p in root.rglob("*.pdf") if p.is_file()): + stem = pdf.stem + if stem not in available: + available[stem] = (root, pdf) + return available + + +def _stable_item_id(cutoff_id: str, key_value: str, stem: str) -> str: + payload = f"{cutoff_id}|{key_value}|{stem}" + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + local_roots = [Path(p).expanduser().resolve() for p in (args.local_download_root or [])] + if not local_roots: + raise SystemExit("Pass at least one --local-download-root.") + + df = pd.read_parquet(parquet_path).copy() + if args.filename_column not in df.columns: + raise SystemExit(f"Filename column '{args.filename_column}' not found in parquet.") + + page_column = _resolve_page_column(df, args.page_column) + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + target_df = df.loc[target_mask].copy() + if target_df.empty: + raise SystemExit("No OCR target rows selected at cutoff.") + + cutoff_id = str(args.cutoff_id or pd.Timestamp.utcnow().strftime("%Y%m%dT%H%M%SZ")) + target_df["filename_base"] = target_df.apply( + lambda row: _canonical_stem_from_row(row, str(args.filename_column)), + axis=1, + ) + available = _scan_local_pdfs(local_roots) + + rows_available: List[Dict[str, object]] = [] + rows_missing: List[Dict[str, object]] = [] + key_column = str(args.key_column) + preserve_columns = [c for c in target_df.columns if c not in {"filename_base"}] + + for row in target_df.to_dict(orient="records"): + stem = str(row.get("filename_base") or "") + key_value = str(row.get(key_column) or stem or row.get(args.filename_column) or "") + base = {col: row.get(col) for col in preserve_columns} + item_id = _stable_item_id(cutoff_id, key_value, stem) + if stem in available: + root, pdf_path = available[stem] + rel_path = pdf_path.relative_to(root) + out = dict(base) + out["source_filename"] = str(row.get(args.filename_column) or "") + out["filename"] = pdf_path.name + out["md_filename"] = f"{stem}.md" + out["filename_base"] = stem + out["ocr_item_id"] = item_id + out["ocr_cutoff_id"] = cutoff_id + out["local_pdf_path"] = str(pdf_path) + out["local_pdf_root"] = str(root) + out["local_pdf_relpath"] = str(rel_path) + out["available_at_cutoff"] = True + rows_available.append(out) + else: + out = dict(base) + out["filename_base"] = stem + out["ocr_item_id"] = item_id + out["ocr_cutoff_id"] = cutoff_id + out["available_at_cutoff"] = False + rows_missing.append(out) + + available_df = pd.DataFrame(rows_available) + missing_df = pd.DataFrame(rows_missing) + available_path = output_dir / "openarchives_ocr_available_at_cutoff.parquet" + missing_path = output_dir / "openarchives_ocr_missing_at_cutoff.parquet" + if not available_df.empty: + bins = _assign_rows(available_df, page_column=page_column, node_count=int(args.nodes)) + else: + bins = [] + + summaries: List[Dict[str, object]] = [] + total_pages = 0 + total_docs = 0 + for node in bins: + node_id = int(node["node_id"]) + node_df = pd.DataFrame(list(node["rows"])) + if "_pages_int" in node_df.columns: + node_df = node_df.drop(columns=["_pages_int"]) + node_df["shard_id"] = f"node-{node_id:02d}" + node_df["node_id"] = node_id + out_path = output_dir / f"openarchives_ocr_shard_node_{node_id:02d}.parquet" + node_df.to_parquet(out_path, index=False) + node_pages = int(node["pages_total"]) + node_docs = int(node["docs_total"]) + total_pages += node_pages + total_docs += node_docs + summaries.append( + { + "node_id": node_id, + "manifest_path": str(out_path), + "docs_total": node_docs, + "pages_total": node_pages, + "eta_hours_at_validated_speed": float(node_pages / float(args.pages_per_hour_per_node)), + } + ) + + available_df.to_parquet(available_path, index=False) + missing_df.to_parquet(missing_path, index=False) + overall = { + "source_parquet": str(parquet_path), + "cutoff_id": cutoff_id, + "nodes": int(args.nodes), + "key_column": key_column, + "filename_column": str(args.filename_column), + "page_column": str(page_column), + "available_docs_total": int(len(available_df)), + "available_pages_total": int(total_pages), + "missing_docs_total": int(len(missing_df)), + "missing_pages_total": int(pd.to_numeric(missing_df.get(page_column, pd.Series(dtype=float)), errors="coerce").fillna(0).sum()) if not missing_df.empty else 0, + "pages_per_hour_per_node": float(args.pages_per_hour_per_node), + "eta_hours_one_node": float(total_pages / float(args.pages_per_hour_per_node)) if total_pages else 0.0, + "eta_hours_all_nodes": float(total_pages / (float(args.pages_per_hour_per_node) * max(1, int(args.nodes)))) if total_pages else 0.0, + "available_manifest_path": str(available_path), + "missing_manifest_path": str(missing_path), + "node_summaries": summaries, + } + (output_dir / "openarchives_ocr_cutoff_summary.json").write_text(json.dumps(overall, indent=2), encoding="utf-8") + print(json.dumps(overall, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py index a66f564..a06af90 100644 --- a/src/glossapi/scripts/openarchives_ocr_merge.py +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import shutil from pathlib import Path from typing import List @@ -16,6 +17,9 @@ def _parse_args(argv: List[str] | None = None) -> argparse.Namespace: p.add_argument("--shard-parquets", nargs="+", required=True) p.add_argument("--output-parquet", required=True) p.add_argument("--key-column", default="filename") + p.add_argument("--preserve-master-columns", default="") + p.add_argument("--artifact-work-roots", nargs="*", default=[]) + p.add_argument("--artifact-output-root", default="") return p.parse_args(argv) @@ -25,12 +29,45 @@ def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: return df[key].astype(str).str.strip() +def _copy_artifacts( + *, + shard_rows: pd.DataFrame, + work_roots: List[Path], + output_root: Path, +) -> int: + copied = 0 + markdown_out = output_root / "markdown" + metrics_out = output_root / "json" / "metrics" + markdown_out.mkdir(parents=True, exist_ok=True) + metrics_out.mkdir(parents=True, exist_ok=True) + for row in shard_rows.to_dict(orient="records"): + stem = str(row.get("filename_base") or Path(str(row.get("filename") or "")).stem).strip() + if not stem: + continue + md_name = str(row.get("md_filename") or f"{stem}.md") + for root in work_roots: + md_src = root / "markdown" / f"{stem}.md" + if md_src.exists(): + shutil.copy2(md_src, markdown_out / md_name) + copied += 1 + break + for suffix in (".metrics.json", ".per_page.metrics.json"): + for root in work_roots: + src = root / "json" / "metrics" / f"{stem}{suffix}" + if src.exists(): + shutil.copy2(src, metrics_out / src.name) + copied += 1 + break + return copied + + def main(argv: List[str] | None = None) -> int: args = _parse_args(argv) master_path = Path(args.master_parquet).expanduser().resolve() out_path = Path(args.output_parquet).expanduser().resolve() out_path.parent.mkdir(parents=True, exist_ok=True) + preserve_master_columns = [c.strip() for c in str(args.preserve_master_columns or "").split(",") if c.strip()] master = pd.read_parquet(master_path).copy() master["_merge_key"] = _normalize_key(master, str(args.key_column)) @@ -48,13 +85,21 @@ def main(argv: List[str] | None = None) -> int: for column in shards.columns: if column == "_merge_key": continue + if column in preserve_master_columns: + continue master.loc[shards.index, column] = shards[column] master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") master.to_parquet(out_path, index=False) - print( - f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}" - ) + copied = 0 + if args.artifact_work_roots and str(args.artifact_output_root or "").strip(): + roots = [Path(p).expanduser().resolve() for p in args.artifact_work_roots] + copied = _copy_artifacts( + shard_rows=shards.reset_index(drop=True), + work_roots=roots, + output_root=Path(args.artifact_output_root).expanduser().resolve(), + ) + print(f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}; copied {copied} artifact file(s)") return 0 diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py index 6970161..4ffdf41 100644 --- a/src/glossapi/scripts/openarchives_ocr_run_node.py +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -42,6 +42,7 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--instance-id", default="") p.add_argument("--node-id", default="") p.add_argument("--dry-run", action="store_true") + p.add_argument("--skip-download", action="store_true") p.add_argument("--scheduler", default="whole_doc") p.add_argument("--target-batch-pages", type=int, default=160) p.add_argument("--shard-pages", type=int, default=0) @@ -81,6 +82,15 @@ def _prepare_download_input(df: pd.DataFrame) -> pd.DataFrame: return out +def _prepare_materialized_input(df: pd.DataFrame) -> pd.DataFrame: + if "filename" not in df.columns: + raise SystemExit("Shard parquet missing required column: filename") + out = df.copy() + if "filename_base" not in out.columns: + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + return out + + def _load_frame(path: Path) -> pd.DataFrame: return pd.read_parquet(path).copy() @@ -132,6 +142,32 @@ def _write_canonical_metadata(work_root: Path, df: pd.DataFrame) -> Path: return canonical +def _normalize_materialized_results( + *, + shard_df: pd.DataFrame, + downloads_dir: Path, +) -> pd.DataFrame: + out = shard_df.copy() + if "filename_base" not in out.columns: + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + if "local_pdf_path" in out.columns: + local_exists = out["local_pdf_path"].astype(str).map(lambda s: Path(s).exists()) + else: + local_exists = out["filename"].astype(str).map(lambda s: (downloads_dir / s).exists()) + out["download_success"] = local_exists.astype(bool) + out["download_error"] = out["download_success"].map(lambda ok: "" if ok else "materialized_pdf_missing") + if "needs_ocr" not in out.columns: + out["needs_ocr"] = True + if "ocr_success" not in out.columns: + out["ocr_success"] = False + if "url" not in out.columns: + if "pdf_url" in out.columns: + out["url"] = out["pdf_url"].fillna("").astype(str) + else: + out["url"] = "" + return out + + def _read_progress(parquet_path: Path, page_col: str = "page_count_source") -> Dict[str, Any]: try: df = pd.read_parquet(parquet_path) @@ -218,14 +254,26 @@ def main(argv: Optional[List[str]] = None) -> int: manifests_dir = work_root / "manifests" manifests_dir.mkdir(parents=True, exist_ok=True) - shard_df = _prepare_download_input(_load_frame(shard_path)) + raw_shard_df = _load_frame(shard_path) + shard_df = ( + _prepare_materialized_input(raw_shard_df) + if args.skip_download + else _prepare_download_input(raw_shard_df) + ) download_input = manifests_dir / "download_input.parquet" - shard_df.to_parquet(download_input, index=False) + if not args.skip_download: + shard_df.to_parquet(download_input, index=False) metadata_path = work_root / "download_results" / "download_results.parquet" if not metadata_path.exists(): metadata_path.parent.mkdir(parents=True, exist_ok=True) - _write_canonical_metadata(work_root, shard_df) + if args.skip_download: + _write_canonical_metadata( + work_root, + _normalize_materialized_results(shard_df=shard_df, downloads_dir=work_root / "downloads"), + ) + else: + _write_canonical_metadata(work_root, shard_df) heartbeat: Optional[_HeartbeatThread] = None if args.heartbeat_path: @@ -248,27 +296,36 @@ def main(argv: Optional[List[str]] = None) -> int: heartbeat.set_stage("dry_run") return 0 - corpus = Corpus( - input_dir=work_root / "downloads", - output_dir=work_root, - metadata_path=metadata_path, - log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), - verbose=False, - ) + if args.skip_download: + if heartbeat: + heartbeat.set_stage("materialized") + canonical_df = _normalize_materialized_results( + shard_df=shard_df, + downloads_dir=work_root / "downloads", + ) + metadata_path = _write_canonical_metadata(work_root, canonical_df) + else: + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) - if heartbeat: - heartbeat.set_stage("download") - dl_df = corpus.download( - input_parquet=download_input, - links_column="url", - parallelize_by=str(args.download_group_by), - concurrency=int(args.download_concurrency), - request_timeout=int(args.download_timeout), - scheduler_mode=str(args.download_scheduler_mode), - download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), - ) - canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") - metadata_path = _write_canonical_metadata(work_root, canonical_df) + if heartbeat: + heartbeat.set_stage("download") + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") + metadata_path = _write_canonical_metadata(work_root, canonical_df) if heartbeat: heartbeat.parquet_path = metadata_path heartbeat.set_stage("ocr") diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py index 314b785..f724c68 100644 --- a/tests/test_openarchives_ocr_shards.py +++ b/tests/test_openarchives_ocr_shards.py @@ -5,7 +5,7 @@ import pandas as pd -from glossapi.scripts import openarchives_ocr_merge, openarchives_ocr_shards +from glossapi.scripts import openarchives_ocr_cutoff_shards, openarchives_ocr_merge, openarchives_ocr_shards def test_openarchives_ocr_shards_balances_pages(tmp_path: Path) -> None: @@ -78,3 +78,100 @@ def test_openarchives_ocr_merge_updates_master(tmp_path: Path) -> None: assert bool(merged.loc["a.pdf", "needs_ocr"]) is False assert int(merged.loc["a.pdf", "ocr_node_id"]) == 2 assert bool(merged.loc["b.pdf", "ocr_success"]) is False + + +def test_openarchives_ocr_cutoff_shards_uses_only_available_local_pdfs(tmp_path: Path) -> None: + df = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "filename_base": "A", "needs_ocr": True, "pages_total_source": 100}, + {"source_doc_id": "doc-2", "filename": "b.html", "filename_base": "B", "needs_ocr": True, "pages_total_source": 50}, + {"source_doc_id": "doc-3", "filename": "c.html", "filename_base": "C", "needs_ocr": False, "pages_total_source": 999}, + ] + ) + source = tmp_path / "master.parquet" + downloads = tmp_path / "downloads" + downloads.mkdir() + (downloads / "A.pdf").write_bytes(b"%PDF-1.4\n") + df.to_parquet(source, index=False) + + out_dir = tmp_path / "cutoff" + rc = openarchives_ocr_cutoff_shards.main( + [ + "--parquet", + str(source), + "--output-dir", + str(out_dir), + "--local-download-root", + str(downloads), + "--nodes", + "2", + "--cutoff-id", + "cutoff-x", + ] + ) + assert rc == 0 + summary = json.loads((out_dir / "openarchives_ocr_cutoff_summary.json").read_text()) + assert summary["available_docs_total"] == 1 + assert summary["missing_docs_total"] == 1 + shard = pd.read_parquet(out_dir / "openarchives_ocr_shard_node_00.parquet") + assert shard.loc[0, "source_filename"] == "a.html" + assert shard.loc[0, "filename"] == "A.pdf" + assert shard.loc[0, "md_filename"] == "A.md" + assert bool(shard.loc[0, "available_at_cutoff"]) is True + missing = pd.read_parquet(out_dir / "openarchives_ocr_missing_at_cutoff.parquet") + assert set(missing["source_doc_id"]) == {"doc-2"} + + +def test_openarchives_ocr_merge_copies_markdown_artifacts(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "md_filename": "a.md", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + (work_root / "markdown").mkdir(parents=True) + (work_root / "json" / "metrics").mkdir(parents=True) + (work_root / "markdown" / "A.md").write_text("ocr text", encoding="utf-8") + (work_root / "json" / "metrics" / "A.metrics.json").write_text("{}", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--preserve-master-columns", + "filename,md_filename", + "--artifact-work-roots", + str(work_root), + "--artifact-output-root", + str(tmp_path / "final"), + ] + ) + assert rc == 0 + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "filename"] == "a.html" + assert bool(merged.loc["doc-1", "ocr_success"]) is True + assert (tmp_path / "final" / "markdown" / "A.md").exists() + assert (tmp_path / "final" / "json" / "metrics" / "A.metrics.json").exists() From dbe14b7a5568a88bab62c5607eea567946a463bc Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 1 Apr 2026 13:24:30 +0300 Subject: [PATCH 45/93] Add priority-only mode for staged OA pulls --- .../scripts/openarchives_pdf_stage_pull.py | 52 +++++++++++----- tests/test_openarchives_pdf_stage_pull.py | 61 +++++++++++++++++++ 2 files changed, 97 insertions(+), 16 deletions(-) diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py index 3af4571..330f0c5 100644 --- a/src/glossapi/scripts/openarchives_pdf_stage_pull.py +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -69,6 +69,11 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: default=None, help="Directory of dynamic priority files or filename lists. Items here are transferred first.", ) + p.add_argument( + "--priority-only", + action="store_true", + help="Transfer only files currently present in the priority set; do not fall through to the rest of the manifest.", + ) return p.parse_args(argv) @@ -159,19 +164,33 @@ def mark_completed_if_present(self, downloads_dir: Path, partial_dir: Path) -> N ) self.conn.commit() - def next_item(self, *, max_attempts: int) -> Optional[sqlite3.Row]: + def next_item(self, *, max_attempts: int, priority_only: bool = False) -> Optional[sqlite3.Row]: self.conn.row_factory = sqlite3.Row - cur = self.conn.execute( - """ - SELECT * - FROM transfer_items - WHERE status IN ('pending', 'failed') - AND attempts < ? - ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC - LIMIT 1 - """, - (max_attempts,), - ) + if priority_only: + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + AND priority_rank > 0 + ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) + else: + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) return cur.fetchone() def mark_in_progress(self, canonical_filename: str, current_size: int) -> None: @@ -590,6 +609,7 @@ def refresh_priorities() -> dict[str, int]: { "updated_at": utc_now(), "priority_dir": str(priority_dir), + "priority_only": bool(args.priority_only), "requested_total": len(requested), "available_in_manifest_total": len(available), "missing_in_manifest_total": len(missing), @@ -602,9 +622,9 @@ def refresh_priorities() -> dict[str, int]: while not stop_requested: priority_counts = refresh_priorities() - row = state.next_item(max_attempts=int(args.max_attempts)) + row = state.next_item(max_attempts=int(args.max_attempts), priority_only=bool(args.priority_only)) if row is None: - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": True}) + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": True}) break canonical = str(row["canonical_filename"]) @@ -698,7 +718,7 @@ def refresh_priorities() -> dict[str, int]: now = time.time() if now - last_summary_ts >= float(args.summary_interval_seconds): priority_counts = refresh_priorities() - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": False}) + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": False}) last_summary_ts = now if current_path.exists(): @@ -708,7 +728,7 @@ def refresh_priorities() -> dict[str, int]: pass priority_counts = refresh_priorities() - write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "done": True}) + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": True}) state.close() return 0 diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py index 52e63ed..fbdfbed 100644 --- a/tests/test_openarchives_pdf_stage_pull.py +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -135,6 +135,28 @@ def test_transfer_state_priorities_are_selected_first(tmp_path: Path) -> None: state.close() +def test_transfer_state_priority_only_skips_non_priority(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + state.set_priorities({"BBB_001.pdf"}) + + row = state.next_item(max_attempts=20, priority_only=True) + assert row is not None + assert row["canonical_filename"] == "BBB_001.pdf" + + state.mark_in_progress("BBB_001.pdf", 0) + state.mark_completed("BBB_001.pdf", 12) + + row2 = state.next_item(max_attempts=20, priority_only=True) + assert row2 is None + state.close() + + def test_load_priority_filenames_ignores_parquet_and_reads_csv_columns(tmp_path: Path) -> None: priority_dir = tmp_path / "priority" priority_dir.mkdir() @@ -189,3 +211,42 @@ def _fake_sftp_one(**kwargs): assert rc == 0 assert seen == ["rsync"] assert (work_root / "downloads" / "AAA_456.pdf").exists() + + +def test_run_priority_only_ignores_non_priority_items(tmp_path: Path, monkeypatch) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + work_root = tmp_path / "work" + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "priority.csv").write_text("filename\nVFK_368.pdf\n", encoding="utf-8") + seen: list[str] = [] + + def _fake_sftp_one(**kwargs): + seen.append(Path(kwargs["remote_path"]).name) + size = 20 if "VFK_368" in kwargs["remote_path"] else 10 + Path(kwargs["temp_path"]).parent.mkdir(parents=True, exist_ok=True) + Path(kwargs["temp_path"]).write_bytes(b"x" * size) + return subprocess.CompletedProcess(args=["sftp"], returncode=0, stdout="", stderr="") + + monkeypatch.setenv("GREECE_BOX_PASSWORD", "secret") + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.sftp_one", _fake_sftp_one) + + rc = run( + [ + "--manifest", + str(manifest), + "--work-root", + str(work_root), + "--priority-dir", + str(priority_dir), + "--priority-only", + "--summary-interval-seconds", + "0", + ] + ) + + assert rc == 0 + assert seen == ["VFK_368.pdf.Ac6Dc3BA"] + assert (work_root / "downloads" / "VFK_368.pdf").exists() + assert not (work_root / "downloads" / "AAA_456.pdf").exists() From e38b189277c0a049c858fbc3a79cbaa87ca4f4d9 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 1 Apr 2026 14:58:37 +0300 Subject: [PATCH 46/93] Document OA single-machine download and supplement runbook --- .../openarchives_ocr_rollout_plan.md | 4 + ...rchives_single_machine_download_runbook.md | 410 ++++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 docs/operations/openarchives_single_machine_download_runbook.md diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 590d18b..6fa9a85 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -2,6 +2,10 @@ This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. +For the reproducible single-machine source-download plus Greek supplementation workflow, see: + +- [openarchives_single_machine_download_runbook.md](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/docs/operations/openarchives_single_machine_download_runbook.md) + ## Implemented tooling The rollout is backed by concrete scripts in `src/glossapi/scripts/`: diff --git a/docs/operations/openarchives_single_machine_download_runbook.md b/docs/operations/openarchives_single_machine_download_runbook.md new file mode 100644 index 0000000..e83c5ad --- /dev/null +++ b/docs/operations/openarchives_single_machine_download_runbook.md @@ -0,0 +1,410 @@ +# OpenArchives Single-Machine Download And Greek Supplement Runbook + +This runbook documents how to reproduce the current OpenArchives download corpus on a single remote machine, using: + +- the normal GlossAPI downloader for the source-side pull +- a deterministic NTUA retry pass +- a single combined high-priority list for Greek-box supplementation + +This document is repo-safe. It does not store the Greek-box password in git. A private companion note with the password and direct copy commands is stored outside the repo. + +## Goal + +Rebuild the currently available OpenArchives PDF corpus on one machine so it can later be: + +- frozen as a local PDF corpus +- backed up +- sharded for OCR + +The intended end state is: + +1. source-download pass from the original OA URLs +2. targeted retry pass for the NTUA rows that the first pass stranded +3. supplementation from the Greek backup box for the high-priority unreachable set + +## Source Of Truth + +The OA routing state used here comes from the enriched OpenArchives parquet: + +- current cleaner-box path: + - `/home/ubuntu/glossapi/work/needs_ocr_enriched.parquet` + +This enriched parquet is produced from: + +- canonical OA document-level parquet after fill/clean +- raw HF OpenArchives dataset snapshot + +The script that creates it is: + +- [openarchives_ocr_enrich.py](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/src/glossapi/scripts/openarchives_ocr_enrich.py) + +CLI contract: + +```bash +PYTHONPATH=src python -m glossapi.scripts.openarchives_ocr_enrich \ + --parquet /data/openarchives/filled_document_level.parquet \ + --raw-repo-root /data/openarchives_hf \ + --output-parquet /data/openarchives/needs_ocr_enriched.parquet +``` + +If `needs_ocr_enriched.parquet` is already available, use it directly. + +## Required Inputs + +You need: + +1. GlossAPI checkout at `development` +2. the sample OA download policy file: + - [openarchives_download_policy.yml](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/samples/openarchives_download_policy.yml) +3. a local snapshot of the HF dataset `glossAPI/openarchives.gr`, at least: + - `data/openarchives/**` +4. the enriched routing parquet: + - `needs_ocr_enriched.parquet` +5. for Greek supplementation: + - the combined high-priority list + +Current combined high-priority list: + +- transfer box: + - `/home/ubuntu/openarchives_stage/unreachable_from_source_20260331/priority_high_combined_20260401.csv` + - `/home/ubuntu/openarchives_stage/unreachable_from_source_20260331/priority_high_combined_20260401.txt` +- local copy: + - [priority_high_combined_20260401.csv](/Users/foivoskarounos-zamparloukos/Projects/glossapi-ocr-local/reports/artifacts-20260401-source-backup/priority_high_combined_20260401.csv) + - [priority_high_combined_20260401.txt](/Users/foivoskarounos-zamparloukos/Projects/glossapi-ocr-local/reports/artifacts-20260401-source-backup/priority_high_combined_20260401.txt) + +That combined list is the union of: + +- `unreachable_from_source_20260331.csv` +- `hard_fail_from_source_20260401.csv` + +Current combined count: + +- `14,532` filenames + +## HF Dataset Snapshot + +Recommended pull method: + +```bash +export HF_TOKEN=... +python - <<'PY' +from huggingface_hub import snapshot_download +snapshot_download( + repo_id="glossAPI/openarchives.gr", + repo_type="dataset", + local_dir="/data/openarchives_hf", + allow_patterns=[ + "README.md", + "data/openarchives/**", + ], + token=True, +) +PY +``` + +If you already have a cleaner-produced enriched parquet, keep it outside the HF snapshot, for example: + +- `/data/openarchives_work/needs_ocr_enriched.parquet` + +## Collections Targeted + +This workflow targets every OA row with `needs_ocr=True` in the enriched parquet. + +Current collection breakdown from the enriched parquet: + +```text +IKEE_AUT 19966 +ntua 8118 +Pandemos 3441 +Dione 3170 +hellanicus 2121 +elocus 1647 +estia 1024 +dias 850 +helios 701 +edulll 643 +KEIED 549 +elstat 542 +nemertes 427 +cetpe 427 +deltion 374 +aua 322 +apothesis 299 +kallipos 254 +ekke 143 +JHVMS 126 +anaktisis 121 +ktisis 105 +KEEF 103 +ariadne 73 +geosociety 1 +``` + +## Host Split Used In Practice + +The current download plan was not collection-first. It was host-first. + +Phase 1 bulk-good-host source download: + +- include all `needs_ocr=True` rows except: + - `ikee.lib.auth.gr` + - `olympias.lib.uoi.gr` + +Targeted NTUA retry pass: + +- rows from phase 1 with: + - `download_success == false` + - `download_error == ""` + - `host == "dspace.lib.ntua.gr"` + +Greek-box supplementation: + +- union of: + - original source-unreachable set + - explicit hard source failures + +## Exact Download Policy + +Use: + +- [openarchives_download_policy.yml](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/samples/openarchives_download_policy.yml) + +Current important host rules: + +- `ikee.lib.auth.gr` + - `request_timeout: 180` + - `per_domain_concurrency: 1` + - `sleep: 1.5` +- `dspace.lib.ntua.gr` + - `request_timeout: 120` + - `per_domain_concurrency: 1` + - `sleep: 1.0` +- `olympias.lib.uoi.gr` + - `request_timeout: 180` + - `ssl_verify: false` + - `per_domain_concurrency: 1` + - `sleep: 1.0` +- `ktisis.cut.ac.cy` + - `ssl_verify: false` +- `repository.academyofathens.gr` + - high concurrency, short timeout + +## GlossAPI Downloader Entry Point + +Use: + +- [openarchives_download_freeze.py](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/src/glossapi/scripts/openarchives_download_freeze.py) + +The script: + +1. reads one manifest parquet +2. downloads PDFs into `downloads/` +3. writes canonical `download_results/download_results.parquet` +4. stops without OCR + +Important arguments: + +- `--input-parquet` +- `--work-root` +- `--download-concurrency` +- `--download-timeout` +- `--download-scheduler-mode` +- `--download-group-by` +- `--download-policy-file` + +Current proven values for the source bulk pass: + +- `download_concurrency = 24` +- `download_timeout = 60` +- `download_scheduler_mode = per_domain` +- `download_group_by = base_domain` +- `download_policy_file = samples/openarchives_download_policy.yml` + +## Single-Machine Directory Layout + +Suggested layout: + +```text +/data/openarchives_hf +/data/openarchives_work/needs_ocr_enriched.parquet +/data/openarchives_work/openarchives_download_phases_20260401/ +/data/openarchives_work/phase1_bulk_good_hosts/ +/data/openarchives_work/retry_ntua_zeroerror_all/ +/data/openarchives_work/greek_supplement/ +``` + +## Step 1: Build The Bulk-Good-Hosts Manifest + +```bash +python - <<'PY' +from pathlib import Path +import pandas as pd + +src = Path("/data/openarchives_work/needs_ocr_enriched.parquet") +outdir = Path("/data/openarchives_work/openarchives_download_phases_20260401") +outdir.mkdir(parents=True, exist_ok=True) + +df = pd.read_parquet(src) +target = df[df["needs_ocr"].fillna(False)].copy() +host = target["host"].fillna("").astype(str) +bulk = target[~host.isin(["ikee.lib.auth.gr", "olympias.lib.uoi.gr"])].copy() + +bulk.to_parquet(outdir / "phase1_bulk_good_hosts.parquet", index=False) +host.value_counts().rename_axis("host").reset_index(name="docs").to_parquet( + outdir / "phase1_bulk_good_hosts_host_stats.parquet", index=False +) +print({ + "bulk_docs": int(len(bulk)), + "bulk_pages": float(pd.to_numeric(bulk.get("pages_total_source", 0), errors="coerce").fillna(0).sum()), +}) +PY +``` + +## Step 2: Run The Bulk Source Download + +```bash +cd /path/to/glossapi-development +PYTHONPATH=src python -m glossapi.scripts.openarchives_download_freeze \ + --input-parquet /data/openarchives_work/openarchives_download_phases_20260401/phase1_bulk_good_hosts.parquet \ + --work-root /data/openarchives_work/phase1_bulk_good_hosts \ + --download-concurrency 24 \ + --download-timeout 60 \ + --download-scheduler-mode per_domain \ + --download-group-by base_domain \ + --download-policy-file samples/openarchives_download_policy.yml +``` + +Outputs: + +- `/data/openarchives_work/phase1_bulk_good_hosts/downloads/*.pdf` +- `/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results.parquet` +- `/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results_download_input.parquet` + +## Step 3: Build The NTUA Retry Manifest + +This is the critical correction from the first run. + +```bash +python - <<'PY' +from pathlib import Path +import pandas as pd + +src = Path("/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results_download_input.parquet") +outdir = Path("/data/openarchives_work/openarchives_retry_20260401") +outdir.mkdir(parents=True, exist_ok=True) + +df = pd.read_parquet(src) +err = df["download_error"].fillna("").astype(str) +host = df["host"].fillna("").astype(str) +succ = df["download_success"].fillna(False) + +ntua = df[(~succ) & err.eq("") & host.eq("dspace.lib.ntua.gr")].copy() +ntua.to_parquet(outdir / "ntua_zeroerror_all.parquet", index=False) +ntua.to_csv(outdir / "ntua_zeroerror_all.csv", index=False) +print({"ntua_retry_rows": int(len(ntua))}) +PY +``` + +## Step 4: Run The NTUA Retry + +Use the exact settings that recovered `6879/6879` in the current run: + +```bash +cd /path/to/glossapi-development +PYTHONPATH=src python -m glossapi.scripts.openarchives_download_freeze \ + --input-parquet /data/openarchives_work/openarchives_retry_20260401/ntua_zeroerror_all.parquet \ + --work-root /data/openarchives_work/retry_ntua_zeroerror_all \ + --download-concurrency 6 \ + --download-timeout 90 \ + --download-scheduler-mode global +``` + +Do not reuse the phase-1 per-domain setup for this retry. The global retry pattern is what recovered the full missed NTUA bucket. + +## Step 5: Greek-Box Supplementation + +You need: + +- the single combined priority list +- SSH access to the Greek box +- the Greek raw path + +Repo-safe details: + +- host: `83.212.80.170` +- user: `debian` +- raw path: + - `/glossapi/1000/s3-backup/open-archive-data/raw` + +For the password and direct commands, see the private companion note outside the repo. + +### 5A. Copy The Combined Priority List To The Remote Machine + +Use: + +- `priority_high_combined_20260401.txt` + +### 5B. Reduce It To The Still-Missing Local Files + +On the single remote machine: + +```bash +find /data/openarchives_work/phase1_bulk_good_hosts/downloads -maxdepth 1 -type f -name '*.pdf' -printf '%f\n' | sort -u > /tmp/source_phase1_have.txt +find /data/openarchives_work/retry_ntua_zeroerror_all/downloads -maxdepth 1 -type f -name '*.pdf' -printf '%f\n' | sort -u > /tmp/source_ntua_have.txt +cat /tmp/source_phase1_have.txt /tmp/source_ntua_have.txt | sort -u > /tmp/source_have_all.txt +comm -23 priority_high_combined_20260401.txt /tmp/source_have_all.txt > /tmp/priority_still_missing.txt +``` + +### 5C. Build The Matched Greek Relative-Path Manifest + +On the Greek box, generate the list of relative PDF paths whose basenames are in `priority_still_missing.txt`. + +Because the Greek raw tree is nested under `.part_*`, the supplement step must be path-aware. The simplest reproducible pattern is: + +```bash +python3 - <<'PY' +from pathlib import Path +wanted = set(Path('/tmp/priority_still_missing.txt').read_text().splitlines()) +raw_root = Path('/glossapi/1000/s3-backup/open-archive-data/raw') +out = Path('/tmp/greek_matched_relative_paths.txt') +with out.open('w') as f: + for p in raw_root.rglob('*.pdf'): + if p.name in wanted: + f.write(str(p.relative_to(raw_root)) + '\n') +print(out) +PY +``` + +### 5D. Pull The Missing Files From The Greek Box + +From the single remote machine: + +```bash +mkdir -p /data/openarchives_work/greek_supplement/downloads +rsync -av --files-from=/tmp/greek_matched_relative_paths.txt \ + debian@83.212.80.170:/glossapi/1000/s3-backup/open-archive-data/raw/ \ + /data/openarchives_work/greek_supplement/raw/ +find /data/openarchives_work/greek_supplement/raw -type f -name '*.pdf' -exec cp -n {} /data/openarchives_work/greek_supplement/downloads/ \\; +``` + +That produces a flat supplement directory of Greek-recovered PDFs. + +## Step 6: Freeze The Final Available Corpus + +Final available set is the union of: + +- `/data/openarchives_work/phase1_bulk_good_hosts/downloads` +- `/data/openarchives_work/retry_ntua_zeroerror_all/downloads` +- `/data/openarchives_work/greek_supplement/downloads` + +At that point: + +- rebuild the cutoff inventory +- shard for OCR +- or archive the source-only-new subset if desired + +## Notes + +- The Greek priority queue was built as one combined list so the supplement step is reproducible on a single remote instance. +- The first bulk run missed many NTUA files for execution reasons, not because the URLs were dead. The dedicated NTUA retry is therefore mandatory. +- The Greek box should be treated as a supplement path, not the default source path. From 94ec617cf6ddd2029a3001b7d6965d213a11244a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 1 Apr 2026 15:00:15 +0300 Subject: [PATCH 47/93] Revert "Document OA single-machine download and supplement runbook" This reverts commit e38b189277c0a049c858fbc3a79cbaa87ca4f4d9. --- .../openarchives_ocr_rollout_plan.md | 4 - ...rchives_single_machine_download_runbook.md | 410 ------------------ 2 files changed, 414 deletions(-) delete mode 100644 docs/operations/openarchives_single_machine_download_runbook.md diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 6fa9a85..590d18b 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -2,10 +2,6 @@ This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. -For the reproducible single-machine source-download plus Greek supplementation workflow, see: - -- [openarchives_single_machine_download_runbook.md](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/docs/operations/openarchives_single_machine_download_runbook.md) - ## Implemented tooling The rollout is backed by concrete scripts in `src/glossapi/scripts/`: diff --git a/docs/operations/openarchives_single_machine_download_runbook.md b/docs/operations/openarchives_single_machine_download_runbook.md deleted file mode 100644 index e83c5ad..0000000 --- a/docs/operations/openarchives_single_machine_download_runbook.md +++ /dev/null @@ -1,410 +0,0 @@ -# OpenArchives Single-Machine Download And Greek Supplement Runbook - -This runbook documents how to reproduce the current OpenArchives download corpus on a single remote machine, using: - -- the normal GlossAPI downloader for the source-side pull -- a deterministic NTUA retry pass -- a single combined high-priority list for Greek-box supplementation - -This document is repo-safe. It does not store the Greek-box password in git. A private companion note with the password and direct copy commands is stored outside the repo. - -## Goal - -Rebuild the currently available OpenArchives PDF corpus on one machine so it can later be: - -- frozen as a local PDF corpus -- backed up -- sharded for OCR - -The intended end state is: - -1. source-download pass from the original OA URLs -2. targeted retry pass for the NTUA rows that the first pass stranded -3. supplementation from the Greek backup box for the high-priority unreachable set - -## Source Of Truth - -The OA routing state used here comes from the enriched OpenArchives parquet: - -- current cleaner-box path: - - `/home/ubuntu/glossapi/work/needs_ocr_enriched.parquet` - -This enriched parquet is produced from: - -- canonical OA document-level parquet after fill/clean -- raw HF OpenArchives dataset snapshot - -The script that creates it is: - -- [openarchives_ocr_enrich.py](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/src/glossapi/scripts/openarchives_ocr_enrich.py) - -CLI contract: - -```bash -PYTHONPATH=src python -m glossapi.scripts.openarchives_ocr_enrich \ - --parquet /data/openarchives/filled_document_level.parquet \ - --raw-repo-root /data/openarchives_hf \ - --output-parquet /data/openarchives/needs_ocr_enriched.parquet -``` - -If `needs_ocr_enriched.parquet` is already available, use it directly. - -## Required Inputs - -You need: - -1. GlossAPI checkout at `development` -2. the sample OA download policy file: - - [openarchives_download_policy.yml](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/samples/openarchives_download_policy.yml) -3. a local snapshot of the HF dataset `glossAPI/openarchives.gr`, at least: - - `data/openarchives/**` -4. the enriched routing parquet: - - `needs_ocr_enriched.parquet` -5. for Greek supplementation: - - the combined high-priority list - -Current combined high-priority list: - -- transfer box: - - `/home/ubuntu/openarchives_stage/unreachable_from_source_20260331/priority_high_combined_20260401.csv` - - `/home/ubuntu/openarchives_stage/unreachable_from_source_20260331/priority_high_combined_20260401.txt` -- local copy: - - [priority_high_combined_20260401.csv](/Users/foivoskarounos-zamparloukos/Projects/glossapi-ocr-local/reports/artifacts-20260401-source-backup/priority_high_combined_20260401.csv) - - [priority_high_combined_20260401.txt](/Users/foivoskarounos-zamparloukos/Projects/glossapi-ocr-local/reports/artifacts-20260401-source-backup/priority_high_combined_20260401.txt) - -That combined list is the union of: - -- `unreachable_from_source_20260331.csv` -- `hard_fail_from_source_20260401.csv` - -Current combined count: - -- `14,532` filenames - -## HF Dataset Snapshot - -Recommended pull method: - -```bash -export HF_TOKEN=... -python - <<'PY' -from huggingface_hub import snapshot_download -snapshot_download( - repo_id="glossAPI/openarchives.gr", - repo_type="dataset", - local_dir="/data/openarchives_hf", - allow_patterns=[ - "README.md", - "data/openarchives/**", - ], - token=True, -) -PY -``` - -If you already have a cleaner-produced enriched parquet, keep it outside the HF snapshot, for example: - -- `/data/openarchives_work/needs_ocr_enriched.parquet` - -## Collections Targeted - -This workflow targets every OA row with `needs_ocr=True` in the enriched parquet. - -Current collection breakdown from the enriched parquet: - -```text -IKEE_AUT 19966 -ntua 8118 -Pandemos 3441 -Dione 3170 -hellanicus 2121 -elocus 1647 -estia 1024 -dias 850 -helios 701 -edulll 643 -KEIED 549 -elstat 542 -nemertes 427 -cetpe 427 -deltion 374 -aua 322 -apothesis 299 -kallipos 254 -ekke 143 -JHVMS 126 -anaktisis 121 -ktisis 105 -KEEF 103 -ariadne 73 -geosociety 1 -``` - -## Host Split Used In Practice - -The current download plan was not collection-first. It was host-first. - -Phase 1 bulk-good-host source download: - -- include all `needs_ocr=True` rows except: - - `ikee.lib.auth.gr` - - `olympias.lib.uoi.gr` - -Targeted NTUA retry pass: - -- rows from phase 1 with: - - `download_success == false` - - `download_error == ""` - - `host == "dspace.lib.ntua.gr"` - -Greek-box supplementation: - -- union of: - - original source-unreachable set - - explicit hard source failures - -## Exact Download Policy - -Use: - -- [openarchives_download_policy.yml](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/samples/openarchives_download_policy.yml) - -Current important host rules: - -- `ikee.lib.auth.gr` - - `request_timeout: 180` - - `per_domain_concurrency: 1` - - `sleep: 1.5` -- `dspace.lib.ntua.gr` - - `request_timeout: 120` - - `per_domain_concurrency: 1` - - `sleep: 1.0` -- `olympias.lib.uoi.gr` - - `request_timeout: 180` - - `ssl_verify: false` - - `per_domain_concurrency: 1` - - `sleep: 1.0` -- `ktisis.cut.ac.cy` - - `ssl_verify: false` -- `repository.academyofathens.gr` - - high concurrency, short timeout - -## GlossAPI Downloader Entry Point - -Use: - -- [openarchives_download_freeze.py](/Users/foivoskarounos-zamparloukos/Projects/glossapi-development/src/glossapi/scripts/openarchives_download_freeze.py) - -The script: - -1. reads one manifest parquet -2. downloads PDFs into `downloads/` -3. writes canonical `download_results/download_results.parquet` -4. stops without OCR - -Important arguments: - -- `--input-parquet` -- `--work-root` -- `--download-concurrency` -- `--download-timeout` -- `--download-scheduler-mode` -- `--download-group-by` -- `--download-policy-file` - -Current proven values for the source bulk pass: - -- `download_concurrency = 24` -- `download_timeout = 60` -- `download_scheduler_mode = per_domain` -- `download_group_by = base_domain` -- `download_policy_file = samples/openarchives_download_policy.yml` - -## Single-Machine Directory Layout - -Suggested layout: - -```text -/data/openarchives_hf -/data/openarchives_work/needs_ocr_enriched.parquet -/data/openarchives_work/openarchives_download_phases_20260401/ -/data/openarchives_work/phase1_bulk_good_hosts/ -/data/openarchives_work/retry_ntua_zeroerror_all/ -/data/openarchives_work/greek_supplement/ -``` - -## Step 1: Build The Bulk-Good-Hosts Manifest - -```bash -python - <<'PY' -from pathlib import Path -import pandas as pd - -src = Path("/data/openarchives_work/needs_ocr_enriched.parquet") -outdir = Path("/data/openarchives_work/openarchives_download_phases_20260401") -outdir.mkdir(parents=True, exist_ok=True) - -df = pd.read_parquet(src) -target = df[df["needs_ocr"].fillna(False)].copy() -host = target["host"].fillna("").astype(str) -bulk = target[~host.isin(["ikee.lib.auth.gr", "olympias.lib.uoi.gr"])].copy() - -bulk.to_parquet(outdir / "phase1_bulk_good_hosts.parquet", index=False) -host.value_counts().rename_axis("host").reset_index(name="docs").to_parquet( - outdir / "phase1_bulk_good_hosts_host_stats.parquet", index=False -) -print({ - "bulk_docs": int(len(bulk)), - "bulk_pages": float(pd.to_numeric(bulk.get("pages_total_source", 0), errors="coerce").fillna(0).sum()), -}) -PY -``` - -## Step 2: Run The Bulk Source Download - -```bash -cd /path/to/glossapi-development -PYTHONPATH=src python -m glossapi.scripts.openarchives_download_freeze \ - --input-parquet /data/openarchives_work/openarchives_download_phases_20260401/phase1_bulk_good_hosts.parquet \ - --work-root /data/openarchives_work/phase1_bulk_good_hosts \ - --download-concurrency 24 \ - --download-timeout 60 \ - --download-scheduler-mode per_domain \ - --download-group-by base_domain \ - --download-policy-file samples/openarchives_download_policy.yml -``` - -Outputs: - -- `/data/openarchives_work/phase1_bulk_good_hosts/downloads/*.pdf` -- `/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results.parquet` -- `/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results_download_input.parquet` - -## Step 3: Build The NTUA Retry Manifest - -This is the critical correction from the first run. - -```bash -python - <<'PY' -from pathlib import Path -import pandas as pd - -src = Path("/data/openarchives_work/phase1_bulk_good_hosts/download_results/download_results_download_input.parquet") -outdir = Path("/data/openarchives_work/openarchives_retry_20260401") -outdir.mkdir(parents=True, exist_ok=True) - -df = pd.read_parquet(src) -err = df["download_error"].fillna("").astype(str) -host = df["host"].fillna("").astype(str) -succ = df["download_success"].fillna(False) - -ntua = df[(~succ) & err.eq("") & host.eq("dspace.lib.ntua.gr")].copy() -ntua.to_parquet(outdir / "ntua_zeroerror_all.parquet", index=False) -ntua.to_csv(outdir / "ntua_zeroerror_all.csv", index=False) -print({"ntua_retry_rows": int(len(ntua))}) -PY -``` - -## Step 4: Run The NTUA Retry - -Use the exact settings that recovered `6879/6879` in the current run: - -```bash -cd /path/to/glossapi-development -PYTHONPATH=src python -m glossapi.scripts.openarchives_download_freeze \ - --input-parquet /data/openarchives_work/openarchives_retry_20260401/ntua_zeroerror_all.parquet \ - --work-root /data/openarchives_work/retry_ntua_zeroerror_all \ - --download-concurrency 6 \ - --download-timeout 90 \ - --download-scheduler-mode global -``` - -Do not reuse the phase-1 per-domain setup for this retry. The global retry pattern is what recovered the full missed NTUA bucket. - -## Step 5: Greek-Box Supplementation - -You need: - -- the single combined priority list -- SSH access to the Greek box -- the Greek raw path - -Repo-safe details: - -- host: `83.212.80.170` -- user: `debian` -- raw path: - - `/glossapi/1000/s3-backup/open-archive-data/raw` - -For the password and direct commands, see the private companion note outside the repo. - -### 5A. Copy The Combined Priority List To The Remote Machine - -Use: - -- `priority_high_combined_20260401.txt` - -### 5B. Reduce It To The Still-Missing Local Files - -On the single remote machine: - -```bash -find /data/openarchives_work/phase1_bulk_good_hosts/downloads -maxdepth 1 -type f -name '*.pdf' -printf '%f\n' | sort -u > /tmp/source_phase1_have.txt -find /data/openarchives_work/retry_ntua_zeroerror_all/downloads -maxdepth 1 -type f -name '*.pdf' -printf '%f\n' | sort -u > /tmp/source_ntua_have.txt -cat /tmp/source_phase1_have.txt /tmp/source_ntua_have.txt | sort -u > /tmp/source_have_all.txt -comm -23 priority_high_combined_20260401.txt /tmp/source_have_all.txt > /tmp/priority_still_missing.txt -``` - -### 5C. Build The Matched Greek Relative-Path Manifest - -On the Greek box, generate the list of relative PDF paths whose basenames are in `priority_still_missing.txt`. - -Because the Greek raw tree is nested under `.part_*`, the supplement step must be path-aware. The simplest reproducible pattern is: - -```bash -python3 - <<'PY' -from pathlib import Path -wanted = set(Path('/tmp/priority_still_missing.txt').read_text().splitlines()) -raw_root = Path('/glossapi/1000/s3-backup/open-archive-data/raw') -out = Path('/tmp/greek_matched_relative_paths.txt') -with out.open('w') as f: - for p in raw_root.rglob('*.pdf'): - if p.name in wanted: - f.write(str(p.relative_to(raw_root)) + '\n') -print(out) -PY -``` - -### 5D. Pull The Missing Files From The Greek Box - -From the single remote machine: - -```bash -mkdir -p /data/openarchives_work/greek_supplement/downloads -rsync -av --files-from=/tmp/greek_matched_relative_paths.txt \ - debian@83.212.80.170:/glossapi/1000/s3-backup/open-archive-data/raw/ \ - /data/openarchives_work/greek_supplement/raw/ -find /data/openarchives_work/greek_supplement/raw -type f -name '*.pdf' -exec cp -n {} /data/openarchives_work/greek_supplement/downloads/ \\; -``` - -That produces a flat supplement directory of Greek-recovered PDFs. - -## Step 6: Freeze The Final Available Corpus - -Final available set is the union of: - -- `/data/openarchives_work/phase1_bulk_good_hosts/downloads` -- `/data/openarchives_work/retry_ntua_zeroerror_all/downloads` -- `/data/openarchives_work/greek_supplement/downloads` - -At that point: - -- rebuild the cutoff inventory -- shard for OCR -- or archive the source-only-new subset if desired - -## Notes - -- The Greek priority queue was built as one combined list so the supplement step is reproducible on a single remote instance. -- The first bulk run missed many NTUA files for execution reasons, not because the URLs were dead. The dedicated NTUA retry is therefore mandatory. -- The Greek box should be treated as a supplement path, not the default source path. From 63a9a6729bcbc8be6ae2666359690bf3e02fbfee Mon Sep 17 00:00:00 2001 From: fffoivos Date: Wed, 1 Apr 2026 22:51:22 +0300 Subject: [PATCH 48/93] Harden OpenArchives PDF download validation --- samples/openarchives_download_policy.yml | 20 +++- src/glossapi/gloss_browser_downloader.py | 112 ++++++++++++++++++ src/glossapi/gloss_downloader.py | 40 +++++-- .../scripts/openarchives_download_freeze.py | 4 + tests/test_browser_gloss_downloader.py | 101 ++++++++++++++++ tests/test_openarchives_download_freeze.py | 43 +++++++ 6 files changed, 309 insertions(+), 11 deletions(-) diff --git a/samples/openarchives_download_policy.yml b/samples/openarchives_download_policy.yml index 8e1091e..180e5fe 100644 --- a/samples/openarchives_download_policy.yml +++ b/samples/openarchives_download_policy.yml @@ -53,11 +53,11 @@ rules: - match: domains: [repository.academyofathens.gr] - downloader: standard + downloader: auto request_timeout: 45 - per_domain_concurrency: 16 + per_domain_concurrency: 6 domain_concurrency_floor: 2 - domain_concurrency_ceiling: 16 + domain_concurrency_ceiling: 8 skip_failed_after: 3 sleep: 0.1 @@ -66,7 +66,6 @@ rules: - dione.lib.unipi.gr - pergamos.lib.uoa.gr - hellanicus.lib.aegean.gr - - dias.library.tuc.gr downloader: standard request_timeout: 60 per_domain_concurrency: 12 @@ -75,6 +74,19 @@ rules: skip_failed_after: 3 sleep: 0.2 + - match: + domains: + - dias.library.tuc.gr + downloader: auto + request_timeout: 90 + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 4 + sleep: 0.5 + browser_timeout_ms: 90000 + browser_post_load_wait_ms: 4000 + - match: domains: - repository.ihu.gr diff --git a/src/glossapi/gloss_browser_downloader.py b/src/glossapi/gloss_browser_downloader.py index 1fc41fa..66a7c6e 100644 --- a/src/glossapi/gloss_browser_downloader.py +++ b/src/glossapi/gloss_browser_downloader.py @@ -3,7 +3,10 @@ from __future__ import annotations import asyncio +import io +import json import os +import re import time from dataclasses import dataclass from urllib.parse import urlparse @@ -11,6 +14,7 @@ import aiofiles import aiohttp +from PIL import Image from .download_policy import DownloadPolicy, load_download_policy from .gloss_downloader import GlossDownloader @@ -110,6 +114,105 @@ def _should_attempt_browser_recovery(self, url: str, html_issue: str) -> bool: return self._url_looks_like_file_endpoint(url) return False + def _extract_academy_document_id(self, url: str) -> Optional[str]: + parsed = urlparse(str(url or "")) + host = (parsed.hostname or "").lower() + if host != "repository.academyofathens.gr": + return None + match = re.match(r"^/document/(\d+)(?:\.pdf)?/?$", parsed.path or "") + if not match: + return None + return match.group(1) + + async def _fetch_bytes(self, session: aiohttp.ClientSession, url: str) -> bytes: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=min(max(self.request_timeout, 60), 180))) as response: + response.raise_for_status() + return await response.read() + + def _academy_images_to_pdf_bytes(self, image_blobs: list[bytes]) -> bytes: + if not image_blobs: + raise RuntimeError("No Academy image pages available to synthesize PDF") + images = [] + try: + for blob in image_blobs: + img = Image.open(io.BytesIO(blob)).convert("RGB") + images.append(img) + out = io.BytesIO() + images[0].save(out, format="PDF", save_all=True, append_images=images[1:]) + return out.getvalue() + finally: + for img in images: + try: + img.close() + except Exception: + pass + + async def _download_academy_bookreader_pdf(self, url: str) -> Optional[bytes]: + item_id = self._extract_academy_document_id(url) + if not item_id: + return None + + candidate_bases = [ + "https://repo.academyofathens.gr", + "https://digitallibrary.academyofathens.gr", + ] + connector = self._build_ssl_connector() + headers = {"User-Agent": "Mozilla/5.0", "Accept": "application/json,*/*"} + async with aiohttp.ClientSession(connector=connector, headers=headers) as session: + for base_url in candidate_bases: + try: + payload_bytes = await self._fetch_bytes(session, f"{base_url}/archive/bookreader_options/{item_id}") + payload = json.loads(payload_bytes.decode("utf-8", errors="ignore")) + except Exception: + continue + + page_data = payload.get("data") + if not isinstance(page_data, list) or not page_data: + continue + + image_urls: list[str] = [] + for page in page_data: + if not page or not isinstance(page, list): + continue + first = page[0] if page else None + uri = first.get("uri") if isinstance(first, dict) else None + if not uri: + continue + image_urls.append(uri if uri.startswith("http") else f"{base_url}{uri}") + + if not image_urls: + continue + + image_blobs: list[bytes] = [] + try: + for image_url in image_urls: + image_blobs.append(await self._fetch_bytes(session, image_url)) + except Exception: + continue + + try: + return await asyncio.to_thread(self._academy_images_to_pdf_bytes, image_blobs) + except Exception: + continue + return None + + async def _recover_source_specific_html_interstitial( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + pdf_body = await self._download_academy_bookreader_pdf(url) + if not pdf_body: + return None + + filename = f"{filename_base}.pdf" if filename_base and str(filename_base).strip() else self.generate_filename(row_index, "pdf") + await self._write_recovered_file(row_index, filename, pdf_body) + self.logger.info("Recovered Academy document via bookreader image->PDF fallback: %s -> %s", url, filename) + return True, filename, "pdf", "", retry_count + def _build_ssl_connector(self) -> Optional[aiohttp.TCPConnector]: connector = None if not self.ssl_verify: @@ -370,6 +473,15 @@ async def _recover_html_interstitial( filename_base: Optional[str], referer: Optional[str], ) -> Optional[Tuple[bool, str, str, str, int]]: + source_specific = await self._recover_source_specific_html_interstitial( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + ) + if source_specific is not None: + return source_specific + route, route_options = self._resolve_route(url) if route == "standard": return None diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index b1b6c61..45f0d39 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -752,15 +752,17 @@ def _ext_from_magic_bytes(self, content: bytes) -> Optional[str]: if not content: return None head = content[:4096] - # PDF - if head.startswith(b'%PDF-'): + lower_head = head.lower() + lstripped = lower_head.lstrip() + # PDF: allow a small junk prefix before the real header. + pdf_idx = head.find(b'%PDF-') + if 0 <= pdf_idx <= 1024: return 'pdf' # HTML (very simple heuristic) - lower_head = head.lower() - if b' Optional[str]: pass return None + def _looks_like_pdf_bytes(self, content: bytes) -> bool: + """Lightweight PDF sanity check for content we are about to persist as a PDF.""" + if not content: + return False + head = content[:4096] + pdf_idx = head.find(b'%PDF-') + return 0 <= pdf_idx <= 1024 + def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes) -> str: """Infer the most likely file extension using URL, headers and content bytes""" + # Strong content sniffing first for the two cases that matter most here: + # real PDFs and HTML bodies masquerading as direct-file endpoints. + sniff_ext = self._ext_from_magic_bytes(content) + if sniff_ext == 'pdf': + return 'pdf' + if sniff_ext == 'html': + return 'html' + # 1) URL path extension url_ext = self.get_file_extension_from_url(url) if self.is_supported_format(url_ext): @@ -797,8 +815,7 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes if ct_ext and self.is_supported_format(ct_ext): return ct_ext - # 4) Magic byte sniffing - sniff_ext = self._ext_from_magic_bytes(content) + # 4) Magic byte sniffing for the remaining supported formats if sniff_ext and self.is_supported_format(sniff_ext): return sniff_ext @@ -849,6 +866,10 @@ def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: or "awswafintegration" in lower_body or "challenge.js" in lower_body or "verify that you're not a robot" in lower_body + or "making sure you're not a bot" in lower_body + or "making sure you're not a bot" in lower_body + or "/.within.website/" in lower_body + or "anubis" in lower_body ): return ( "HTML challenge page returned instead of a document; " @@ -1071,6 +1092,11 @@ async def _finalize_download_result( f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}" ) return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count + if file_ext == 'pdf' and not self._looks_like_pdf_bytes(content): + self._cleanup_temp_file(tmp_path) + message = "Invalid PDF signature in downloaded content" + self.logger.warning("%s for %s", message, url) + return False, "", file_ext, message, retry_count filename = self._build_output_filename(row_index, file_ext, filename_base) if tmp_path is not None: diff --git a/src/glossapi/scripts/openarchives_download_freeze.py b/src/glossapi/scripts/openarchives_download_freeze.py index 8188e9a..e358781 100644 --- a/src/glossapi/scripts/openarchives_download_freeze.py +++ b/src/glossapi/scripts/openarchives_download_freeze.py @@ -29,9 +29,11 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--python-log-level", default="INFO") p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-mode", default="auto") p.add_argument("--download-scheduler-mode", default="per_domain") p.add_argument("--download-group-by", default="base_domain") p.add_argument("--download-policy-file", default="") + p.add_argument("--supported-formats", default="pdf") p.add_argument("--dry-run", action="store_true") return p.parse_args(argv) @@ -66,10 +68,12 @@ def main(argv: Optional[List[str]] = None) -> int: dl_df = corpus.download( input_parquet=download_input, links_column="url", + download_mode=str(args.download_mode), parallelize_by=str(args.download_group_by), concurrency=int(args.download_concurrency), request_timeout=int(args.download_timeout), scheduler_mode=str(args.download_scheduler_mode), + supported_formats=[part.strip() for part in str(args.supported_formats).split(",") if part.strip()], download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), ) canonical_df = _normalize_download_results(shard_df=manifest_df, download_results_df=dl_df, url_column="url") diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py index 20707e7..ab94f15 100644 --- a/tests/test_browser_gloss_downloader.py +++ b/tests/test_browser_gloss_downloader.py @@ -1,6 +1,8 @@ import asyncio +import io import pandas as pd +from PIL import Image from glossapi import Corpus from glossapi.download_policy import build_download_policy @@ -72,6 +74,105 @@ async def _fake_browser_download(**kwargs): assert not (tmp_path / "downloads" / ".part_browser_0").exists() +def test_browser_downloader_detects_anubis_challenge(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + issue = downloader._detect_html_interstitial( + "https://dias.library.tuc.gr/view/view/manf/77495", + {"Content-Type": "text/html"}, + b"Making sure you're not a bot!" + b"anubis /.within.website/", + ) + + assert issue is not None + assert "challenge page returned" in issue.lower() + + +def test_infer_file_extension_prefers_html_magic_over_pdf_url(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + file_ext = downloader.infer_file_extension( + "https://repository.academyofathens.gr/document/43963.pdf", + {"Content-Type": "text/html"}, + b"spa shell", + ) + + assert file_ext == "html" + + +def test_infer_file_extension_accepts_pdf_header_after_small_prefix(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + file_ext = downloader.infer_file_extension( + "https://pergamos.lib.uoa.gr/uoa/dl/object/1316268/file.pdf", + {"Content-Type": "application/pdf"}, + b"test123%PDF-1.5\nrest", + ) + + assert file_ext == "pdf" + + +def test_finalize_download_result_rejects_invalid_pdf_payload(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + result = asyncio.run( + downloader._finalize_download_result( + row_index=0, + url="https://example.org/file.pdf", + resp_headers={"Content-Type": "application/pdf"}, + content=b"this is not a pdf payload", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result[0] is False + assert result[2] == "pdf" + assert "invalid pdf signature" in result[3].lower() + assert not (tmp_path / "downloads" / "AAA_000.pdf").exists() + + +def test_browser_downloader_recovers_academy_bookreader_pdf(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="standard") + + async def _fake_download_academy(url: str): + return b"%PDF-1.4\n%academy\n" + + monkeypatch.setattr(downloader, "_download_academy_bookreader_pdf", _fake_download_academy) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://repository.academyofathens.gr/document/43963.pdf", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue="Expected a file-like response but received HTML instead", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.4") + + +def test_academy_images_to_pdf_bytes_builds_pdf(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + blobs = [] + for color in ("red", "blue"): + image = Image.new("RGB", (16, 16), color=color) + buf = io.BytesIO() + image.save(buf, format="JPEG") + blobs.append(buf.getvalue()) + + pdf_bytes = downloader._academy_images_to_pdf_bytes(blobs) + + assert pdf_bytes.startswith(b"%PDF-") + + def test_browser_downloader_domain_cookie_lookup(tmp_path): downloader = BrowserGlossDownloader( output_dir=str(tmp_path), diff --git a/tests/test_openarchives_download_freeze.py b/tests/test_openarchives_download_freeze.py index 6420372..e76b24e 100644 --- a/tests/test_openarchives_download_freeze.py +++ b/tests/test_openarchives_download_freeze.py @@ -4,6 +4,7 @@ import pandas as pd +import glossapi.scripts.openarchives_download_freeze as freeze_mod from glossapi.scripts.openarchives_download_freeze import main @@ -24,3 +25,45 @@ def test_download_freeze_dry_run_materializes_manifest(tmp_path: Path) -> None: assert rc == 0 assert (work_root / "manifests" / "download_input.parquet").exists() assert (work_root / "download_results" / "download_results.parquet").exists() + + +def test_download_freeze_uses_pdf_only_auto_mode(tmp_path: Path, monkeypatch) -> None: + src = tmp_path / "input.parquet" + pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ).to_parquet(src, index=False) + + observed = {} + + class DummyCorpus: + def __init__(self, *args, **kwargs): + observed["init"] = kwargs + + def download(self, **kwargs): + observed["download"] = kwargs + return pd.DataFrame( + [ + { + "url": "https://example.com/a.pdf", + "filename": "ABC_001.pdf", + "download_success": True, + "download_error": "", + "file_ext": "pdf", + } + ] + ) + + monkeypatch.setattr(freeze_mod, "Corpus", DummyCorpus) + + work_root = tmp_path / "work" + rc = main(["--input-parquet", str(src), "--work-root", str(work_root)]) + + assert rc == 0 + assert observed["download"]["download_mode"] == "auto" + assert observed["download"]["supported_formats"] == ["pdf"] From 489698e4e004c1dc6df4017d7b386e642baad4a9 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Thu, 2 Apr 2026 03:24:30 +0300 Subject: [PATCH 49/93] deepseek reliability hardening --- .../artifact_layout_and_stage_handoffs.md | 26 + docs/multi_gpu.md | 29 + ...deepseek_reliability_pending_2026-04-02.md | 39 + docs/stages/ocr.md | 13 + src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 1055 +++++++++++++---- src/glossapi/ocr/deepseek/runner.py | 727 +++++++++++- src/glossapi/ocr/deepseek/work_queue.py | 380 ++++++ .../scripts/deepseek_pipeline_benchmark.py | 13 + tests/test_deepseek_multi_gpu_runtime.py | 407 +++++++ tests/test_deepseek_runner_contract.py | 321 +++++ 10 files changed, 2809 insertions(+), 201 deletions(-) create mode 100644 docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md create mode 100644 src/glossapi/ocr/deepseek/work_queue.py create mode 100644 tests/test_deepseek_multi_gpu_runtime.py diff --git a/docs/architecture/artifact_layout_and_stage_handoffs.md b/docs/architecture/artifact_layout_and_stage_handoffs.md index f3b5b6d..27cfef0 100644 --- a/docs/architecture/artifact_layout_and_stage_handoffs.md +++ b/docs/architecture/artifact_layout_and_stage_handoffs.md @@ -92,6 +92,32 @@ That affects: Chunk suffix behavior is therefore part of the current contract. +For DeepSeek OCR, there is an important distinction between execution-time shards and stage handoff artifacts: + +- Multi-GPU `exact_fill` may execute shards such as `doc__p00001-00096` internally to keep GPU lanes full. +- Those shard names are operational artifacts, not the downstream contract for OCR outputs. +- After worker completion, the runner reassembles canonical `markdown/.md` and `json/metrics/.metrics.json` files for each source PDF. +- Canonical OCR markdown page boundaries are annotated with `` comments next to the page-split marker, and the parser remains backward-compatible with legacy unnumbered separators. +- Original shard markdown and shard metrics are moved under `sidecars/ocr_shards/` for debugging and audit trails. +- If a repair retry trips the garbage cutoff again, the canonical markdown keeps the page slot but blanks the page content rather than preserving the bad first-pass OCR. + +For multi-GPU vLLM OCR, there is now a second class of operational artifacts under `sidecars/ocr_runtime/`: + +- `work_queue.sqlite`: durable batch queue state for the current OCR run +- `worker_*.runtime.json`: per-worker heartbeat and timing state +- `gpu_preflight.json`: GPU readiness checks such as persistence mode +- `gpu_telemetry.jsonl`: sampled GPU utilization and process telemetry +- `runtime_summary.json`: queue completion state plus steady-state timing windows + +The runtime queue now has two phases inside the same operational state: + +- first-pass shard batches +- repair shard batches published after first pass completes + +These runtime artifacts are operational state, not downstream stage inputs. They are intended for monitoring, debugging, and safe resumption logic. + +Downstream stages should therefore consume canonical OCR outputs, not shard artifacts. + ## Authoritative state vs derived artifacts Not every file has equal semantic importance. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index feb3283..598fb1f 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -32,6 +32,35 @@ c.ocr(use_gpus='multi', math_batch_size=12) - Crashed workers are respawned automatically; control the retry budget per GPU with `GLOSSAPI_MATH_RESPAWN_CAP` (default `5`). Use `GLOSSAPI_WORKER_LOG_VERBOSE=0` to silence the banner that prints the binding info. - When a device exceeds the respawn cap, remaining stems are added to the fatal skip-list and their artifacts are quarantined under `downloads/problematic_math/` and `json/problematic_math/` for follow-up. +## DeepSeek OCR on Multiple GPUs + +```python +from glossapi import Corpus +c = Corpus("OUT", "OUT") +c.ocr( + use_gpus="multi", + runtime_backend="vllm", + workers_per_gpu=1, + scheduler="exact_fill", + target_batch_pages=96, +) +``` + +- `scheduler="exact_fill"` is the preferred multi-GPU vLLM scheduler when PDFs vary widely in length. It shards large documents into page ranges and keeps GPU lanes filled more evenly. +- Internal shard runs now preserve the public `Corpus.ocr()` contract. Canonical outputs are reassembled back into `markdown/.md` and `json/metrics/.metrics.json` for each source PDF. +- Shard markdown and shard metrics are retained for debugging under `sidecars/ocr_shards/` instead of remaining in the canonical handoff directories. +- The vLLM path now renders pages into memory and feeds a bounded queue directly into inference, which removes the temporary PNG round-trip and overlaps rendering with generation. +- Empty-page detection still happens before inference, and repair retries reuse the in-memory page image instead of reopening a file from disk. +- Final OCR markdown now tags each page split with `` so page images, markdown, and metrics stay aligned during inspection. +- If a repair retry hits the garbage cutoff again, the page is blanked rather than keeping the failed first-pass garbage. +- Multi-GPU vLLM workers now pull from a durable shared batch queue in `sidecars/ocr_runtime/work_queue.sqlite`, so finished batches survive worker crashes and respawned workers can continue without rescanning completed work. +- Repair work now runs as a second global queue phase. First-pass batches finish and persist shard outputs first; then any worker can claim the queued repair shards. This keeps repair tails balanced across GPUs without mixing worker-local repair state into the controller. +- Each worker writes `sidecars/ocr_runtime/worker_*.runtime.json` with heartbeat state and steady-state timing markers. The runner also emits `gpu_preflight.json`, `gpu_telemetry.jsonl`, and `runtime_summary.json`. +- The runner checks GPU persistence mode before launch by default. Control it with `GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT=off|warn|ensure`. The default is `ensure`, which will try `sudo -n nvidia-smi -pm 1` and record the result in `gpu_preflight.json`. +- Worker reliability knobs are environment-driven: `GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP`, `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS`, `GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC`, `GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC`, and `GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC`. +- The default `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS=2` means one retry after the first failed claim, then the batch is marked failed instead of retrying forever. +- `workers_per_gpu=1` remains the safe default on A100 40GB nodes. Prefer increasing `target_batch_pages` before adding more workers per device. + ## Provider & Device Checks - ONNXRuntime providers must include `CUDAExecutionProvider`. diff --git a/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md b/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md new file mode 100644 index 0000000..2e6605d --- /dev/null +++ b/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md @@ -0,0 +1,39 @@ +# DELETE ME: DeepSeek Reliability Pending Work + +This note is temporary. Delete it after the first production soak confirms the +merged reliability path is stable and the follow-up items below are either done +or explicitly discarded. + +## What shipped in this merge + +- durable multi-GPU DeepSeek work queue with separate main and repair phases +- worker respawn with process-group teardown so orphaned `VLLM::EngineCore` + processes do not pin VRAM after a crash +- GPU preflight and telemetry sidecars under `sidecars/ocr_runtime/` +- steady-state timing in the runtime summary +- default work-item retry ceiling of two total attempts + - first failure: retry once + - second failure: mark the batch failed and stop retrying it + +## Pending follow-up + +1. Capture and archive one clean fault-injection receipt on the merged + `development` branch. + - Goal: preserve one explicit production-like run where a worker is killed + mid-run, the supervisor respawns it, the in-flight batch is retried once, + and the run still completes. + +2. Add operator-facing handling for terminally failed batches. + - The durable queue already marks them `failed`. + - The remaining work is a cleaner operator handoff, for example a dedicated + quarantine/export path or a documented replay workflow. + +3. Replace the current image-content stats implementation in + `run_pdf_ocr_vllm.py`. + - It still uses a CPU-heavy PIL pixel scan and currently emits a Pillow + deprecation warning. + +4. Run a longer unattended soak after merge. + - The current validation covers targeted tests, full end-to-end runs, and + reliability-path implementation, but production confidence still benefits + from a longer multi-hour burn-in on the merged branch. diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 3a7e57c..179b211 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -41,6 +41,19 @@ OCR reruns should preserve: - explicit indication that remediation was attempted - visibility into files that remain problematic +## DeepSeek runtime contract + +- `ocr()` may execute page-range shards internally when `use_gpus="multi"` and `scheduler="exact_fill"`, but the stage contract remains one canonical Markdown file and one canonical metrics file per source PDF. +- When shard execution is used, the runner reassembles `markdown/.md` and `json/metrics/.metrics.json` after the CLI workers finish. +- Execution-time shard artifacts are moved under `sidecars/ocr_shards/` so downstream stages do not mistake them for canonical stage outputs. +- The vLLM runtime now streams rendered pages through an in-memory queue, overlaps rendering with inference, skips empty pages before inference, and reuses the same in-memory image for repair retries. +- Canonical OCR markdown now annotates page boundaries with `` comments alongside each page-split marker so downstream inspection can line up page images and markdown more easily. +- In `repair_mode="auto"`, a page that trips the garbage cutoff again during the plain-OCR repair pass is now blanked instead of keeping the original garbage text. +- Multi-GPU vLLM runs now execute through a durable shared batch queue rather than one fragile subprocess per preassigned lane. Workers claim first-pass batches dynamically, heartbeat while a batch is active, and can be respawned without losing finished batch outputs. +- Repair retries are now durable too. Flagged pages are published back into the same runtime database as a second global repair queue, and any GPU worker can drain those repair shards after the first-pass queue is complete. +- By default each durable batch gets at most two total attempts, so one retry is allowed after the first failure and then the batch is marked failed for operator follow-up. +- Operational sidecars for these runs live under `sidecars/ocr_runtime/`, including the durable work queue state, per-worker runtime JSON, GPU telemetry samples, GPU preflight output, and a final runtime summary with steady-state inference timestamps. + ## Contributor note Any change to candidate selection, skiplist semantics, or OCR-success metadata affects both rerun behavior and corpus analysis quality. diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 6368f81..2f76b67 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -3,24 +3,45 @@ from __future__ import annotations import argparse +import json import logging -import tempfile +import queue +import sys +import threading import time from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional, Tuple from PIL import Image +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( DEFAULT_MAX_NEW_TOKENS, - PAGE_SPLIT, + _join_page_outputs, + _count_rendered_pages, _iter_pdf_jobs, + _iter_rendered_pages, _postprocess_page_text, _profile_defaults, - _render_pages, + _split_page_outputs, _write_outputs, _write_progress, ) +from glossapi.ocr.deepseek.work_queue import ( + QUEUE_MAIN, + QUEUE_REPAIR, + STATUS_PENDING, + STATUS_RUNNING, + claim_next_batch, + enqueue_batches, + heartbeat_batch, + mark_batch_done, + mark_batch_failed, + work_queue_counts, +) from glossapi.ocr.utils.cleaning import StreamingGarbageDetector LOGGER = logging.getLogger(__name__) @@ -57,6 +78,12 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--disable-fp8-kv", action="store_true") parser.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) parser.add_argument("--content-debug", action="store_true") + parser.add_argument("--work-db", default=None) + parser.add_argument("--worker-id", default=None) + parser.add_argument("--worker-runtime-file", default=None) + parser.add_argument("--work-stale-after-sec", type=float, default=900.0) + parser.add_argument("--work-heartbeat-sec", type=float, default=10.0) + parser.add_argument("--work-max-attempts", type=int, default=2) return parser.parse_args() @@ -272,212 +299,458 @@ def _is_effectively_empty_page(image_stats: dict, repair_mode: str) -> bool: ) -def _load_job_image(item: dict) -> Image.Image: - return Image.open(item["image_path"]).convert("RGB") +def _resolve_job_image(item: dict) -> Tuple[Image.Image, bool]: + image = item.get("image") + if isinstance(image, Image.Image): + return image, False + return Image.open(item["image_path"]).convert("RGB"), True -def _generate_batch_outputs( +def _close_job_image(item: dict) -> None: + image = item.pop("image", None) + if isinstance(image, Image.Image): + image.close() + + +def _empty_page_metric(*, page_number: int, image_stats: dict) -> dict: + return { + "page_number": int(page_number), + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "skip_empty", + "repair_reason": "empty_page", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": True, + "garbage_early_stop_applied": False, + **image_stats, + } + + +def _utc_now_iso(now_ts: Optional[float] = None) -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(float(now_ts) if now_ts is not None else time.time())) + + +def _write_worker_runtime(runtime_file: Optional[Path], state: dict) -> None: + if runtime_file is None: + return + runtime_path = Path(runtime_file).expanduser().resolve() + runtime_path.parent.mkdir(parents=True, exist_ok=True) + payload = dict(state) + payload["updated_at"] = _utc_now_iso() + runtime_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def _build_jobs_from_batch(input_dir: Path, batch: dict) -> List[dict]: + files = list(batch.get("files") or []) + page_ranges = list(batch.get("page_ranges") or []) + return _iter_pdf_jobs(input_dir, files, page_ranges) + + +def _iter_selected_rendered_pages( + pdf_path: Path, + *, + render_dpi: int, + source_page_numbers: List[int], +): + import fitz + + doc = fitz.open(pdf_path) + try: + zoom = float(render_dpi) / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for source_page_number in source_page_numbers: + idx = int(source_page_number) - 1 + if idx < 0 or idx >= int(doc.page_count): + raise ValueError(f"Requested page {source_page_number} outside document bounds for {pdf_path}") + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + yield int(source_page_number), Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + finally: + doc.close() + + +def _emit_progress(output_dir: Path, stem: str, state: dict) -> None: + _write_progress( + output_dir, + stem, + state["page_outputs"], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + +def _resolve_repair_disposition(*, repair_text: str, repair_postprocess: dict) -> dict: + if bool(repair_postprocess.get("early_stops", 0)): + return { + "final_text": "", + "repair_applied": False, + "page_dropped_after_repair": True, + "drop_reason": "repeat_garbage_cutoff", + } + if repair_text.strip(): + return { + "final_text": repair_text, + "repair_applied": True, + "page_dropped_after_repair": False, + "drop_reason": None, + } + return { + "final_text": None, + "repair_applied": False, + "page_dropped_after_repair": False, + "drop_reason": None, + } + + +def _repair_summary_from_page_metrics(page_metrics: List[dict], repair_mode: str) -> dict: + return { + "repair_mode": str(repair_mode), + "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), + "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), + "plain_repairs": int( + sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied"))) + ), + "tiled_repairs": 0, + "pages_dropped_after_repeat_cutoff": int(sum(1 for item in page_metrics if bool(item.get("page_dropped_after_repair")))), + "empty_pages_skipped": int(sum(1 for item in page_metrics if bool(item.get("empty_page_skipped")))), + "pages_with_early_stop": int(sum(1 for item in page_metrics if bool(item.get("garbage_early_stop_applied")))), + } + + +def _load_persisted_doc_state(output_dir: Path, stem: str) -> dict: + markdown_path = output_dir / "markdown" / f"{stem}.md" + metrics_path = output_dir / "json" / "metrics" / f"{stem}.metrics.json" + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + page_count = int(metrics.get("page_count", 0)) + page_outputs = _split_page_outputs(markdown_path.read_text(encoding="utf-8")) if markdown_path.exists() else [] + if len(page_outputs) < page_count: + page_outputs.extend([""] * (page_count - len(page_outputs))) + elif len(page_outputs) > page_count: + page_outputs = page_outputs[:page_count] + metrics_by_page = { + int(item["page_number"]): dict(item) + for item in list(metrics.get("page_metrics") or []) + if item is not None and "page_number" in item + } + page_metrics = [metrics_by_page.get(page_number) for page_number in range(1, page_count + 1)] + extra_metrics = dict(metrics) + extra_metrics.pop("page_count", None) + extra_metrics.pop("model", None) + return { + "stem": stem, + "page_outputs": page_outputs, + "page_metrics": page_metrics, + "total_pages": page_count, + "extra_metrics": extra_metrics, + } + + +def _build_repair_batches(*, doc_states: Dict[str, dict], retry_pages_by_stem: Dict[str, List[int]], origin_batch_id: int) -> List[dict]: + batches: List[dict] = [] + for stem, retry_pages in sorted(retry_pages_by_stem.items()): + unique_retry_pages = sorted({int(page_number) for page_number in retry_pages}) + if not unique_retry_pages: + continue + state = doc_states[stem] + batches.append( + { + "queue_key": f"repair:{int(origin_batch_id)}:{stem}", + "origin_batch_id": int(origin_batch_id), + "stem": stem, + "pdf_path": str(state["pdf_path"]), + "source_name": str(state["source_name"]), + "source_stem": str(state["source_stem"]), + "source_start_page": int(state["source_start_page"]), + "source_end_page": int(state["source_start_page"]) + max(0, int(state["total_pages"]) - 1), + "repair_page_numbers": unique_retry_pages, + "pages": int(len(unique_retry_pages)), + } + ) + return batches + + +def _run_vllm_batch( llm, *, - jobs: List[dict], + batch: List[dict], prompt: str, - batch_size: int, sampling_params, ) -> List[dict]: - outputs_by_key: Dict[tuple[str, int], dict] = {} - for batch in _batched(jobs, batch_size): - prompt_batch = [] - opened_images: List[Image.Image] = [] - keys: List[tuple[str, int]] = [] - for item in batch: - image = _load_job_image(item) + if not batch: + return [] + + prompt_batch = [] + opened_images: List[Image.Image] = [] + keys: List[tuple[str, int]] = [] + for item in batch: + image, should_close = _resolve_job_image(item) + if should_close: opened_images.append(image) - keys.append((str(item["stem"]), int(item["page_number"]))) - prompt_batch.append( - { - "prompt": prompt, - "multi_modal_data": {"image": image}, - } - ) + keys.append((str(item["stem"]), int(item["page_number"]))) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + + try: infer_start = time.perf_counter() batch_outputs = llm.generate(prompt_batch, sampling_params=sampling_params) infer_sec = time.perf_counter() - infer_start - per_item_sec = infer_sec / max(1, len(batch)) + finally: for image in opened_images: image.close() - for item, key, output in zip(batch, keys, batch_outputs): - raw_text = "" - if getattr(output, "outputs", None): - raw_text = str(output.outputs[0].text) - outputs_by_key[key] = { + + per_item_sec = infer_sec / max(1, len(batch)) + results: List[dict] = [] + for item, key, output in zip(batch, keys, batch_outputs): + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + results.append( + { + "key": key, "item": item, "raw_text": raw_text, "infer_sec": float(per_item_sec), } - return [outputs_by_key[(str(item["stem"]), int(item["page_number"]))] for item in jobs] + ) + return results -def main() -> int: - args = _parse_args() - input_dir = Path(args.input_dir).resolve() - output_dir = Path(args.output_dir).resolve() - model_dir = Path(args.model_dir).resolve() - jobs_to_run = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) - if not jobs_to_run: - return 0 +def _generate_batch_outputs( + llm, + *, + jobs: List[dict], + prompt: str, + batch_size: int, + sampling_params, +) -> List[dict]: + outputs_by_key: Dict[tuple[str, int], dict] = {} + for batch in _batched(jobs, batch_size): + for result in _run_vllm_batch( + llm, + batch=batch, + prompt=prompt, + sampling_params=sampling_params, + ): + outputs_by_key[result["key"]] = { + "item": result["item"], + "raw_text": result["raw_text"], + "infer_sec": result["infer_sec"], + } + return [outputs_by_key[(str(item["stem"]), int(item["page_number"]))] for item in jobs] - profile_defaults = _profile_defaults(args.ocr_profile) - prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] - plain_prompt = _profile_defaults("plain_ocr")["prompt"] - base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) - image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) - crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) - llm = _load_vllm( - model_dir, - gpu_memory_utilization=float(args.gpu_memory_utilization), - disable_fp8_kv=bool(args.disable_fp8_kv), - ) - sampling_params = _sampling_params( - args.max_new_tokens, - enable_garbage_early_stop=str(args.repair_mode or "off").strip().lower() == "auto", +def _run_jobs_to_outputs( + args: argparse.Namespace, + *, + jobs_to_run: List[dict], + output_dir: Path, + work_db: Optional[Path], + origin_batch_id: Optional[int], + llm, + prompt: str, + plain_prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, + sampling_params, +) -> dict: + batch_wall_start = time.perf_counter() + batch_size = max(1, int(args.batch_size)) + doc_states: Dict[str, dict] = {} + plain_retry_jobs: List[dict] = [] + retry_pages_by_stem: Dict[str, List[int]] = {} + state_lock = threading.Lock() + render_queue: "queue.Queue[dict | None]" = queue.Queue(maxsize=max(2, batch_size * 2)) + producer_errors: List[BaseException] = [] + first_infer_started_at: Optional[float] = None + last_infer_completed_at: Optional[float] = None + shared_repair_queue = ( + work_db is not None + and origin_batch_id is not None + and str(args.repair_mode or "off").strip().lower() == "auto" ) - with tempfile.TemporaryDirectory(prefix="deepseek_vllm_") as tmp_dir_str: - tmp_dir = Path(tmp_dir_str) - doc_states: Dict[str, dict] = {} - jobs: List[dict] = [] - plain_retry_jobs: List[dict] = [] - - for job in jobs_to_run: - pdf_path = Path(job["pdf_path"]) - stem = str(job["stem"]) - doc_start = time.perf_counter() - render_start = time.perf_counter() - images = _render_pages( - pdf_path, - args.max_pages, - args.render_dpi, - start_page=int(job["start_page"]), - end_page=job["end_page"], - ) - render_sec = time.perf_counter() - render_start - total_pages = len(images) - state = { - "stem": stem, - "source_name": str(job["source_name"]), - "source_stem": str(job["source_stem"]), - "source_start_page": int(job["start_page"]), - "page_outputs": [""] * total_pages, - "page_metrics": [None] * total_pages, - "render_sec": float(render_sec), - "doc_start": float(doc_start), - "completed_pages": 0, - "total_pages": total_pages, - } - doc_states[stem] = state - _write_progress(output_dir, stem, [], total_pages, 0) - for idx, image in enumerate(images): - page_path = tmp_dir / f"{stem}_page_{idx + 1:04d}.png" - image_stats = _image_content_stats(image) - if _is_effectively_empty_page(image_stats, args.repair_mode): - state["page_metrics"][idx] = { - "page_number": int(idx + 1), - "infer_sec": 0.0, - "raw_chars": 0, - "final_chars": 0, - "first_pass_quality_score": 0.0, - "first_pass_letters": 0, - "first_pass_digits": 0, - "first_pass_pua_chars": 0, - "repair_strategy": "skip_empty", - "repair_reason": "empty_page", - "repair_attempted": False, - "repair_applied": False, - "empty_page_skipped": True, - "garbage_early_stop_applied": False, - **image_stats, - } - state["completed_pages"] = int(state["completed_pages"]) + 1 - _write_progress( - output_dir, - stem, - [page for page in state["page_outputs"] if page], - int(state["total_pages"]), - int(state["completed_pages"]), - ) - image.close() - continue - image.save(page_path, format="PNG") - image.close() - jobs.append( - { - "stem": stem, - "page_number": int(idx + 1), - "image_path": page_path, - "image_stats": image_stats, - } + def _render_producer() -> None: + try: + for job in jobs_to_run: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) + doc_start = time.perf_counter() + total_pages = _count_rendered_pages( + pdf_path, + args.max_pages, + start_page=int(job["start_page"]), + end_page=job["end_page"], ) + state = { + "stem": stem, + "pdf_path": str(pdf_path), + "source_name": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "page_outputs": [""] * total_pages, + "page_metrics": [None] * total_pages, + "render_sec": 0.0, + "doc_start": float(doc_start), + "completed_pages": 0, + "total_pages": total_pages, + } + with state_lock: + doc_states[stem] = state + _emit_progress(output_dir, stem, state) - first_pass_outputs = _generate_batch_outputs( - llm, - jobs=jobs, - prompt=prompt, - batch_size=int(args.batch_size), - sampling_params=sampling_params, - ) - for result in first_pass_outputs: - item = result["item"] - state = doc_states[item["stem"]] - raw_text = str(result["raw_text"]) - image_stats = dict(item.get("image_stats", {})) - page_text, postprocess_metrics = _postprocess_page_text( - raw_text, + render_start = time.perf_counter() + for page_number, image in enumerate( + _iter_rendered_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ), + start=1, + ): + image_stats = _image_content_stats(image) + if _is_effectively_empty_page(image_stats, args.repair_mode): + with state_lock: + state["page_metrics"][page_number - 1] = _empty_page_metric( + page_number=page_number, + image_stats=image_stats, + ) + state["completed_pages"] = int(state["completed_pages"]) + 1 + _emit_progress(output_dir, stem, state) + image.close() + continue + render_queue.put( + { + "stem": stem, + "page_number": int(page_number), + "image": image, + "image_stats": image_stats, + } + ) + + with state_lock: + state["render_sec"] = float(time.perf_counter() - render_start) + except BaseException as exc: # pragma: no cover - exercised in integration flows + producer_errors.append(exc) + finally: + render_queue.put(None) + + producer = threading.Thread(target=_render_producer, name="deepseek-vllm-render", daemon=True) + producer.start() + + in_flight_batch: List[dict] = [] + producer_done = False + queue_wait_timeout = 0.05 + queue_flush_marker = "__flush__" + try: + while not producer_done or in_flight_batch: + if not producer_done and len(in_flight_batch) < batch_size: + try: + item = render_queue.get(timeout=queue_wait_timeout) + except queue.Empty: + item = queue_flush_marker if in_flight_batch else None + if item is None: + if producer.is_alive(): + continue + producer_done = True + elif item == queue_flush_marker: + pass + else: + in_flight_batch.append(item) + if len(in_flight_batch) < batch_size: + continue + + if not in_flight_batch: + continue + + batch_infer_started_at = time.time() + if first_infer_started_at is None: + first_infer_started_at = batch_infer_started_at + batch_results = _run_vllm_batch( + llm, + batch=in_flight_batch, prompt=prompt, - content_debug=bool(args.content_debug), - ) - if args.content_debug: - page_text = f"\n{page_text}".strip() - state["page_outputs"][item["page_number"] - 1] = page_text - quality = _text_quality_metrics(page_text) - metric = { - "page_number": int(item["page_number"]), - "infer_sec": float(result["infer_sec"]), - "raw_chars": int(len(raw_text.strip())), - "final_chars": int(len(page_text.strip())), - "first_pass_quality_score": float(quality["quality_score"]), - "first_pass_letters": int(quality["letters"]), - "first_pass_digits": int(quality["digits"]), - "first_pass_pua_chars": int(quality["pua_chars"]), - "repair_strategy": "plain" if bool(postprocess_metrics.get("early_stops", 0)) else "none", - "repair_reason": "early_stop_markdown_garbage" if bool(postprocess_metrics.get("early_stops", 0)) else None, - "repair_attempted": False, - "repair_applied": False, - "empty_page_skipped": False, - "garbage_early_stop_applied": bool(postprocess_metrics.get("early_stops", 0)), - **image_stats, - **postprocess_metrics, - } - state["page_metrics"][item["page_number"] - 1] = metric - if bool(postprocess_metrics.get("early_stops", 0)) and str(args.repair_mode or "off").strip().lower() == "auto": - plain_retry_jobs.append(item) - state["completed_pages"] = int(state["completed_pages"]) + 1 - _write_progress( - output_dir, - item["stem"], - [page for page in state["page_outputs"] if page], - int(state["total_pages"]), - int(state["completed_pages"]), + sampling_params=sampling_params, ) + last_infer_completed_at = time.time() + for result in batch_results: + item = result["item"] + state = doc_states[item["stem"]] + raw_text = str(result["raw_text"]) + image_stats = dict(item.get("image_stats", {})) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + quality = _text_quality_metrics(page_text) + metric = { + "page_number": int(item["page_number"]), + "infer_sec": float(result["infer_sec"]), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + "first_pass_quality_score": float(quality["quality_score"]), + "first_pass_letters": int(quality["letters"]), + "first_pass_digits": int(quality["digits"]), + "first_pass_pua_chars": int(quality["pua_chars"]), + "repair_strategy": "plain" if bool(postprocess_metrics.get("early_stops", 0)) else "none", + "repair_reason": "early_stop_markdown_garbage" if bool(postprocess_metrics.get("early_stops", 0)) else None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": bool(postprocess_metrics.get("early_stops", 0)), + **image_stats, + **postprocess_metrics, + } + with state_lock: + state["page_outputs"][item["page_number"] - 1] = page_text + state["page_metrics"][item["page_number"] - 1] = metric + state["completed_pages"] = int(state["completed_pages"]) + 1 + _emit_progress(output_dir, item["stem"], state) + + if bool(postprocess_metrics.get("early_stops", 0)) and str(args.repair_mode or "off").strip().lower() == "auto": + if shared_repair_queue: + retry_pages_by_stem.setdefault(str(item["stem"]), []).append(int(item["page_number"])) + _close_job_image(item) + else: + plain_retry_jobs.append(item) + else: + _close_job_image(item) + + in_flight_batch = [] + + producer.join() + if producer_errors: + raise producer_errors[0] if plain_retry_jobs: + repair_started_at = time.time() + if first_infer_started_at is None: + first_infer_started_at = repair_started_at plain_repair_outputs = _generate_batch_outputs( llm, jobs=plain_retry_jobs, prompt=plain_prompt, - batch_size=int(args.batch_size), + batch_size=batch_size, sampling_params=sampling_params, ) + last_infer_completed_at = time.time() for result in plain_repair_outputs: item = result["item"] state = doc_states[item["stem"]] @@ -492,39 +765,40 @@ def main() -> int: metric["repair_attempted"] = True metric["repair_infer_sec"] = float(result["infer_sec"]) metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) - metric["repair_final_chars"] = int(len(repair_text.strip())) metric["repair_profile"] = "plain_ocr" - metric["repair_quality_score"] = float(_text_quality_metrics(repair_text)["quality_score"]) + disposition = _resolve_repair_disposition( + repair_text=repair_text, + repair_postprocess=repair_postprocess, + ) + repair_effective_text = disposition["final_text"] or "" + metric["repair_final_chars"] = int(len(repair_effective_text.strip())) + metric["repair_quality_score"] = float(_text_quality_metrics(repair_effective_text)["quality_score"]) metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) + metric["repair_applied"] = bool(disposition["repair_applied"]) + metric["page_dropped_after_repair"] = bool(disposition["page_dropped_after_repair"]) + if disposition["drop_reason"] is not None: + metric["drop_reason"] = str(disposition["drop_reason"]) metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) - if repair_text.strip(): - state["page_outputs"][item["page_number"] - 1] = repair_text - metric["repair_applied"] = True - metric["final_chars"] = int(len(repair_text.strip())) - _write_progress( - output_dir, - item["stem"], - [page for page in state["page_outputs"] if page], - int(state["total_pages"]), - int(state["completed_pages"]), - ) + with state_lock: + if disposition["final_text"] is not None: + state["page_outputs"][item["page_number"] - 1] = repair_effective_text + metric["final_chars"] = int(len(repair_effective_text.strip())) + _emit_progress(output_dir, item["stem"], state) + _close_job_image(item) + finally: + for item in in_flight_batch: + _close_job_image(item) + for item in plain_retry_jobs: + _close_job_image(item) for stem, state in doc_states.items(): - markdown = PAGE_SPLIT.join(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" + markdown = _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" page_metrics = sorted( [item for item in state["page_metrics"] if item], key=lambda item: int(item["page_number"]), ) - repair_summary = { - "repair_mode": str(args.repair_mode), - "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), - "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), - "plain_repairs": int(sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied")))), - "tiled_repairs": 0, - "empty_pages_skipped": int(sum(1 for item in page_metrics if bool(item.get("empty_page_skipped")))), - "pages_with_early_stop": int(sum(1 for item in page_metrics if bool(item.get("garbage_early_stop_applied")))), - } + repair_summary = _repair_summary_from_page_metrics(page_metrics, str(args.repair_mode)) _write_outputs( output_dir, stem, @@ -554,7 +828,392 @@ def main() -> int: "page_metrics": page_metrics, }, ) + if shared_repair_queue and retry_pages_by_stem: + enqueue_batches( + work_db, + queue_name=QUEUE_REPAIR, + batches=_build_repair_batches( + doc_states=doc_states, + retry_pages_by_stem=retry_pages_by_stem, + origin_batch_id=int(origin_batch_id), + ), + ) + + return { + "docs": int(len(doc_states)), + "pages": int(sum(int(state["total_pages"]) for state in doc_states.values())), + "render_sec_total": float(sum(float(state["render_sec"]) for state in doc_states.values())), + "infer_sec_total": float( + sum( + sum(float(item["infer_sec"]) for item in state["page_metrics"] if item is not None) + for state in doc_states.values() + ) + ), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "repair_batches_enqueued": int(sum(1 for pages in retry_pages_by_stem.values() if pages)), + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + } + + +def _run_repair_batch_to_outputs( + args: argparse.Namespace, + *, + batch: dict, + output_dir: Path, + llm, + plain_prompt: str, + sampling_params, +) -> dict: + batch_wall_start = time.perf_counter() + stem = str(batch["stem"]) + state = _load_persisted_doc_state(output_dir, stem) + source_start_page = int(batch["source_start_page"]) + repair_page_numbers = sorted({int(page_number) for page_number in list(batch.get("repair_page_numbers") or [])}) + if not repair_page_numbers: + return { + "docs": 1, + "pages": 0, + "render_sec_total": 0.0, + "infer_sec_total": 0.0, + "first_infer_started_at": None, + "last_infer_completed_at": None, + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + } + + render_start = time.perf_counter() + source_page_numbers = [source_start_page + page_number - 1 for page_number in repair_page_numbers] + repair_jobs: List[dict] = [] + for source_page_number, image in _iter_selected_rendered_pages( + Path(str(batch["pdf_path"])), + render_dpi=int(args.render_dpi), + source_page_numbers=source_page_numbers, + ): + repair_jobs.append( + { + "stem": stem, + "page_number": int(source_page_number) - source_start_page + 1, + "image": image, + } + ) + render_sec = float(time.perf_counter() - render_start) + if not repair_jobs: + return { + "docs": 1, + "pages": 0, + "render_sec_total": render_sec, + "infer_sec_total": 0.0, + "first_infer_started_at": None, + "last_infer_completed_at": None, + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + } + + first_infer_started_at = time.time() + repair_outputs = _generate_batch_outputs( + llm, + jobs=repair_jobs, + prompt=plain_prompt, + batch_size=max(1, int(args.batch_size)), + sampling_params=sampling_params, + ) + last_infer_completed_at = time.time() + try: + for result in repair_outputs: + item = result["item"] + page_number = int(item["page_number"]) + metric = state["page_metrics"][page_number - 1] + if metric is None: + metric = { + "page_number": page_number, + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + } + state["page_metrics"][page_number - 1] = metric + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + repair_text = f"\n{repair_text}".strip() + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_profile"] = "plain_ocr" + disposition = _resolve_repair_disposition( + repair_text=repair_text, + repair_postprocess=repair_postprocess, + ) + repair_effective_text = disposition["final_text"] or "" + metric["repair_final_chars"] = int(len(repair_effective_text.strip())) + metric["repair_quality_score"] = float(_text_quality_metrics(repair_effective_text)["quality_score"]) + metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) + metric["repair_applied"] = bool(disposition["repair_applied"]) + metric["page_dropped_after_repair"] = bool(disposition["page_dropped_after_repair"]) + if disposition["drop_reason"] is not None: + metric["drop_reason"] = str(disposition["drop_reason"]) + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric.get("infer_sec", 0.0)) + float(result["infer_sec"]) + if disposition["final_text"] is not None: + state["page_outputs"][page_number - 1] = repair_effective_text + metric["final_chars"] = int(len(repair_effective_text.strip())) + _close_job_image(item) + finally: + for item in repair_jobs: + _close_job_image(item) + + page_metrics = sorted([item for item in state["page_metrics"] if item], key=lambda item: int(item["page_number"])) + extra_metrics = dict(state["extra_metrics"]) + extra_metrics["repair_summary"] = _repair_summary_from_page_metrics(page_metrics, extra_metrics.get("repair_mode", args.repair_mode)) + extra_metrics["page_metrics"] = page_metrics + extra_metrics["infer_sec_total"] = float(sum(float(item["infer_sec"]) for item in page_metrics)) + _write_outputs( + output_dir, + stem, + _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]", + int(state["total_pages"]), + extra_metrics=extra_metrics, + ) + return { + "docs": 1, + "pages": int(len(repair_page_numbers)), + "render_sec_total": render_sec, + "infer_sec_total": float(sum(float(result["infer_sec"]) for result in repair_outputs)), + "first_infer_started_at": _utc_now_iso(first_infer_started_at), + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at), + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + } + + +def _queue_has_pending_or_running(counts: Dict[str, object], queue_name: str) -> bool: + queue_counts = counts.get("by_queue", {}).get(queue_name, {}) + return int(queue_counts.get(STATUS_PENDING, 0)) > 0 or int(queue_counts.get(STATUS_RUNNING, 0)) > 0 + + +def _claim_next_phase_batch( + work_db: Path, + *, + worker_id: str, + stale_after_sec: float, +) -> Tuple[Optional[str], Optional[Dict[str, object]], bool]: + batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_MAIN, + ) + if batch is not None: + return QUEUE_MAIN, batch, False + + counts = work_queue_counts(work_db) + # Repairs are a distinct global phase: no worker should start repair work + # while any first-pass batch is still pending or running elsewhere. + if _queue_has_pending_or_running(counts, QUEUE_MAIN): + return None, None, True + + batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_REPAIR, + ) + if batch is not None: + return QUEUE_REPAIR, batch, False + + counts = work_queue_counts(work_db) + if _queue_has_pending_or_running(counts, QUEUE_REPAIR): + return None, None, True + return None, None, False + + +def _run_work_queue( + args: argparse.Namespace, + *, + input_dir: Path, + output_dir: Path, + llm, + prompt: str, + plain_prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, + sampling_params, +) -> int: + work_db = Path(str(args.work_db)).expanduser().resolve() + worker_id = str(args.worker_id or f"worker-{int(time.time())}") + runtime_file = Path(str(args.worker_runtime_file)).expanduser().resolve() if args.worker_runtime_file else None + heartbeat_interval = float(max(1.0, args.work_heartbeat_sec)) + stale_after_sec = float(max(30.0, args.work_stale_after_sec)) + max_attempts = int(max(1, args.work_max_attempts)) + runtime_state = { + "worker_id": worker_id, + "status": "starting", + "started_at": _utc_now_iso(), + "engine_ready_at": _utc_now_iso(), + "current_batch_id": None, + "current_queue_name": None, + "completed_batches": [], + "first_batch_started_at": None, + "last_batch_finished_at": None, + } + _write_worker_runtime(runtime_file, runtime_state) + + while True: + queue_name, batch, should_wait = _claim_next_phase_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + ) + if batch is None: + if should_wait: + time.sleep(min(heartbeat_interval, 1.0)) + continue + runtime_state["status"] = "complete" + runtime_state["current_batch_id"] = None + runtime_state["current_queue_name"] = None + _write_worker_runtime(runtime_file, runtime_state) + return 0 + + batch_id = int(batch["batch_id"]) + heartbeat_stop = threading.Event() + + def _heartbeat_loop() -> None: + while not heartbeat_stop.wait(heartbeat_interval): + heartbeat_batch(work_db, batch_id=batch_id, worker_id=worker_id) + runtime_state["heartbeat_at"] = _utc_now_iso() + _write_worker_runtime(runtime_file, runtime_state) + + heartbeat_thread = threading.Thread(target=_heartbeat_loop, name=f"{worker_id}-heartbeat", daemon=True) + heartbeat_thread.start() + try: + runtime_state["status"] = f"running_{queue_name}" + runtime_state["current_batch_id"] = batch_id + runtime_state["current_queue_name"] = queue_name + runtime_state["current_batch_pages"] = int(batch.get("pages", 0)) + runtime_state["heartbeat_at"] = _utc_now_iso() + _write_worker_runtime(runtime_file, runtime_state) + if queue_name == QUEUE_MAIN: + result = _run_jobs_to_outputs( + args, + jobs_to_run=_build_jobs_from_batch(input_dir, batch), + output_dir=output_dir, + work_db=work_db, + origin_batch_id=batch_id, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) + else: + result = _run_repair_batch_to_outputs( + args, + batch=batch, + output_dir=output_dir, + llm=llm, + plain_prompt=plain_prompt, + sampling_params=sampling_params, + ) + if runtime_state["first_batch_started_at"] is None: + runtime_state["first_batch_started_at"] = result.get("first_infer_started_at") + runtime_state["last_batch_finished_at"] = result.get("last_infer_completed_at") + runtime_state["completed_batches"].append( + { + "batch_id": batch_id, + "queue_name": queue_name, + } + ) + mark_batch_done(work_db, batch_id=batch_id, worker_id=worker_id, result=result) + except Exception as exc: + runtime_state["status"] = "failed" + runtime_state["current_batch_id"] = batch_id + runtime_state["current_queue_name"] = queue_name + runtime_state["last_error"] = str(exc) + _write_worker_runtime(runtime_file, runtime_state) + mark_batch_failed( + work_db, + batch_id=batch_id, + worker_id=worker_id, + error=str(exc), + max_attempts=max_attempts, + ) + raise + finally: + heartbeat_stop.set() + heartbeat_thread.join(timeout=max(1.0, heartbeat_interval)) + runtime_state["current_batch_id"] = None + runtime_state["current_queue_name"] = None + _write_worker_runtime(runtime_file, runtime_state) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + plain_prompt = _profile_defaults("plain_ocr")["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + llm = _load_vllm( + model_dir, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + ) + sampling_params = _sampling_params( + args.max_new_tokens, + enable_garbage_early_stop=str(args.repair_mode or "off").strip().lower() == "auto", + ) + + if args.work_db: + return _run_work_queue( + args, + input_dir=input_dir, + output_dir=output_dir, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) + jobs_to_run = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs_to_run: + return 0 + _run_jobs_to_outputs( + args, + jobs_to_run=jobs_to_run, + output_dir=output_dir, + work_db=None, + origin_batch_id=None, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) return 0 diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 7a22018..07f5285 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -3,12 +3,17 @@ from __future__ import annotations from contextlib import ExitStack +import calendar import json import logging import os +import re +import signal import shutil import subprocess import sys +import threading +import time from pathlib import Path from typing import Any, Dict, Iterable, List, Optional @@ -20,6 +25,15 @@ build_whole_document_slices, pack_slices_into_batches, ) +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs +from glossapi.ocr.deepseek.work_queue import ( + STATUS_DONE, + STATUS_FAILED, + init_work_db, + iter_work_items, + requeue_worker_batches, + work_queue_counts, +) try: import pypdfium2 as _pypdfium2 @@ -32,6 +46,26 @@ DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" AUTO_VLLM_BATCH_PAGE_CAP = 160 DEFAULT_MAX_NEW_TOKENS = 2048 +DEFAULT_WORKER_RESPAWN_CAP = 3 +DEFAULT_WORK_ITEM_MAX_ATTEMPTS = 2 +DEFAULT_WORK_STALE_AFTER_SEC = 900.0 +DEFAULT_WORK_HEARTBEAT_SEC = 10.0 +DEFAULT_TELEMETRY_INTERVAL_SEC = 15.0 +SHARD_STEM_RE = re.compile(r"^(?P.+)__p(?P\d{5})-(?P\d{5})$") +REASSEMBLED_CONFIG_KEYS = ( + "ocr_profile", + "attn_backend", + "runtime_backend", + "base_size", + "image_size", + "crop_mode", + "render_dpi", + "max_new_tokens", + "batch_size", + "gpu_memory_utilization", + "disable_fp8_kv", + "repair_mode", +) def _page_count(pdf_path: Path) -> int: @@ -70,6 +104,12 @@ def _build_cli_command( gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, repair_mode: Optional[str], + work_db: Optional[Path] = None, + worker_id: Optional[str] = None, + worker_runtime_file: Optional[Path] = None, + work_stale_after_sec: Optional[float] = None, + work_heartbeat_sec: Optional[float] = None, + work_max_attempts: Optional[int] = None, ) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ @@ -110,6 +150,18 @@ def _build_cli_command( cmd += ["--render-dpi", str(int(render_dpi))] if max_new_tokens is not None: cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if work_db is not None: + cmd += ["--work-db", str(work_db)] + if worker_id: + cmd += ["--worker-id", str(worker_id)] + if worker_runtime_file is not None: + cmd += ["--worker-runtime-file", str(worker_runtime_file)] + if work_stale_after_sec is not None: + cmd += ["--work-stale-after-sec", str(float(work_stale_after_sec))] + if work_heartbeat_sec is not None: + cmd += ["--work-heartbeat-sec", str(float(work_heartbeat_sec))] + if work_max_attempts is not None: + cmd += ["--work-max-attempts", str(int(work_max_attempts))] if repetition_penalty is not None: cmd += ["--repetition-penalty", str(float(repetition_penalty))] if no_repeat_ngram_size is not None: @@ -127,13 +179,32 @@ def _build_cli_command( return cmd -def _build_env(*, python_bin: Optional[Path], visible_device: Optional[int] = None) -> Dict[str, str]: +def _build_env( + *, + python_bin: Optional[Path], + visible_device: Optional[int] = None, + script: Optional[Path] = None, +) -> Dict[str, str]: env = os.environ.copy() if python_bin: python_path = Path(python_bin).expanduser() venv_bin = str(python_path.parent) env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" env["VIRTUAL_ENV"] = str(python_path.parent.parent) + if script is not None: + script_path = Path(script).expanduser().resolve() + src_root = next((parent for parent in script_path.parents if (parent / "glossapi").is_dir()), None) + if src_root is not None: + src_root_str = str(src_root) + existing_pythonpath = str(env.get("PYTHONPATH", "")).strip() + pythonpath_entries = [src_root_str] + if existing_pythonpath: + pythonpath_entries.extend( + entry + for entry in existing_pythonpath.split(os.pathsep) + if entry and entry != src_root_str + ) + env["PYTHONPATH"] = os.pathsep.join(pythonpath_entries) env.pop("PYTHONHOME", None) if visible_device is not None: env["CUDA_VISIBLE_DEVICES"] = str(visible_device) @@ -202,7 +273,7 @@ def _run_cli( disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, ) - env = _build_env(python_bin=python_bin, visible_device=visible_device) + env = _build_env(python_bin=python_bin, visible_device=visible_device, script=script) LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments @@ -416,6 +487,51 @@ def _plan_lane_batches( return [lane.to_dict() for lane in lanes if lane.batches] +def _plan_work_batches( + *, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], + runtime_backend: str, + scheduler: Optional[str], + lane_devices: List[int], + workers_per_gpu: int, + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + documents = _source_documents( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + ) + scheduler_norm = _resolve_scheduler( + scheduler=scheduler, + runtime_backend=runtime_backend, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches( + documents, + target_batch_pages=max(1, int(target_batch_pages)), + ) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches( + slices, + target_batch_pages=max(1, int(target_batch_pages)), + ) + return [batch.to_dict() for batch in batches if int(batch.pages) > 0] + + def _auto_vllm_batch_size( *, runtime_backend: str, @@ -462,6 +578,435 @@ def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: } +def _utc_now_iso(now_ts: Optional[float] = None) -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(float(now_ts) if now_ts is not None else time.time())) + + +def _parse_utc_iso(value: Optional[str]) -> Optional[float]: + if not value: + return None + try: + return float(calendar.timegm(time.strptime(str(value), "%Y-%m-%dT%H:%M:%SZ"))) + except Exception: + return None + + +def _run_text_command(cmd: List[str]) -> str: + proc = subprocess.run(cmd, check=True, capture_output=True, text=True) # nosec: controlled args + return str(proc.stdout or "").strip() + + +def _process_group_members(pgid: int) -> List[int]: + proc = subprocess.run(["pgrep", "-g", str(int(pgid))], check=False, capture_output=True, text=True) # nosec: controlled args + if int(proc.returncode) not in {0, 1}: + return [] + members: List[int] = [] + for line in str(proc.stdout or "").splitlines(): + line = line.strip() + if line: + try: + members.append(int(line)) + except ValueError: + continue + return members + + +def _wait_for_process_group_exit(pgid: int, *, timeout_sec: float) -> bool: + deadline = time.time() + float(max(0.0, timeout_sec)) + while time.time() <= deadline: + if not _process_group_members(pgid): + return True + time.sleep(0.2) + return not _process_group_members(pgid) + + +def _terminate_worker_process_group(worker: Dict[str, Any]) -> bool: + pgid = int(worker["proc"].pid) + worker_id = str(worker["worker_id"]) + for sig, grace_sec in ((signal.SIGTERM, 5.0), (signal.SIGKILL, 5.0)): + try: + os.killpg(pgid, sig) + except ProcessLookupError: + return True + except Exception as exc: + LOGGER.warning("Failed to signal worker process group %s pgid=%s: %s", worker_id, pgid, exc) + return False + if _wait_for_process_group_exit(pgid, timeout_sec=grace_sec): + return True + LOGGER.warning("Worker process group %s pgid=%s did not exit cleanly", worker_id, pgid) + return False + + +def _launch_worker_process(cmd: List[str], *, fh, env: Dict[str, str]) -> subprocess.Popen: + return subprocess.Popen( + cmd, + stdout=fh, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) # nosec: controlled args + + +def _parse_csv_table(text: str, columns: List[str]) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + for raw_line in str(text or "").splitlines(): + line = raw_line.strip() + if not line: + continue + parts = [piece.strip() for piece in line.split(",")] + if len(parts) < len(columns): + parts.extend([""] * (len(columns) - len(parts))) + rows.append({name: str(parts[idx]) for idx, name in enumerate(columns)}) + return rows + + +def _collect_gpu_snapshot(*, visible_devices: List[int]) -> Dict[str, Any]: + gpu_text = _run_text_command( + [ + "nvidia-smi", + f"--id={','.join(str(device) for device in visible_devices)}", + "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,persistence_mode", + "--format=csv,noheader,nounits", + ] + ) + process_text = _run_text_command( + [ + "nvidia-smi", + "--query-compute-apps=gpu_uuid,pid,process_name,used_memory", + "--format=csv,noheader,nounits", + ] + ) + return { + "captured_at": _utc_now_iso(), + "gpus": _parse_csv_table( + gpu_text, + [ + "index", + "name", + "utilization_gpu", + "memory_used_mib", + "memory_total_mib", + "temperature_c", + "power_draw_w", + "persistence_mode", + ], + ), + "processes": _parse_csv_table( + process_text, + [ + "gpu_uuid", + "pid", + "process_name", + "used_memory_mib", + ], + ), + } + + +def _read_worker_runtime(runtime_path: Path) -> Dict[str, Any]: + try: + return json.loads(Path(runtime_path).read_text(encoding="utf-8")) + except Exception: + return {} + + +def _write_runtime_summary(*, runtime_dir: Path, db_path: Path) -> Path: + runtime_dir.mkdir(parents=True, exist_ok=True) + workers = [] + first_batch_started = [] + last_batch_finished = [] + engine_ready = [] + for path in sorted(runtime_dir.glob("worker_*.runtime.json")): + data = _read_worker_runtime(path) + workers.append(data) + first_batch_started_ts = _parse_utc_iso(data.get("first_batch_started_at")) + last_batch_finished_ts = _parse_utc_iso(data.get("last_batch_finished_at")) + engine_ready_ts = _parse_utc_iso(data.get("engine_ready_at")) + if first_batch_started_ts is not None: + first_batch_started.append(first_batch_started_ts) + if last_batch_finished_ts is not None: + last_batch_finished.append(last_batch_finished_ts) + if engine_ready_ts is not None: + engine_ready.append(engine_ready_ts) + steady_summary = { + "first_batch_started_at": _utc_now_iso(min(first_batch_started)) if first_batch_started else None, + "last_batch_finished_at": _utc_now_iso(max(last_batch_finished)) if last_batch_finished else None, + "all_workers_ready_at": _utc_now_iso(max(engine_ready)) if engine_ready else None, + "first_batch_to_last_batch_window_sec": ( + float(max(last_batch_finished) - min(first_batch_started)) + if first_batch_started and last_batch_finished + else None + ), + "all_workers_ready_to_last_batch_window_sec": ( + float(max(last_batch_finished) - max(engine_ready)) + if engine_ready and last_batch_finished + else None + ), + } + summary_path = runtime_dir / "runtime_summary.json" + summary_path.write_text( + json.dumps( + { + "generated_at": _utc_now_iso(), + "queue_counts": work_queue_counts(db_path), + "work_items": list(iter_work_items(db_path)), + "workers": workers, + "steady_state": steady_summary, + }, + indent=2, + ), + encoding="utf-8", + ) + return summary_path + + +def _query_persistence_mode(*, visible_devices: List[int]) -> List[Dict[str, str]]: + raw = _run_text_command( + [ + "nvidia-smi", + f"--id={','.join(str(device) for device in visible_devices)}", + "--query-gpu=index,persistence_mode", + "--format=csv,noheader,nounits", + ] + ) + return _parse_csv_table(raw, ["index", "persistence_mode"]) + + +def _ensure_gpu_preflight(*, visible_devices: List[int], mode: str) -> Dict[str, Any]: + mode_norm = str(mode or "warn").strip().lower() + status = { + "mode": mode_norm, + "checked_at": _utc_now_iso(), + "before": _query_persistence_mode(visible_devices=visible_devices), + "changed": False, + } + disabled = [item for item in status["before"] if str(item.get("persistence_mode", "")).lower() != "enabled"] + if not disabled or mode_norm == "off": + status["after"] = list(status["before"]) + return status + if mode_norm == "ensure": + try: + subprocess.run(["sudo", "-n", "nvidia-smi", "-pm", "1"], check=True, capture_output=True, text=True) # nosec: controlled args + status["changed"] = True + except Exception as exc: + status["ensure_error"] = str(exc) + status["after"] = _query_persistence_mode(visible_devices=visible_devices) + return status + + +def _collect_xid_faults(*, start_utc_iso: str) -> Dict[str, Any]: + cmd = [ + "journalctl", + "-k", + "--since", + str(start_utc_iso), + "--no-pager", + ] + try: + output = _run_text_command(cmd) + except Exception as exc: + return { + "supported": False, + "error": str(exc), + "faults": [], + } + faults = [line for line in output.splitlines() if "NVRM: Xid" in line] + return { + "supported": True, + "faults": faults, + } + + +def _start_gpu_telemetry( + *, + telemetry_path: Path, + visible_devices: List[int], + interval_sec: float, + stop_event: threading.Event, +) -> threading.Thread: + telemetry_path.parent.mkdir(parents=True, exist_ok=True) + + def _loop() -> None: + while not stop_event.wait(float(max(1.0, interval_sec))): + try: + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + except Exception as exc: # pragma: no cover - best effort logging + LOGGER.warning("GPU telemetry sample failed: %s", exc) + + thread = threading.Thread(target=_loop, name="deepseek-gpu-telemetry", daemon=True) + thread.start() + return thread + + +def _parse_shard_stem(stem: str) -> Optional[Dict[str, Any]]: + match = SHARD_STEM_RE.match(str(stem)) + if match is None: + return None + return { + "source_stem": str(match.group("source_stem")), + "start_page": int(match.group("start")), + "end_page": int(match.group("end")), + } + + +def _split_markdown_pages(markdown_text: str, *, expected_pages: int) -> List[str]: + pages = _split_page_outputs(markdown_text) + if len(pages) < int(expected_pages): + pages.extend([""] * (int(expected_pages) - len(pages))) + elif len(pages) > int(expected_pages): + pages = pages[: int(expected_pages)] + return pages + + +def _archive_shard_artifact(*, out_root: Path, source_path: Path, relative_path: Path) -> None: + archive_path = out_root / "sidecars" / "ocr_shards" / relative_path + archive_path.parent.mkdir(parents=True, exist_ok=True) + if archive_path.exists(): + archive_path.unlink() + source_path.replace(archive_path) + + +def _reassemble_canonical_output_for_source( + *, + out_root: Path, + pdf_path: Path, + source_name: str, +) -> bool: + md_dir = out_root / "markdown" + metrics_dir = out_root / "json" / "metrics" + source_stem = Path(source_name).stem + canonical_md = md_dir / f"{source_stem}.md" + canonical_metrics = metrics_dir / f"{source_stem}.metrics.json" + if canonical_md.exists() and canonical_metrics.exists(): + return True + + shard_records: List[Dict[str, Any]] = [] + for metrics_path in sorted(metrics_dir.glob(f"{source_stem}__p*.metrics.json")): + shard_stem = metrics_path.name.removesuffix(".metrics.json") + shard_md = md_dir / f"{shard_stem}.md" + if not shard_md.exists(): + continue + shard_meta = _parse_shard_stem(shard_stem) + if shard_meta is None: + continue + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + start_page = int(metrics.get("source_start_page", shard_meta["start_page"])) + end_page = int(metrics.get("source_end_page", shard_meta["end_page"])) + shard_records.append( + { + "stem": shard_stem, + "md_path": shard_md, + "metrics_path": metrics_path, + "metrics": metrics, + "start_page": start_page, + "end_page": end_page, + } + ) + + if not shard_records: + return False + + shard_records.sort(key=lambda item: (int(item["start_page"]), int(item["end_page"]), str(item["stem"]))) + page_count = max(int(_page_count(pdf_path)), max(int(item["end_page"]) for item in shard_records)) + merged_pages = [""] * int(page_count) + merged_page_metrics: List[Optional[Dict[str, Any]]] = [None] * int(page_count) + merged_extra_metrics: Dict[str, Any] = {} + repair_totals: Dict[str, int] = {} + render_sec_total = 0.0 + infer_sec_total = 0.0 + wall_time_sec_total = 0.0 + reassembled_ranges: List[Dict[str, int]] = [] + + for shard in shard_records: + metrics = dict(shard["metrics"]) + start_page = int(shard["start_page"]) + end_page = int(shard["end_page"]) + expected_pages = max(0, end_page - start_page + 1) + reassembled_ranges.append({"start_page": start_page, "end_page": end_page}) + + shard_pages = _split_markdown_pages( + shard["md_path"].read_text(encoding="utf-8"), + expected_pages=expected_pages, + ) + for offset, page_text in enumerate(shard_pages): + merged_pages[start_page - 1 + offset] = page_text + + for idx, page_metric in enumerate(list(metrics.get("page_metrics") or []), start=1): + absolute_page = start_page + int(page_metric.get("page_number", idx)) - 1 + if absolute_page <= 0 or absolute_page > int(page_count): + continue + merged_metric = dict(page_metric) + merged_metric["page_number"] = int(absolute_page) + merged_page_metrics[absolute_page - 1] = merged_metric + + render_sec_total += float(metrics.get("render_sec", 0.0)) + infer_sec_total += float(metrics.get("infer_sec_total", 0.0)) + wall_time_sec_total += float(metrics.get("wall_time_sec", 0.0)) + for key, value in dict(metrics.get("repair_summary") or {}).items(): + if key == "repair_mode": + continue + repair_totals[key] = int(repair_totals.get(key, 0)) + int(value) + for key in REASSEMBLED_CONFIG_KEYS: + if key in metrics and key not in merged_extra_metrics: + merged_extra_metrics[key] = metrics[key] + + merged_extra_metrics.update( + { + "source_file": str(source_name), + "source_stem": str(source_stem), + "source_start_page": 1, + "source_end_page": int(page_count), + "reassembled_from_shards": True, + "reassembled_shard_count": len(shard_records), + "reassembled_source_ranges": reassembled_ranges, + "render_sec": float(render_sec_total), + "infer_sec_total": float(infer_sec_total), + "wall_time_sec": float(wall_time_sec_total), + "wall_time_sec_semantics": "sum_of_shard_wall_times", + "page_metrics": [item for item in merged_page_metrics if item is not None], + } + ) + if repair_totals: + merged_extra_metrics["repair_summary"] = { + "repair_mode": str(merged_extra_metrics.get("repair_mode", "unknown")), + **{key: int(value) for key, value in repair_totals.items()}, + } + + merged_markdown = _join_page_outputs(merged_pages) if merged_pages else "[[Blank page]]" + _write_outputs( + output_dir=out_root, + stem=source_stem, + markdown=merged_markdown, + page_count=int(page_count), + extra_metrics=merged_extra_metrics, + ) + for shard in shard_records: + _archive_shard_artifact( + out_root=out_root, + source_path=Path(shard["md_path"]), + relative_path=Path("markdown") / Path(shard["md_path"]).name, + ) + _archive_shard_artifact( + out_root=out_root, + source_path=Path(shard["metrics_path"]), + relative_path=Path("json") / "metrics" / Path(shard["metrics_path"]).name, + ) + return True + + +def _ensure_canonical_outputs(*, out_root: Path, pdf_root: Path, file_list: List[str]) -> None: + for name in file_list: + pdf_path = (pdf_root / name).resolve() + if _reassemble_canonical_output_for_source( + out_root=out_root, + pdf_path=pdf_path, + source_name=name, + ): + continue + + + def _run_multi_cli( *, input_root: Path, @@ -495,6 +1040,180 @@ def _run_multi_cli( shard_pages: int, shard_threshold_pages: int, ) -> None: + if str(runtime_backend or "").strip().lower() == "vllm": + batches = _plan_work_batches( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + runtime_backend=runtime_backend, + scheduler=scheduler, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + ) + if not batches: + return + + log_dir.mkdir(parents=True, exist_ok=True) + runtime_dir = out_root / "sidecars" / "ocr_runtime" + runtime_dir.mkdir(parents=True, exist_ok=True) + work_db = runtime_dir / "work_queue.sqlite" + init_work_db(work_db, batches=batches, replace=True) + + visible_devices = sorted({int(device) for device in lane_devices}) + preflight_mode = str(os.environ.get("GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT", "ensure")).strip().lower() + preflight = _ensure_gpu_preflight(visible_devices=visible_devices, mode=preflight_mode) + (runtime_dir / "gpu_preflight.json").write_text(json.dumps(preflight, indent=2), encoding="utf-8") + + telemetry_path = runtime_dir / "gpu_telemetry.jsonl" + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "preflight", **preflight}) + "\n") + fh.write(json.dumps({"kind": "initial_sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + + telemetry_stop = threading.Event() + telemetry_thread = _start_gpu_telemetry( + telemetry_path=telemetry_path, + visible_devices=visible_devices, + interval_sec=float(os.environ.get("GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC", DEFAULT_TELEMETRY_INTERVAL_SEC)), + stop_event=telemetry_stop, + ) + stale_after_sec = float(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC", DEFAULT_WORK_STALE_AFTER_SEC)) + heartbeat_sec = float(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC", DEFAULT_WORK_HEARTBEAT_SEC)) + respawn_cap = int(os.environ.get("GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP", DEFAULT_WORKER_RESPAWN_CAP)) + work_max_attempts = int( + max(1, int(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS", DEFAULT_WORK_ITEM_MAX_ATTEMPTS))) + ) + xid_start = _utc_now_iso() + + def _start_worker(*, worker_id: str, visible_device: int, respawns: int) -> Dict[str, Any]: + log_path = log_dir / f"{worker_id}.r{int(respawns)}.log" + fh = log_path.open("w", encoding="utf-8") + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size_for_pages( + runtime_backend=runtime_backend, + pages=int(target_batch_pages), + ) + ) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=[], + page_ranges=None, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + work_db=work_db, + worker_id=worker_id, + worker_runtime_file=runtime_dir / f"{worker_id}.runtime.json", + work_stale_after_sec=stale_after_sec, + work_heartbeat_sec=heartbeat_sec, + work_max_attempts=work_max_attempts, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device, script=script_path) + LOGGER.info( + "Running DeepSeek OCR worker=%s visible_gpu=%s batches=%d: %s", + worker_id, + visible_device, + len(batches), + " ".join(cmd), + ) + proc = _launch_worker_process(cmd, fh=fh, env=env) + return { + "worker_id": worker_id, + "visible_device": int(visible_device), + "proc": proc, + "fh": fh, + "log_path": log_path, + "respawns": int(respawns), + } + + active_workers: List[Dict[str, Any]] = [] + worker_index = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + worker_id = f"worker_{worker_index:02d}_gpu{int(visible_device)}" + active_workers.append(_start_worker(worker_id=worker_id, visible_device=int(visible_device), respawns=0)) + worker_index += 1 + + failures: List[str] = [] + try: + while active_workers: + time.sleep(0.5) + for worker in list(active_workers): + rc = worker["proc"].poll() + if rc is None: + continue + worker["fh"].close() + active_workers.remove(worker) + if int(rc) == 0: + continue + error_message = f"{worker['worker_id']} rc={int(rc)} log={worker['log_path']}" + LOGGER.warning("DeepSeek OCR worker failed: %s", error_message) + _terminate_worker_process_group(worker) + requeue_worker_batches( + work_db, + worker_id=str(worker["worker_id"]), + error=error_message, + max_attempts=work_max_attempts, + ) + counts = work_queue_counts(work_db) + # Only respawn while there is retryable work left in the + # durable queue; terminally failed items should stop the run. + remaining_work = int(counts.get("pending", 0)) + int(counts.get("running", 0)) + if remaining_work > 0 and int(worker["respawns"]) < respawn_cap: + active_workers.append( + _start_worker( + worker_id=str(worker["worker_id"]), + visible_device=int(worker["visible_device"]), + respawns=int(worker["respawns"]) + 1, + ) + ) + continue + failures.append(error_message) + counts = work_queue_counts(work_db) + if int(counts.get(STATUS_FAILED, 0)) > 0 or int(counts.get(STATUS_DONE, 0)) < int(counts.get("total", 0)): + failures.append(f"incomplete_work queue_counts={counts}") + finally: + for worker in list(active_workers): + _terminate_worker_process_group(worker) + try: + worker["proc"].wait(timeout=5) + except Exception: + pass + worker["fh"].close() + telemetry_stop.set() + telemetry_thread.join(timeout=max(1.0, DEFAULT_TELEMETRY_INTERVAL_SEC)) + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "final_sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + fh.write(json.dumps({"kind": "xid_faults", **_collect_xid_faults(start_utc_iso=xid_start)}) + "\n") + _write_runtime_summary(runtime_dir=runtime_dir, db_path=work_db) + + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + return + lanes = _plan_lane_batches( file_list=file_list, input_root=input_root, @@ -561,7 +1280,7 @@ def _run_multi_cli( disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, ) - env = _build_env(python_bin=python_exe, visible_device=visible_device) + env = _build_env(python_bin=python_exe, visible_device=visible_device, script=script_path) LOGGER.info( "Running DeepSeek OCR lane=%s visible_gpu=%s pages=%s planned_batches=%s files=%d ranges=%d: %s", lane_id, @@ -760,6 +1479,8 @@ def run_for_files( repair_mode=repair_mode, ) + _ensure_canonical_outputs(out_root=out_root, pdf_root=pdf_root, file_list=file_list) + results: Dict[str, Any] = {} for name in file_list: pdf_path = (pdf_root / name).resolve() diff --git a/src/glossapi/ocr/deepseek/work_queue.py b/src/glossapi/ocr/deepseek/work_queue.py new file mode 100644 index 0000000..9cf8d0b --- /dev/null +++ b/src/glossapi/ocr/deepseek/work_queue.py @@ -0,0 +1,380 @@ +"""Durable batch queue helpers for multi-GPU DeepSeek OCR runs.""" + +from __future__ import annotations + +import json +import sqlite3 +import time +from pathlib import Path +from typing import Any, Dict, Iterable, Optional + +STATUS_DONE = "done" +STATUS_FAILED = "failed" +STATUS_PENDING = "pending" +STATUS_RUNNING = "running" +QUEUE_MAIN = "main" +QUEUE_REPAIR = "repair" + + +def _empty_counts() -> Dict[str, int]: + return { + STATUS_PENDING: 0, + STATUS_RUNNING: 0, + STATUS_DONE: 0, + STATUS_FAILED: 0, + "total": 0, + } + + +def _normalize_queue_name(queue_name: str) -> str: + queue_norm = str(queue_name or QUEUE_MAIN).strip().lower() + if queue_norm not in {QUEUE_MAIN, QUEUE_REPAIR}: + raise ValueError(f"Unsupported queue name: {queue_name}") + return queue_norm + + +def _connect(db_path: Path) -> sqlite3.Connection: + db_path = Path(db_path).expanduser().resolve() + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path), timeout=30.0, isolation_level=None) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") + return conn + + +def init_work_db(db_path: Path, *, batches: Iterable[Dict[str, Any]], replace: bool = True) -> None: + db_path = Path(db_path).expanduser().resolve() + if replace and db_path.exists(): + db_path.unlink() + with _connect(db_path) as conn: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS work_items ( + batch_id INTEGER PRIMARY KEY, + queue_name TEXT NOT NULL, + queue_key TEXT NOT NULL UNIQUE, + batch_json TEXT NOT NULL, + pages INTEGER NOT NULL, + status TEXT NOT NULL, + worker_id TEXT, + attempt_count INTEGER NOT NULL DEFAULT 0, + started_at REAL, + finished_at REAL, + last_heartbeat REAL, + last_error TEXT, + result_json TEXT + ); + CREATE INDEX IF NOT EXISTS idx_work_items_status ON work_items(status); + CREATE INDEX IF NOT EXISTS idx_work_items_queue_status ON work_items(queue_name, status); + CREATE INDEX IF NOT EXISTS idx_work_items_worker ON work_items(worker_id); + """ + ) + rows = [ + ( + int(batch["batch_id"]), + QUEUE_MAIN, + str(batch.get("queue_key") or f"{QUEUE_MAIN}:{int(batch['batch_id'])}"), + json.dumps(batch, sort_keys=True), + int(batch.get("pages", 0)), + STATUS_PENDING, + ) + for batch in batches + ] + conn.executemany( + """ + INSERT OR REPLACE INTO work_items(batch_id, queue_name, queue_key, batch_json, pages, status) + VALUES (?, ?, ?, ?, ?, ?) + """, + rows, + ) + + +def enqueue_batches( + db_path: Path, + *, + queue_name: str, + batches: Iterable[Dict[str, Any]], +) -> list[int]: + queue_norm = _normalize_queue_name(queue_name) + inserted_ids: list[int] = [] + with _connect(db_path) as conn: + _with_transaction(conn) + next_batch_id = int( + conn.execute("SELECT COALESCE(MAX(batch_id), -1) + 1 AS next_id FROM work_items").fetchone()["next_id"] + ) + for batch in batches: + payload = dict(batch) + queue_key = str(payload.get("queue_key") or f"{queue_norm}:{next_batch_id}") + row = conn.execute( + "SELECT batch_id FROM work_items WHERE queue_key = ?", + (queue_key,), + ).fetchone() + if row is None: + batch_id = int(payload.get("batch_id", next_batch_id)) + next_batch_id = max(next_batch_id, batch_id + 1) + else: + batch_id = int(row["batch_id"]) + payload["batch_id"] = batch_id + payload["queue_name"] = queue_norm + payload_json = json.dumps(payload, sort_keys=True) + pages = int(payload.get("pages", 0)) + if row is None: + conn.execute( + """ + INSERT INTO work_items(batch_id, queue_name, queue_key, batch_json, pages, status) + VALUES (?, ?, ?, ?, ?, ?) + """, + (batch_id, queue_norm, queue_key, payload_json, pages, STATUS_PENDING), + ) + else: + conn.execute( + """ + UPDATE work_items + SET queue_name = ?, batch_json = ?, pages = ?, status = ?, worker_id = NULL, attempt_count = 0, + started_at = NULL, finished_at = NULL, last_heartbeat = NULL, last_error = NULL, result_json = NULL + WHERE batch_id = ? + """, + (queue_norm, payload_json, pages, STATUS_PENDING, batch_id), + ) + inserted_ids.append(batch_id) + conn.commit() + return inserted_ids + + +def _with_transaction(conn: sqlite3.Connection) -> None: + conn.execute("BEGIN IMMEDIATE") + + +def requeue_stale_running_batches( + db_path: Path, + *, + stale_after_sec: float, + now_ts: Optional[float] = None, +) -> int: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + cutoff = now_value - float(max(1.0, stale_after_sec)) + with _connect(db_path) as conn: + _with_transaction(conn) + cursor = conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = NULL, started_at = NULL, finished_at = NULL + WHERE status = ? AND COALESCE(last_heartbeat, started_at, 0) < ? + """, + (STATUS_PENDING, STATUS_RUNNING, cutoff), + ) + conn.commit() + return int(cursor.rowcount or 0) + + +def requeue_worker_batches( + db_path: Path, + *, + worker_id: str, + error: Optional[str] = None, + max_attempts: int = 2, +) -> int: + max_attempts_value = max(1, int(max_attempts)) + with _connect(db_path) as conn: + _with_transaction(conn) + # `attempt_count` is incremented on claim. With the default max_attempts=2 + # each work item gets one retry after its first failed claim, then becomes + # terminally failed instead of bouncing forever between workers. + cursor = conn.execute( + """ + UPDATE work_items + SET status = CASE WHEN attempt_count < ? THEN ? ELSE ? END, + worker_id = CASE WHEN attempt_count < ? THEN NULL ELSE ? END, + started_at = NULL, + finished_at = NULL, + last_heartbeat = NULL, + last_error = ?, + result_json = NULL + WHERE status = ? AND worker_id = ? + """, + ( + max_attempts_value, + STATUS_PENDING, + STATUS_FAILED, + max_attempts_value, + str(worker_id), + str(error) if error else None, + STATUS_RUNNING, + str(worker_id), + ), + ) + conn.commit() + return int(cursor.rowcount or 0) + + +def claim_next_batch( + db_path: Path, + *, + worker_id: str, + stale_after_sec: float, + queue_name: str = QUEUE_MAIN, + now_ts: Optional[float] = None, +) -> Optional[Dict[str, Any]]: + queue_norm = _normalize_queue_name(queue_name) + now_value = float(now_ts) if now_ts is not None else float(time.time()) + cutoff = now_value - float(max(1.0, stale_after_sec)) + with _connect(db_path) as conn: + _with_transaction(conn) + conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = NULL, started_at = NULL, finished_at = NULL + WHERE status = ? AND COALESCE(last_heartbeat, started_at, 0) < ? + """, + (STATUS_PENDING, STATUS_RUNNING, cutoff), + ) + row = conn.execute( + """ + SELECT batch_id, batch_json + FROM work_items + WHERE status = ? AND queue_name = ? + ORDER BY batch_id ASC + LIMIT 1 + """, + (STATUS_PENDING, queue_norm), + ).fetchone() + if row is None: + conn.commit() + return None + conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = ?, attempt_count = attempt_count + 1, started_at = ?, last_heartbeat = ?, last_error = NULL + WHERE batch_id = ? + """, + (STATUS_RUNNING, str(worker_id), now_value, now_value, int(row["batch_id"])), + ) + conn.commit() + return json.loads(str(row["batch_json"])) + + +def heartbeat_batch(db_path: Path, *, batch_id: int, worker_id: str, now_ts: Optional[float] = None) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET last_heartbeat = ? + WHERE batch_id = ? AND status = ? AND worker_id = ? + """, + (now_value, int(batch_id), STATUS_RUNNING, str(worker_id)), + ) + + +def mark_batch_done( + db_path: Path, + *, + batch_id: int, + worker_id: str, + result: Optional[Dict[str, Any]] = None, + now_ts: Optional[float] = None, +) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET status = ?, finished_at = ?, last_heartbeat = ?, result_json = ? + WHERE batch_id = ? AND worker_id = ? + """, + ( + STATUS_DONE, + now_value, + now_value, + json.dumps(result, sort_keys=True) if result is not None else None, + int(batch_id), + str(worker_id), + ), + ) + + +def mark_batch_failed( + db_path: Path, + *, + batch_id: int, + worker_id: str, + error: str, + max_attempts: int = 2, + now_ts: Optional[float] = None, +) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + max_attempts_value = max(1, int(max_attempts)) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET status = CASE WHEN attempt_count < ? THEN ? ELSE ? END, + worker_id = CASE WHEN attempt_count < ? THEN NULL ELSE ? END, + started_at = NULL, + finished_at = ?, + last_heartbeat = ?, + last_error = ?, + result_json = NULL + WHERE batch_id = ? AND worker_id = ? + """, + ( + max_attempts_value, + STATUS_PENDING, + STATUS_FAILED, + max_attempts_value, + str(worker_id), + now_value, + now_value, + str(error), + int(batch_id), + str(worker_id), + ), + ) + + +def work_queue_counts(db_path: Path) -> Dict[str, int]: + counts = _empty_counts() + counts["by_queue"] = { + QUEUE_MAIN: _empty_counts(), + QUEUE_REPAIR: _empty_counts(), + } + with _connect(db_path) as conn: + for row in conn.execute("SELECT queue_name, status, COUNT(*) AS count FROM work_items GROUP BY queue_name, status"): + queue_name = _normalize_queue_name(str(row["queue_name"])) + status = str(row["status"]) + count = int(row["count"]) + counts[status] = int(counts.get(status, 0)) + count + counts["total"] += count + counts["by_queue"][queue_name][status] = count + counts["by_queue"][queue_name]["total"] += count + return counts + + +def iter_work_items(db_path: Path) -> Iterable[Dict[str, Any]]: + with _connect(db_path) as conn: + for row in conn.execute( + """ + SELECT batch_id, queue_name, queue_key, batch_json, pages, status, worker_id, attempt_count, started_at, + finished_at, last_heartbeat, last_error, result_json + FROM work_items + ORDER BY batch_id ASC + """ + ): + item = json.loads(str(row["batch_json"])) + item.update( + { + "queue_name": str(row["queue_name"]), + "queue_key": str(row["queue_key"]), + "status": str(row["status"]), + "worker_id": row["worker_id"], + "attempt_count": int(row["attempt_count"]), + "started_at": row["started_at"], + "finished_at": row["finished_at"], + "last_heartbeat": row["last_heartbeat"], + "last_error": row["last_error"], + "result": json.loads(str(row["result_json"])) if row["result_json"] else None, + "pages": int(row["pages"]), + } + ) + yield item diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py index 83a8a8b..4ffb064 100644 --- a/src/glossapi/scripts/deepseek_pipeline_benchmark.py +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -150,6 +150,16 @@ def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: return totals +def _collect_runtime_summary(run_dir: Path) -> Dict[str, Any]: + summary_path = run_dir / "sidecars" / "ocr_runtime" / "runtime_summary.json" + if not summary_path.exists(): + return {} + try: + return json.loads(summary_path.read_text(encoding="utf-8")) + except Exception: + return {} + + def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: files: List[str] = [] page_ranges: List[str] = [] @@ -354,6 +364,7 @@ def start_lane(lane: Dict[str, Any]) -> Dict[str, Any]: ) repair_metrics = _collect_repair_metrics(run_dir) + runtime_summary = _collect_runtime_summary(run_dir) summary = { "label": str(args.label), "status": "pass" if not failures else "fail", @@ -377,6 +388,8 @@ def start_lane(lane: Dict[str, Any]) -> Dict[str, Any]: "lane_results": lane_results, "gpu_results": gpu_results, "repair_metrics": repair_metrics, + "runtime_summary": runtime_summary, + "steady_state": dict(runtime_summary.get("steady_state") or {}), "failures": failures, } (run_dir / "pipeline_benchmark_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py new file mode 100644 index 0000000..06d7b58 --- /dev/null +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -0,0 +1,407 @@ +import json +from pathlib import Path +from types import SimpleNamespace + + +def test_work_queue_requeues_stale_running_batch(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 0, + "pages": 12, + "files": ["a.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-a", + stale_after_sec=30.0, + now_ts=100.0, + ) + + assert claimed["batch_id"] == 0 + assert work_queue.work_queue_counts(db_path)["running"] == 1 + + requeued = work_queue.requeue_stale_running_batches( + db_path, + stale_after_sec=30.0, + now_ts=200.0, + ) + + assert requeued == 1 + assert work_queue.work_queue_counts(db_path)["pending"] == 1 + + +def test_work_queue_mark_done_persists_result(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 1, + "pages": 8, + "files": [], + "page_ranges": ["b.pdf:1:8"], + "items": [], + } + ], + ) + + work_queue.claim_next_batch( + db_path, + worker_id="worker-b", + stale_after_sec=60.0, + now_ts=50.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=1, + worker_id="worker-b", + result={"pages": 8, "first_infer_started_at": "2026-04-02T10:00:00Z"}, + now_ts=75.0, + ) + + items = list(work_queue.iter_work_items(db_path)) + + assert items[0]["status"] == work_queue.STATUS_DONE + assert items[0]["result"]["pages"] == 8 + assert work_queue.work_queue_counts(db_path)["done"] == 1 + + +def test_work_queue_repair_enqueue_reuses_queue_key(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db(db_path, batches=[]) + + inserted = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:5:doc", + "stem": "doc", + "repair_page_numbers": [2, 5], + "pages": 2, + } + ], + ) + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-r", + stale_after_sec=60.0, + queue_name=work_queue.QUEUE_REPAIR, + now_ts=10.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=claimed["batch_id"], + worker_id="worker-r", + result={"pages": 2}, + now_ts=12.0, + ) + + inserted_again = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:5:doc", + "stem": "doc", + "repair_page_numbers": [2], + "pages": 1, + } + ], + ) + repair_item = [ + item + for item in work_queue.iter_work_items(db_path) + if item["queue_name"] == work_queue.QUEUE_REPAIR + ][0] + + assert inserted_again == inserted + assert repair_item["batch_id"] == inserted[0] + assert repair_item["status"] == work_queue.STATUS_PENDING + assert repair_item["repair_page_numbers"] == [2] + + +def test_work_queue_marks_batch_failed_after_one_retry(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 2, + "pages": 4, + "files": ["c.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + + first = work_queue.claim_next_batch( + db_path, + worker_id="worker-a", + stale_after_sec=60.0, + now_ts=10.0, + ) + work_queue.mark_batch_failed( + db_path, + batch_id=first["batch_id"], + worker_id="worker-a", + error="first failure", + max_attempts=2, + now_ts=20.0, + ) + + second = work_queue.claim_next_batch( + db_path, + worker_id="worker-b", + stale_after_sec=60.0, + now_ts=30.0, + ) + work_queue.mark_batch_failed( + db_path, + batch_id=second["batch_id"], + worker_id="worker-b", + error="second failure", + max_attempts=2, + now_ts=40.0, + ) + + item = list(work_queue.iter_work_items(db_path))[0] + + assert item["attempt_count"] == 2 + assert item["status"] == work_queue.STATUS_FAILED + assert item["worker_id"] == "worker-b" + assert item["last_error"] == "second failure" + + +def test_claim_next_phase_batch_switches_to_repair_after_main_drains(tmp_path): + from glossapi.ocr.deepseek import run_pdf_ocr_vllm + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 0, + "pages": 8, + "files": ["a.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-main", + stale_after_sec=60.0, + now_ts=10.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=claimed["batch_id"], + worker_id="worker-main", + result={"pages": 8}, + now_ts=20.0, + ) + work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:0:doc", + "stem": "doc", + "repair_page_numbers": [2, 5], + "pages": 2, + } + ], + ) + + queue_name, batch, should_wait = run_pdf_ocr_vllm._claim_next_phase_batch( + db_path, + worker_id="worker-repair", + stale_after_sec=60.0, + ) + + assert queue_name == work_queue.QUEUE_REPAIR + assert batch is not None + assert batch["queue_key"] == "repair:0:doc" + assert should_wait is False + + +def test_runner_runtime_summary_reports_steady_state_windows(tmp_path): + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek import work_queue + + runtime_dir = tmp_path / "runtime" + runtime_dir.mkdir(parents=True, exist_ok=True) + (runtime_dir / "worker_00.runtime.json").write_text( + json.dumps( + { + "worker_id": "worker_00", + "engine_ready_at": "2026-04-02T10:00:10Z", + "first_batch_started_at": "2026-04-02T10:00:20Z", + "last_batch_finished_at": "2026-04-02T10:05:20Z", + } + ), + encoding="utf-8", + ) + (runtime_dir / "worker_01.runtime.json").write_text( + json.dumps( + { + "worker_id": "worker_01", + "engine_ready_at": "2026-04-02T10:00:12Z", + "first_batch_started_at": "2026-04-02T10:00:24Z", + "last_batch_finished_at": "2026-04-02T10:04:20Z", + } + ), + encoding="utf-8", + ) + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + {"batch_id": 0, "pages": 50, "files": ["a.pdf"], "page_ranges": [], "items": []}, + {"batch_id": 1, "pages": 50, "files": ["b.pdf"], "page_ranges": [], "items": []}, + ], + ) + work_queue.claim_next_batch(db_path, worker_id="worker_00", stale_after_sec=60.0, now_ts=1.0) + work_queue.mark_batch_done(db_path, batch_id=0, worker_id="worker_00", now_ts=2.0) + work_queue.claim_next_batch(db_path, worker_id="worker_01", stale_after_sec=60.0, now_ts=3.0) + work_queue.mark_batch_done(db_path, batch_id=1, worker_id="worker_01", now_ts=4.0) + + summary_path = runner._write_runtime_summary(runtime_dir=runtime_dir, db_path=db_path) + summary = json.loads(summary_path.read_text(encoding="utf-8")) + + assert summary["queue_counts"]["done"] == 2 + assert summary["steady_state"]["first_batch_started_at"] == "2026-04-02T10:00:20Z" + assert summary["steady_state"]["all_workers_ready_at"] == "2026-04-02T10:00:12Z" + assert summary["steady_state"]["last_batch_finished_at"] == "2026-04-02T10:05:20Z" + assert summary["steady_state"]["first_batch_to_last_batch_window_sec"] == 300.0 + assert summary["steady_state"]["all_workers_ready_to_last_batch_window_sec"] == 308.0 + assert summary["queue_counts"]["by_queue"]["main"]["done"] == 2 + assert summary["queue_counts"]["by_queue"]["repair"]["done"] == 0 + + +def test_runner_preflight_can_ensure_persistence_mode(monkeypatch): + from glossapi.ocr.deepseek import runner + + responses = [ + [{"index": "0", "persistence_mode": "Disabled"}], + [{"index": "0", "persistence_mode": "Enabled"}], + ] + + monkeypatch.setattr(runner, "_query_persistence_mode", lambda *, visible_devices: responses.pop(0)) + + calls = {} + + def fake_run(cmd, check, capture_output, text): + calls["cmd"] = cmd + return SimpleNamespace(returncode=0) + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + + status = runner._ensure_gpu_preflight(visible_devices=[0], mode="ensure") + + assert calls["cmd"] == ["sudo", "-n", "nvidia-smi", "-pm", "1"] + assert status["changed"] is True + assert status["after"] == [{"index": "0", "persistence_mode": "Enabled"}] + + +def test_build_cli_command_includes_work_queue_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=[], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="vllm", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=144, + max_new_tokens=2048, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=96, + gpu_memory_utilization=0.9, + disable_fp8_kv=False, + repair_mode="auto", + work_db=tmp_path / "work.sqlite", + worker_id="worker_00_gpu0", + worker_runtime_file=tmp_path / "worker_00.runtime.json", + work_stale_after_sec=900.0, + work_heartbeat_sec=10.0, + work_max_attempts=2, + ) + + assert "--work-db" in cmd + assert str(tmp_path / "work.sqlite") in cmd + assert "--worker-id" in cmd and "worker_00_gpu0" in cmd + assert "--worker-runtime-file" in cmd and str(tmp_path / "worker_00.runtime.json") in cmd + assert "--work-stale-after-sec" in cmd and "900.0" in cmd + assert "--work-heartbeat-sec" in cmd and "10.0" in cmd + assert "--work-max-attempts" in cmd and "2" in cmd + + +def test_launch_worker_process_uses_start_new_session(monkeypatch): + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_popen(cmd, stdout, stderr, env, start_new_session): + calls["cmd"] = cmd + calls["start_new_session"] = start_new_session + return SimpleNamespace(pid=1234) + + monkeypatch.setattr(runner.subprocess, "Popen", fake_popen) + + proc = runner._launch_worker_process(["python", "worker.py"], fh=object(), env={"A": "1"}) + + assert calls["cmd"] == ["python", "worker.py"] + assert calls["start_new_session"] is True + assert proc.pid == 1234 + + +def test_terminate_worker_process_group_signals_group(monkeypatch): + from glossapi.ocr.deepseek import runner + + signals = [] + monkeypatch.setattr(runner.os, "killpg", lambda pgid, sig: signals.append((pgid, sig))) + monkeypatch.setattr(runner, "_wait_for_process_group_exit", lambda pgid, *, timeout_sec: True) + + ok = runner._terminate_worker_process_group( + { + "worker_id": "worker_00_gpu0", + "proc": SimpleNamespace(pid=4321), + } + ) + + assert ok is True + assert signals == [(4321, runner.signal.SIGTERM)] diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 1e39cd5..db30b9a 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -1,5 +1,7 @@ +import json import sys from pathlib import Path +from types import SimpleNamespace import pandas as pd import pytest @@ -63,6 +65,42 @@ def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): assert not progress_markdown.exists() +def test_page_output_helpers_roundtrip_numbered_blank_pages(): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs + + page_outputs = ["page one", "", "page three"] + + markdown = _join_page_outputs(page_outputs) + + assert markdown == ( + "page one\n" + "\n" + "<--- Page Split --->\n" + "\n" + "\n" + "<--- Page Split --->\n" + "page three" + ) + assert _split_page_outputs(markdown) == page_outputs + + +def test_write_outputs_preserves_blank_first_page_structure(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + + output_dir = tmp_path / "output" + markdown = _join_page_outputs(["", "page two"]) + + _write_outputs(output_dir=output_dir, stem="doc", markdown=markdown, page_count=2) + + written = (output_dir / "markdown" / "doc.md").read_text(encoding="utf-8") + assert written == ( + "\n" + "<--- Page Split --->\n" + "page two\n" + ) + assert _split_page_outputs(written) == ["", "page two"] + + def test_auto_attn_backend_prefers_eager_when_flash_attn_is_unavailable(monkeypatch): import builtins @@ -198,6 +236,32 @@ def test_build_cli_command_includes_vllm_flags(tmp_path): assert "--repair-mode" in cmd and "auto" in cmd +def test_build_env_prepends_script_src_to_pythonpath(tmp_path, monkeypatch): + import os + + from glossapi.ocr.deepseek.runner import _build_env + + repo_root = tmp_path / "repo" + script = repo_root / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" + script.parent.mkdir(parents=True, exist_ok=True) + script.write_text("# stub\n", encoding="utf-8") + (repo_root / "src" / "glossapi").mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("PYTHONPATH", os.pathsep.join(["/tmp/old-a", "/tmp/old-b"])) + env = _build_env( + python_bin=Path("/usr/bin/python3"), + visible_device=1, + script=script, + ) + + assert env["PYTHONPATH"].split(os.pathsep) == [ + str((repo_root / "src").resolve()), + "/tmp/old-a", + "/tmp/old-b", + ] + assert env["CUDA_VISIBLE_DEVICES"] == "1" + + def test_build_cli_command_includes_page_ranges(tmp_path): from glossapi.ocr.deepseek.runner import _build_cli_command @@ -258,6 +322,126 @@ def test_vllm_empty_page_detector_is_conservative(): assert _is_effectively_empty_page(empty_page, "off") is False +def test_repair_disposition_drops_repeat_garbage_cutoff(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _resolve_repair_disposition + + disposition = _resolve_repair_disposition( + repair_text="garbage", + repair_postprocess={"early_stops": 1}, + ) + + assert disposition == { + "final_text": "", + "repair_applied": False, + "page_dropped_after_repair": True, + "drop_reason": "repeat_garbage_cutoff", + } + + +def test_repair_batch_updates_persisted_outputs_with_repeat_cutoff_drop(tmp_path, monkeypatch): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _run_repair_batch_to_outputs + + output_dir = tmp_path / "output" + _write_outputs( + output_dir=output_dir, + stem="doc", + markdown=_join_page_outputs(["bad first page", "page two"]), + page_count=2, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 1.0, + "raw_chars": 20, + "final_chars": 14, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + }, + { + "page_number": 2, + "infer_sec": 0.5, + "raw_chars": 8, + "final_chars": 8, + "repair_strategy": "none", + "repair_reason": None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + }, + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._iter_selected_rendered_pages", + lambda pdf_path, *, render_dpi, source_page_numbers: [(1, Image.new("RGB", (4, 4), "white"))], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._generate_batch_outputs", + lambda llm, *, jobs, prompt, batch_size, sampling_params: [ + {"item": jobs[0], "raw_text": "still broken", "infer_sec": 0.25} + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._postprocess_page_text", + lambda text, *, prompt, content_debug: ("garbage", {"early_stops": 1}), + ) + + result = _run_repair_batch_to_outputs( + SimpleNamespace(render_dpi=144, batch_size=8, content_debug=False, repair_mode="auto"), + batch={ + "stem": "doc", + "pdf_path": str(tmp_path / "doc.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + }, + output_dir=output_dir, + llm=object(), + plain_prompt="plain prompt", + sampling_params=object(), + ) + + markdown = (output_dir / "markdown" / "doc.md").read_text(encoding="utf-8") + metrics = json.loads((output_dir / "json" / "metrics" / "doc.metrics.json").read_text(encoding="utf-8")) + + assert result["pages"] == 1 + assert _split_page_outputs(markdown) == ["", "page two"] + assert metrics["repair_summary"]["pages_dropped_after_repeat_cutoff"] == 1 + + +def test_vllm_progress_sidecar_keeps_absolute_page_numbers(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _emit_progress + + state = { + "page_outputs": ["", "page two"], + "total_pages": 2, + "completed_pages": 2, + } + + _emit_progress(tmp_path / "output", "doc", state) + + partial_markdown = (tmp_path / "output" / "sidecars" / "ocr_progress" / "doc.partial.md").read_text( + encoding="utf-8" + ) + assert partial_markdown == ( + "\n" + "<--- Page Split --->\n" + "page two\n" + ) + + def test_early_stop_detects_symbol_and_numeric_list_garbage(): from glossapi.ocr.utils.cleaning import detect_early_stop_index @@ -340,3 +524,140 @@ def fake_run_multi_cli(**kwargs): assert calls["shard_pages"] == 64 assert calls["shard_threshold_pages"] == 256 assert result["doc"]["page_count"] == 1 + + + +def test_runner_reassembles_exact_fill_shards_into_canonical_outputs(tmp_path, monkeypatch): + import json + + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _write_outputs + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + def fake_run_multi_cli(*, out_root, **kwargs): + del kwargs + common_metrics = { + "source_file": "doc.pdf", + "source_stem": "doc", + "ocr_profile": "markdown_grounded", + "attn_backend": "vllm", + "runtime_backend": "vllm", + "batch_size": 96, + "repair_mode": "auto", + } + _write_outputs( + output_dir=out_root, + stem="doc__p00001-00002", + markdown=_join_page_outputs(["page one", "page two"]), + page_count=2, + extra_metrics={ + **common_metrics, + "source_start_page": 1, + "source_end_page": 2, + "render_sec": 1.5, + "infer_sec_total": 2.5, + "wall_time_sec": 3.5, + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 1}, + "page_metrics": [ + {"page_number": 1, "infer_sec": 1.0, "repair_strategy": "none", "repair_applied": False}, + {"page_number": 2, "infer_sec": 1.5, "repair_strategy": "plain", "repair_applied": True}, + ], + }, + ) + _write_outputs( + output_dir=out_root, + stem="doc__p00003-00004", + markdown=_join_page_outputs(["page three", "page four"]), + page_count=2, + extra_metrics={ + **common_metrics, + "source_start_page": 3, + "source_end_page": 4, + "render_sec": 0.5, + "infer_sec_total": 1.5, + "wall_time_sec": 2.0, + "repair_summary": {"repair_mode": "auto", "pages_flagged": 0, "pages_repaired": 0}, + "page_metrics": [ + {"page_number": 1, "infer_sec": 0.7, "repair_strategy": "none", "repair_applied": False}, + {"page_number": 2, "infer_sec": 0.8, "repair_strategy": "none", "repair_applied": False}, + ], + }, + ) + + monkeypatch.setattr(runner, "_run_multi_cli", fake_run_multi_cli) + monkeypatch.setattr(runner, "_page_count", lambda path: 4) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_vllm.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files( + corpus, + ["doc.pdf"], + use_gpus="multi", + devices=[0, 1], + runtime_backend="vllm", + scheduler="exact_fill", + target_batch_pages=2, + ) + + canonical_md = corpus.output_dir / "markdown" / "doc.md" + canonical_metrics = corpus.output_dir / "json" / "metrics" / "doc.metrics.json" + assert canonical_md.exists() + assert canonical_metrics.exists() + assert canonical_md.read_text(encoding="utf-8") == _join_page_outputs( + ["page one", "page two", "page three", "page four"] + ) + "\n" + + metrics = json.loads(canonical_metrics.read_text(encoding="utf-8")) + assert metrics["reassembled_from_shards"] is True + assert metrics["reassembled_shard_count"] == 2 + assert [item["page_number"] for item in metrics["page_metrics"]] == [1, 2, 3, 4] + assert metrics["repair_summary"]["pages_flagged"] == 1 + assert metrics["repair_summary"]["pages_repaired"] == 1 + assert result["doc"]["page_count"] == 4 + + assert not (corpus.output_dir / "markdown" / "doc__p00001-00002.md").exists() + assert (corpus.output_dir / "sidecars" / "ocr_shards" / "markdown" / "doc__p00001-00002.md").exists() + assert (corpus.output_dir / "sidecars" / "ocr_shards" / "json" / "metrics" / "doc__p00003-00004.metrics.json").exists() + + +def test_vllm_batch_outputs_accept_in_memory_images_without_disk_roundtrip(): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _generate_batch_outputs + + class FakeOutput: + def __init__(self, text): + self.outputs = [type("TokenOutput", (), {"text": text})()] + + class FakeLLM: + def generate(self, prompt_batch, sampling_params=None): + del sampling_params + assert len(prompt_batch) == 2 + assert all(item["multi_modal_data"]["image"].mode == "RGB" for item in prompt_batch) + return [FakeOutput("alpha"), FakeOutput("beta")] + + jobs = [ + {"stem": "doc", "page_number": 1, "image": Image.new("RGB", (4, 4), color="white")}, + {"stem": "doc", "page_number": 2, "image": Image.new("RGB", (4, 4), color="black")}, + ] + outputs = _generate_batch_outputs( + FakeLLM(), + jobs=jobs, + prompt="prompt", + batch_size=2, + sampling_params=object(), + ) + + assert [item["raw_text"] for item in outputs] == ["alpha", "beta"] + assert jobs[0]["image"].size == (4, 4) + assert jobs[1]["image"].size == (4, 4) + for item in jobs: + item["image"].close() From f584ab18e6aff68126103f5ebbd0ab17a1fbd058 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 02:58:02 +0300 Subject: [PATCH 50/93] Pack repair batches and preserve OCR text continuity --- docs/api/corpus.md | 1 + .../artifact_layout_and_stage_handoffs.md | 6 + docs/multi_gpu.md | 2 + .../openarchives_ocr_rollout_plan.md | 1 + docs/stages/ocr.md | 3 + .../ocr/deepseek/run_pdf_ocr_transformers.py | 126 ++++++- src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py | 309 +++++++++++++----- src/glossapi/ocr/deepseek/runner.py | 22 ++ .../scripts/openarchives_ocr_merge.py | 79 ++++- tests/test_deepseek_multi_gpu_runtime.py | 43 +++ tests/test_deepseek_runner_contract.py | 129 ++++++++ tests/test_openarchives_ocr_shards.py | 55 ++++ 12 files changed, 667 insertions(+), 109 deletions(-) diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 8b740d6..a0d6d10 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -126,6 +126,7 @@ ocr( - Main outputs: - refreshed `markdown/.md` - refreshed cleaner/parquet metadata after OCR reruns + - when metadata parquet is present, a canonical OCR parquet should preserve the same row identity and carry corrected `text` together with the updated metadata - `json/.latex_map.jsonl` when enrichment runs ## formula_enrich_from_json() diff --git a/docs/architecture/artifact_layout_and_stage_handoffs.md b/docs/architecture/artifact_layout_and_stage_handoffs.md index 27cfef0..53cbcec 100644 --- a/docs/architecture/artifact_layout_and_stage_handoffs.md +++ b/docs/architecture/artifact_layout_and_stage_handoffs.md @@ -97,6 +97,7 @@ For DeepSeek OCR, there is an important distinction between execution-time shard - Multi-GPU `exact_fill` may execute shards such as `doc__p00001-00096` internally to keep GPU lanes full. - Those shard names are operational artifacts, not the downstream contract for OCR outputs. - After worker completion, the runner reassembles canonical `markdown/.md` and `json/metrics/.metrics.json` files for each source PDF. +- If OCR started from canonical corpus metadata, the authoritative OCR handoff should also include a canonical parquet where corrected `text` is embedded back into the same document rows. Detached markdown alone is not the full stage handoff in that case. - Canonical OCR markdown page boundaries are annotated with `` comments next to the page-split marker, and the parser remains backward-compatible with legacy unnumbered separators. - Original shard markdown and shard metrics are moved under `sidecars/ocr_shards/` for debugging and audit trails. - If a repair retry trips the garbage cutoff again, the canonical markdown keeps the page slot but blanks the page content rather than preserving the bad first-pass OCR. @@ -114,6 +115,11 @@ The runtime queue now has two phases inside the same operational state: - first-pass shard batches - repair shard batches published after first pass completes +Repair queue durability and repair execution batching are intentionally separate concerns: + +- the durable queue records individual repair work items so retries, failure accounting, and resume logic stay precise +- workers may pack multiple pending repair items into one larger execution batch to keep GPUs busy during the repair tail + These runtime artifacts are operational state, not downstream stage inputs. They are intended for monitoring, debugging, and safe resumption logic. Downstream stages should therefore consume canonical OCR outputs, not shard artifacts. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 598fb1f..b67f238 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -48,6 +48,7 @@ c.ocr( - `scheduler="exact_fill"` is the preferred multi-GPU vLLM scheduler when PDFs vary widely in length. It shards large documents into page ranges and keeps GPU lanes filled more evenly. - Internal shard runs now preserve the public `Corpus.ocr()` contract. Canonical outputs are reassembled back into `markdown/.md` and `json/metrics/.metrics.json` for each source PDF. +- When OCR starts from canonical corpus rows, the preferred stage handoff is also a canonical parquet where corrected `text` is embedded back into the same row identity. Markdown and metrics remain sidecars for inspection and audit. - Shard markdown and shard metrics are retained for debugging under `sidecars/ocr_shards/` instead of remaining in the canonical handoff directories. - The vLLM path now renders pages into memory and feeds a bounded queue directly into inference, which removes the temporary PNG round-trip and overlaps rendering with generation. - Empty-page detection still happens before inference, and repair retries reuse the in-memory page image instead of reopening a file from disk. @@ -55,6 +56,7 @@ c.ocr( - If a repair retry hits the garbage cutoff again, the page is blanked rather than keeping the failed first-pass garbage. - Multi-GPU vLLM workers now pull from a durable shared batch queue in `sidecars/ocr_runtime/work_queue.sqlite`, so finished batches survive worker crashes and respawned workers can continue without rescanning completed work. - Repair work now runs as a second global queue phase. First-pass batches finish and persist shard outputs first; then any worker can claim the queued repair shards. This keeps repair tails balanced across GPUs without mixing worker-local repair state into the controller. +- Workers may pack multiple pending repair items into one larger execution batch. Queue durability stays item-granular, but the runtime no longer has to execute the repair tail as one tiny origin-shard retry at a time. - Each worker writes `sidecars/ocr_runtime/worker_*.runtime.json` with heartbeat state and steady-state timing markers. The runner also emits `gpu_preflight.json`, `gpu_telemetry.jsonl`, and `runtime_summary.json`. - The runner checks GPU persistence mode before launch by default. Control it with `GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT=off|warn|ensure`. The default is `ensure`, which will try `sudo -n nvidia-smi -pm 1` and record the result in `gpu_preflight.json`. - Worker reliability knobs are environment-driven: `GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP`, `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS`, `GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC`, `GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC`, and `GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC`. diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 590d18b..848fb72 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -19,6 +19,7 @@ The rollout is backed by concrete scripts in `src/glossapi/scripts/`: - writes a JSON summary with page totals and ETA - `openarchives_ocr_merge.py` - merges shard-level OCR metadata back into the canonical parquet by `filename` + - can also embed merged OCR `text` plus artifact linkage fields back into the canonical rows when OCR markdown artifacts are available These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 179b211..83f260b 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -22,6 +22,7 @@ The OCR stage repairs documents whose extracted text is considered unreliable, a - corrected Markdown or OCR-enriched outputs - backend-specific JSON or related artifacts - metadata updates such as OCR success markers +- when metadata parquet is available, a canonical OCR parquet should preserve the same row identity and carry corrected `text` together with the updated metadata ## Backend choices @@ -46,11 +47,13 @@ OCR reruns should preserve: - `ocr()` may execute page-range shards internally when `use_gpus="multi"` and `scheduler="exact_fill"`, but the stage contract remains one canonical Markdown file and one canonical metrics file per source PDF. - When shard execution is used, the runner reassembles `markdown/.md` and `json/metrics/.metrics.json` after the CLI workers finish. - Execution-time shard artifacts are moved under `sidecars/ocr_shards/` so downstream stages do not mistake them for canonical stage outputs. +- When OCR starts from canonical corpus rows, the authoritative stage handoff should preserve that metadata continuity instead of reducing the result to detached markdown files. Corrected `text` belongs in the canonical parquet row; markdown and metrics stay as sidecars. - The vLLM runtime now streams rendered pages through an in-memory queue, overlaps rendering with inference, skips empty pages before inference, and reuses the same in-memory image for repair retries. - Canonical OCR markdown now annotates page boundaries with `` comments alongside each page-split marker so downstream inspection can line up page images and markdown more easily. - In `repair_mode="auto"`, a page that trips the garbage cutoff again during the plain-OCR repair pass is now blanked instead of keeping the original garbage text. - Multi-GPU vLLM runs now execute through a durable shared batch queue rather than one fragile subprocess per preassigned lane. Workers claim first-pass batches dynamically, heartbeat while a batch is active, and can be respawned without losing finished batch outputs. - Repair retries are now durable too. Flagged pages are published back into the same runtime database as a second global repair queue, and any GPU worker can drain those repair shards after the first-pass queue is complete. +- Repair queue durability and repair execution packing are intentionally separate. The queue tracks individual retry items for precise resume/failure accounting, while workers can combine multiple repair items into one larger execution batch to keep the repair tail GPU-efficient. - By default each durable batch gets at most two total attempts, so one retry is allowed after the first failure and then the batch is marked failed for operator follow-up. - Operational sidecars for these runs live under `sidecars/ocr_runtime/`, including the durable work queue state, per-worker runtime JSON, GPU telemetry samples, GPU preflight output, and a final runtime summary with steady-state inference timestamps. diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py index 213fdcf..9b318e1 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -10,7 +10,7 @@ import tempfile import time from pathlib import Path -from typing import Iterable, List +from typing import Iterable, Iterator, List from PIL import Image @@ -29,6 +29,7 @@ PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." PAGE_SPLIT = "\n<--- Page Split --->\n" +PAGE_SPLIT_RE = re.compile(r"(?:^|\n)(?:\n)?<--- Page Split --->\n?") DEFAULT_MAX_NEW_TOKENS = 2048 @@ -126,36 +127,96 @@ def _iter_pdf_jobs(input_dir: Path, files: List[str], page_ranges: List[str]) -> ] -def _render_pages( +def _resolve_render_window( + *, + doc_page_count: int, + max_pages: int | None, + start_page: int = 1, + end_page: int | None = None, +) -> tuple[int, int] | None: + first_idx = max(0, int(start_page) - 1) + last_idx = int(doc_page_count) - 1 if end_page is None else min(int(doc_page_count) - 1, int(end_page) - 1) + if max_pages is not None: + last_idx = min(last_idx, first_idx + int(max_pages) - 1) + if last_idx < first_idx: + return None + return first_idx, last_idx + + +def _count_rendered_pages( + pdf_path: Path, + max_pages: int | None, + *, + start_page: int = 1, + end_page: int | None = None, +) -> int: + import fitz + + doc = fitz.open(pdf_path) + try: + window = _resolve_render_window( + doc_page_count=int(doc.page_count), + max_pages=max_pages, + start_page=start_page, + end_page=end_page, + ) + if window is None: + return 0 + first_idx, last_idx = window + return max(0, int(last_idx) - int(first_idx) + 1) + finally: + doc.close() + + +def _iter_rendered_pages( pdf_path: Path, max_pages: int | None, render_dpi: int, *, start_page: int = 1, end_page: int | None = None, -) -> List[Image.Image]: +) -> Iterator[Image.Image]: import fitz - images: List[Image.Image] = [] doc = fitz.open(pdf_path) try: - doc_page_count = int(doc.page_count) - first_idx = max(0, int(start_page) - 1) - last_idx = doc_page_count - 1 if end_page is None else min(doc_page_count - 1, int(end_page) - 1) - if max_pages is not None: - last_idx = min(last_idx, first_idx + int(max_pages) - 1) - if last_idx < first_idx: - return images + window = _resolve_render_window( + doc_page_count=int(doc.page_count), + max_pages=max_pages, + start_page=start_page, + end_page=end_page, + ) + if window is None: + return + first_idx, last_idx = window zoom = float(render_dpi) / 72.0 matrix = fitz.Matrix(zoom, zoom) for idx in range(first_idx, last_idx + 1): page = doc[idx] pixmap = page.get_pixmap(matrix=matrix, alpha=False) img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) - images.append(img) + yield img finally: doc.close() - return images + + +def _render_pages( + pdf_path: Path, + max_pages: int | None, + render_dpi: int, + *, + start_page: int = 1, + end_page: int | None = None, +) -> List[Image.Image]: + return list( + _iter_rendered_pages( + pdf_path, + max_pages, + render_dpi, + start_page=start_page, + end_page=end_page, + ) + ) def _clean_markdown(text: str) -> str: @@ -170,6 +231,37 @@ def _clean_markdown(text: str) -> str: return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() +def _page_split_comment(page_number: int) -> str: + return f"\n\n<--- Page Split --->\n" + + +def _join_page_outputs(page_outputs: List[str]) -> str: + if not page_outputs: + return "" + first_page = str(page_outputs[0]) + parts = [first_page] + emitted = bool(first_page) + for page_number, page_text in enumerate(page_outputs[1:], start=2): + separator = _page_split_comment(page_number) + if not emitted: + separator = separator.lstrip("\n") + parts.append(separator) + emitted = True + parts.append(str(page_text)) + return "".join(parts) + + +def _split_page_outputs(markdown_text: str) -> List[str]: + content = str(markdown_text or "").rstrip("\n") + if not content: + return [] + return PAGE_SPLIT_RE.split(content) + + +def _serialize_markdown(markdown: str) -> str: + return str(markdown or "").rstrip("\n") + "\n" + + def _postprocess_page_text( text: str, *, @@ -342,7 +434,7 @@ def _write_outputs( md_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True) progress_dir.mkdir(parents=True, exist_ok=True) - (md_dir / f"{stem}.md").write_text(markdown.strip() + "\n", encoding="utf-8") + (md_dir / f"{stem}.md").write_text(_serialize_markdown(markdown), encoding="utf-8") metrics = { "page_count": page_count, "model": "deepseek-ai/DeepSeek-OCR-2", @@ -368,9 +460,9 @@ def _write_progress( progress_dir = output_dir / "sidecars" / "ocr_progress" metrics_dir.mkdir(parents=True, exist_ok=True) progress_dir.mkdir(parents=True, exist_ok=True) - partial_markdown = PAGE_SPLIT.join(page_outputs).strip() + partial_markdown = _join_page_outputs(page_outputs) if partial_markdown: - (progress_dir / f"{stem}.partial.md").write_text(partial_markdown + "\n", encoding="utf-8") + (progress_dir / f"{stem}.partial.md").write_text(_serialize_markdown(partial_markdown), encoding="utf-8") progress = { "completed_pages": completed_pages, "total_pages": total_pages, @@ -465,7 +557,7 @@ def main() -> int: total_pages, idx + 1, ) - markdown = PAGE_SPLIT.join(page_outputs) if page_outputs else "[[Blank page]]" + markdown = _join_page_outputs(page_outputs) if page_outputs else "[[Blank page]]" _write_outputs( output_dir, stem, diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py index 2f76b67..edc838b 100644 --- a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -50,6 +50,8 @@ EMPTY_PAGE_BAND_DARK_MAX = 0.0025 GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS = 48 GARBAGE_EARLY_STOP_WINDOW_TOKENS = 160 +DEFAULT_REPAIR_EXEC_BATCH_TARGET_PAGES = 48 +DEFAULT_REPAIR_EXEC_BATCH_TARGET_ITEMS = 32 def _parse_args() -> argparse.Namespace: @@ -84,6 +86,8 @@ def _parse_args() -> argparse.Namespace: parser.add_argument("--work-stale-after-sec", type=float, default=900.0) parser.add_argument("--work-heartbeat-sec", type=float, default=10.0) parser.add_argument("--work-max-attempts", type=int, default=2) + parser.add_argument("--repair-exec-batch-target-pages", type=int, default=DEFAULT_REPAIR_EXEC_BATCH_TARGET_PAGES) + parser.add_argument("--repair-exec-batch-target-items", type=int, default=DEFAULT_REPAIR_EXEC_BATCH_TARGET_ITEMS) return parser.parse_args() @@ -476,6 +480,64 @@ def _build_repair_batches(*, doc_states: Dict[str, dict], retry_pages_by_stem: D return batches +def _claim_additional_repair_batches( + work_db: Path, + *, + worker_id: str, + stale_after_sec: float, + first_batch: dict, + target_pages: int, + target_items: int, +) -> List[dict]: + claimed_batches = [dict(first_batch)] + first_batch_pages = max(0, int(first_batch.get("pages", len(list(first_batch.get("repair_page_numbers") or []))))) + claimed_pages = first_batch_pages + target_pages = max(1, int(target_pages)) + target_items = max(1, int(target_items)) + if "batch_id" in first_batch: + heartbeat_batch(work_db, batch_id=int(first_batch["batch_id"]), worker_id=worker_id) + while len(claimed_batches) < target_items and claimed_pages < target_pages: + next_batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_REPAIR, + ) + if next_batch is None: + break + claimed_batches.append(dict(next_batch)) + claimed_pages += max(0, int(next_batch.get("pages", 0))) + return claimed_batches + + +def _repair_batch_result( + *, + batch: dict, + render_sec_total: float, + infer_sec_total: float, + first_infer_started_at: Optional[float], + last_infer_completed_at: Optional[float], + batch_wall_time_sec: float, + execution_pack_batch_ids: List[int], + execution_pack_pages: int, +) -> dict: + batch_pages = int(batch.get("pages", len(list(batch.get("repair_page_numbers") or [])))) + return { + "docs": 1, + "pages": int(batch_pages), + "render_sec_total": float(render_sec_total), + "infer_sec_total": float(infer_sec_total), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "batch_wall_time_sec": float(batch_wall_time_sec), + "execution_pack_batch_ids": [int(batch_id) for batch_id in execution_pack_batch_ids], + "execution_pack_pages": int(execution_pack_pages), + "execution_pack_items": int(len(execution_pack_batch_ids)), + "queue_name": QUEUE_REPAIR, + "batch_id": int(batch["batch_id"]) if "batch_id" in batch else None, + } + + def _run_vllm_batch( llm, *, @@ -856,71 +918,97 @@ def _render_producer() -> None: } -def _run_repair_batch_to_outputs( +def _run_repair_batches_to_outputs( args: argparse.Namespace, *, - batch: dict, + batches: List[dict], output_dir: Path, llm, plain_prompt: str, sampling_params, ) -> dict: batch_wall_start = time.perf_counter() - stem = str(batch["stem"]) - state = _load_persisted_doc_state(output_dir, stem) - source_start_page = int(batch["source_start_page"]) - repair_page_numbers = sorted({int(page_number) for page_number in list(batch.get("repair_page_numbers") or [])}) - if not repair_page_numbers: + claimed_batches = [dict(batch) for batch in batches] + if not claimed_batches: return { - "docs": 1, + "docs": 0, "pages": 0, "render_sec_total": 0.0, "infer_sec_total": 0.0, "first_infer_started_at": None, "last_infer_completed_at": None, "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + "per_batch_results": {}, } - render_start = time.perf_counter() - source_page_numbers = [source_start_page + page_number - 1 for page_number in repair_page_numbers] + state_by_stem: Dict[str, dict] = {} repair_jobs: List[dict] = [] - for source_page_number, image in _iter_selected_rendered_pages( - Path(str(batch["pdf_path"])), - render_dpi=int(args.render_dpi), - source_page_numbers=source_page_numbers, - ): - repair_jobs.append( - { - "stem": stem, - "page_number": int(source_page_number) - source_start_page + 1, - "image": image, - } + per_batch_results: Dict[int, dict] = {} + execution_pack_batch_ids = [int(batch["batch_id"]) for batch in claimed_batches if "batch_id" in batch] + execution_pack_pages = int(sum(max(0, int(batch.get("pages", 0))) for batch in claimed_batches)) + render_sec_total = 0.0 + + for batch in claimed_batches: + batch_id = int(batch["batch_id"]) if "batch_id" in batch else None + stem = str(batch["stem"]) + state = state_by_stem.get(stem) + if state is None: + state = _load_persisted_doc_state(output_dir, stem) + state_by_stem[stem] = state + source_start_page = int(batch["source_start_page"]) + repair_page_numbers = sorted({int(page_number) for page_number in list(batch.get("repair_page_numbers") or [])}) + render_start = time.perf_counter() + if repair_page_numbers: + source_page_numbers = [source_start_page + page_number - 1 for page_number in repair_page_numbers] + for source_page_number, image in _iter_selected_rendered_pages( + Path(str(batch["pdf_path"])), + render_dpi=int(args.render_dpi), + source_page_numbers=source_page_numbers, + ): + repair_jobs.append( + { + "batch_id": batch_id, + "stem": stem, + "page_number": int(source_page_number) - source_start_page + 1, + "image": image, + } + ) + render_sec = float(time.perf_counter() - render_start) + render_sec_total += render_sec + if batch_id is not None: + per_batch_results[batch_id] = _repair_batch_result( + batch=batch, + render_sec_total=render_sec, + infer_sec_total=0.0, + first_infer_started_at=None, + last_infer_completed_at=None, + batch_wall_time_sec=float(time.perf_counter() - batch_wall_start), + execution_pack_batch_ids=execution_pack_batch_ids, + execution_pack_pages=execution_pack_pages, + ) + + first_infer_started_at: Optional[float] = None + last_infer_completed_at: Optional[float] = None + if repair_jobs: + first_infer_started_at = time.time() + repair_outputs = _generate_batch_outputs( + llm, + jobs=repair_jobs, + prompt=plain_prompt, + batch_size=max(1, int(args.batch_size)), + sampling_params=sampling_params, ) - render_sec = float(time.perf_counter() - render_start) - if not repair_jobs: - return { - "docs": 1, - "pages": 0, - "render_sec_total": render_sec, - "infer_sec_total": 0.0, - "first_infer_started_at": None, - "last_infer_completed_at": None, - "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), - } + last_infer_completed_at = time.time() + else: + repair_outputs = [] - first_infer_started_at = time.time() - repair_outputs = _generate_batch_outputs( - llm, - jobs=repair_jobs, - prompt=plain_prompt, - batch_size=max(1, int(args.batch_size)), - sampling_params=sampling_params, - ) - last_infer_completed_at = time.time() try: for result in repair_outputs: item = result["item"] + stem = str(item["stem"]) page_number = int(item["page_number"]) + batch_id = int(item["batch_id"]) if item.get("batch_id") is not None else None + state = state_by_stem[stem] metric = state["page_metrics"][page_number - 1] if metric is None: metric = { @@ -969,34 +1057,80 @@ def _run_repair_batch_to_outputs( if disposition["final_text"] is not None: state["page_outputs"][page_number - 1] = repair_effective_text metric["final_chars"] = int(len(repair_effective_text.strip())) + if batch_id is not None and batch_id in per_batch_results: + per_batch_results[batch_id]["infer_sec_total"] = float( + per_batch_results[batch_id]["infer_sec_total"] + float(result["infer_sec"]) + ) _close_job_image(item) finally: for item in repair_jobs: _close_job_image(item) - page_metrics = sorted([item for item in state["page_metrics"] if item], key=lambda item: int(item["page_number"])) - extra_metrics = dict(state["extra_metrics"]) - extra_metrics["repair_summary"] = _repair_summary_from_page_metrics(page_metrics, extra_metrics.get("repair_mode", args.repair_mode)) - extra_metrics["page_metrics"] = page_metrics - extra_metrics["infer_sec_total"] = float(sum(float(item["infer_sec"]) for item in page_metrics)) - _write_outputs( - output_dir, - stem, - _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]", - int(state["total_pages"]), - extra_metrics=extra_metrics, - ) + for stem, state in state_by_stem.items(): + page_metrics = sorted([item for item in state["page_metrics"] if item], key=lambda item: int(item["page_number"])) + extra_metrics = dict(state["extra_metrics"]) + extra_metrics["repair_summary"] = _repair_summary_from_page_metrics( + page_metrics, + extra_metrics.get("repair_mode", args.repair_mode), + ) + extra_metrics["page_metrics"] = page_metrics + extra_metrics["infer_sec_total"] = float(sum(float(item["infer_sec"]) for item in page_metrics)) + _write_outputs( + output_dir, + stem, + _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]", + int(state["total_pages"]), + extra_metrics=extra_metrics, + ) + + batch_wall_time_sec = float(time.perf_counter() - batch_wall_start) + for batch_id, result in per_batch_results.items(): + result["first_infer_started_at"] = ( + _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None + ) + result["last_infer_completed_at"] = ( + _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None + ) + result["batch_wall_time_sec"] = batch_wall_time_sec + return { - "docs": 1, - "pages": int(len(repair_page_numbers)), - "render_sec_total": render_sec, + "docs": int(len(state_by_stem)), + "pages": int( + sum(max(0, int(batch.get("pages", len(list(batch.get("repair_page_numbers") or []))))) for batch in claimed_batches) + ), + "render_sec_total": float(render_sec_total), "infer_sec_total": float(sum(float(result["infer_sec"]) for result in repair_outputs)), - "first_infer_started_at": _utc_now_iso(first_infer_started_at), - "last_infer_completed_at": _utc_now_iso(last_infer_completed_at), - "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "batch_wall_time_sec": batch_wall_time_sec, + "per_batch_results": per_batch_results, } +def _run_repair_batch_to_outputs( + args: argparse.Namespace, + *, + batch: dict, + output_dir: Path, + llm, + plain_prompt: str, + sampling_params, +) -> dict: + result = _run_repair_batches_to_outputs( + args, + batches=[batch], + output_dir=output_dir, + llm=llm, + plain_prompt=plain_prompt, + sampling_params=sampling_params, + ) + batch_id = int(batch["batch_id"]) if "batch_id" in batch else None + if batch_id is not None and batch_id in result["per_batch_results"]: + return dict(result["per_batch_results"][batch_id]) + result.pop("per_batch_results", None) + return result + + def _queue_has_pending_or_running(counts: Dict[str, object], queue_name: str) -> bool: queue_counts = counts.get("by_queue", {}).get(queue_name, {}) return int(queue_counts.get(STATUS_PENDING, 0)) > 0 or int(queue_counts.get(STATUS_RUNNING, 0)) > 0 @@ -1064,6 +1198,7 @@ def _run_work_queue( "engine_ready_at": _utc_now_iso(), "current_batch_id": None, "current_queue_name": None, + "current_batch_ids": [], "completed_batches": [], "first_batch_started_at": None, "last_batch_finished_at": None, @@ -1086,12 +1221,24 @@ def _run_work_queue( _write_worker_runtime(runtime_file, runtime_state) return 0 - batch_id = int(batch["batch_id"]) + claimed_batches = [dict(batch)] + if queue_name == QUEUE_REPAIR: + claimed_batches = _claim_additional_repair_batches( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + first_batch=batch, + target_pages=int(args.repair_exec_batch_target_pages), + target_items=int(args.repair_exec_batch_target_items), + ) + batch_ids = [int(claimed_batch["batch_id"]) for claimed_batch in claimed_batches if "batch_id" in claimed_batch] + batch_id = batch_ids[0] heartbeat_stop = threading.Event() def _heartbeat_loop() -> None: while not heartbeat_stop.wait(heartbeat_interval): - heartbeat_batch(work_db, batch_id=batch_id, worker_id=worker_id) + for heartbeat_batch_id in batch_ids: + heartbeat_batch(work_db, batch_id=heartbeat_batch_id, worker_id=worker_id) runtime_state["heartbeat_at"] = _utc_now_iso() _write_worker_runtime(runtime_file, runtime_state) @@ -1101,7 +1248,8 @@ def _heartbeat_loop() -> None: runtime_state["status"] = f"running_{queue_name}" runtime_state["current_batch_id"] = batch_id runtime_state["current_queue_name"] = queue_name - runtime_state["current_batch_pages"] = int(batch.get("pages", 0)) + runtime_state["current_batch_ids"] = batch_ids + runtime_state["current_batch_pages"] = int(sum(int(claimed_batch.get("pages", 0)) for claimed_batch in claimed_batches)) runtime_state["heartbeat_at"] = _utc_now_iso() _write_worker_runtime(runtime_file, runtime_state) if queue_name == QUEUE_MAIN: @@ -1119,44 +1267,57 @@ def _heartbeat_loop() -> None: crop_mode=crop_mode, sampling_params=sampling_params, ) + per_batch_results = {batch_id: dict(result)} else: - result = _run_repair_batch_to_outputs( + result = _run_repair_batches_to_outputs( args, - batch=batch, + batches=claimed_batches, output_dir=output_dir, llm=llm, plain_prompt=plain_prompt, sampling_params=sampling_params, ) + per_batch_results = dict(result.get("per_batch_results") or {}) if runtime_state["first_batch_started_at"] is None: runtime_state["first_batch_started_at"] = result.get("first_infer_started_at") runtime_state["last_batch_finished_at"] = result.get("last_infer_completed_at") - runtime_state["completed_batches"].append( + runtime_state["completed_batches"].extend( { - "batch_id": batch_id, + "batch_id": int(claimed_batch["batch_id"]), "queue_name": queue_name, } + for claimed_batch in claimed_batches + if "batch_id" in claimed_batch ) - mark_batch_done(work_db, batch_id=batch_id, worker_id=worker_id, result=result) + for claimed_batch in claimed_batches: + claimed_batch_id = int(claimed_batch["batch_id"]) + mark_batch_done( + work_db, + batch_id=claimed_batch_id, + worker_id=worker_id, + result=per_batch_results.get(claimed_batch_id, dict(result)), + ) except Exception as exc: runtime_state["status"] = "failed" runtime_state["current_batch_id"] = batch_id runtime_state["current_queue_name"] = queue_name runtime_state["last_error"] = str(exc) _write_worker_runtime(runtime_file, runtime_state) - mark_batch_failed( - work_db, - batch_id=batch_id, - worker_id=worker_id, - error=str(exc), - max_attempts=max_attempts, - ) + for claimed_batch in claimed_batches: + mark_batch_failed( + work_db, + batch_id=int(claimed_batch["batch_id"]), + worker_id=worker_id, + error=str(exc), + max_attempts=max_attempts, + ) raise finally: heartbeat_stop.set() heartbeat_thread.join(timeout=max(1.0, heartbeat_interval)) runtime_state["current_batch_id"] = None runtime_state["current_queue_name"] = None + runtime_state["current_batch_ids"] = [] _write_worker_runtime(runtime_file, runtime_state) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 07f5285..fca7c2a 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -104,6 +104,8 @@ def _build_cli_command( gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, work_db: Optional[Path] = None, worker_id: Optional[str] = None, worker_runtime_file: Optional[Path] = None, @@ -176,6 +178,10 @@ def _build_cli_command( cmd.append("--disable-fp8-kv") if repair_mode: cmd += ["--repair-mode", str(repair_mode)] + if repair_exec_batch_target_pages is not None: + cmd += ["--repair-exec-batch-target-pages", str(int(repair_exec_batch_target_pages))] + if repair_exec_batch_target_items is not None: + cmd += ["--repair-exec-batch-target-items", str(int(repair_exec_batch_target_items))] return cmd @@ -244,6 +250,8 @@ def _run_cli( gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int], + repair_exec_batch_target_items: Optional[int], visible_device: Optional[int] = None, ) -> None: cmd = _build_cli_command( @@ -272,6 +280,8 @@ def _run_cli( gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, ) env = _build_env(python_bin=python_bin, visible_device=visible_device, script=script) @@ -1035,6 +1045,8 @@ def _run_multi_cli( gpu_memory_utilization: Optional[float], disable_fp8_kv: bool, repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int], + repair_exec_batch_target_items: Optional[int], scheduler: Optional[str], target_batch_pages: int, shard_pages: int, @@ -1124,6 +1136,8 @@ def _start_worker(*, worker_id: str, visible_device: int, respawns: int) -> Dict gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, work_db=work_db, worker_id=worker_id, worker_runtime_file=runtime_dir / f"{worker_id}.runtime.json", @@ -1279,6 +1293,8 @@ def _start_worker(*, worker_id: str, visible_device: int, respawns: int) -> Dict gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, ) env = _build_env(python_bin=python_exe, visible_device=visible_device, script=script_path) LOGGER.info( @@ -1338,6 +1354,8 @@ def run_for_files( disable_fp8_kv: bool = False, vllm_batch_size: Optional[int] = None, repair_mode: str = "auto", + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, scheduler: str = "auto", target_batch_pages: int = AUTO_VLLM_BATCH_PAGE_CAP, shard_pages: int = 0, @@ -1435,6 +1453,8 @@ def run_for_files( gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, scheduler=scheduler, target_batch_pages=int(max(1, target_batch_pages)), shard_pages=int(max(0, shard_pages)), @@ -1477,6 +1497,8 @@ def run_for_files( gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, ) _ensure_canonical_outputs(out_root=out_root, pdf_root=pdf_root, file_list=file_list) diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py index a06af90..8b441be 100644 --- a/src/glossapi/scripts/openarchives_ocr_merge.py +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -1,9 +1,10 @@ from __future__ import annotations import argparse +import hashlib import shutil from pathlib import Path -from typing import List +from typing import Dict, List, Optional import pandas as pd @@ -29,36 +30,63 @@ def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: return df[key].astype(str).str.strip() -def _copy_artifacts( +def _collect_artifact_updates( *, shard_rows: pd.DataFrame, work_roots: List[Path], - output_root: Path, -) -> int: + output_root: Optional[Path], +) -> tuple[int, pd.DataFrame]: copied = 0 - markdown_out = output_root / "markdown" - metrics_out = output_root / "json" / "metrics" - markdown_out.mkdir(parents=True, exist_ok=True) - metrics_out.mkdir(parents=True, exist_ok=True) + markdown_out = output_root / "markdown" if output_root is not None else None + metrics_out = output_root / "json" / "metrics" if output_root is not None else None + if markdown_out is not None: + markdown_out.mkdir(parents=True, exist_ok=True) + if metrics_out is not None: + metrics_out.mkdir(parents=True, exist_ok=True) + updates: List[Dict[str, object]] = [] for row in shard_rows.to_dict(orient="records"): + merge_key = str(row.get("_merge_key") or "").strip() stem = str(row.get("filename_base") or Path(str(row.get("filename") or "")).stem).strip() if not stem: continue md_name = str(row.get("md_filename") or f"{stem}.md") + md_payload = None + md_relpath = None for root in work_roots: md_src = root / "markdown" / f"{stem}.md" if md_src.exists(): - shutil.copy2(md_src, markdown_out / md_name) - copied += 1 + md_payload = md_src.read_text(encoding="utf-8") + if markdown_out is not None: + shutil.copy2(md_src, markdown_out / md_name) + copied += 1 + md_relpath = str(Path("markdown") / md_name) break + metrics_relpath = None for suffix in (".metrics.json", ".per_page.metrics.json"): for root in work_roots: src = root / "json" / "metrics" / f"{stem}{suffix}" if src.exists(): - shutil.copy2(src, metrics_out / src.name) - copied += 1 + if metrics_out is not None: + shutil.copy2(src, metrics_out / src.name) + copied += 1 + metrics_relpath = str(Path("json") / "metrics" / src.name) break - return copied + if metrics_relpath is not None: + break + updates.append( + { + "_merge_key": merge_key, + "text": md_payload, + "ocr_markdown_relpath": md_relpath, + "ocr_metrics_relpath": metrics_relpath, + "ocr_text_sha256": ( + hashlib.sha256(md_payload.encode("utf-8")).hexdigest() + if isinstance(md_payload, str) + else None + ), + } + ) + return copied, pd.DataFrame(updates) def main(argv: List[str] | None = None) -> int: @@ -89,16 +117,31 @@ def main(argv: List[str] | None = None) -> int: continue master.loc[shards.index, column] = shards[column] - master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") - master.to_parquet(out_path, index=False) copied = 0 - if args.artifact_work_roots and str(args.artifact_output_root or "").strip(): + if args.artifact_work_roots: roots = [Path(p).expanduser().resolve() for p in args.artifact_work_roots] - copied = _copy_artifacts( + artifact_output_root = ( + Path(args.artifact_output_root).expanduser().resolve() + if str(args.artifact_output_root or "").strip() + else None + ) + copied, artifact_updates = _collect_artifact_updates( shard_rows=shards.reset_index(drop=True), work_roots=roots, - output_root=Path(args.artifact_output_root).expanduser().resolve(), + output_root=artifact_output_root, ) + if not artifact_updates.empty: + artifact_updates = artifact_updates.drop_duplicates(subset=["_merge_key"], keep="last").set_index("_merge_key") + for column in artifact_updates.columns: + if column in preserve_master_columns: + continue + if column not in master.columns: + master[column] = None + mask = artifact_updates[column].notna() + if bool(mask.any()): + master.loc[artifact_updates.index[mask], column] = artifact_updates.loc[mask, column] + master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") + master.to_parquet(out_path, index=False) print(f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}; copied {copied} artifact file(s)") return 0 diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py index 06d7b58..e553382 100644 --- a/tests/test_deepseek_multi_gpu_runtime.py +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -190,6 +190,45 @@ def test_work_queue_marks_batch_failed_after_one_retry(tmp_path): assert item["last_error"] == "second failure" +def test_claim_additional_repair_batches_packs_multiple_items(tmp_path): + from glossapi.ocr.deepseek import run_pdf_ocr_vllm + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db(db_path, batches=[]) + inserted = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + {"queue_key": "repair:1:a", "batch_id": 10, "stem": "a", "repair_page_numbers": [1, 2], "pages": 2}, + {"queue_key": "repair:1:b", "batch_id": 11, "stem": "b", "repair_page_numbers": [3, 4], "pages": 2}, + {"queue_key": "repair:1:c", "batch_id": 12, "stem": "c", "repair_page_numbers": [5], "pages": 1}, + ], + ) + assert inserted == [10, 11, 12] + + first = work_queue.claim_next_batch( + db_path, + worker_id="worker-pack", + stale_after_sec=60.0, + queue_name=work_queue.QUEUE_REPAIR, + now_ts=10.0, + ) + packed = run_pdf_ocr_vllm._claim_additional_repair_batches( + db_path, + worker_id="worker-pack", + stale_after_sec=60.0, + first_batch=first, + target_pages=4, + target_items=8, + ) + + assert [int(batch["batch_id"]) for batch in packed] == [10, 11] + counts = work_queue.work_queue_counts(db_path) + assert counts["by_queue"][work_queue.QUEUE_REPAIR][work_queue.STATUS_RUNNING] == 2 + assert counts["by_queue"][work_queue.QUEUE_REPAIR][work_queue.STATUS_PENDING] == 1 + + def test_claim_next_phase_batch_switches_to_repair_after_main_drains(tmp_path): from glossapi.ocr.deepseek import run_pdf_ocr_vllm from glossapi.ocr.deepseek import work_queue @@ -353,6 +392,8 @@ def test_build_cli_command_includes_work_queue_flags(tmp_path): gpu_memory_utilization=0.9, disable_fp8_kv=False, repair_mode="auto", + repair_exec_batch_target_pages=48, + repair_exec_batch_target_items=32, work_db=tmp_path / "work.sqlite", worker_id="worker_00_gpu0", worker_runtime_file=tmp_path / "worker_00.runtime.json", @@ -368,6 +409,8 @@ def test_build_cli_command_includes_work_queue_flags(tmp_path): assert "--work-stale-after-sec" in cmd and "900.0" in cmd assert "--work-heartbeat-sec" in cmd and "10.0" in cmd assert "--work-max-attempts" in cmd and "2" in cmd + assert "--repair-exec-batch-target-pages" in cmd and "48" in cmd + assert "--repair-exec-batch-target-items" in cmd and "32" in cmd def test_launch_worker_process_uses_start_new_session(monkeypatch): diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index db30b9a..8a07c57 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -421,6 +421,135 @@ def test_repair_batch_updates_persisted_outputs_with_repeat_cutoff_drop(tmp_path assert metrics["repair_summary"]["pages_dropped_after_repeat_cutoff"] == 1 +def test_repair_batch_pack_updates_multiple_stems(tmp_path, monkeypatch): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _run_repair_batches_to_outputs + + output_dir = tmp_path / "output" + _write_outputs( + output_dir=output_dir, + stem="doc_a", + markdown=_join_page_outputs(["bad a", "page two a"]), + page_count=2, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 1.0, + "raw_chars": 10, + "final_chars": 5, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + }, + { + "page_number": 2, + "infer_sec": 0.5, + "raw_chars": 9, + "final_chars": 9, + "repair_strategy": "none", + "repair_reason": None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + }, + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + _write_outputs( + output_dir=output_dir, + stem="doc_b", + markdown=_join_page_outputs(["bad b"]), + page_count=1, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 0.7, + "raw_chars": 8, + "final_chars": 5, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + } + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._iter_selected_rendered_pages", + lambda pdf_path, *, render_dpi, source_page_numbers: [ + (page_number, Image.new("RGB", (4, 4), "white")) for page_number in source_page_numbers + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._generate_batch_outputs", + lambda llm, *, jobs, prompt, batch_size, sampling_params: [ + {"item": job, "raw_text": f"fixed-{job['stem']}-{job['page_number']}", "infer_sec": 0.25} + for job in jobs + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._postprocess_page_text", + lambda text, *, prompt, content_debug: (text, {"early_stops": 0}), + ) + + result = _run_repair_batches_to_outputs( + SimpleNamespace(render_dpi=144, batch_size=8, content_debug=False, repair_mode="auto"), + batches=[ + { + "batch_id": 10, + "stem": "doc_a", + "pdf_path": str(tmp_path / "doc_a.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + "pages": 1, + }, + { + "batch_id": 11, + "stem": "doc_b", + "pdf_path": str(tmp_path / "doc_b.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + "pages": 1, + }, + ], + output_dir=output_dir, + llm=object(), + plain_prompt="plain prompt", + sampling_params=object(), + ) + + markdown_a = (output_dir / "markdown" / "doc_a.md").read_text(encoding="utf-8") + markdown_b = (output_dir / "markdown" / "doc_b.md").read_text(encoding="utf-8") + metrics_a = json.loads((output_dir / "json" / "metrics" / "doc_a.metrics.json").read_text(encoding="utf-8")) + metrics_b = json.loads((output_dir / "json" / "metrics" / "doc_b.metrics.json").read_text(encoding="utf-8")) + + assert result["pages"] == 2 + assert result["docs"] == 2 + assert set(result["per_batch_results"]) == {10, 11} + assert _split_page_outputs(markdown_a)[0] == "fixed-doc_a-1" + assert _split_page_outputs(markdown_b)[0] == "fixed-doc_b-1" + assert metrics_a["repair_summary"]["pages_repaired"] == 1 + assert metrics_b["repair_summary"]["pages_repaired"] == 1 + + def test_vllm_progress_sidecar_keeps_absolute_page_numbers(tmp_path): from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _emit_progress diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py index f724c68..aa0a66e 100644 --- a/tests/test_openarchives_ocr_shards.py +++ b/tests/test_openarchives_ocr_shards.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib import json from pathlib import Path @@ -173,5 +174,59 @@ def test_openarchives_ocr_merge_copies_markdown_artifacts(tmp_path: Path) -> Non merged = pd.read_parquet(out_path).set_index("source_doc_id") assert merged.loc["doc-1", "filename"] == "a.html" assert bool(merged.loc["doc-1", "ocr_success"]) is True + assert merged.loc["doc-1", "text"] == "ocr text" + assert merged.loc["doc-1", "ocr_markdown_relpath"] == "markdown/A.md" + assert merged.loc["doc-1", "ocr_metrics_relpath"] == "json/metrics/A.metrics.json" + assert merged.loc["doc-1", "ocr_text_sha256"] == hashlib.sha256(b"ocr text").hexdigest() assert (tmp_path / "final" / "markdown" / "A.md").exists() assert (tmp_path / "final" / "json" / "metrics" / "A.metrics.json").exists() + + +def test_openarchives_ocr_merge_embeds_text_without_copy_root(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + (work_root / "markdown").mkdir(parents=True) + (work_root / "json" / "metrics").mkdir(parents=True) + (work_root / "markdown" / "A.md").write_text("embedded text", encoding="utf-8") + (work_root / "json" / "metrics" / "A.metrics.json").write_text("{}", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--artifact-work-roots", + str(work_root), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "text"] == "embedded text" + assert pd.isna(merged.loc["doc-1", "ocr_markdown_relpath"]) From a1f0eba0a76dbee5536220f338345a579c468035 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 03:58:37 +0300 Subject: [PATCH 51/93] Auto-wire CUDA wheel libs for DeepSeek workers --- docs/multi_gpu.md | 1 + src/glossapi/ocr/deepseek/runner.py | 23 ++++++++++++++++++++++- tests/test_deepseek_multi_gpu_runtime.py | 20 ++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index b67f238..3853f50 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -59,6 +59,7 @@ c.ocr( - Workers may pack multiple pending repair items into one larger execution batch. Queue durability stays item-granular, but the runtime no longer has to execute the repair tail as one tiny origin-shard retry at a time. - Each worker writes `sidecars/ocr_runtime/worker_*.runtime.json` with heartbeat state and steady-state timing markers. The runner also emits `gpu_preflight.json`, `gpu_telemetry.jsonl`, and `runtime_summary.json`. - The runner checks GPU persistence mode before launch by default. Control it with `GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT=off|warn|ensure`. The default is `ensure`, which will try `sudo -n nvidia-smi -pm 1` and record the result in `gpu_preflight.json`. +- When the DeepSeek runtime is built from wheel-managed CUDA packages, the runner now auto-discovers the venv's `site-packages/nvidia/*/lib` directories and prepends them to `LD_LIBRARY_PATH`. `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH` still works as a manual override or supplement. - Worker reliability knobs are environment-driven: `GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP`, `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS`, `GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC`, `GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC`, and `GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC`. - The default `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS=2` means one retry after the first failed claim, then the batch is marked failed instead of retrying forever. - `workers_per_gpu=1` remains the safe default on A100 40GB nodes. Prefer increasing `target_batch_pages` before adding more workers per device. diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index fca7c2a..1924ef7 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -218,9 +218,30 @@ def _build_env( for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" break + ld_entries: List[str] = [] + if python_bin: + venv_root = Path(python_bin).expanduser().resolve().parent.parent + for site_packages in sorted((venv_root / "lib").glob("python*/site-packages")): + nvidia_root = site_packages / "nvidia" + if not nvidia_root.is_dir(): + continue + for lib_dir in sorted(nvidia_root.glob("*/lib")): + if lib_dir.is_dir(): + ld_entries.append(str(lib_dir)) ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: - env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH', '')}" + ld_entries.extend(entry for entry in str(ld_path).split(os.pathsep) if entry) + existing_ld = str(env.get("LD_LIBRARY_PATH", "")).strip() + if existing_ld: + ld_entries.extend(entry for entry in existing_ld.split(os.pathsep) if entry) + if ld_entries: + deduped: List[str] = [] + seen: Set[str] = set() + for entry in ld_entries: + if entry and entry not in seen: + seen.add(entry) + deduped.append(entry) + env["LD_LIBRARY_PATH"] = os.pathsep.join(deduped) return env diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py index e553382..d82a2d6 100644 --- a/tests/test_deepseek_multi_gpu_runtime.py +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -3,6 +3,26 @@ from types import SimpleNamespace +def test_build_env_adds_wheel_managed_cuda_lib_dirs(tmp_path): + from glossapi.ocr.deepseek import runner + + venv_root = tmp_path / "venv" + python_bin = venv_root / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("") + cuda_runtime_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cuda_runtime" / "lib" + cublas_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cublas" / "lib" + cuda_runtime_lib.mkdir(parents=True, exist_ok=True) + cublas_lib.mkdir(parents=True, exist_ok=True) + + env = runner._build_env(python_bin=python_bin, visible_device=1, script=None) + + assert env["CUDA_VISIBLE_DEVICES"] == "1" + ld_entries = env["LD_LIBRARY_PATH"].split(":") + assert str(cuda_runtime_lib) in ld_entries + assert str(cublas_lib) in ld_entries + + def test_work_queue_requeues_stale_running_batch(tmp_path): from glossapi.ocr.deepseek import work_queue From c84769af6cbc9ffd97ae28af13d3fd9f206c1eea Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 04:04:10 +0300 Subject: [PATCH 52/93] Fix DeepSeek CUDA wheel lib discovery for venv symlinks --- src/glossapi/ocr/deepseek/runner.py | 5 ++++- tests/test_deepseek_multi_gpu_runtime.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index 1924ef7..fb91d60 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -220,7 +220,10 @@ def _build_env( break ld_entries: List[str] = [] if python_bin: - venv_root = Path(python_bin).expanduser().resolve().parent.parent + # Keep the venv path semantics instead of resolving the interpreter symlink + # back to `/usr/bin/python...`; the wheel-managed CUDA libs live under the + # virtualenv tree, not under the system interpreter location. + venv_root = Path(python_bin).expanduser().parent.parent for site_packages in sorted((venv_root / "lib").glob("python*/site-packages")): nvidia_root = site_packages / "nvidia" if not nvidia_root.is_dir(): diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py index d82a2d6..b2a7c01 100644 --- a/tests/test_deepseek_multi_gpu_runtime.py +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -23,6 +23,22 @@ def test_build_env_adds_wheel_managed_cuda_lib_dirs(tmp_path): assert str(cublas_lib) in ld_entries +def test_build_env_uses_virtualenv_path_when_python_bin_is_symlink(tmp_path): + from glossapi.ocr.deepseek import runner + + venv_root = tmp_path / "venv" + python_bin = venv_root / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.symlink_to("/usr/bin/python3") + cuda_runtime_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cuda_runtime" / "lib" + cuda_runtime_lib.mkdir(parents=True, exist_ok=True) + + env = runner._build_env(python_bin=python_bin, visible_device=0, script=None) + + ld_entries = env["LD_LIBRARY_PATH"].split(":") + assert str(cuda_runtime_lib) in ld_entries + + def test_work_queue_requeues_stale_running_batch(tmp_path): from glossapi.ocr.deepseek import work_queue From c32079028300da2608e66d0e3fc7b9206800a516 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 12:20:05 +0300 Subject: [PATCH 53/93] Add DeepSeek setup report and OCR shard unification --- docs/multi_gpu.md | 1 + docs/operations/deepseek_gcp_a100_setup.md | 110 +++++++ .../openarchives_ocr_rollout_plan.md | 1 + .../scripts/deepseek_runtime_report.py | 283 ++++++++++++++++++ .../scripts/openarchives_ocr_merge.py | 87 +++++- tests/test_openarchives_ocr_shards.py | 55 ++++ 6 files changed, 523 insertions(+), 14 deletions(-) create mode 100644 docs/operations/deepseek_gcp_a100_setup.md create mode 100644 src/glossapi/scripts/deepseek_runtime_report.py diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 3853f50..162e901 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -63,6 +63,7 @@ c.ocr( - Worker reliability knobs are environment-driven: `GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP`, `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS`, `GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC`, `GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC`, and `GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC`. - The default `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS=2` means one retry after the first failed claim, then the batch is marked failed instead of retrying forever. - `workers_per_gpu=1` remains the safe default on A100 40GB nodes. Prefer increasing `target_batch_pages` before adding more workers per device. +- For fresh GCP A100 nodes, run `python -m glossapi.scripts.deepseek_runtime_report --repo-root ` before applying ad hoc fixes. Treat that report as the baseline comparison against a known-good node. See [operations/deepseek_gcp_a100_setup.md](operations/deepseek_gcp_a100_setup.md). ## Provider & Device Checks diff --git a/docs/operations/deepseek_gcp_a100_setup.md b/docs/operations/deepseek_gcp_a100_setup.md new file mode 100644 index 0000000..0991790 --- /dev/null +++ b/docs/operations/deepseek_gcp_a100_setup.md @@ -0,0 +1,110 @@ +# DeepSeek GCP A100 Setup + +This note captures the current known-good baseline for bringing up GlossAPI +DeepSeek OCR on fresh GCP A100 nodes and the required diagnosis workflow when a +fresh node does not behave like the already-converged fleet. + +## Goal + +Treat a fresh OCR node as a reproducible setup target, not as a one-off machine +that is repaired interactively until it happens to work. + +The target is a clean path from: + +1. create instance +2. bootstrap machine +3. prepare GlossAPI runtime +4. run a normal GlossAPI OCR workflow + +## Known-good baseline + +This rollout has validated the following stack on working OCR fleet nodes: + +- Ubuntu `22.04.5` +- NVIDIA driver `590.48.01` +- `A100 40GB` GPUs +- host Python `3.10` +- DeepSeek venv Python `3.11` +- `torch 2.10.0+cu130` +- `vllm 0.18.0` +- `transformers 4.57.6` +- `workers_per_gpu=1` + +The runner also expects GPU persistence mode to be enabled and will record the +preflight result under `sidecars/ocr_runtime/gpu_preflight.json`. + +## First command on a fresh node + +Run the checked-in runtime report before changing code or applying ad hoc fixes: + +```bash +python -m glossapi.scripts.deepseek_runtime_report --repo-root /opt/glossapi/repo +``` + +The report prints: + +- OS and hostname +- repo revision +- GPU model, driver, and memory +- selected Python executable and venv root +- `torch` / `vllm` / `transformers` import details +- wheel-managed NVIDIA library directories +- a focused `pip freeze` subset +- selected runtime environment variables + +Prefer comparing this output against a known-good OCR node before modifying +GlossAPI itself. + +## Fresh-node diagnosis rule + +If a fresh node fails, classify the problem before patching code: + +1. instance creation choice + - wrong image + - wrong driver path + - wrong machine family or GPU shape +2. bootstrap incompleteness + - missing system packages + - missing wheel-managed CUDA libraries + - model / cache / filesystem layout mismatch + - missing env wiring +3. actual GlossAPI runtime assumption + - hidden dependency on a particular venv layout + - hidden dependency on a specific CUDA wheel layout + - hidden runner / vLLM startup assumption + +Write down which class the current failure belongs to before making broad code +changes. + +## Current benchmark-node findings + +The fresh `a2-highgpu-2g` benchmark node used during the April 3, 2026 work +surfaced two setup classes: + +- early missing shared-library failure: + - `ImportError: libcudart.so.12: cannot open shared object file` +- later engine startup failure after bootstrap fixes: + - `RuntimeError: Engine core initialization failed. Failed core proc(s): {}` + +This means instance creation itself worked, but bootstrap/runtime reproducibility +was incomplete. + +## Current runner expectation + +`glossapi.ocr.deepseek.runner._build_env()` now auto-discovers +`site-packages/nvidia/*/lib` directories under the selected DeepSeek virtualenv +and prepends them to `LD_LIBRARY_PATH`. + +This is the right place to normalize wheel-managed CUDA library discovery. Do +not rely on manual shell-session exports as the primary contract. + +## Practical bring-up checklist + +1. confirm the node matches the OS / driver baseline +2. run `deepseek_runtime_report` +3. compare report output to a known-good node +4. fix bootstrap mismatches first +5. rerun the report +6. only then run a small OCR validation workload +7. if OCR still fails, inspect worker logs and decide whether the remaining gap + belongs in GlossAPI runtime code or external bootstrap diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md index 848fb72..56136fa 100644 --- a/docs/operations/openarchives_ocr_rollout_plan.md +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -20,6 +20,7 @@ The rollout is backed by concrete scripts in `src/glossapi/scripts/`: - `openarchives_ocr_merge.py` - merges shard-level OCR metadata back into the canonical parquet by `filename` - can also embed merged OCR `text` plus artifact linkage fields back into the canonical rows when OCR markdown artifacts are available + - unifies page-range shard markdown back into one canonical document-level markdown artifact per OCR row before downstream handoff These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. diff --git a/src/glossapi/scripts/deepseek_runtime_report.py b/src/glossapi/scripts/deepseek_runtime_report.py new file mode 100644 index 0000000..d069073 --- /dev/null +++ b/src/glossapi/scripts/deepseek_runtime_report.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +import argparse +import json +import os +import platform +import subprocess +import sys +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +PACKAGE_NAMES = ( + "torch", + "vllm", + "transformers", + "nvidia.cuda_runtime", + "nvidia.cuda_nvrtc", +) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_runtime_report", + description="Print a reproducible DeepSeek OCR runtime report for a GlossAPI checkout.", + ) + p.add_argument("--repo-root", default=".") + p.add_argument("--python-bin", default="") + p.add_argument("--json", action="store_true") + return p.parse_args(argv) + + +def _detect_python_bin(repo_root: Path, explicit: str) -> Path: + if str(explicit).strip(): + return Path(explicit).expanduser().resolve() + candidates = ( + repo_root / "dependency_setup" / ".venvs" / "deepseek" / "bin" / "python", + repo_root / "dependency_setup" / "deepseek_uv" / ".venv" / "bin" / "python", + ) + for candidate in candidates: + if candidate.exists(): + return candidate.resolve() + return Path(sys.executable).resolve() + + +def _read_os_release() -> Dict[str, str]: + path = Path("/etc/os-release") + if not path.exists(): + return {} + out: Dict[str, str] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + out[key] = value.strip().strip('"') + return out + + +def _run_text(*cmd: str) -> str: + try: + completed = subprocess.run( + list(cmd), + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + except FileNotFoundError: + return "" + return completed.stdout.strip() + + +def _gpu_rows() -> List[Dict[str, str]]: + text = _run_text( + "nvidia-smi", + "--query-gpu=index,name,driver_version,memory.total", + "--format=csv,noheader,nounits", + ) + rows: List[Dict[str, str]] = [] + for line in text.splitlines(): + parts = [part.strip() for part in line.split(",")] + if len(parts) != 4: + continue + rows.append( + { + "index": parts[0], + "name": parts[1], + "driver_version": parts[2], + "memory_total_mib": parts[3], + } + ) + return rows + + +def _python_json(python_bin: Path, code: str) -> Dict[str, Any]: + completed = subprocess.run( + [str(python_bin), "-c", code], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if completed.returncode != 0: + return { + "ok": False, + "stdout": completed.stdout.strip(), + "stderr": completed.stderr.strip(), + } + try: + return {"ok": True, "data": json.loads(completed.stdout)} + except json.JSONDecodeError: + return { + "ok": False, + "stdout": completed.stdout.strip(), + "stderr": completed.stderr.strip(), + } + + +def _package_report(python_bin: Path) -> Dict[str, Any]: + code = """ +import importlib +import json +import os +import sys + +mods = {} +for name in %s: + try: + mod = importlib.import_module(name) + mods[name] = { + "version": getattr(mod, "__version__", None), + "file": getattr(mod, "__file__", None), + } + except Exception as exc: + mods[name] = {"error": repr(exc)} + +payload = { + "python_version": sys.version, + "executable": sys.executable, + "virtual_env": os.environ.get("VIRTUAL_ENV"), + "ld_library_path": os.environ.get("LD_LIBRARY_PATH"), + "packages": mods, +} +print(json.dumps(payload)) +""" % (repr(PACKAGE_NAMES),) + return _python_json(python_bin, code) + + +def _site_package_nvidia_libs(venv_root: Path) -> List[Path]: + libs: List[Path] = [] + for site_packages in sorted((venv_root / "lib").glob("python*/site-packages")): + for lib_dir in sorted((site_packages / "nvidia").glob("*/lib")): + if lib_dir.is_dir(): + libs.append(lib_dir) + return libs + + +def _interesting_libs(lib_dir: Path) -> List[str]: + names = [] + for child in sorted(lib_dir.iterdir()): + if not child.is_file(): + continue + name = child.name + if any(token in name for token in ("libcudart", "libnvrtc", "libcudnn", "libcuda")): + names.append(name) + return names + + +def _venv_root(python_bin: Path) -> Path: + return python_bin.parent.parent + + +def _pip_freeze_subset(python_bin: Path) -> List[str]: + text = _run_text(str(python_bin), "-m", "pip", "freeze") + prefixes = ( + "torch", + "vllm", + "transformers", + "nvidia-cuda", + "nvidia-cudnn", + "xformers", + "flash-attn", + ) + lines = [] + for line in text.splitlines(): + normalized = line.strip().lower() + if any(normalized.startswith(prefix) for prefix in prefixes): + lines.append(line.strip()) + return lines + + +def _report(repo_root: Path, python_bin: Path) -> Dict[str, Any]: + os_release = _read_os_release() + venv_root = _venv_root(python_bin) + lib_dirs = _site_package_nvidia_libs(venv_root) + return { + "repo_root": str(repo_root), + "repo_head": _run_text("git", "-C", str(repo_root), "rev-parse", "HEAD"), + "hostname": platform.node(), + "os_release": { + "PRETTY_NAME": os_release.get("PRETTY_NAME"), + "VERSION_ID": os_release.get("VERSION_ID"), + }, + "python_bin": str(python_bin), + "venv_root": str(venv_root), + "gpus": _gpu_rows(), + "python_env": _package_report(python_bin), + "nvidia_lib_dirs": [ + { + "path": str(lib_dir), + "interesting_libs": _interesting_libs(lib_dir), + } + for lib_dir in lib_dirs + ], + "pip_freeze_subset": _pip_freeze_subset(python_bin), + "selected_env": { + "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES"), + "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH"), + "VIRTUAL_ENV": os.environ.get("VIRTUAL_ENV"), + }, + } + + +def _print_text(report: Dict[str, Any]) -> None: + print(f"repo_root: {report['repo_root']}") + print(f"repo_head: {report['repo_head']}") + print(f"hostname: {report['hostname']}") + os_release = report["os_release"] + print(f"os: {os_release.get('PRETTY_NAME')} (VERSION_ID={os_release.get('VERSION_ID')})") + print(f"python_bin: {report['python_bin']}") + print(f"venv_root: {report['venv_root']}") + print() + print("gpus:") + for row in report["gpus"]: + print( + f" - index={row['index']} name={row['name']} " + f"driver={row['driver_version']} memory_mib={row['memory_total_mib']}" + ) + print() + print("python_env:") + py_env = report["python_env"] + print(f" ok: {py_env.get('ok')}") + if py_env.get("ok"): + data = py_env["data"] + print(f" executable: {data.get('executable')}") + print(f" python_version: {data.get('python_version')}") + print(f" virtual_env: {data.get('virtual_env')}") + print(f" ld_library_path: {data.get('ld_library_path')}") + for name, package in data.get("packages", {}).items(): + print(f" {name}: {package}") + else: + print(f" stdout: {py_env.get('stdout')}") + print(f" stderr: {py_env.get('stderr')}") + print() + print("nvidia_lib_dirs:") + for item in report["nvidia_lib_dirs"]: + print(f" - path: {item['path']}") + for lib in item["interesting_libs"]: + print(f" {lib}") + print() + print("pip_freeze_subset:") + for line in report["pip_freeze_subset"]: + print(f" - {line}") + print() + print("selected_env:") + for key, value in report["selected_env"].items(): + print(f" {key}={value}") + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + repo_root = Path(args.repo_root).expanduser().resolve() + python_bin = _detect_python_bin(repo_root, str(args.python_bin or "")) + report = _report(repo_root, python_bin) + if args.json: + print(json.dumps(report, indent=2, ensure_ascii=False)) + else: + _print_text(report) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py index 8b441be..b88b8c3 100644 --- a/src/glossapi/scripts/openarchives_ocr_merge.py +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -2,6 +2,7 @@ import argparse import hashlib +import re import shutil from pathlib import Path from typing import Dict, List, Optional @@ -9,6 +10,9 @@ import pandas as pd +_MARKDOWN_SHARD_RE = re.compile(r"^(?P.+)__p(?P\d+)-(?P\d+)\.md$") + + def _parse_args(argv: List[str] | None = None) -> argparse.Namespace: p = argparse.ArgumentParser( prog="python -m glossapi.scripts.openarchives_ocr_merge", @@ -30,6 +34,66 @@ def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: return df[key].astype(str).str.strip() +def _merge_markdown_parts(parts: List[str]) -> str: + merged: List[str] = [] + for part in parts: + if not part: + continue + if merged and not merged[-1].endswith("\n"): + merged[-1] = merged[-1] + "\n" + merged.append(part) + return "".join(merged) + + +def _copy_once(src: Path, dst: Path) -> None: + dst.parent.mkdir(parents=True, exist_ok=True) + if dst.exists(): + return + shutil.copy2(src, dst) + + +def _resolve_markdown_payload( + *, + stem: str, + md_name: str, + work_roots: List[Path], + output_root: Optional[Path], +) -> tuple[Optional[str], Optional[str]]: + markdown_out = output_root / "markdown" if output_root is not None else None + shard_out = output_root / "sidecars" / "ocr_shards" / "markdown" if output_root is not None else None + + for root in work_roots: + canonical_src = root / "markdown" / f"{stem}.md" + if canonical_src.exists(): + payload = canonical_src.read_text(encoding="utf-8") + if markdown_out is not None: + _copy_once(canonical_src, markdown_out / md_name) + return payload, str(Path("markdown") / md_name) + return payload, None + + shard_sources = [] + for candidate in sorted((root / "markdown").glob(f"{stem}__p*.md")): + match = _MARKDOWN_SHARD_RE.match(candidate.name) + if not match or match.group("stem") != stem: + continue + shard_sources.append((int(match.group("start")), candidate)) + if not shard_sources: + continue + + shard_sources.sort(key=lambda item: item[0]) + payload = _merge_markdown_parts([path.read_text(encoding="utf-8") for _, path in shard_sources]) + if markdown_out is not None: + destination = markdown_out / md_name + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_text(payload, encoding="utf-8") + if shard_out is not None: + for _, shard_path in shard_sources: + _copy_once(shard_path, shard_out / shard_path.name) + return payload, str(Path("markdown") / md_name) + return payload, None + return None, None + + def _collect_artifact_updates( *, shard_rows: pd.DataFrame, @@ -39,8 +103,6 @@ def _collect_artifact_updates( copied = 0 markdown_out = output_root / "markdown" if output_root is not None else None metrics_out = output_root / "json" / "metrics" if output_root is not None else None - if markdown_out is not None: - markdown_out.mkdir(parents=True, exist_ok=True) if metrics_out is not None: metrics_out.mkdir(parents=True, exist_ok=True) updates: List[Dict[str, object]] = [] @@ -50,24 +112,21 @@ def _collect_artifact_updates( if not stem: continue md_name = str(row.get("md_filename") or f"{stem}.md") - md_payload = None - md_relpath = None - for root in work_roots: - md_src = root / "markdown" / f"{stem}.md" - if md_src.exists(): - md_payload = md_src.read_text(encoding="utf-8") - if markdown_out is not None: - shutil.copy2(md_src, markdown_out / md_name) - copied += 1 - md_relpath = str(Path("markdown") / md_name) - break + md_payload, md_relpath = _resolve_markdown_payload( + stem=stem, + md_name=md_name, + work_roots=work_roots, + output_root=output_root, + ) + if md_payload is not None and markdown_out is not None: + copied += 1 metrics_relpath = None for suffix in (".metrics.json", ".per_page.metrics.json"): for root in work_roots: src = root / "json" / "metrics" / f"{stem}{suffix}" if src.exists(): if metrics_out is not None: - shutil.copy2(src, metrics_out / src.name) + _copy_once(src, metrics_out / src.name) copied += 1 metrics_relpath = str(Path("json") / "metrics" / src.name) break diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py index aa0a66e..d616225 100644 --- a/tests/test_openarchives_ocr_shards.py +++ b/tests/test_openarchives_ocr_shards.py @@ -230,3 +230,58 @@ def test_openarchives_ocr_merge_embeds_text_without_copy_root(tmp_path: Path) -> merged = pd.read_parquet(out_path).set_index("source_doc_id") assert merged.loc["doc-1", "text"] == "embedded text" assert pd.isna(merged.loc["doc-1", "ocr_markdown_relpath"]) + + +def test_openarchives_ocr_merge_unifies_markdown_shards(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "md_filename": "a.md", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + markdown_dir = work_root / "markdown" + markdown_dir.mkdir(parents=True) + (markdown_dir / "A__p00001-00096.md").write_text("part one", encoding="utf-8") + (markdown_dir / "A__p00097-00179.md").write_text("part two\n", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--artifact-work-roots", + str(work_root), + "--artifact-output-root", + str(tmp_path / "final"), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "text"] == "part one\npart two\n" + assert merged.loc["doc-1", "ocr_markdown_relpath"] == "markdown/A.md" + assert (tmp_path / "final" / "markdown" / "A.md").read_text(encoding="utf-8") == "part one\npart two\n" + assert (tmp_path / "final" / "sidecars" / "ocr_shards" / "markdown" / "A__p00001-00096.md").exists() + assert (tmp_path / "final" / "sidecars" / "ocr_shards" / "markdown" / "A__p00097-00179.md").exists() From d5655543ca83ac38d632895e35fc91191b71a707 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 12:21:35 +0300 Subject: [PATCH 54/93] Keep runtime report inside the venv --- src/glossapi/scripts/deepseek_runtime_report.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/glossapi/scripts/deepseek_runtime_report.py b/src/glossapi/scripts/deepseek_runtime_report.py index d069073..cb93729 100644 --- a/src/glossapi/scripts/deepseek_runtime_report.py +++ b/src/glossapi/scripts/deepseek_runtime_report.py @@ -32,15 +32,18 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: def _detect_python_bin(repo_root: Path, explicit: str) -> Path: if str(explicit).strip(): - return Path(explicit).expanduser().resolve() + path = Path(explicit).expanduser() + if not path.is_absolute(): + path = repo_root / path + return path.absolute() candidates = ( repo_root / "dependency_setup" / ".venvs" / "deepseek" / "bin" / "python", repo_root / "dependency_setup" / "deepseek_uv" / ".venv" / "bin" / "python", ) for candidate in candidates: if candidate.exists(): - return candidate.resolve() - return Path(sys.executable).resolve() + return candidate.absolute() + return Path(sys.executable).absolute() def _read_os_release() -> Dict[str, str]: From e1745ececf0c95262b3c52339232faed875866fb Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 12:54:52 +0300 Subject: [PATCH 55/93] Harden fresh-node GlossAPI setup --- dependency_setup/setup_deepseek_uv.sh | 38 ++++++++++++++++++ dependency_setup/setup_glossapi.sh | 38 ++++++++++++++++++ docs/getting_started.md | 27 +++++++++++++ docs/operations/deepseek_gcp_a100_setup.md | 46 ++++++++++++++++++---- 4 files changed, 142 insertions(+), 7 deletions(-) diff --git a/dependency_setup/setup_deepseek_uv.sh b/dependency_setup/setup_deepseek_uv.sh index 04a21ba..87ad8b6 100755 --- a/dependency_setup/setup_deepseek_uv.sh +++ b/dependency_setup/setup_deepseek_uv.sh @@ -33,6 +33,38 @@ Options: EOF } +prepend_path_if_dir() { + local dir="$1" + if [[ -d "${dir}" ]]; then + case ":${PATH}:" in + *":${dir}:"*) ;; + *) export PATH="${dir}:${PATH}" ;; + esac + fi +} + +ensure_stable_python() { + local python_bin="$1" + local release_level + release_level="$("${python_bin}" - <<'PY' +import sys +print(sys.version_info.releaselevel) +PY +)" + if [[ "${release_level}" != "final" ]]; then + error "Python interpreter ${python_bin} is not a stable final release (releaselevel=${release_level}). Install a stable CPython (for example via 'uv python install 3.11.11') and rerun with --python." + fi +} + +check_rust_toolchain() { + if ! command -v cargo >/dev/null 2>&1; then + error "cargo is required to build the Rust extensions. Install Rust (for example via rustup) and ensure cargo is on PATH." + fi + if ! cargo metadata --format-version 1 --manifest-path "${REPO_ROOT}/rust/glossapi_rs_cleaner/Cargo.toml" >/dev/null 2>&1; then + error "Current cargo cannot parse the repo Rust metadata/Cargo.lock. Upgrade Rust (for example 'rustup toolchain install stable && rustup default stable') and rerun setup." + fi +} + while (( "$#" )); do case "$1" in --venv) @@ -69,7 +101,13 @@ while (( "$#" )); do shift || true done +prepend_path_if_dir "${HOME}/.local/bin" +prepend_path_if_dir "${HOME}/.cargo/bin" + command -v uv >/dev/null 2>&1 || error "uv is required. Install it first, e.g. 'python3 -m pip install --user uv'." +command -v "${PYTHON_BIN}" >/dev/null 2>&1 || error "Python executable not found: ${PYTHON_BIN}" +ensure_stable_python "${PYTHON_BIN}" +check_rust_toolchain MODEL_DIR="${MODEL_ROOT}/DeepSeek-OCR-2" diff --git a/dependency_setup/setup_glossapi.sh b/dependency_setup/setup_glossapi.sh index 70e9754..d8d081b 100755 --- a/dependency_setup/setup_glossapi.sh +++ b/dependency_setup/setup_glossapi.sh @@ -32,6 +32,38 @@ Options: EOF } +prepend_path_if_dir() { + local dir="$1" + if [[ -d "${dir}" ]]; then + case ":${PATH}:" in + *":${dir}:"*) ;; + *) export PATH="${dir}:${PATH}" ;; + esac + fi +} + +ensure_stable_python() { + local python_bin="$1" + local release_level + release_level="$("${python_bin}" - <<'PY' +import sys +print(sys.version_info.releaselevel) +PY +)" + if [[ "${release_level}" != "final" ]]; then + error "Python interpreter ${python_bin} is not a stable final release (releaselevel=${release_level}). Install a stable CPython and rerun with --python." + fi +} + +check_rust_toolchain() { + if ! command -v cargo >/dev/null 2>&1; then + error "cargo is required to build the Rust extensions. Install Rust (for example via rustup) and ensure cargo is on PATH." + fi + if ! cargo metadata --format-version 1 --manifest-path "${REPO_ROOT}/rust/glossapi_rs_cleaner/Cargo.toml" >/dev/null 2>&1; then + error "Current cargo cannot parse the repo Rust metadata/Cargo.lock. Upgrade Rust (for example 'rustup toolchain install stable && rustup default stable') and rerun setup." + fi +} + while (( "$#" )); do case "$1" in --mode) @@ -72,6 +104,12 @@ while (( "$#" )); do shift || true done +prepend_path_if_dir "${HOME}/.local/bin" +prepend_path_if_dir "${HOME}/.cargo/bin" +command -v "${PYTHON_BIN}" >/dev/null 2>&1 || error "Python executable not found: ${PYTHON_BIN}" +ensure_stable_python "${PYTHON_BIN}" +check_rust_toolchain + case "${MODE}" in vanilla) warn "Mode 'vanilla' is deprecated; using 'docling' instead." diff --git a/docs/getting_started.md b/docs/getting_started.md index d1557d3..97e3905 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -8,6 +8,12 @@ This guide gets a new GlossAPI contributor from clone → first extraction with - Recent `pip` (or `uv`) and a C/C++ toolchain for Rust wheels - Optional: NVIDIA GPU with CUDA drivers for Docling/DeepSeek acceleration +On fresh Linux hosts, make these assumptions explicit instead of relying on shell startup files: + +- prefer a stable final CPython, not a prerelease distro build +- keep `~/.local/bin` on `PATH` if `uv` was installed with `pip install --user uv` +- keep `~/.cargo/bin` on `PATH` if Rust was installed with `rustup` + ## Install GlossAPI ### Recommended setup @@ -20,6 +26,7 @@ Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `d # DeepSeek OCR on GPU (uv-managed, downloads DeepSeek-OCR-2 if requested) ./dependency_setup/setup_deepseek_uv.sh \ + --python /path/to/stable/python \ --venv dependency_setup/.venvs/deepseek \ --model-root /path/to/deepseek-ocr-2-model \ --download-model \ @@ -29,8 +36,28 @@ Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `d `setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `glossapi[deepseek]` and leaves Docling in the main environment. +On fresh GPU nodes, prefer a `uv`-managed stable Python such as: + +```bash +~/.local/bin/uv python install 3.11.11 +``` + +Then pass that interpreter explicitly to the setup scripts: + +```bash +./dependency_setup/setup_glossapi.sh \ + --mode docling \ + --python /home/$USER/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/bin/python3.11 \ + --venv dependency_setup/.venvs/docling + +./dependency_setup/setup_deepseek_uv.sh \ + --python /home/$USER/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/bin/python3.11 \ + --venv dependency_setup/.venvs/deepseek +``` + **DeepSeek runtime checklist** - Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. +- Run `python -m glossapi.scripts.deepseek_runtime_report` from the DeepSeek venv on fresh GPU nodes before ad hoc fixes. That captures the interpreter, CUDA wheel layout, and package versions used by the node. - Force the real runtime and avoid stub fallback by setting: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` diff --git a/docs/operations/deepseek_gcp_a100_setup.md b/docs/operations/deepseek_gcp_a100_setup.md index 0991790..8f4192d 100644 --- a/docs/operations/deepseek_gcp_a100_setup.md +++ b/docs/operations/deepseek_gcp_a100_setup.md @@ -24,7 +24,7 @@ This rollout has validated the following stack on working OCR fleet nodes: - NVIDIA driver `590.48.01` - `A100 40GB` GPUs - host Python `3.10` -- DeepSeek venv Python `3.11` +- DeepSeek venv Python `3.11` from a stable final CPython, not a prerelease distro build - `torch 2.10.0+cu130` - `vllm 0.18.0` - `transformers 4.57.6` @@ -89,6 +89,22 @@ surfaced two setup classes: This means instance creation itself worked, but bootstrap/runtime reproducibility was incomplete. +The concrete bootstrap issues found on that node were: + +- `uv` existed only in `~/.local/bin`, which non-interactive shells were not using +- the default DeepSeek venv was created against `/usr/bin/python3.11`, which on + that node was `Python 3.11.0rc1` +- system cargo/rustc were too old to parse the repo `Cargo.lock` +- the DeepSeek venv still needed the cu12 runtime pair for `vllm._C` to import: + - `nvidia-cuda-runtime-cu12` + - `nvidia-cuda-nvrtc-cu12` + +After correcting those bootstrap defects, the same fresh node was able to: + +- import `vllm._C` +- initialize a direct one-GPU `LLM(...)` +- start a real `openarchives_ocr_run_node` workload with `runtime_backend=vllm` + ## Current runner expectation `glossapi.ocr.deepseek.runner._build_env()` now auto-discovers @@ -101,10 +117,26 @@ not rely on manual shell-session exports as the primary contract. ## Practical bring-up checklist 1. confirm the node matches the OS / driver baseline -2. run `deepseek_runtime_report` -3. compare report output to a known-good node -4. fix bootstrap mismatches first -5. rerun the report -6. only then run a small OCR validation workload -7. if OCR still fails, inspect worker logs and decide whether the remaining gap +2. export user-local tool paths explicitly for non-interactive shells: + - `export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"` +3. install a stable CPython explicitly, for example: + - `~/.local/bin/uv python install 3.11.11` +4. run `deepseek_runtime_report` +5. compare report output to a known-good node +6. fix bootstrap mismatches first +7. rerun the report +8. only then run a small OCR validation workload +9. if OCR still fails, inspect worker logs and decide whether the remaining gap belongs in GlossAPI runtime code or external bootstrap + +## Rust note + +If editable installs fail while building `glossapi_rs_cleaner` or +`glossapi_rs_noise`, prefer a user-local modern Rust toolchain: + +```bash +curl https://sh.rustup.rs -sSf | sh -s -- -y +export PATH="$HOME/.cargo/bin:$PATH" +rustup toolchain install stable +rustup default stable +``` From 02f9e93d8dea4ab581c6371240fc8950912419e5 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 13:25:44 +0300 Subject: [PATCH 56/93] Harden OCR stage continuity and Docling tuning --- docs/api/corpus.md | 3 +- docs/configuration.md | 1 + docs/multi_gpu.md | 1 + .../ocr_changes_2026-04-01_to_2026-04-03.md | 82 +++++++++++++ docs/stages/ocr.md | 4 + src/glossapi/corpus/phase_extract.py | 27 ++++- src/glossapi/corpus/phase_ocr_math.py | 108 +++++++++++++++--- src/glossapi/ocr/docling/pipeline.py | 58 +++++++--- tests/test_docling_pipeline_tuning.py | 14 +++ tests/test_ocr_backends_smoke.py | 9 ++ tests/test_phase_extract_tuning.py | 16 +++ 11 files changed, 285 insertions(+), 38 deletions(-) create mode 100644 docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md create mode 100644 tests/test_docling_pipeline_tuning.py create mode 100644 tests/test_phase_extract_tuning.py diff --git a/docs/api/corpus.md b/docs/api/corpus.md index a0d6d10..3155d99 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -57,6 +57,7 @@ extract( - `force_ocr`: deprecated no-op kept for compatibility; OCR remediation now lives in `Corpus.ocr(backend='deepseek')` - `use_gpus='multi'`: use all visible GPUs through a shared work queue - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput + - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: optional environment override for how many PDFs one Docling worker processes per extractor batch; GlossAPI keeps the default at `1` until a benchmark proves a larger batch is safe on the target node - `export_doc_json=True`: write `json/.docling.json(.zst)` - `emit_formula_index=True`: also write `json/.formula_index.jsonl` - Main outputs: @@ -126,7 +127,7 @@ ocr( - Main outputs: - refreshed `markdown/.md` - refreshed cleaner/parquet metadata after OCR reruns - - when metadata parquet is present, a canonical OCR parquet should preserve the same row identity and carry corrected `text` together with the updated metadata + - when metadata parquet is present, OCR now preserves the same row identity and embeds corrected `text` plus direct OCR sidecar pointers such as `ocr_markdown_relpath`, `ocr_metrics_relpath`, and `ocr_text_sha256` - `json/.latex_map.jsonl` when enrichment runs ## formula_enrich_from_json() diff --git a/docs/configuration.md b/docs/configuration.md index af8737a..f03c521 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -22,6 +22,7 @@ Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: +- `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: override the number of PDF documents a single Phase‑1 Docling worker processes per extractor batch. Defaults to `1` in GlossAPI for stability; raise it deliberately when benchmarking fresh A100 nodes. - `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. - `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. - `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 162e901..8f36628 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -13,6 +13,7 @@ c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', worker - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. - `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. +- `GLOSSAPI_DOCLING_MAX_BATCH_FILES` lets one Docling worker take more than one PDF per extractor batch; keep the default `1` for fresh-node stability and benchmark larger values explicitly. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md b/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md new file mode 100644 index 0000000..734b5d8 --- /dev/null +++ b/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md @@ -0,0 +1,82 @@ +# OCR Changes Merged To `development` (2026-04-01 to 2026-04-03) + +This note summarizes the OCR-facing changes already merged into +`development`, centered on commit `489698e` (`deepseek reliability hardening`). + +Use it as a short operator/developer changelog for the April 1-3 rollout. + +## Runtime reliability + +- DeepSeek multi-GPU OCR now runs through a durable SQLite work queue instead of + fragile fixed subprocess assignment. +- Work items heartbeat while running and are requeued if a worker dies or goes + stale. +- Failed work items now default to one retry (`max_attempts=2` total attempts), + then become terminal failures for operator follow-up instead of bouncing + forever. +- Repair work is durable too: first-pass batches populate a second repair queue + that workers drain after the main queue is empty. +- Workers are launched in their own process groups so respawn can clean up + orphaned runtime processes and recover GPU memory. + +## Throughput and observability + +- vLLM OCR now renders pages into memory and feeds a bounded render queue + directly into inference, removing the temporary image-file round trip. +- Rendering and inference overlap during the first pass. +- Empty pages are detected before inference and skipped early. +- Per-worker runtime JSON, GPU preflight output, GPU telemetry, durable queue + state, and the final runtime summary now live under `sidecars/ocr_runtime/`. +- Runtime summaries now expose steady-state inference timestamps so long-run + throughput can be measured without startup noise. + +## Output contract and repair behavior + +- Canonical OCR outputs remain one `markdown/.md` and one + `json/metrics/.metrics.json` per source PDF. +- Page boundaries are annotated with `` comments alongside the + page split markers. +- Internal shard markdown and shard metrics move under `sidecars/ocr_shards/` + so downstream stages do not mistake them for canonical outputs. +- If a repair retry hits the garbage cutoff again, GlossAPI now blanks that page + slot instead of preserving the failed garbage text. +- Repair queue durability and repair execution packing are separate concerns: + queue accounting stays item-granular, while workers are allowed to combine + multiple repair items into one larger execution batch. + +## Fresh-node setup implications + +- The runner now auto-discovers wheel-managed CUDA libraries inside the selected + DeepSeek virtualenv and prepends them to `LD_LIBRARY_PATH`. +- Fresh A100 nodes should be validated first with: + +```bash +python -m glossapi.scripts.deepseek_runtime_report --repo-root +``` + +- The currently validated fleet baseline is: + - Ubuntu `22.04.5` + - NVIDIA driver `590.48.01` + - A100 `40GB` + - `torch 2.10.0+cu130` + - `vllm 0.18.0` + - `transformers 4.57.6` + - `workers_per_gpu=1` + +## Test coverage added with the merge + +- durable queue requeue / retry behavior +- repair queue enqueue and phase switching +- repair execution packing +- worker runtime summaries and runner contracts + +## What this doc does not cover + +This note only summarizes OCR work already merged into `development`. + +It does not describe the still-in-progress branch work for: + +- fresh-node bootstrap hardening beyond `development` +- stronger OCR metadata continuity +- canonical text-bearing OCR parquet outputs +- additional extract-clean-ocr integration validation diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 83f260b..65454eb 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -44,6 +44,10 @@ OCR reruns should preserve: ## DeepSeek runtime contract +For the operator-facing summary of the OCR changes already merged into +`development` during the April 1-3 rollout, see +`../operations/ocr_changes_2026-04-01_to_2026-04-03.md`. + - `ocr()` may execute page-range shards internally when `use_gpus="multi"` and `scheduler="exact_fill"`, but the stage contract remains one canonical Markdown file and one canonical metrics file per source PDF. - When shard execution is used, the runner reassembles `markdown/.md` and `json/metrics/.metrics.json` after the CLI workers finish. - Execution-time shard artifacts are moved under `sidecars/ocr_shards/` so downstream stages do not mistake them for canonical stage outputs. diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index 296429a..e210803 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -37,6 +37,24 @@ def _maybe_import_torch(force: bool = False): return _maybe_import_torch_fallback(force=force) +def _resolve_docling_max_batch_files(default: int = 1) -> int: + """Resolve the per-worker Docling document batch size for Phase-1 extraction. + + GlossAPI keeps the default conservative because fresh GPU nodes have been + more sensitive to bootstrap/runtime drift than to raw scheduler limits. + Strong GPUs can still be benchmarked explicitly by raising this knob. + """ + + fallback = max(1, int(default)) + raw = os.getenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES") + if not raw: + return fallback + try: + return max(1, int(raw)) + except Exception: + return fallback + + class ExtractPhaseMixin: def prime_extractor( self, @@ -112,8 +130,13 @@ def prime_extractor( # Configure batch/backend policy based on resolved choice if backend_choice == "docling": - # Keep docling runs conservative: process one document per batch for stability - self.extractor.configure_batch_policy("docling", max_batch_files=1, prefer_safe_backend=False) + # Keep docling runs conservative by default, but expose an explicit + # Phase-1 tuning hook for benchmark nodes and strong GPUs. + self.extractor.configure_batch_policy( + "docling", + max_batch_files=_resolve_docling_max_batch_files(), + prefer_safe_backend=False, + ) else: self.extractor.configure_batch_policy("safe", max_batch_files=1, prefer_safe_backend=True) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 552af09..125c289 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -1,6 +1,7 @@ """OCR and math enrichment helpers split from Corpus.""" from __future__ import annotations +import hashlib import json import logging import math @@ -27,6 +28,85 @@ from .corpus_utils import _maybe_import_torch +def _build_ocr_stage_artifact_update( + *, + markdown_dir: Path, + metrics_dir: Path, + stem: str, +) -> Optional[Dict[str, object]]: + """Return direct OCR-owned artifact fields for one canonical OCR document. + + The OCR stage should hand off the same row identity that upstream stages + produced, with corrected text embedded back into parquet. Markdown and + metrics remain sidecars, but detached markdown alone is not the full stage + contract. + """ + + markdown_path = Path(markdown_dir) / f"{stem}.md" + if not markdown_path.exists(): + return None + text_payload = markdown_path.read_text(encoding="utf-8") + metrics_path = Path(metrics_dir) / f"{stem}.metrics.json" + return { + "text": text_payload, + "ocr_markdown_relpath": str(Path("markdown") / markdown_path.name), + "ocr_metrics_relpath": ( + str(Path("json") / "metrics" / metrics_path.name) if metrics_path.exists() else None + ), + "ocr_text_sha256": hashlib.sha256(text_payload.encode("utf-8")).hexdigest(), + } + + +def _apply_ocr_success_updates( + df_meta: pd.DataFrame, + *, + filenames: List[str], + markdown_dir: Path, + metrics_dir: Path, + backend_norm: str, +) -> pd.DataFrame: + """Apply only direct, obvious OCR-owned metadata updates to the parquet rows.""" + + if "filename" not in df_meta.columns: + return df_meta + + if "filter" not in df_meta.columns: + df_meta["filter"] = "ok" + if "needs_ocr" not in df_meta.columns: + df_meta["needs_ocr"] = False + if "ocr_success" not in df_meta.columns: + df_meta["ocr_success"] = False + if "extraction_mode" not in df_meta.columns: + df_meta["extraction_mode"] = None + + direct_columns = ("text", "ocr_markdown_relpath", "ocr_metrics_relpath", "ocr_text_sha256") + for column in direct_columns: + if column not in df_meta.columns: + df_meta[column] = None + + for fname in filenames: + mask = df_meta["filename"].astype(str) == str(fname) + if not bool(mask.any()): + continue + stem = canonical_stem(fname) + artifact_update = _build_ocr_stage_artifact_update( + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + stem=stem, + ) + df_meta.loc[mask, "filter"] = "ok" + df_meta.loc[mask, "needs_ocr"] = False + df_meta.loc[mask, "ocr_success"] = True + if backend_norm == "deepseek": + df_meta.loc[mask, "extraction_mode"] = "deepseek" + if artifact_update is None: + continue + for column, value in artifact_update.items(): + df_meta.loc[mask, column] = value + + return df_meta + + class OcrMathPhaseMixin: def ocr( self, @@ -674,25 +754,15 @@ def _run_math(stems: List[str]) -> None: import pandas as _pd df_meta = _pd.read_parquet(parquet_path) - if "filename" in df_meta.columns: - if "filter" not in df_meta.columns: - df_meta["filter"] = "ok" - if "needs_ocr" not in df_meta.columns: - df_meta["needs_ocr"] = False - if "ocr_success" not in df_meta.columns: - df_meta["ocr_success"] = False - if "extraction_mode" not in df_meta.columns: - df_meta["extraction_mode"] = None - for _fname in success_files: - mask = df_meta["filename"].astype(str) == str(_fname) - if mask.any(): - df_meta.loc[mask, "filter"] = "ok" - df_meta.loc[mask, "needs_ocr"] = False - df_meta.loc[mask, "ocr_success"] = True - if backend_norm == "deepseek": - df_meta.loc[mask, "extraction_mode"] = "deepseek" - self._cache_metadata_parquet(parquet_path) - parquet_schema.write_metadata_parquet(df_meta, parquet_path) + df_meta = _apply_ocr_success_updates( + df_meta, + filenames=success_files, + markdown_dir=self.markdown_dir, + metrics_dir=self.output_dir / "json" / "metrics", + backend_norm=backend_norm, + ) + self._cache_metadata_parquet(parquet_path) + parquet_schema.write_metadata_parquet(df_meta, parquet_path) # Keep sectioner in sync with newly recovered files try: stems = [canonical_stem(_f) for _f in success_files] diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py index 8162e60..34b4192 100644 --- a/src/glossapi/ocr/docling/pipeline.py +++ b/src/glossapi/ocr/docling/pipeline.py @@ -6,19 +6,35 @@ from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, - LayoutOptions, PdfPipelineOptions, - PictureDescriptionApiOptions, TableFormerMode, TableStructureOptions, ) +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import LayoutOptions +except ImportError: # pragma: no cover - older Docling versions + LayoutOptions = None + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import PictureDescriptionApiOptions +except ImportError: # pragma: no cover - older Docling versions + PictureDescriptionApiOptions = None + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions +except ImportError: # pragma: no cover - older Docling versions + ThreadedPdfPipelineOptions = None + def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: """Return accelerator options and whether CUDA was requested.""" dev = device or "cuda:0" if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) + try: + acc = AcceleratorOptions(device=dev) + except Exception: + acc = AcceleratorOptions(device=dev.split(":", 1)[0]) want_cuda = dev.lower().startswith("cuda") else: want_cuda = str(dev).lower().startswith("cuda") @@ -35,6 +51,13 @@ def _apply_common_pdf_options( formula_enrichment: bool, code_enrichment: bool, ) -> PdfPipelineOptions: + def _supports_kwarg(model_cls, field_name: str) -> bool: + fields = getattr(model_cls, "model_fields", None) or getattr(model_cls, "__fields__", None) + if fields is None: + return True + return field_name in fields + + options_cls = ThreadedPdfPipelineOptions or PdfPipelineOptions table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) try: if hasattr(table_opts, "do_cell_matching"): @@ -42,22 +65,25 @@ def _apply_common_pdf_options( except Exception: pass - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) + option_kwargs = { + "accelerator_options": acc, + "do_ocr": False, + "do_table_structure": True, + "do_formula_enrichment": bool(formula_enrichment), + "do_code_enrichment": bool(code_enrichment), + "force_backend_text": False, + "generate_parsed_pages": False, + "allow_external_plugins": True, + } + if LayoutOptions is not None and _supports_kwarg(options_cls, "layout_options"): + option_kwargs["layout_options"] = LayoutOptions() + if _supports_kwarg(options_cls, "table_structure_options"): + option_kwargs["table_structure_options"] = table_opts + opts = options_cls(**{key: value for key, value in option_kwargs.items() if _supports_kwarg(options_cls, key)}) try: if hasattr(opts, "do_picture_description"): opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: + if PictureDescriptionApiOptions is not None and getattr(opts, "picture_description_options", None) is None: opts.picture_description_options = PictureDescriptionApiOptions() if hasattr(opts, "enable_remote_services"): opts.enable_remote_services = False diff --git a/tests/test_docling_pipeline_tuning.py b/tests/test_docling_pipeline_tuning.py new file mode 100644 index 0000000..1978db2 --- /dev/null +++ b/tests/test_docling_pipeline_tuning.py @@ -0,0 +1,14 @@ +from glossapi.ocr.docling import pipeline as docling_pipeline + + +def test_apply_common_pdf_options_prefers_threaded_pipeline_options_when_available(): + acc, _ = docling_pipeline._resolve_accelerator("cuda:0") + opts = docling_pipeline._apply_common_pdf_options( + acc=acc, + images_scale=1.25, + formula_enrichment=False, + code_enrichment=False, + ) + + expected_cls = docling_pipeline.ThreadedPdfPipelineOptions or docling_pipeline.PdfPipelineOptions + assert isinstance(opts, expected_cls) diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index 6c410c5..e2c2133 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -1,3 +1,4 @@ +import hashlib from pathlib import Path import pandas as pd @@ -63,4 +64,12 @@ def fake_enrich(files=None, **kwargs): # Verify updated = pd.read_parquet(parquet_path).set_index("filename") assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs.pdf", "needs_ocr"]) is False + assert updated.loc["needs.pdf", "text"] == "ds md\n" + assert updated.loc["needs.pdf", "ocr_markdown_relpath"] == "markdown/needs.md" + assert updated.loc["needs.pdf", "ocr_metrics_relpath"] == "json/metrics/needs.metrics.json" + assert ( + updated.loc["needs.pdf", "ocr_text_sha256"] + == hashlib.sha256(b"ds md\n").hexdigest() + ) assert captured.get("files") == ["clean"], "Math-only should run for non-OCR stem only" diff --git a/tests/test_phase_extract_tuning.py b/tests/test_phase_extract_tuning.py new file mode 100644 index 0000000..7d79f39 --- /dev/null +++ b/tests/test_phase_extract_tuning.py @@ -0,0 +1,16 @@ +from glossapi.corpus.phase_extract import _resolve_docling_max_batch_files + + +def test_resolve_docling_max_batch_files_defaults_to_conservative_batch(monkeypatch): + monkeypatch.delenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", raising=False) + assert _resolve_docling_max_batch_files() == 1 + + +def test_resolve_docling_max_batch_files_accepts_explicit_override(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "4") + assert _resolve_docling_max_batch_files() == 4 + + +def test_resolve_docling_max_batch_files_ignores_invalid_values(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "not-an-int") + assert _resolve_docling_max_batch_files() == 1 From 85da5dbd97138887f77c6951bfbac4d4285ba689 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 13:35:58 +0300 Subject: [PATCH 57/93] Add extraction checkpoint benchmark harness --- .../scripts/extract_checkpoint_benchmark.py | 208 ++++++++++++++++++ tests/test_extract_checkpoint_benchmark.py | 46 ++++ 2 files changed, 254 insertions(+) create mode 100644 src/glossapi/scripts/extract_checkpoint_benchmark.py create mode 100644 tests/test_extract_checkpoint_benchmark.py diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py new file mode 100644 index 0000000..53bb2c6 --- /dev/null +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import shutil +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +from glossapi import Corpus + + +HEADER_RE = re.compile(r"(?m)^[ \t]{0,3}#{1,6}\s+\S") + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.extract_checkpoint_benchmark", + description=( + "Run a strict Phase-1 extraction benchmark on a fixed PDF set and audit " + "canonical markdown outputs for presence, byte size, header counts, and drift." + ), + ) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--report-path", required=True) + p.add_argument("--baseline-report", default="") + p.add_argument("--phase1-backend", default="docling", choices=["auto", "safe", "docling"]) + p.add_argument("--accel-type", default="CUDA") + p.add_argument("--num-threads", type=int, default=1) + p.add_argument("--use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--devices", nargs="*", type=int, default=None) + p.add_argument("--workers-per-device", type=int, default=1) + p.add_argument("--benchmark-mode", action="store_true") + p.add_argument("--filenames", nargs="*", default=[]) + p.add_argument("--clean-output-dir", action="store_true") + p.add_argument("--log-level", default="INFO") + return p.parse_args(argv) + + +def _count_pdf_pages(pdf_path: Path) -> int: + import fitz + + doc = fitz.open(pdf_path) + try: + return int(doc.page_count) + finally: + doc.close() + + +def _sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _markdown_headers(text: str) -> int: + return int(len(HEADER_RE.findall(text or ""))) + + +def _inventory_markdown(markdown_dir: Path, *, pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]: + inventory: Dict[str, Dict[str, Any]] = {} + for pdf_path in pdf_paths: + stem = pdf_path.stem + md_path = markdown_dir / f"{stem}.md" + present = md_path.exists() + payload = md_path.read_bytes() if present else b"" + text = payload.decode("utf-8") if present else "" + inventory[stem] = { + "filename": pdf_path.name, + "markdown_path": str(md_path), + "present": bool(present), + "byte_size": int(len(payload)), + "header_count": _markdown_headers(text), + "sha256": _sha256_bytes(payload) if present else None, + } + return inventory + + +def _compare_inventory( + current_inventory: Dict[str, Dict[str, Any]], + baseline_inventory: Dict[str, Dict[str, Any]], +) -> Dict[str, Any]: + added = [] + missing = [] + byte_size_changed = [] + header_count_changed = [] + sha_changed = [] + for stem, current in sorted(current_inventory.items()): + baseline = baseline_inventory.get(stem) + if baseline is None: + added.append(stem) + continue + if bool(baseline.get("present")) and not bool(current.get("present")): + missing.append(stem) + if int(baseline.get("byte_size", 0)) != int(current.get("byte_size", 0)): + byte_size_changed.append(stem) + if int(baseline.get("header_count", 0)) != int(current.get("header_count", 0)): + header_count_changed.append(stem) + if baseline.get("sha256") != current.get("sha256"): + sha_changed.append(stem) + for stem, baseline in sorted(baseline_inventory.items()): + if stem in current_inventory: + continue + if bool(baseline.get("present")): + missing.append(stem) + return { + "added_markdown": added, + "missing_markdown": sorted(set(missing)), + "byte_size_changed": byte_size_changed, + "header_count_changed": header_count_changed, + "sha_changed": sha_changed, + } + + +def _load_baseline_inventory(path: Path) -> Dict[str, Dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + return dict(payload.get("markdown_inventory") or {}) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + input_dir = Path(args.input_dir).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + report_path = Path(args.report_path).expanduser().resolve() + report_path.parent.mkdir(parents=True, exist_ok=True) + + pdf_paths = sorted(input_dir.glob("*.pdf")) + if args.filenames: + selected = {str(name) for name in args.filenames} + pdf_paths = [path for path in pdf_paths if path.name in selected] + if not pdf_paths: + raise SystemExit(f"No PDF files selected under {input_dir}") + + if bool(args.clean_output_dir) and output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + total_pages = int(sum(_count_pdf_pages(path) for path in pdf_paths)) + start_ts = time.time() + start_perf = time.perf_counter() + + corpus = Corpus(input_dir=input_dir, output_dir=output_dir) + corpus.extract( + input_format="pdf", + accel_type=str(args.accel_type), + num_threads=int(args.num_threads), + phase1_backend=str(args.phase1_backend), + use_gpus=str(args.use_gpus), + devices=list(args.devices) if args.devices else None, + workers_per_device=int(args.workers_per_device), + benchmark_mode=bool(args.benchmark_mode), + filenames=[path.name for path in pdf_paths], + ) + + elapsed_sec = float(time.perf_counter() - start_perf) + end_ts = time.time() + markdown_dir = output_dir / "markdown" + inventory = _inventory_markdown(markdown_dir, pdf_paths=pdf_paths) + markdown_present = int(sum(1 for item in inventory.values() if bool(item["present"]))) + + report: Dict[str, Any] = { + "input_dir": str(input_dir), + "output_dir": str(output_dir), + "started_at": int(start_ts), + "finished_at": int(end_ts), + "elapsed_sec": elapsed_sec, + "files_total": int(len(pdf_paths)), + "pages_total": int(total_pages), + "pages_per_sec": (float(total_pages) / elapsed_sec) if elapsed_sec > 0 else None, + "phase1_backend": str(args.phase1_backend), + "accel_type": str(args.accel_type), + "num_threads": int(args.num_threads), + "use_gpus": str(args.use_gpus), + "devices": list(args.devices) if args.devices else [], + "workers_per_device": int(args.workers_per_device), + "benchmark_mode": bool(args.benchmark_mode), + "markdown_present": markdown_present, + "markdown_missing": int(len(pdf_paths) - markdown_present), + "markdown_inventory": inventory, + } + + baseline_raw = str(args.baseline_report or "").strip() + if baseline_raw: + baseline_path = Path(baseline_raw).expanduser().resolve() + if baseline_path.exists(): + report["comparison"] = _compare_inventory( + inventory, + _load_baseline_inventory(baseline_path), + ) + else: + report["comparison_error"] = f"Baseline report not found: {baseline_path}" + + report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print(json.dumps({ + "files_total": report["files_total"], + "pages_total": report["pages_total"], + "elapsed_sec": round(report["elapsed_sec"], 3), + "pages_per_sec": round(report["pages_per_sec"], 4) if report["pages_per_sec"] is not None else None, + "markdown_present": report["markdown_present"], + "markdown_missing": report["markdown_missing"], + "report_path": str(report_path), + }, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_extract_checkpoint_benchmark.py b/tests/test_extract_checkpoint_benchmark.py new file mode 100644 index 0000000..e8ac27b --- /dev/null +++ b/tests/test_extract_checkpoint_benchmark.py @@ -0,0 +1,46 @@ +import json +from pathlib import Path + +from glossapi.scripts import extract_checkpoint_benchmark as benchmark + + +def test_markdown_headers_counts_markdown_headings(): + text = "# Title\n\ntext\n## Subtitle\n\nnot a header\n### Third\n" + assert benchmark._markdown_headers(text) == 3 + + +def test_compare_inventory_detects_presence_size_header_and_sha_changes(): + baseline = { + "a": {"present": True, "byte_size": 10, "header_count": 1, "sha256": "old"}, + "b": {"present": True, "byte_size": 20, "header_count": 0, "sha256": "same"}, + } + current = { + "a": {"present": True, "byte_size": 12, "header_count": 2, "sha256": "new"}, + "c": {"present": True, "byte_size": 5, "header_count": 0, "sha256": "other"}, + } + diff = benchmark._compare_inventory(current, baseline) + assert diff["added_markdown"] == ["c"] + assert diff["missing_markdown"] == ["b"] + assert diff["byte_size_changed"] == ["a"] + assert diff["header_count_changed"] == ["a"] + assert diff["sha_changed"] == ["a"] + + +def test_load_baseline_inventory_reads_report_payload(tmp_path): + report_path = tmp_path / "baseline.json" + report_path.write_text( + json.dumps({"markdown_inventory": {"doc": {"present": True, "byte_size": 1, "header_count": 0}}}), + encoding="utf-8", + ) + assert benchmark._load_baseline_inventory(report_path)["doc"]["present"] is True + + +def test_inventory_markdown_marks_missing_files(tmp_path): + input_pdf = tmp_path / "sample.pdf" + input_pdf.write_bytes(b"%PDF-1.4\n") + markdown_dir = tmp_path / "markdown" + markdown_dir.mkdir() + inventory = benchmark._inventory_markdown(markdown_dir, pdf_paths=[input_pdf]) + assert inventory["sample"]["present"] is False + assert inventory["sample"]["byte_size"] == 0 + assert inventory["sample"]["header_count"] == 0 From 5de066e2d882acaead1e76c9de54e84da6e2398c Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 13:38:45 +0300 Subject: [PATCH 58/93] Fallback PDF page counting in benchmark harness --- .../scripts/extract_checkpoint_benchmark.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py index 53bb2c6..c0e9de0 100644 --- a/src/glossapi/scripts/extract_checkpoint_benchmark.py +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -41,13 +41,30 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: def _count_pdf_pages(pdf_path: Path) -> int: - import fitz + try: + import fitz + + doc = fitz.open(pdf_path) + try: + return int(doc.page_count) + finally: + doc.close() + except Exception: + pass + + try: + from pypdf import PdfReader + + return int(len(PdfReader(str(pdf_path)).pages)) + except Exception: + pass - doc = fitz.open(pdf_path) try: - return int(doc.page_count) - finally: - doc.close() + from PyPDF2 import PdfReader # type: ignore + + return int(len(PdfReader(str(pdf_path)).pages)) + except Exception as exc: + raise RuntimeError(f"Unable to count PDF pages for {pdf_path}: {exc}") from exc def _sha256_bytes(data: bytes) -> str: From 4d46285419e22711deecb172efbb18410b35526b Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 13:40:06 +0300 Subject: [PATCH 59/93] Use pypdfium2 for benchmark page counts --- .../scripts/extract_checkpoint_benchmark.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py index c0e9de0..72d6d6e 100644 --- a/src/glossapi/scripts/extract_checkpoint_benchmark.py +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -53,18 +53,30 @@ def _count_pdf_pages(pdf_path: Path) -> int: pass try: - from pypdf import PdfReader + import pypdfium2 as pdfium - return int(len(PdfReader(str(pdf_path)).pages)) + pdf = pdfium.PdfDocument(str(pdf_path)) + try: + return int(len(pdf)) + finally: + try: + pdf.close() + except Exception: + pass except Exception: pass try: - from PyPDF2 import PdfReader # type: ignore + from pypdf import PdfReader return int(len(PdfReader(str(pdf_path)).pages)) except Exception as exc: - raise RuntimeError(f"Unable to count PDF pages for {pdf_path}: {exc}") from exc + try: + from PyPDF2 import PdfReader # type: ignore + + return int(len(PdfReader(str(pdf_path)).pages)) + except Exception as exc2: + raise RuntimeError(f"Unable to count PDF pages for {pdf_path}: {exc2}") from exc2 def _sha256_bytes(data: bytes) -> str: From 4e49582254ed861f93f53c201ab90b9ac91dec1a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 14:05:25 +0300 Subject: [PATCH 60/93] Pack multi-GPU extraction work by page budget --- src/glossapi/corpus/corpus_orchestrator.py | 162 ++++++++++----------- src/glossapi/corpus/phase_extract.py | 158 +++++++++++++++++++- tests/test_corpus_guards.py | 55 +++++++ tests/test_phase_extract_tuning.py | 58 +++++++- 4 files changed, 341 insertions(+), 92 deletions(-) diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index 7f254f1..276c060 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -354,7 +354,7 @@ def gpu_extract_worker_queue( worker_key: str, in_dir: str, out_dir: str, - work_q, # multiprocessing Queue of filename strings + work_q, # multiprocessing Queue of filename strings or bundled path lists force: bool, fe: bool, ce: bool, @@ -420,6 +420,21 @@ def _clear_current() -> None: _marker_path.unlink(missing_ok=True) except Exception: pass + + def _normalize_work_item(item: Any) -> List[str]: + if isinstance(item, str): + return [item] if item.strip() else [] + if isinstance(item, (list, tuple, set)): + normalized: List[str] = [] + for value in item: + try: + text = str(value).strip() + except Exception: + continue + if text: + normalized.append(text) + return normalized + return [] _worker_log_handle = None try: _log_dir = _os.environ.get("GLOSSAPI_WORKER_LOG_DIR") @@ -579,104 +594,79 @@ def _report_batch(ok_list, bad_list): last_progress = _time.time() processed = 0 exit_code = 0 + + def _run_batch(batch_items: List[str]) -> None: + nonlocal processed, exit_code + if not batch_items: + return + try: + _update_current(list(batch_items)) + c.extract( + input_format=input_fmt, + num_threads=threads, + accel_type="cuda:0", + force_ocr=force, + formula_enrichment=fe, + code_enrichment=ce, + file_paths=list(batch_items), + skip_existing=skip, + use_gpus="single", + use_cls=use_cls_w, + benchmark_mode=benchmark, + export_doc_json=bool(export_json), + emit_formula_index=bool(emit_index), + phase1_backend=backend, + _prepared=True, + ) + processed += len(batch_items) + _clear_current() + except Exception as _e: + exit_code = 1 + print(f"[GPU{device_id}] Batch failed ({len(batch_items)}): {_e}") + if result_q is not None: + try: + result_q.put( + { + "event": "batch", + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, + "processed": [], + "problematic": list(batch_items), + "pid": _os.getpid(), + "error": str(_e), + } + ) + except Exception: + pass + _clear_current() + try: while True: try: - nm = work_q.get_nowait() + work_item = work_q.get_nowait() except _queue.Empty: # queue.Empty or other -> flush any pending batch then exit if batch: - try: - _update_current(list(batch)) - c.extract( - input_format=input_fmt, - num_threads=threads, - accel_type="cuda:0", - force_ocr=force, - formula_enrichment=fe, - code_enrichment=ce, - file_paths=list(batch), - skip_existing=skip, - use_gpus="single", - use_cls=use_cls_w, - benchmark_mode=benchmark, - export_doc_json=bool(export_json), - emit_formula_index=bool(emit_index), - phase1_backend=backend, - _prepared=True, - ) - processed += len(batch) - _clear_current() - except Exception as _e: - exit_code = 1 - print(f"[GPU{device_id}] Batch failed ({len(batch)}): {_e}") - if result_q is not None: - try: - result_q.put( - { - "event": "batch", - "worker": _worker_label, - "device_id": device_id, - "worker_slot": worker_slot, - "processed": [], - "problematic": list(batch), - "pid": _os.getpid(), - "error": str(_e), - } - ) - except Exception: - pass - _clear_current() + _run_batch(batch) batch.clear() break except Exception as exc: exit_code = 1 print(f"[GPU{device_id}] Queue receive error: {exc}") break - if isinstance(nm, str) and nm.strip(): - batch.append(nm) + normalized = _normalize_work_item(work_item) + if not normalized: + continue + if len(normalized) > 1: + if batch: + _run_batch(batch) + batch.clear() + _run_batch(normalized) + continue + batch.extend(normalized) if len(batch) >= BATCH_SIZE: - try: - _update_current(list(batch)) - c.extract( - input_format=input_fmt, - num_threads=threads, - accel_type="cuda:0", - force_ocr=force, - formula_enrichment=fe, - code_enrichment=ce, - file_paths=list(batch), - skip_existing=skip, - use_gpus="single", - use_cls=use_cls_w, - benchmark_mode=benchmark, - export_doc_json=bool(export_json), - emit_formula_index=bool(emit_index), - phase1_backend=backend, - _prepared=True, - ) - processed += len(batch) - _clear_current() - except Exception as _e: - exit_code = 1 - print(f"[GPU{device_id}] Batch failed ({len(batch)}): {_e}") - if result_q is not None: - try: - result_q.put( - { - "event": "batch", - "worker": _worker_label, - "device_id": device_id, - "worker_slot": worker_slot, - "processed": [], - "problematic": list(batch), - "pid": _os.getpid(), - "error": str(_e), - } - ) - except Exception: - pass - _clear_current() + _run_batch(batch) batch.clear() # Occasional heartbeat if _time.time() - last_progress > 30: diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index e210803..dc825f6 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -13,7 +13,7 @@ import sys import time from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -55,6 +55,109 @@ def _resolve_docling_max_batch_files(default: int = 1) -> int: return fallback +def _resolve_docling_batch_target_pages(default: int = 256) -> int: + """Resolve the target page budget per queued Docling extraction work item.""" + + fallback = max(1, int(default)) + raw = os.getenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES") + if not raw: + return fallback + try: + return max(1, int(raw)) + except Exception: + return fallback + + +def _estimate_extract_work_pages(path: Path) -> int: + """Best-effort PDF page estimate used for Phase-1 queue packing.""" + + suffix = path.suffix.lower() + if suffix != ".pdf": + return 1 + + try: + import pypdfium2 as pdfium # type: ignore + + pdf = pdfium.PdfDocument(str(path)) + try: + return max(1, int(len(pdf))) + finally: + close = getattr(pdf, "close", None) + if callable(close): + close() + except Exception: + pass + + for module_name, attr_name in ( + ("pypdf", "PdfReader"), + ("PyPDF2", "PdfReader"), + ): + try: + module = __import__(module_name, fromlist=[attr_name]) + reader_cls = getattr(module, attr_name) + reader = reader_cls(str(path)) + return max(1, int(len(reader.pages))) + except Exception: + continue + + return 1 + + +def _build_extract_work_items( + paths: Iterable[Path], + *, + max_batch_files: int, + target_batch_pages: int, + long_pdf_page_threshold: int = 600, + page_counter: Optional[Callable[[Path], int]] = None, +) -> List[List[Path]]: + """Pack extraction work into steadier page-budget batches for multi-GPU runs.""" + + files = [Path(path) for path in paths] + if not files: + return [] + + max_files = max(1, int(max_batch_files)) + target_pages = max(1, int(target_batch_pages)) + long_threshold = max(1, int(long_pdf_page_threshold)) + counter = page_counter or _estimate_extract_work_pages + + packed: List[Tuple[List[Path], int]] = [] + standalone: List[Tuple[List[Path], int]] = [] + + for path in files: + try: + est_pages = max(1, int(counter(path))) + except Exception: + est_pages = 1 + + if path.suffix.lower() == ".pdf" and est_pages > long_threshold: + standalone.append(([path], est_pages)) + continue + + best_idx: Optional[int] = None + best_leftover: Optional[int] = None + for idx, (bundle_paths, bundle_pages) in enumerate(packed): + if len(bundle_paths) >= max_files: + continue + new_pages = bundle_pages + est_pages + if bundle_paths and new_pages > target_pages: + continue + leftover = max(0, target_pages - new_pages) + if best_leftover is None or leftover < best_leftover: + best_idx = idx + best_leftover = leftover + if best_idx is None: + packed.append(([path], est_pages)) + else: + packed[best_idx][0].append(path) + packed[best_idx] = (packed[best_idx][0], packed[best_idx][1] + est_pages) + + work_items = standalone + packed + work_items.sort(key=lambda item: item[1], reverse=True) + return [bundle_paths for bundle_paths, _ in work_items] + + class ExtractPhaseMixin: def prime_extractor( self, @@ -488,9 +591,47 @@ def extract( task_q = ctx.Queue() result_q = ctx.Queue() status_map = manager.dict() - path_list = [str(p.resolve()) for p in pending_files] - for full_path in path_list: - task_q.put(full_path) + batch_target_pages = 1 + configured_max_batch_files = 1 + long_pdf_page_threshold = 600 + work_items: List[List[Path]] = [[Path(p)] for p in pending_files] + try: + extractor = getattr(self, "extractor", None) + if extractor is not None: + configured_max_batch_files = max( + 1, int(getattr(extractor, "max_batch_files", configured_max_batch_files)) + ) + long_pdf_page_threshold = max( + 1, int(getattr(extractor, "long_pdf_page_threshold", long_pdf_page_threshold)) + ) + except Exception: + configured_max_batch_files = 1 + long_pdf_page_threshold = 600 + if backend_choice == "docling": + batch_target_pages = _resolve_docling_batch_target_pages() + work_items = _build_extract_work_items( + pending_files, + max_batch_files=configured_max_batch_files, + target_batch_pages=batch_target_pages, + long_pdf_page_threshold=long_pdf_page_threshold, + ) + queue_items = [[str(path.resolve()) for path in item] for item in work_items] + for queue_item in queue_items: + task_q.put(queue_item) + total_estimated_pages = 0 + try: + total_estimated_pages = sum(_estimate_extract_work_pages(path) for path in pending_files) + except Exception: + total_estimated_pages = 0 + self.logger.info( + "Phase-1 dispatch: %d file(s) -> %d work item(s) (backend=%s max_batch_files=%d target_pages=%d est_pages=%d)", + len(pending_files), + len(queue_items), + backend_choice, + configured_max_batch_files, + batch_target_pages, + total_estimated_pages, + ) worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") worker_log_dir_to_use = worker_log_dir_env if not worker_log_dir_to_use: @@ -699,7 +840,7 @@ def extract( now = time.time() if now - last_summary > 30: try: - pending = result_q.qsize() + pending = task_q.qsize() except NotImplementedError: pending = -1 self.logger.info( @@ -740,6 +881,13 @@ def extract( pending_item = task_q.get_nowait() if isinstance(pending_item, str) and pending_item.strip(): remaining_after_failure.append(pending_item) + continue + if isinstance(pending_item, (list, tuple, set)): + remaining_after_failure.extend( + str(item).strip() + for item in pending_item + if str(item).strip() + ) except queue.Empty: pass if remaining_after_failure: diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index a5ea0b1..525db7a 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -228,4 +228,59 @@ def extract(self, *, file_paths=None, **kwargs): assert processed_batches == [["doc.pdf"]] assert work_q.empty() + + +def test_gpu_worker_accepts_bundled_work_items(tmp_path, monkeypatch): + import glossapi.corpus as corpus_mod + + processed_batches = [] + + class FakeCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + self.extractor = SimpleNamespace(max_batch_files=1) + + def prime_extractor(self, *args, **kwargs): + return None + + def extract(self, *, file_paths=None, **kwargs): + processed_batches.append(list(file_paths or [])) + return None + + monkeypatch.setattr(corpus_mod, "Corpus", FakeCorpus) + monkeypatch.setattr("glossapi.Corpus", FakeCorpus) + monkeypatch.delenv("GLOSSAPI_WORKER_LOG_DIR", raising=False) + + work_q = queue.Queue() + work_q.put(["doc-a.pdf", "doc-b.pdf"]) + result_q = queue.Queue() + status_map: dict = {} + + with pytest.raises(SystemExit) as exit_info: + corpus_mod.gpu_extract_worker_queue( + device_id=0, + worker_slot=0, + worker_key="gpu0-w0", + in_dir=str(tmp_path), + out_dir=str(tmp_path), + work_q=work_q, + force=False, + fe=False, + ce=False, + use_cls_w=False, + skip=False, + input_fmt="pdf", + threads=1, + benchmark=False, + export_json=False, + emit_index=False, + backend="safe", + result_q=result_q, + status_map=status_map, + marker_dir=None, + ) + + assert exit_info.value.code == 0 + assert processed_batches == [["doc-a.pdf", "doc-b.pdf"]] assert status_map == {} diff --git a/tests/test_phase_extract_tuning.py b/tests/test_phase_extract_tuning.py index 7d79f39..e5a3a35 100644 --- a/tests/test_phase_extract_tuning.py +++ b/tests/test_phase_extract_tuning.py @@ -1,4 +1,10 @@ -from glossapi.corpus.phase_extract import _resolve_docling_max_batch_files +from pathlib import Path + +from glossapi.corpus.phase_extract import ( + _build_extract_work_items, + _resolve_docling_batch_target_pages, + _resolve_docling_max_batch_files, +) def test_resolve_docling_max_batch_files_defaults_to_conservative_batch(monkeypatch): @@ -14,3 +20,53 @@ def test_resolve_docling_max_batch_files_accepts_explicit_override(monkeypatch): def test_resolve_docling_max_batch_files_ignores_invalid_values(monkeypatch): monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "not-an-int") assert _resolve_docling_max_batch_files() == 1 + + +def test_resolve_docling_batch_target_pages_defaults(monkeypatch): + monkeypatch.delenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", raising=False) + assert _resolve_docling_batch_target_pages() == 256 + + +def test_resolve_docling_batch_target_pages_accepts_override(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", "384") + assert _resolve_docling_batch_target_pages() == 384 + + +def test_build_extract_work_items_packs_smaller_files_by_page_budget(): + paths = [Path("a.pdf"), Path("b.pdf"), Path("c.pdf"), Path("d.pdf")] + pages = { + "a.pdf": 140, + "b.pdf": 120, + "c.pdf": 110, + "d.pdf": 90, + } + + items = _build_extract_work_items( + paths, + max_batch_files=2, + target_batch_pages=250, + long_pdf_page_threshold=600, + page_counter=lambda path: pages[path.name], + ) + + assert [[p.name for p in item] for item in items] == [["a.pdf", "c.pdf"], ["b.pdf", "d.pdf"]] + + +def test_build_extract_work_items_keeps_long_pdf_as_standalone_work_item(): + paths = [Path("huge.pdf"), Path("small-a.pdf"), Path("small-b.pdf")] + pages = { + "huge.pdf": 1200, + "small-a.pdf": 100, + "small-b.pdf": 80, + } + + items = _build_extract_work_items( + paths, + max_batch_files=3, + target_batch_pages=250, + long_pdf_page_threshold=600, + page_counter=lambda path: pages[path.name], + ) + + assert [p.name for p in items[0]] == ["huge.pdf"] + assert sorted(p.name for p in items[1]) == ["small-a.pdf", "small-b.pdf"] From 442fe467dd298947cadcc7d3a3159e3ae0ba9bb2 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 14:48:25 +0300 Subject: [PATCH 61/93] Document extraction queue tuning knobs --- docs/api/corpus.md | 1 + docs/configuration.md | 1 + docs/multi_gpu.md | 3 ++- src/glossapi/corpus/phase_extract.py | 15 ++++++++++++--- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 3155d99..22d0d21 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -58,6 +58,7 @@ extract( - `use_gpus='multi'`: use all visible GPUs through a shared work queue - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: optional environment override for how many PDFs one Docling worker processes per extractor batch; GlossAPI keeps the default at `1` until a benchmark proves a larger batch is safe on the target node + - `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: optional environment override for the page budget of each queued multi-GPU Docling work item; use it with benchmark checkpoints when long PDFs dominate the tail - `export_doc_json=True`: write `json/.docling.json(.zst)` - `emit_formula_index=True`: also write `json/.formula_index.jsonl` - Main outputs: diff --git a/docs/configuration.md b/docs/configuration.md index f03c521..e25bcf6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -23,6 +23,7 @@ Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: override the number of PDF documents a single Phase‑1 Docling worker processes per extractor batch. Defaults to `1` in GlossAPI for stability; raise it deliberately when benchmarking fresh A100 nodes. +- `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: target page budget for each queued multi‑GPU Docling work item. Defaults to `256`; lower it when a single worker hoards long PDFs, raise it when a strong GPU can keep larger mixed bundles resident. - `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. - `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. - `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 8f36628..29bca72 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -1,7 +1,7 @@ # Multi‑GPU & Benchmarking GlossAPI can scale across multiple visible GPUs. Faster GPUs drain more work from a shared queue of **absolute -file paths**, so no worker rescans directories. +file paths or pre-packed work items, so no worker rescans directories. ## Extract (Phase‑1) on Multiple GPUs @@ -14,6 +14,7 @@ c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', worker - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. - `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. - `GLOSSAPI_DOCLING_MAX_BATCH_FILES` lets one Docling worker take more than one PDF per extractor batch; keep the default `1` for fresh-node stability and benchmark larger values explicitly. +- `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES` controls the page budget per queued multi-GPU Docling work item. The controller now sorts heavier work first and packs smaller PDFs toward that page budget so workers do not immediately collapse into a long single-file tail. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index dc825f6..69d50cd 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -544,11 +544,20 @@ def extract( threads_effective = int(num_threads) if isinstance(num_threads, int) else max(2, 2 * max(1, len(devs))) workers_per_device = max(1, int(workers_per_device or 1)) - batch_hint = 1 + configured_batch_hint = 1 + if backend_choice == "docling": + try: + extractor = getattr(self, "extractor", None) + if extractor is not None: + configured_batch_hint = max( + 1, int(getattr(extractor, "max_batch_files", configured_batch_hint)) + ) + except Exception: + configured_batch_hint = _resolve_docling_max_batch_files() self.logger.info( - "Phase-1 config: backend=%s batch_size=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", + "Phase-1 config: backend=%s max_batch_files=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, - batch_hint, + configured_batch_hint, threads_effective, workers_per_device, bool(skip_existing), From 91769e8925474fefa2a731d9015ed396ce555e5d Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 15:22:49 +0300 Subject: [PATCH 62/93] Tune Docling page batching for extraction --- docs/api/corpus.md | 1 + docs/configuration.md | 1 + docs/multi_gpu.md | 1 + src/glossapi/ocr/docling/pipeline.py | 19 +++++++++++++++++ .../scripts/extract_checkpoint_benchmark.py | 15 +++++++++++++ tests/test_docling_pipeline_tuning.py | 21 +++++++++++++++++++ tests/test_extract_checkpoint_benchmark.py | 12 +++++++++++ 7 files changed, 70 insertions(+) diff --git a/docs/api/corpus.md b/docs/api/corpus.md index 22d0d21..9fd73a4 100644 --- a/docs/api/corpus.md +++ b/docs/api/corpus.md @@ -59,6 +59,7 @@ extract( - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: optional environment override for how many PDFs one Docling worker processes per extractor batch; GlossAPI keeps the default at `1` until a benchmark proves a larger batch is safe on the target node - `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: optional environment override for the page budget of each queued multi-GPU Docling work item; use it with benchmark checkpoints when long PDFs dominate the tail + - `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE`: optional environment override for Docling's internal `settings.perf.page_batch_size`; use it when a GPU can hold more pages in flight than the default internal batch window - `export_doc_json=True`: write `json/.docling.json(.zst)` - `emit_formula_index=True`: also write `json/.formula_index.jsonl` - Main outputs: diff --git a/docs/configuration.md b/docs/configuration.md index e25bcf6..870babf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -25,6 +25,7 @@ These optional knobs map directly to current Docling `PdfPipelineOptions` fields - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: override the number of PDF documents a single Phase‑1 Docling worker processes per extractor batch. Defaults to `1` in GlossAPI for stability; raise it deliberately when benchmarking fresh A100 nodes. - `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: target page budget for each queued multi‑GPU Docling work item. Defaults to `256`; lower it when a single worker hoards long PDFs, raise it when a strong GPU can keep larger mixed bundles resident. - `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. +- `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE`: override Docling `settings.perf.page_batch_size` so Phase‑1 can raise or lower the number of pages each device keeps in flight internally without changing GlossAPI queue semantics. - `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. - `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. - `GLOSSAPI_DOCLING_QUEUE_MAX_SIZE`: override Docling `queue_max_size`. diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index 29bca72..c06efe8 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -15,6 +15,7 @@ c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', worker - `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. - `GLOSSAPI_DOCLING_MAX_BATCH_FILES` lets one Docling worker take more than one PDF per extractor batch; keep the default `1` for fresh-node stability and benchmark larger values explicitly. - `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES` controls the page budget per queued multi-GPU Docling work item. The controller now sorts heavier work first and packs smaller PDFs toward that page budget so workers do not immediately collapse into a long single-file tail. +- `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE` controls Docling's internal per-device page window (`settings.perf.page_batch_size`). Use it together with the outer queue page budget when you want steadier GPU residency instead of just fatter file bundles. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py index 34b4192..df23030 100644 --- a/src/glossapi/ocr/docling/pipeline.py +++ b/src/glossapi/ocr/docling/pipeline.py @@ -26,6 +26,11 @@ except ImportError: # pragma: no cover - older Docling versions ThreadedPdfPipelineOptions = None +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.settings import settings as docling_settings +except ImportError: # pragma: no cover - older Docling versions + docling_settings = None + def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: """Return accelerator options and whether CUDA was requested.""" @@ -141,6 +146,20 @@ def _apply_runtime_overrides(opts: PdfPipelineOptions) -> None: except Exception: pass + raw_page_batch_size = os.getenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE") + if raw_page_batch_size and docling_settings is not None: + try: + page_batch_size = int(raw_page_batch_size) + except ValueError: + page_batch_size = 0 + if page_batch_size > 0: + try: + perf_settings = getattr(docling_settings, "perf", None) + if perf_settings is not None and hasattr(perf_settings, "page_batch_size"): + setattr(perf_settings, "page_batch_size", page_batch_size) + except Exception: + pass + def build_layout_pipeline( *, diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py index 72d6d6e..ccbffa9 100644 --- a/src/glossapi/scripts/extract_checkpoint_benchmark.py +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -3,6 +3,7 @@ import argparse import hashlib import json +import os import re import shutil import time @@ -14,6 +15,19 @@ HEADER_RE = re.compile(r"(?m)^[ \t]{0,3}#{1,6}\s+\S") +TUNING_ENV_VARS = ( + "GLOSSAPI_DOCLING_MAX_BATCH_FILES", + "GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE", + "GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", +) + + +def _runtime_env_snapshot() -> Dict[str, str]: + return {name: os.getenv(name, "") for name in TUNING_ENV_VARS} + def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p = argparse.ArgumentParser( @@ -204,6 +218,7 @@ def main(argv: Optional[List[str]] = None) -> int: "devices": list(args.devices) if args.devices else [], "workers_per_device": int(args.workers_per_device), "benchmark_mode": bool(args.benchmark_mode), + "runtime_env": _runtime_env_snapshot(), "markdown_present": markdown_present, "markdown_missing": int(len(pdf_paths) - markdown_present), "markdown_inventory": inventory, diff --git a/tests/test_docling_pipeline_tuning.py b/tests/test_docling_pipeline_tuning.py index 1978db2..d57aadb 100644 --- a/tests/test_docling_pipeline_tuning.py +++ b/tests/test_docling_pipeline_tuning.py @@ -12,3 +12,24 @@ def test_apply_common_pdf_options_prefers_threaded_pipeline_options_when_availab expected_cls = docling_pipeline.ThreadedPdfPipelineOptions or docling_pipeline.PdfPipelineOptions assert isinstance(opts, expected_cls) + + +def test_apply_runtime_overrides_updates_docling_page_batch_size(monkeypatch): + class Perf: + page_batch_size = 4 + + class Settings: + perf = Perf() + + monkeypatch.setenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", "8") + monkeypatch.setattr(docling_pipeline, "docling_settings", Settings(), raising=False) + + acc, _ = docling_pipeline._resolve_accelerator("cuda:0") + docling_pipeline._apply_common_pdf_options( + acc=acc, + images_scale=1.25, + formula_enrichment=False, + code_enrichment=False, + ) + + assert Settings.perf.page_batch_size == 8 diff --git a/tests/test_extract_checkpoint_benchmark.py b/tests/test_extract_checkpoint_benchmark.py index e8ac27b..b87a887 100644 --- a/tests/test_extract_checkpoint_benchmark.py +++ b/tests/test_extract_checkpoint_benchmark.py @@ -44,3 +44,15 @@ def test_inventory_markdown_marks_missing_files(tmp_path): assert inventory["sample"]["present"] is False assert inventory["sample"]["byte_size"] == 0 assert inventory["sample"]["header_count"] == 0 + + +def test_runtime_env_snapshot_captures_docling_batch_knobs(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + monkeypatch.setenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", "384") + monkeypatch.setenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", "8") + + snapshot = benchmark._runtime_env_snapshot() + + assert snapshot["GLOSSAPI_DOCLING_MAX_BATCH_FILES"] == "2" + assert snapshot["GLOSSAPI_DOCLING_BATCH_TARGET_PAGES"] == "384" + assert snapshot["GLOSSAPI_DOCLING_PAGE_BATCH_SIZE"] == "8" From 9c30ab895bb70aa9fb484232508bf9a434e23150 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 15:28:05 +0300 Subject: [PATCH 63/93] Add explicit extract benchmark tuning flags --- .../scripts/extract_checkpoint_benchmark.py | 24 +++++++++++++++++ tests/test_corpus_guards.py | 1 + tests/test_extract_checkpoint_benchmark.py | 27 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py index ccbffa9..ec5800d 100644 --- a/src/glossapi/scripts/extract_checkpoint_benchmark.py +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -24,6 +24,15 @@ "GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", ) +TUNING_ARG_TO_ENV = { + "docling_max_batch_files": "GLOSSAPI_DOCLING_MAX_BATCH_FILES", + "docling_batch_target_pages": "GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", + "docling_layout_batch_size": "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE", + "docling_table_batch_size": "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE", + "docling_ocr_batch_size": "GLOSSAPI_DOCLING_OCR_BATCH_SIZE", + "docling_page_batch_size": "GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", +} + def _runtime_env_snapshot() -> Dict[str, str]: return {name: os.getenv(name, "") for name in TUNING_ENV_VARS} @@ -48,12 +57,26 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--devices", nargs="*", type=int, default=None) p.add_argument("--workers-per-device", type=int, default=1) p.add_argument("--benchmark-mode", action="store_true") + p.add_argument("--docling-max-batch-files", type=int, default=None) + p.add_argument("--docling-batch-target-pages", type=int, default=None) + p.add_argument("--docling-layout-batch-size", type=int, default=None) + p.add_argument("--docling-table-batch-size", type=int, default=None) + p.add_argument("--docling-ocr-batch-size", type=int, default=None) + p.add_argument("--docling-page-batch-size", type=int, default=None) p.add_argument("--filenames", nargs="*", default=[]) p.add_argument("--clean-output-dir", action="store_true") p.add_argument("--log-level", default="INFO") return p.parse_args(argv) +def _apply_cli_tuning_overrides(args: argparse.Namespace) -> None: + for arg_name, env_name in TUNING_ARG_TO_ENV.items(): + value = getattr(args, arg_name, None) + if value is None: + continue + os.environ[env_name] = str(int(value)) + + def _count_pdf_pages(pdf_path: Path) -> int: try: import fitz @@ -163,6 +186,7 @@ def _load_baseline_inventory(path: Path) -> Dict[str, Dict[str, Any]]: def main(argv: Optional[List[str]] = None) -> int: args = _parse_args(argv) + _apply_cli_tuning_overrides(args) input_dir = Path(args.input_dir).expanduser().resolve() output_dir = Path(args.output_dir).expanduser().resolve() report_path = Path(args.report_path).expanduser().resolve() diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 525db7a..8997a20 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -115,6 +115,7 @@ def test_prime_extractor_configures_docling_backend_explicitly(tmp_path, monkeyp corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() + monkeypatch.delenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", raising=False) set_torch_stub(monkeypatch, available=True, device_count=2) corpus.prime_extractor( input_format="pdf", diff --git a/tests/test_extract_checkpoint_benchmark.py b/tests/test_extract_checkpoint_benchmark.py index b87a887..aefa3e5 100644 --- a/tests/test_extract_checkpoint_benchmark.py +++ b/tests/test_extract_checkpoint_benchmark.py @@ -56,3 +56,30 @@ def test_runtime_env_snapshot_captures_docling_batch_knobs(monkeypatch): assert snapshot["GLOSSAPI_DOCLING_MAX_BATCH_FILES"] == "2" assert snapshot["GLOSSAPI_DOCLING_BATCH_TARGET_PAGES"] == "384" assert snapshot["GLOSSAPI_DOCLING_PAGE_BATCH_SIZE"] == "8" + + +def test_apply_cli_tuning_overrides_sets_docling_env(monkeypatch): + for env_name in benchmark.TUNING_ENV_VARS: + monkeypatch.delenv(env_name, raising=False) + + args = benchmark._parse_args( + [ + "--input-dir", + "/tmp/in", + "--output-dir", + "/tmp/out", + "--report-path", + "/tmp/report.json", + "--docling-max-batch-files", + "2", + "--docling-batch-target-pages", + "512", + "--docling-page-batch-size", + "8", + ] + ) + benchmark._apply_cli_tuning_overrides(args) + + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_MAX_BATCH_FILES"] == "2" + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_BATCH_TARGET_PAGES"] == "512" + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_PAGE_BATCH_SIZE"] == "8" From 1f8204a66774e5dff7e1bdb29b846b734391cf57 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 15:35:50 +0300 Subject: [PATCH 64/93] Respect extraction queue batch boundaries --- src/glossapi/corpus/corpus_orchestrator.py | 34 ++----------- tests/test_corpus_guards.py | 57 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index 276c060..3feb7ec 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -574,22 +574,9 @@ def _report_batch(ok_list, bad_list): c.extractor.batch_result_callback = _report_batch except Exception as _e: print(f"[GPU{device_id}] Unable to set batch callback: {_e}") - # Prepare persistent extractor in this worker on first call - # Process queue items in small batches to reduce function-call overhead - batch: list[str] = [] - try: - _batch_env = int(str(_os.environ.get("GLOSSAPI_GPU_BATCH_SIZE", "")).strip() or 0) - except Exception: - _batch_env = 0 - default_batch = 5 - try: - extractor = getattr(c, "extractor", None) - if extractor is not None: - configured = int(getattr(extractor, "max_batch_files", default_batch)) - default_batch = max(1, configured) - except Exception: - pass - BATCH_SIZE = max(1, _batch_env) if _batch_env else max(1, default_batch) + # The controller already shapes queue items for multi-GPU extraction. Workers + # should execute those queue items as-is rather than re-batching them locally, + # otherwise long PDFs can be accidentally merged back into tail-heavy bundles. import queue as _queue last_progress = _time.time() processed = 0 @@ -646,10 +633,6 @@ def _run_batch(batch_items: List[str]) -> None: try: work_item = work_q.get_nowait() except _queue.Empty: - # queue.Empty or other -> flush any pending batch then exit - if batch: - _run_batch(batch) - batch.clear() break except Exception as exc: exit_code = 1 @@ -658,16 +641,7 @@ def _run_batch(batch_items: List[str]) -> None: normalized = _normalize_work_item(work_item) if not normalized: continue - if len(normalized) > 1: - if batch: - _run_batch(batch) - batch.clear() - _run_batch(normalized) - continue - batch.extend(normalized) - if len(batch) >= BATCH_SIZE: - _run_batch(batch) - batch.clear() + _run_batch(normalized) # Occasional heartbeat if _time.time() - last_progress > 30: try: diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 8997a20..d6911ee 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -285,3 +285,60 @@ def extract(self, *, file_paths=None, **kwargs): assert exit_info.value.code == 0 assert processed_batches == [["doc-a.pdf", "doc-b.pdf"]] assert status_map == {} + + +def test_gpu_worker_keeps_singleton_queue_items_separate(tmp_path, monkeypatch): + import glossapi.corpus as corpus_mod + + processed_batches = [] + + class FakeCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + self.extractor = SimpleNamespace(max_batch_files=2) + + def prime_extractor(self, *args, **kwargs): + return None + + def extract(self, *, file_paths=None, **kwargs): + processed_batches.append(list(file_paths or [])) + return None + + monkeypatch.setattr(corpus_mod, "Corpus", FakeCorpus) + monkeypatch.setattr("glossapi.Corpus", FakeCorpus) + monkeypatch.delenv("GLOSSAPI_WORKER_LOG_DIR", raising=False) + + work_q = queue.Queue() + work_q.put("doc-a.pdf") + work_q.put("doc-b.pdf") + result_q = queue.Queue() + status_map: dict = {} + + with pytest.raises(SystemExit) as exit_info: + corpus_mod.gpu_extract_worker_queue( + device_id=0, + worker_slot=0, + worker_key="gpu0-w0", + in_dir=str(tmp_path), + out_dir=str(tmp_path), + work_q=work_q, + force=False, + fe=False, + ce=False, + use_cls_w=False, + skip=False, + input_fmt="pdf", + threads=1, + benchmark=False, + export_json=False, + emit_index=False, + backend="docling", + result_q=result_q, + status_map=status_map, + marker_dir=None, + ) + + assert exit_info.value.code == 0 + assert processed_batches == [["doc-a.pdf"], ["doc-b.pdf"]] + assert status_map == {} From f29bb442890a9c755188f4d2f64ee349729f2a0f Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 15:38:51 +0300 Subject: [PATCH 65/93] Apply Docling queue policy before worker startup --- src/glossapi/corpus/phase_extract.py | 44 +++++++++++++++------------- tests/test_phase_extract_tuning.py | 15 ++++++++++ 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index 69d50cd..476c3c6 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -158,6 +158,26 @@ def _build_extract_work_items( return [bundle_paths for bundle_paths, _ in work_items] +def _resolve_docling_queue_policy(extractor: Any | None = None) -> Tuple[int, int]: + """Return the Docling queue packing knobs the multi-GPU planner should use.""" + + max_batch_files = _resolve_docling_max_batch_files() + long_pdf_page_threshold = 600 + if extractor is None: + return max_batch_files, long_pdf_page_threshold + try: + max_batch_files = max(1, int(getattr(extractor, "max_batch_files", max_batch_files))) + except Exception: + max_batch_files = _resolve_docling_max_batch_files() + try: + long_pdf_page_threshold = max( + 1, int(getattr(extractor, "long_pdf_page_threshold", long_pdf_page_threshold)) + ) + except Exception: + long_pdf_page_threshold = 600 + return max_batch_files, long_pdf_page_threshold + + class ExtractPhaseMixin: def prime_extractor( self, @@ -546,14 +566,8 @@ def extract( workers_per_device = max(1, int(workers_per_device or 1)) configured_batch_hint = 1 if backend_choice == "docling": - try: - extractor = getattr(self, "extractor", None) - if extractor is not None: - configured_batch_hint = max( - 1, int(getattr(extractor, "max_batch_files", configured_batch_hint)) - ) - except Exception: - configured_batch_hint = _resolve_docling_max_batch_files() + extractor = getattr(self, "extractor", None) + configured_batch_hint, _ = _resolve_docling_queue_policy(extractor) self.logger.info( "Phase-1 config: backend=%s max_batch_files=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, @@ -604,18 +618,8 @@ def extract( configured_max_batch_files = 1 long_pdf_page_threshold = 600 work_items: List[List[Path]] = [[Path(p)] for p in pending_files] - try: - extractor = getattr(self, "extractor", None) - if extractor is not None: - configured_max_batch_files = max( - 1, int(getattr(extractor, "max_batch_files", configured_max_batch_files)) - ) - long_pdf_page_threshold = max( - 1, int(getattr(extractor, "long_pdf_page_threshold", long_pdf_page_threshold)) - ) - except Exception: - configured_max_batch_files = 1 - long_pdf_page_threshold = 600 + extractor = getattr(self, "extractor", None) + configured_max_batch_files, long_pdf_page_threshold = _resolve_docling_queue_policy(extractor) if backend_choice == "docling": batch_target_pages = _resolve_docling_batch_target_pages() work_items = _build_extract_work_items( diff --git a/tests/test_phase_extract_tuning.py b/tests/test_phase_extract_tuning.py index e5a3a35..3b32792 100644 --- a/tests/test_phase_extract_tuning.py +++ b/tests/test_phase_extract_tuning.py @@ -4,6 +4,7 @@ _build_extract_work_items, _resolve_docling_batch_target_pages, _resolve_docling_max_batch_files, + _resolve_docling_queue_policy, ) @@ -32,6 +33,20 @@ def test_resolve_docling_batch_target_pages_accepts_override(monkeypatch): assert _resolve_docling_batch_target_pages() == 384 +def test_resolve_docling_queue_policy_uses_env_when_extractor_is_unprimed(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + assert _resolve_docling_queue_policy(None) == (2, 600) + + +def test_resolve_docling_queue_policy_prefers_extractor_values(monkeypatch): + class Extractor: + max_batch_files = 3 + long_pdf_page_threshold = 900 + + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + assert _resolve_docling_queue_policy(Extractor()) == (3, 900) + + def test_build_extract_work_items_packs_smaller_files_by_page_budget(): paths = [Path("a.pdf"), Path("b.pdf"), Path("c.pdf"), Path("d.pdf")] pages = { From bc164547c52ed4253f5cf9035cd1e5740697f326 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 16:41:26 +0300 Subject: [PATCH 66/93] Add full pipeline sample checkpoint runner --- .../scripts/full_pipeline_checkpoint.py | 211 ++++++++++++++++++ tests/test_full_pipeline_checkpoint.py | 88 ++++++++ 2 files changed, 299 insertions(+) create mode 100644 src/glossapi/scripts/full_pipeline_checkpoint.py create mode 100644 tests/test_full_pipeline_checkpoint.py diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py new file mode 100644 index 0000000..2fafff9 --- /dev/null +++ b/src/glossapi/scripts/full_pipeline_checkpoint.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import argparse +import json +import shutil +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from glossapi import Corpus +from glossapi.scripts.extract_checkpoint_benchmark import _apply_cli_tuning_overrides + + +def _parse_int_list(values: Optional[List[int]]) -> List[int]: + return list(values or []) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.full_pipeline_checkpoint", + description=( + "Run a sample GlossAPI pipeline checkpoint from extract through JSONL export " + "and write a compact timing/continuity report." + ), + ) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--export-path", required=True) + p.add_argument("--report-path", required=True) + p.add_argument("--clean-output-dir", action="store_true") + + p.add_argument("--phase1-backend", default="docling", choices=["auto", "safe", "docling"]) + p.add_argument("--accel-type", default="CUDA") + p.add_argument("--num-threads", type=int, default=1) + p.add_argument("--use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--devices", nargs="*", type=int, default=None) + p.add_argument("--workers-per-device", type=int, default=1) + p.add_argument("--benchmark-mode", action="store_true") + p.add_argument("--filenames", nargs="*", default=[]) + p.add_argument("--drop-bad", action="store_true") + + p.add_argument("--docling-max-batch-files", type=int, default=None) + p.add_argument("--docling-batch-target-pages", type=int, default=None) + p.add_argument("--docling-layout-batch-size", type=int, default=None) + p.add_argument("--docling-table-batch-size", type=int, default=None) + p.add_argument("--docling-ocr-batch-size", type=int, default=None) + p.add_argument("--docling-page-batch-size", type=int, default=None) + + p.add_argument("--ocr-backend", default="deepseek") + p.add_argument("--ocr-runtime-backend", default="vllm") + p.add_argument("--ocr-use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--ocr-devices", nargs="*", type=int, default=None) + p.add_argument("--ocr-workers-per-gpu", type=int, default=1) + p.add_argument("--ocr-vllm-batch-size", type=int, default=None) + p.add_argument("--ocr-target-batch-pages", type=int, default=160) + p.add_argument("--ocr-render-dpi", type=int, default=None) + p.add_argument("--ocr-scheduler", default="auto") + p.add_argument("--ocr-math-enhance", action="store_true") + + p.add_argument("--text-key", default="text") + p.add_argument("--metadata-key", default="pipeline_metadata") + return p.parse_args(argv) + + +def _read_metadata_counts(parquet_path: Path) -> Dict[str, int]: + if not parquet_path.exists(): + return { + "rows_total": 0, + "needs_ocr_true": 0, + "ocr_success_true": 0, + "text_nonempty": 0, + } + df = pd.read_parquet(parquet_path) + if df.empty: + return { + "rows_total": 0, + "needs_ocr_true": 0, + "ocr_success_true": 0, + "text_nonempty": 0, + } + text_series = df["text"] if "text" in df.columns else pd.Series([], dtype=object) + text_nonempty = int( + sum(bool(str(value).strip()) for value in text_series.fillna("").tolist()) + ) if len(text_series) else 0 + needs_ocr_true = int(df["needs_ocr"].fillna(False).astype(bool).sum()) if "needs_ocr" in df.columns else 0 + ocr_success_true = int(df["ocr_success"].fillna(False).astype(bool).sum()) if "ocr_success" in df.columns else 0 + return { + "rows_total": int(len(df)), + "needs_ocr_true": needs_ocr_true, + "ocr_success_true": ocr_success_true, + "text_nonempty": text_nonempty, + } + + +def _count_jsonl_records(path: Path) -> int: + if not path.exists(): + return 0 + with path.open("r", encoding="utf-8") as fp: + return sum(1 for line in fp if line.strip()) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + _apply_cli_tuning_overrides(args) + + input_dir = Path(args.input_dir).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + export_path = Path(args.export_path).expanduser().resolve() + report_path = Path(args.report_path).expanduser().resolve() + + if bool(args.clean_output_dir) and output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + report_path.parent.mkdir(parents=True, exist_ok=True) + export_path.parent.mkdir(parents=True, exist_ok=True) + + corpus = Corpus(input_dir=input_dir, output_dir=output_dir) + metadata_path = output_dir / "download_results" / "download_results.parquet" + + started_at = time.time() + + extract_start = time.perf_counter() + corpus.extract( + input_format="pdf", + accel_type=str(args.accel_type), + num_threads=int(args.num_threads), + phase1_backend=str(args.phase1_backend), + use_gpus=str(args.use_gpus), + devices=_parse_int_list(args.devices), + workers_per_device=int(args.workers_per_device), + benchmark_mode=bool(args.benchmark_mode), + filenames=list(args.filenames or []), + ) + extract_elapsed = float(time.perf_counter() - extract_start) + post_extract_counts = _read_metadata_counts(metadata_path) + + clean_start = time.perf_counter() + corpus.clean(drop_bad=bool(args.drop_bad)) + clean_elapsed = float(time.perf_counter() - clean_start) + post_clean_counts = _read_metadata_counts(metadata_path) + + ocr_start = time.perf_counter() + corpus.ocr( + backend=str(args.ocr_backend), + runtime_backend=str(args.ocr_runtime_backend), + use_gpus=str(args.ocr_use_gpus), + devices=_parse_int_list(args.ocr_devices), + workers_per_gpu=int(args.ocr_workers_per_gpu), + vllm_batch_size=args.ocr_vllm_batch_size, + target_batch_pages=int(args.ocr_target_batch_pages), + render_dpi=args.ocr_render_dpi, + scheduler=str(args.ocr_scheduler), + math_enhance=bool(args.ocr_math_enhance), + ) + ocr_elapsed = float(time.perf_counter() - ocr_start) + post_ocr_counts = _read_metadata_counts(metadata_path) + + export_start = time.perf_counter() + corpus.jsonl( + export_path, + text_key=str(args.text_key), + metadata_key=str(args.metadata_key), + include_remaining_metadata=False, + metadata_path=metadata_path, + ) + export_elapsed = float(time.perf_counter() - export_start) + export_records = _count_jsonl_records(export_path) + + finished_at = time.time() + report: Dict[str, Any] = { + "input_dir": str(input_dir), + "output_dir": str(output_dir), + "export_path": str(export_path), + "metadata_path": str(metadata_path), + "started_at": int(started_at), + "finished_at": int(finished_at), + "elapsed_total_sec": float(finished_at - started_at), + "extract_elapsed_sec": extract_elapsed, + "clean_elapsed_sec": clean_elapsed, + "ocr_elapsed_sec": ocr_elapsed, + "export_elapsed_sec": export_elapsed, + "post_extract_counts": post_extract_counts, + "post_clean_counts": post_clean_counts, + "post_ocr_counts": post_ocr_counts, + "export_records": int(export_records), + } + report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print( + json.dumps( + { + "extract_elapsed_sec": round(extract_elapsed, 3), + "clean_elapsed_sec": round(clean_elapsed, 3), + "ocr_elapsed_sec": round(ocr_elapsed, 3), + "export_elapsed_sec": round(export_elapsed, 3), + "rows_total": post_ocr_counts["rows_total"], + "needs_ocr_after_clean": post_clean_counts["needs_ocr_true"], + "ocr_success_after_ocr": post_ocr_counts["ocr_success_true"], + "export_records": int(export_records), + "report_path": str(report_path), + }, + indent=2, + sort_keys=True, + ) + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py new file mode 100644 index 0000000..7991879 --- /dev/null +++ b/tests/test_full_pipeline_checkpoint.py @@ -0,0 +1,88 @@ +import json + +import pandas as pd + +from glossapi.scripts import full_pipeline_checkpoint as checkpoint + + +def test_read_metadata_counts_handles_missing_and_populated_parquet(tmp_path): + missing = checkpoint._read_metadata_counts(tmp_path / "missing.parquet") + assert missing["rows_total"] == 0 + + parquet_path = tmp_path / "download_results.parquet" + pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}, + {"filename": "b.pdf", "needs_ocr": False, "ocr_success": True, "text": "hello"}, + ] + ).to_parquet(parquet_path, index=False) + + counts = checkpoint._read_metadata_counts(parquet_path) + assert counts == { + "rows_total": 2, + "needs_ocr_true": 1, + "ocr_success_true": 1, + "text_nonempty": 1, + } + + +def test_full_pipeline_checkpoint_main_writes_summary(tmp_path, monkeypatch): + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + md = self.output_dir / "markdown" + md.mkdir(parents=True, exist_ok=True) + (md / "doc.md").write_text("raw text", encoding="utf-8") + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + (input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n") + + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + ] + ) + + assert rc == 0 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["post_clean_counts"]["needs_ocr_true"] == 1 + assert report["post_ocr_counts"]["ocr_success_true"] == 1 + assert report["export_records"] == 1 From 0ba41c708c1b01bd21e9d29d56375de62c2d4b83 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 17:24:50 +0300 Subject: [PATCH 67/93] Prefer repo DeepSeek runtime for OCR workers --- src/glossapi/ocr/deepseek/preflight.py | 9 +-- src/glossapi/ocr/deepseek/runner.py | 8 +-- src/glossapi/ocr/deepseek/runtime_paths.py | 83 ++++++++++++++++++++++ tests/test_deepseek_multi_gpu_runtime.py | 13 ++++ tests/test_deepseek_runner_contract.py | 34 +++++++++ 5 files changed, 135 insertions(+), 12 deletions(-) create mode 100644 src/glossapi/ocr/deepseek/runtime_paths.py diff --git a/src/glossapi/ocr/deepseek/preflight.py b/src/glossapi/ocr/deepseek/preflight.py index 6669707..b8638b1 100644 --- a/src/glossapi/ocr/deepseek/preflight.py +++ b/src/glossapi/ocr/deepseek/preflight.py @@ -4,10 +4,11 @@ import dataclasses import os -import sys from pathlib import Path from typing import Dict, Iterable, List, Optional +from .runtime_paths import resolve_deepseek_python + REPO_ROOT = Path(__file__).resolve().parents[4] DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" DEFAULT_MODEL_DIR = REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2" @@ -88,11 +89,7 @@ def check_deepseek_env( ) _ensure_path(script, "runner_script", errors) - python_bin = Path( - env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") - or env.get("GLOSSAPI_DEEPSEEK_PYTHON") - or sys.executable - ) + python_bin = resolve_deepseek_python(env=env) _ensure_path(python_bin, "deepseek_python", errors) model_dir = Path( diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index fb91d60..0731228 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -25,6 +25,7 @@ build_whole_document_slices, pack_slices_into_batches, ) +from glossapi.ocr.deepseek.runtime_paths import resolve_deepseek_python from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs from glossapi.ocr.deepseek.work_queue import ( STATUS_DONE, @@ -1433,12 +1434,7 @@ def run_for_files( if not script_path.exists(): raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") - python_exe = Path( - python_bin - or os.environ.get("GLOSSAPI_DEEPSEEK_PYTHON", "") - or os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", "") - or sys.executable - ) + python_exe = resolve_deepseek_python(explicit_python=python_bin) if not python_exe.exists(): raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") diff --git a/src/glossapi/ocr/deepseek/runtime_paths.py b/src/glossapi/ocr/deepseek/runtime_paths.py new file mode 100644 index 0000000..7451304 --- /dev/null +++ b/src/glossapi/ocr/deepseek/runtime_paths.py @@ -0,0 +1,83 @@ +"""Resolve DeepSeek runtime paths for split-runtime GlossAPI installs.""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Dict, Iterable, List, Optional + +REPO_ROOT = Path(__file__).resolve().parents[4] + + +def _candidate_deepseek_pythons( + *, + explicit_python: Optional[Path | str] = None, + env: Optional[Dict[str, str]] = None, + repo_root: Optional[Path] = None, +) -> List[Path]: + resolved_env = dict(env or os.environ) + root = Path(repo_root) if repo_root is not None else REPO_ROOT + + candidates: List[Path] = [] + + def _append(candidate: Optional[Path | str]) -> None: + if not candidate: + return + path = Path(candidate).expanduser() + if path not in candidates: + candidates.append(path) + + _append(explicit_python) + _append(resolved_env.get("GLOSSAPI_DEEPSEEK_PYTHON")) + _append(resolved_env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON")) + + venv_root = root / "dependency_setup" / ".venvs" + preferred_names = ("deepseek", "deepseek31111") + for name in preferred_names: + _append(venv_root / name / "bin" / "python") + if venv_root.exists(): + for candidate in sorted(venv_root.glob("deepseek*/bin/python")): + _append(candidate) + + _append(sys.executable) + return candidates + + +def resolve_deepseek_python( + *, + explicit_python: Optional[Path | str] = None, + env: Optional[Dict[str, str]] = None, + repo_root: Optional[Path] = None, +) -> Path: + """Return the best available DeepSeek Python interpreter path. + + Preference order: + 1. explicit function argument + 2. explicit environment override + 3. validated repo-local DeepSeek venv(s) + 4. current process interpreter + """ + + resolved_env = dict(env or os.environ) + explicit_candidate = Path(explicit_python).expanduser() if explicit_python else None + if explicit_candidate is not None: + return explicit_candidate + + for key in ("GLOSSAPI_DEEPSEEK_PYTHON", "GLOSSAPI_DEEPSEEK_TEST_PYTHON"): + raw = resolved_env.get(key) + if raw: + return Path(raw).expanduser() + + candidates = _candidate_deepseek_pythons( + explicit_python=None, + env={}, + repo_root=repo_root, + ) + for candidate in candidates: + if candidate.exists(): + return candidate + return candidates[0] + + +__all__ = ["resolve_deepseek_python"] diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py index b2a7c01..ca001c8 100644 --- a/tests/test_deepseek_multi_gpu_runtime.py +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -39,6 +39,19 @@ def test_build_env_uses_virtualenv_path_when_python_bin_is_symlink(tmp_path): assert str(cuda_runtime_lib) in ld_entries +def test_resolve_deepseek_python_prefers_repo_local_runtime(tmp_path): + from glossapi.ocr.deepseek import runtime_paths + + repo_root = tmp_path / "repo" + python_bin = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("", encoding="utf-8") + + resolved = runtime_paths.resolve_deepseek_python(env={}, repo_root=repo_root) + + assert resolved == python_bin + + def test_work_queue_requeues_stale_running_batch(tmp_path): from glossapi.ocr.deepseek import work_queue diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py index 8a07c57..1c7fef6 100644 --- a/tests/test_deepseek_runner_contract.py +++ b/tests/test_deepseek_runner_contract.py @@ -615,6 +615,40 @@ def fake_run_cli(input_dir, output_dir, **kwargs): assert result["doc"]["page_count"] == 1 +def test_runner_prefers_repo_local_deepseek_runtime_when_env_missing(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner, runtime_paths + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + repo_root = tmp_path / "repo" + python_bin = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("", encoding="utf-8") + monkeypatch.setattr(runtime_paths, "REPO_ROOT", repo_root) + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["python_bin"] = kwargs["python_bin"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.delenv("GLOSSAPI_DEEPSEEK_PYTHON", raising=False) + monkeypatch.delenv("GLOSSAPI_DEEPSEEK_TEST_PYTHON", raising=False) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["python_bin"] == python_bin + assert result["doc"]["page_count"] == 1 + + def test_runner_forwards_scheduler_controls_to_multi_cli(tmp_path, monkeypatch): from glossapi.ocr.deepseek import runner From e9f73c2bdec4f05fbf3a9139cf699b103db4c8d3 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 18:08:56 +0300 Subject: [PATCH 68/93] Prefer validated DeepSeek runtimes --- docs/configuration.md | 2 +- docs/getting_started.md | 1 + src/glossapi/ocr/deepseek/runtime_paths.py | 16 ++++++++++++---- tests/test_deepseek_multi_gpu_runtime.py | 16 ++++++++++++++++ 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 870babf..98f2687 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -47,7 +47,7 @@ The dedicated uv profile is OCR-only and does not install the Docling extraction - `GLOSSAPI_DEEPSEEK_ALLOW_STUB`: must remain `0`; stub execution is rejected. - `GLOSSAPI_DEEPSEEK_ALLOW_CLI`: keep at `1` to require the real runtime. -- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. +- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. When this is unset, GlossAPI now prefers a repo-local version-pinned DeepSeek runtime under `dependency_setup/.venvs/deepseek*` before falling back to the generic `deepseek` alias and finally the current process interpreter. - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`: override path to the OCR runner script (defaults to `src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`). - `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. diff --git a/docs/getting_started.md b/docs/getting_started.md index 97e3905..a53518c 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -63,6 +63,7 @@ Then pass that interpreter explicitly to the setup scripts: - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- If `GLOSSAPI_DEEPSEEK_PYTHON` is unset, GlossAPI now searches for a repo-local version-pinned DeepSeek runtime under `dependency_setup/.venvs/deepseek*` before falling back to the generic `deepseek` alias and then the current process interpreter. Keep the env var set when you need an explicit override; broken explicit paths are treated as configuration errors, not silently ignored. - Standard OCR defaults after setup: - `runtime_backend='vllm'` - `ocr_profile='markdown_grounded'` diff --git a/src/glossapi/ocr/deepseek/runtime_paths.py b/src/glossapi/ocr/deepseek/runtime_paths.py index 7451304..a442010 100644 --- a/src/glossapi/ocr/deepseek/runtime_paths.py +++ b/src/glossapi/ocr/deepseek/runtime_paths.py @@ -10,6 +10,17 @@ REPO_ROOT = Path(__file__).resolve().parents[4] +def _runtime_sort_key(candidate: Path) -> tuple[int, int, str]: + name = candidate.parent.parent.name + if name == "deepseek": + return (1, 0, name) + if name.startswith("deepseek"): + suffix = name[len("deepseek") :] + if suffix.isdigit(): + return (0, -int(suffix), name) + return (2, 0, name) + + def _candidate_deepseek_pythons( *, explicit_python: Optional[Path | str] = None, @@ -33,11 +44,8 @@ def _append(candidate: Optional[Path | str]) -> None: _append(resolved_env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON")) venv_root = root / "dependency_setup" / ".venvs" - preferred_names = ("deepseek", "deepseek31111") - for name in preferred_names: - _append(venv_root / name / "bin" / "python") if venv_root.exists(): - for candidate in sorted(venv_root.glob("deepseek*/bin/python")): + for candidate in sorted(venv_root.glob("deepseek*/bin/python"), key=_runtime_sort_key): _append(candidate) _append(sys.executable) diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py index ca001c8..e465949 100644 --- a/tests/test_deepseek_multi_gpu_runtime.py +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -52,6 +52,22 @@ def test_resolve_deepseek_python_prefers_repo_local_runtime(tmp_path): assert resolved == python_bin +def test_resolve_deepseek_python_prefers_versioned_runtime_over_generic_alias(tmp_path): + from glossapi.ocr.deepseek import runtime_paths + + repo_root = tmp_path / "repo" + generic = repo_root / "dependency_setup" / ".venvs" / "deepseek" / "bin" / "python" + versioned = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + generic.parent.mkdir(parents=True, exist_ok=True) + versioned.parent.mkdir(parents=True, exist_ok=True) + generic.write_text("", encoding="utf-8") + versioned.write_text("", encoding="utf-8") + + resolved = runtime_paths.resolve_deepseek_python(env={}, repo_root=repo_root) + + assert resolved == versioned + + def test_work_queue_requeues_stale_running_batch(tmp_path): from glossapi.ocr.deepseek import work_queue From 3a3a401b83c70f43237ffd6bba0c619ad1b34992 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 18:12:41 +0300 Subject: [PATCH 69/93] Allow full pipeline checkpoints to resume --- .../scripts/full_pipeline_checkpoint.py | 79 +++++++++++-------- tests/test_full_pipeline_checkpoint.py | 62 +++++++++++++++ 2 files changed, 110 insertions(+), 31 deletions(-) diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py index 2fafff9..eabf355 100644 --- a/src/glossapi/scripts/full_pipeline_checkpoint.py +++ b/src/glossapi/scripts/full_pipeline_checkpoint.py @@ -30,6 +30,9 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--export-path", required=True) p.add_argument("--report-path", required=True) p.add_argument("--clean-output-dir", action="store_true") + p.add_argument("--skip-extract", action="store_true") + p.add_argument("--skip-clean", action="store_true") + p.add_argument("--skip-ocr", action="store_true") p.add_argument("--phase1-backend", default="docling", choices=["auto", "safe", "docling"]) p.add_argument("--accel-type", default="CUDA") @@ -120,41 +123,54 @@ def main(argv: Optional[List[str]] = None) -> int: metadata_path = output_dir / "download_results" / "download_results.parquet" started_at = time.time() - - extract_start = time.perf_counter() - corpus.extract( - input_format="pdf", - accel_type=str(args.accel_type), - num_threads=int(args.num_threads), - phase1_backend=str(args.phase1_backend), - use_gpus=str(args.use_gpus), - devices=_parse_int_list(args.devices), - workers_per_device=int(args.workers_per_device), - benchmark_mode=bool(args.benchmark_mode), - filenames=list(args.filenames or []), - ) - extract_elapsed = float(time.perf_counter() - extract_start) + skipped_phases: List[str] = [] + + if bool(args.skip_extract): + skipped_phases.append("extract") + extract_elapsed = 0.0 + else: + extract_start = time.perf_counter() + corpus.extract( + input_format="pdf", + accel_type=str(args.accel_type), + num_threads=int(args.num_threads), + phase1_backend=str(args.phase1_backend), + use_gpus=str(args.use_gpus), + devices=_parse_int_list(args.devices), + workers_per_device=int(args.workers_per_device), + benchmark_mode=bool(args.benchmark_mode), + filenames=list(args.filenames or []), + ) + extract_elapsed = float(time.perf_counter() - extract_start) post_extract_counts = _read_metadata_counts(metadata_path) - clean_start = time.perf_counter() - corpus.clean(drop_bad=bool(args.drop_bad)) - clean_elapsed = float(time.perf_counter() - clean_start) + if bool(args.skip_clean): + skipped_phases.append("clean") + clean_elapsed = 0.0 + else: + clean_start = time.perf_counter() + corpus.clean(drop_bad=bool(args.drop_bad)) + clean_elapsed = float(time.perf_counter() - clean_start) post_clean_counts = _read_metadata_counts(metadata_path) - ocr_start = time.perf_counter() - corpus.ocr( - backend=str(args.ocr_backend), - runtime_backend=str(args.ocr_runtime_backend), - use_gpus=str(args.ocr_use_gpus), - devices=_parse_int_list(args.ocr_devices), - workers_per_gpu=int(args.ocr_workers_per_gpu), - vllm_batch_size=args.ocr_vllm_batch_size, - target_batch_pages=int(args.ocr_target_batch_pages), - render_dpi=args.ocr_render_dpi, - scheduler=str(args.ocr_scheduler), - math_enhance=bool(args.ocr_math_enhance), - ) - ocr_elapsed = float(time.perf_counter() - ocr_start) + if bool(args.skip_ocr): + skipped_phases.append("ocr") + ocr_elapsed = 0.0 + else: + ocr_start = time.perf_counter() + corpus.ocr( + backend=str(args.ocr_backend), + runtime_backend=str(args.ocr_runtime_backend), + use_gpus=str(args.ocr_use_gpus), + devices=_parse_int_list(args.ocr_devices), + workers_per_gpu=int(args.ocr_workers_per_gpu), + vllm_batch_size=args.ocr_vllm_batch_size, + target_batch_pages=int(args.ocr_target_batch_pages), + render_dpi=args.ocr_render_dpi, + scheduler=str(args.ocr_scheduler), + math_enhance=bool(args.ocr_math_enhance), + ) + ocr_elapsed = float(time.perf_counter() - ocr_start) post_ocr_counts = _read_metadata_counts(metadata_path) export_start = time.perf_counter() @@ -177,6 +193,7 @@ def main(argv: Optional[List[str]] = None) -> int: "started_at": int(started_at), "finished_at": int(finished_at), "elapsed_total_sec": float(finished_at - started_at), + "skipped_phases": list(skipped_phases), "extract_elapsed_sec": extract_elapsed, "clean_elapsed_sec": clean_elapsed, "ocr_elapsed_sec": ocr_elapsed, diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py index 7991879..57cf540 100644 --- a/tests/test_full_pipeline_checkpoint.py +++ b/tests/test_full_pipeline_checkpoint.py @@ -86,3 +86,65 @@ def jsonl(self, output_path, **kwargs): assert report["post_clean_counts"]["needs_ocr_true"] == 1 assert report["post_ocr_counts"]["ocr_success_true"] == 1 assert report["export_records"] == 1 + + +def test_full_pipeline_checkpoint_can_resume_from_ocr_phase(tmp_path, monkeypatch): + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + raise AssertionError("extract should have been skipped") + + def clean(self, **kwargs): + raise AssertionError("clean should have been skipped") + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + metadata_path = output_dir / "download_results" / "download_results.parquet" + metadata_path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(metadata_path, index=False) + + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + "--skip-extract", + "--skip-clean", + ] + ) + + assert rc == 0 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["skipped_phases"] == ["extract", "clean"] + assert report["post_extract_counts"]["needs_ocr_true"] == 1 + assert report["post_ocr_counts"]["ocr_success_true"] == 1 + assert report["export_records"] == 1 From a8d2b93a9022c10178d24f85a508bf3426849a4e Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 19:13:44 +0300 Subject: [PATCH 70/93] Normalize OCR targets and expose repair packing --- src/glossapi/_naming.py | 4 ++ src/glossapi/corpus/phase_ocr_math.py | 50 ++++++++++++++-- .../scripts/full_pipeline_checkpoint.py | 4 ++ .../scripts/openarchives_ocr_run_node.py | 4 ++ tests/test_full_pipeline_checkpoint.py | 60 +++++++++++++++++++ tests/test_metadata_fallback.py | 2 + tests/test_ocr_backends_smoke.py | 41 +++++++++++++ tests/test_ocr_dispatch_backends.py | 37 ++++++++++++ 8 files changed, 198 insertions(+), 4 deletions(-) diff --git a/src/glossapi/_naming.py b/src/glossapi/_naming.py index 068b195..5f28434 100644 --- a/src/glossapi/_naming.py +++ b/src/glossapi/_naming.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path +import re from typing import Union _KNOWN_SUFFIXES = ( @@ -19,6 +20,8 @@ ".htm", ) +_PAGE_CHUNK_SUFFIX_RE = re.compile(r"__p\d{4,5}-\d{4,5}$") + def canonical_stem(value: Union[str, Path]) -> str: """Return a normalised stem for any pipeline artefact.""" @@ -33,6 +36,7 @@ def canonical_stem(value: Union[str, Path]) -> str: working = working[: -len(suffix)] stripped = True break + working = _PAGE_CHUNK_SUFFIX_RE.sub("", working) if working: return working fallback = Path(name).stem diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 125c289..28030ff 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -84,11 +84,14 @@ def _apply_ocr_success_updates( if column not in df_meta.columns: df_meta[column] = None + filename_series = df_meta["filename"].astype(str) + stem_series = filename_series.map(canonical_stem) + for fname in filenames: - mask = df_meta["filename"].astype(str) == str(fname) + stem = canonical_stem(fname) + mask = stem_series == stem if not bool(mask.any()): continue - stem = canonical_stem(fname) artifact_update = _build_ocr_stage_artifact_update( markdown_dir=markdown_dir, metrics_dir=metrics_dir, @@ -107,6 +110,27 @@ def _apply_ocr_success_updates( return df_meta +def _normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> List[str]: + """Collapse chunk-like metadata rows back to real OCR source files when possible.""" + + source_by_stem: Dict[str, str] = {} + try: + for path in sorted(Path(input_dir).glob("*.pdf")): + source_by_stem.setdefault(canonical_stem(path.name), path.name) + except Exception: + source_by_stem = {} + + normalized: List[str] = [] + seen: Set[str] = set() + for fname in filenames: + resolved = source_by_stem.get(canonical_stem(fname), str(fname)) + if resolved in seen: + continue + normalized.append(resolved) + seen.add(resolved) + return normalized + + class OcrMathPhaseMixin: def ocr( self, @@ -137,6 +161,8 @@ def ocr( gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, repair_mode: str = "auto", + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, scheduler: str = "auto", target_batch_pages: int = 160, shard_pages: int = 0, @@ -196,8 +222,11 @@ def ocr( - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode: Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first repair pipeline (plain fallback for garbage pages, tiled fallback for - short coverage failures). These are ignored by the transformers runtime - except for ``prompt_override``. + short coverage failures). ``repair_exec_batch_target_pages`` and + ``repair_exec_batch_target_items`` control how many pending repair rows + a worker tries to execute together once the global repair phase begins. + These are ignored by the transformers runtime except for + ``prompt_override``. - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - reprocess_completed: when False, skip documents already flagged as successfully OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False @@ -357,6 +386,17 @@ def ocr( removed, ) try: + normalized_bad_files = _normalize_ocr_target_filenames( + filenames=bad_files, + input_dir=Path(self.input_dir), + ) + if len(normalized_bad_files) != len(bad_files): + self.logger.info( + "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem.", + len(bad_files), + len(normalized_bad_files), + ) + bad_files = normalized_bad_files self.logger.info( "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d", ocr_candidates_initial, @@ -727,6 +767,8 @@ def _run_math(stems: List[str]) -> None: gpu_memory_utilization=gpu_memory_utilization, disable_fp8_kv=disable_fp8_kv, repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, scheduler=scheduler, target_batch_pages=int(max(1, target_batch_pages)), shard_pages=int(max(0, shard_pages)), diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py index eabf355..d0feb5b 100644 --- a/src/glossapi/scripts/full_pipeline_checkpoint.py +++ b/src/glossapi/scripts/full_pipeline_checkpoint.py @@ -57,6 +57,8 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--ocr-devices", nargs="*", type=int, default=None) p.add_argument("--ocr-workers-per-gpu", type=int, default=1) p.add_argument("--ocr-vllm-batch-size", type=int, default=None) + p.add_argument("--ocr-repair-exec-batch-target-pages", type=int, default=None) + p.add_argument("--ocr-repair-exec-batch-target-items", type=int, default=None) p.add_argument("--ocr-target-batch-pages", type=int, default=160) p.add_argument("--ocr-render-dpi", type=int, default=None) p.add_argument("--ocr-scheduler", default="auto") @@ -165,6 +167,8 @@ def main(argv: Optional[List[str]] = None) -> int: devices=_parse_int_list(args.ocr_devices), workers_per_gpu=int(args.ocr_workers_per_gpu), vllm_batch_size=args.ocr_vllm_batch_size, + repair_exec_batch_target_pages=args.ocr_repair_exec_batch_target_pages, + repair_exec_batch_target_items=args.ocr_repair_exec_batch_target_items, target_batch_pages=int(args.ocr_target_batch_pages), render_dpi=args.ocr_render_dpi, scheduler=str(args.ocr_scheduler), diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py index 4ffdf41..aeb2751 100644 --- a/src/glossapi/scripts/openarchives_ocr_run_node.py +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -53,6 +53,8 @@ def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: p.add_argument("--max-new-tokens", type=int, default=2048) p.add_argument("--render-dpi", type=int, default=144) p.add_argument("--repair-mode", default="auto") + p.add_argument("--repair-exec-batch-target-pages", type=int, default=None) + p.add_argument("--repair-exec-batch-target-items", type=int, default=None) p.add_argument("--gpu-memory-utilization", type=float, default=0.9) return p.parse_args(argv) @@ -348,6 +350,8 @@ def main(argv: Optional[List[str]] = None) -> int: render_dpi=int(args.render_dpi), max_new_tokens=int(args.max_new_tokens), repair_mode=str(args.repair_mode), + repair_exec_batch_target_pages=args.repair_exec_batch_target_pages, + repair_exec_batch_target_items=args.repair_exec_batch_target_items, scheduler=str(args.scheduler), target_batch_pages=int(args.target_batch_pages), shard_pages=int(args.shard_pages), diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py index 57cf540..fe87bd9 100644 --- a/tests/test_full_pipeline_checkpoint.py +++ b/tests/test_full_pipeline_checkpoint.py @@ -148,3 +148,63 @@ def jsonl(self, output_path, **kwargs): assert report["post_extract_counts"]["needs_ocr_true"] == 1 assert report["post_ocr_counts"]["ocr_success_true"] == 1 assert report["export_records"] == 1 + + +def test_full_pipeline_checkpoint_forwards_repair_exec_batch_controls(tmp_path, monkeypatch): + captured = {} + + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + return None + + def ocr(self, **kwargs): + captured.update(kwargs) + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + "--ocr-repair-exec-batch-target-pages", + "64", + "--ocr-repair-exec-batch-target-items", + "24", + ] + ) + + assert rc == 0 + assert captured["repair_exec_batch_target_pages"] == 64 + assert captured["repair_exec_batch_target_items"] == 24 diff --git a/tests/test_metadata_fallback.py b/tests/test_metadata_fallback.py index f899f17..53524eb 100644 --- a/tests/test_metadata_fallback.py +++ b/tests/test_metadata_fallback.py @@ -210,6 +210,8 @@ def test_canonical_stem_variants(): "beta.metrics.json": "beta", "gamma.per_page.metrics.json": "gamma", "delta.with.dots.pdf": "delta.with.dots", + "needs__p0001-0002.pdf": "needs", + "needs__p00001-00096.md": "needs", } for source, expected in cases.items(): assert canonical_stem(source) == expected diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index e2c2133..096bf73 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -73,3 +73,44 @@ def fake_enrich(files=None, **kwargs): == hashlib.sha256(b"ds md\n").hexdigest() ) assert captured.get("files") == ["clean"], "Math-only should run for non-OCR stem only" + + +def test_deepseek_ocr_normalizes_chunk_rows_to_real_source_pdf(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + (corpus.input_dir / "needs.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + parquet_path = dl_dir / "download_results.parquet" + pd.DataFrame( + [ + {"filename": "needs.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + ] + ).to_parquet(parquet_path, index=False) + + from glossapi.ocr.deepseek import runner + + captured = {} + + def fake_run_for_files(self_ref, files, **kwargs): + captured["files"] = list(files) + markdown_dir = corpus.output_dir / "markdown" + metrics_dir = corpus.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (markdown_dir / "needs.md").write_text("normalized md\n", encoding="utf-8") + (metrics_dir / "needs.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {"needs": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False, mode="ocr_bad") + + assert captured["files"] == ["needs.pdf"] + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs__p0001-0002.pdf", "ocr_success"]) is True + assert updated.loc["needs.pdf", "text"] == "normalized md\n" + assert updated.loc["needs__p0001-0002.pdf", "text"] == "normalized md\n" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index e2198b7..3fb05eb 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -51,6 +51,43 @@ def fail_math(*args, **kwargs): assert calls.get("files") == [fname] +def test_deepseek_backend_forwards_repair_exec_batch_controls(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + df = pd.DataFrame([ + {"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False} + ]) + df.to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + mode="ocr_bad", + repair_exec_batch_target_pages=64, + repair_exec_batch_target_items=24, + ) + + assert calls.get("files") == [fname] + assert calls["kwargs"]["repair_exec_batch_target_pages"] == 64 + assert calls["kwargs"]["repair_exec_batch_target_items"] == 24 + + def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): From dbea9d15e1382a7baa42209aac0e589aa2ab7d63 Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 21:47:21 +0300 Subject: [PATCH 71/93] Harden full-pipeline export retries --- docs/operations/deepseek_gcp_a100_setup.md | 18 ++++++ .../scripts/full_pipeline_checkpoint.py | 41 +++++++++++-- tests/test_full_pipeline_checkpoint.py | 61 +++++++++++++++++++ 3 files changed, 115 insertions(+), 5 deletions(-) diff --git a/docs/operations/deepseek_gcp_a100_setup.md b/docs/operations/deepseek_gcp_a100_setup.md index 8f4192d..20d9209 100644 --- a/docs/operations/deepseek_gcp_a100_setup.md +++ b/docs/operations/deepseek_gcp_a100_setup.md @@ -105,6 +105,24 @@ After correcting those bootstrap defects, the same fresh node was able to: - initialize a direct one-GPU `LLM(...)` - start a real `openarchives_ocr_run_node` workload with `runtime_backend=vllm` +The same node was also used for a real `10`-PDF `extract -> clean -> ocr` +checkpoint: + +- the stable end-to-end shape on that node was: + - multi-GPU extraction + - `workers_per_device=1` + - multi-GPU DeepSeek OCR with `workers_per_gpu=1` +- an isolated extraction benchmark with `workers_per_device=2` was faster on the + same sample, but the first full-pipeline replay hit a Docling allocator crash: + - `malloc_consolidate(): unaligned fastbin chunk detected` +- treat `workers_per_device=2` as benchmark-only / experimental until it is + proven stable in the full Corpus pipeline, not just in extract-only tests + +The full-pipeline checkpoint harness also now retries the JSONL export when OCR +has already filled text into parquet rows but the first export pass still emits +zero records. This guards the observed end-of-run export race on the benchmark +node without changing the OCR output contract itself. + ## Current runner expectation `glossapi.ocr.deepseek.runner._build_env()` now auto-discovers diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py index d0feb5b..406b8ed 100644 --- a/src/glossapi/scripts/full_pipeline_checkpoint.py +++ b/src/glossapi/scripts/full_pipeline_checkpoint.py @@ -106,6 +106,37 @@ def _count_jsonl_records(path: Path) -> int: return sum(1 for line in fp if line.strip()) +def _export_jsonl_with_retry( + corpus: Corpus, + *, + export_path: Path, + metadata_path: Path, + text_key: str, + metadata_key: str, + post_ocr_counts: Dict[str, int], + max_attempts: int = 4, + retry_delay_sec: float = 1.0, +) -> int: + needs_retry = int(post_ocr_counts.get("text_nonempty", 0) or 0) > 0 + attempts = max_attempts if needs_retry else 1 + + for attempt in range(attempts): + if export_path.exists(): + export_path.unlink() + corpus.jsonl( + export_path, + text_key=text_key, + metadata_key=metadata_key, + include_remaining_metadata=False, + metadata_path=metadata_path, + ) + export_records = _count_jsonl_records(export_path) + if export_records > 0 or attempt == attempts - 1: + return export_records + time.sleep(retry_delay_sec) + return 0 + + def main(argv: Optional[List[str]] = None) -> int: args = _parse_args(argv) _apply_cli_tuning_overrides(args) @@ -178,15 +209,15 @@ def main(argv: Optional[List[str]] = None) -> int: post_ocr_counts = _read_metadata_counts(metadata_path) export_start = time.perf_counter() - corpus.jsonl( - export_path, + export_records = _export_jsonl_with_retry( + corpus, + export_path=export_path, + metadata_path=metadata_path, text_key=str(args.text_key), metadata_key=str(args.metadata_key), - include_remaining_metadata=False, - metadata_path=metadata_path, + post_ocr_counts=post_ocr_counts, ) export_elapsed = float(time.perf_counter() - export_start) - export_records = _count_jsonl_records(export_path) finished_at = time.time() report: Dict[str, Any] = { diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py index fe87bd9..5c36250 100644 --- a/tests/test_full_pipeline_checkpoint.py +++ b/tests/test_full_pipeline_checkpoint.py @@ -208,3 +208,64 @@ def jsonl(self, output_path, **kwargs): assert rc == 0 assert captured["repair_exec_batch_target_pages"] == 64 assert captured["repair_exec_batch_target_items"] == 24 + + +def test_full_pipeline_checkpoint_retries_empty_export_when_ocr_text_exists(tmp_path, monkeypatch): + calls = {"jsonl": 0} + + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + return None + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + calls["jsonl"] += 1 + if calls["jsonl"] == 1: + output_path.write_text("", encoding="utf-8") + return + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + ] + ) + + assert rc == 0 + assert calls["jsonl"] == 2 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["post_ocr_counts"]["text_nonempty"] == 1 + assert report["export_records"] == 1 From 6f29a2825559c540ab342fc77ae4457cf3556f2a Mon Sep 17 00:00:00 2001 From: fffoivos Date: Fri, 3 Apr 2026 22:19:46 +0300 Subject: [PATCH 72/93] Fix MkDocs navigation for OCR docs --- mkdocs.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 43b70fa..c61882c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,7 +22,6 @@ nav: - Metadata, Artifacts, and Run Diagnostics: architecture/metadata_artifacts_and_run_diagnostics.md - Artifact Layout and Stage Handoffs: architecture/artifact_layout_and_stage_handoffs.md - Resumability, Recovery, and Retention: architecture/resumability_recovery_and_retention.md - - DeepSeek-Only Upgrade Roadmap: architecture/deepseek_only_upgrade_roadmap.md - Pipeline: - Pipeline Overview: pipeline.md - OCR & Math Enrichment: ocr_and_math_enhancement.md @@ -39,6 +38,9 @@ nav: - Configuration & Ops: - Configuration: configuration.md - AWS Job Distribution: aws_job_distribution.md + - DeepSeek GCP A100 Setup: operations/deepseek_gcp_a100_setup.md + - OCR Changes 2026-04-01 to 2026-04-03: operations/ocr_changes_2026-04-01_to_2026-04-03.md + - OpenArchives OCR Rollout Plan: operations/openarchives_ocr_rollout_plan.md - Troubleshooting: troubleshooting.md - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: From 2a9ac30012a429722661e6a1d48034276be1ee0e Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Tue, 14 Apr 2026 16:59:24 +0300 Subject: [PATCH 73/93] refactor: split corpus OCR orchestration from runtime --- src/glossapi/corpus/ocr/__init__.py | 6 + src/glossapi/corpus/ocr/artifacts.py | 138 +++ src/glossapi/corpus/ocr/config.py | 229 +++++ src/glossapi/corpus/ocr/context.py | 26 + src/glossapi/corpus/ocr/dispatch.py | 49 ++ src/glossapi/corpus/ocr/math_targets.py | 43 + src/glossapi/corpus/ocr/pipeline.py | 46 + src/glossapi/corpus/ocr/targets.py | 144 ++++ src/glossapi/corpus/phase_ocr_math.py | 1017 ++++++++--------------- src/glossapi/ocr/deepseek/defaults.py | 27 + tests/test_corpus_ocr_modules.py | 102 +++ 11 files changed, 1152 insertions(+), 675 deletions(-) create mode 100644 src/glossapi/corpus/ocr/__init__.py create mode 100644 src/glossapi/corpus/ocr/artifacts.py create mode 100644 src/glossapi/corpus/ocr/config.py create mode 100644 src/glossapi/corpus/ocr/context.py create mode 100644 src/glossapi/corpus/ocr/dispatch.py create mode 100644 src/glossapi/corpus/ocr/math_targets.py create mode 100644 src/glossapi/corpus/ocr/pipeline.py create mode 100644 src/glossapi/corpus/ocr/targets.py create mode 100644 src/glossapi/ocr/deepseek/defaults.py create mode 100644 tests/test_corpus_ocr_modules.py diff --git a/src/glossapi/corpus/ocr/__init__.py b/src/glossapi/corpus/ocr/__init__.py new file mode 100644 index 0000000..e8d5b32 --- /dev/null +++ b/src/glossapi/corpus/ocr/__init__.py @@ -0,0 +1,6 @@ +"""Readable OCR orchestration helpers for the corpus pipeline.""" + +from .config import OcrRequest, normalize_ocr_request +from .pipeline import run_ocr_phase + +__all__ = ["OcrRequest", "normalize_ocr_request", "run_ocr_phase"] diff --git a/src/glossapi/corpus/ocr/artifacts.py b/src/glossapi/corpus/ocr/artifacts.py new file mode 100644 index 0000000..11b7dbb --- /dev/null +++ b/src/glossapi/corpus/ocr/artifacts.py @@ -0,0 +1,138 @@ +"""OCR result persistence helpers.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd + +from ..._naming import canonical_stem +from .context import CorpusOcrContext + + +def build_ocr_stage_artifact_update( + *, + markdown_dir: Path, + metrics_dir: Path, + stem: str, +) -> Optional[Dict[str, object]]: + """Return direct OCR-owned artifact fields for one canonical OCR document.""" + + markdown_path = Path(markdown_dir) / f"{stem}.md" + if not markdown_path.exists(): + return None + text_payload = markdown_path.read_text(encoding="utf-8") + metrics_path = Path(metrics_dir) / f"{stem}.metrics.json" + return { + "text": text_payload, + "ocr_markdown_relpath": str(Path("markdown") / markdown_path.name), + "ocr_metrics_relpath": ( + str(Path("json") / "metrics" / metrics_path.name) if metrics_path.exists() else None + ), + "ocr_text_sha256": hashlib.sha256(text_payload.encode("utf-8")).hexdigest(), + } + + +def apply_ocr_success_updates( + df_meta: pd.DataFrame, + *, + filenames: List[str], + markdown_dir: Path, + metrics_dir: Path, + backend_norm: str, +) -> pd.DataFrame: + """Apply direct OCR-owned metadata updates to parquet rows.""" + + if "filename" not in df_meta.columns: + return df_meta + + if "filter" not in df_meta.columns: + df_meta["filter"] = "ok" + if "needs_ocr" not in df_meta.columns: + df_meta["needs_ocr"] = False + if "ocr_success" not in df_meta.columns: + df_meta["ocr_success"] = False + if "extraction_mode" not in df_meta.columns: + df_meta["extraction_mode"] = None + + direct_columns = ("text", "ocr_markdown_relpath", "ocr_metrics_relpath", "ocr_text_sha256") + for column in direct_columns: + if column not in df_meta.columns: + df_meta[column] = None + + filename_series = df_meta["filename"].astype(str) + stem_series = filename_series.map(canonical_stem) + + for fname in filenames: + stem = canonical_stem(fname) + mask = stem_series == stem + if not bool(mask.any()): + continue + artifact_update = build_ocr_stage_artifact_update( + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + stem=stem, + ) + df_meta.loc[mask, "filter"] = "ok" + df_meta.loc[mask, "needs_ocr"] = False + df_meta.loc[mask, "ocr_success"] = True + if backend_norm == "deepseek": + df_meta.loc[mask, "extraction_mode"] = "deepseek" + if artifact_update is None: + continue + for column, value in artifact_update.items(): + df_meta.loc[mask, column] = value + + return df_meta + + +def persist_ocr_success( + context: CorpusOcrContext, + *, + filenames: List[str], + backend_norm: str, +) -> List[str]: + from ...parquet_schema import ParquetSchema + + success_files: List[str] = [] + for fname in filenames: + stem = canonical_stem(fname) + if (context.markdown_dir / f"{stem}.md").exists(): + success_files.append(fname) + + if not success_files: + return success_files + + parquet_schema = ParquetSchema({"url_column": context.url_column}) + parquet_path = context._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) + if parquet_path and parquet_path.exists(): + df_meta = pd.read_parquet(parquet_path) + df_meta = apply_ocr_success_updates( + df_meta, + filenames=success_files, + markdown_dir=context.markdown_dir, + metrics_dir=context.output_dir / "json" / "metrics", + backend_norm=backend_norm, + ) + context._cache_metadata_parquet(parquet_path) + parquet_schema.write_metadata_parquet(df_meta, parquet_path) + + stems = [canonical_stem(name) for name in success_files] + if hasattr(context, "good_files"): + for stem in stems: + if stem not in getattr(context, "good_files", []): + context.good_files.append(stem) + + return success_files + + +def refresh_cleaner_after_ocr(context: CorpusOcrContext) -> None: + """Refresh cleaner metrics after OCR reruns rewrite markdown outputs.""" + + context.logger.info("Re-running Rust cleaner after OCR rerun to refresh metrics") + context.clean( + input_dir=context.markdown_dir, + drop_bad=False, + ) diff --git a/src/glossapi/corpus/ocr/config.py b/src/glossapi/corpus/ocr/config.py new file mode 100644 index 0000000..f9da5fc --- /dev/null +++ b/src/glossapi/corpus/ocr/config.py @@ -0,0 +1,229 @@ +"""Request normalization for corpus OCR orchestration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from ...ocr.deepseek.defaults import ( + DEFAULT_ATTN_BACKEND, + DEFAULT_GPU_MEMORY_UTILIZATION, + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_OCR_PROFILE, + DEFAULT_RENDER_DPI, + DEFAULT_REPAIR_MODE, + DEFAULT_RUNTIME_BACKEND, + DEFAULT_TARGET_BATCH_PAGES, + DEFAULT_WORKERS_PER_GPU, + resolve_gpu_memory_utilization, + resolve_render_dpi, +) + + +@dataclass(slots=True) +class OcrRequest: + mode: str + backend: str + device: Optional[str] + model_dir: Optional[Path] + max_pages: Optional[int] + persist_engine: bool + precision: Optional[str] + workers_per_gpu: int + runtime_backend: str + ocr_profile: str + prompt_override: Optional[str] + attn_backend: str + base_size: Optional[int] + image_size: Optional[int] + crop_mode: Optional[bool] + render_dpi: int + max_new_tokens: int + repetition_penalty: Optional[float] + no_repeat_ngram_size: Optional[int] + vllm_batch_size: Optional[int] + gpu_memory_utilization: float + disable_fp8_kv: bool + repair_mode: str + repair_exec_batch_target_pages: Optional[int] + repair_exec_batch_target_items: Optional[int] + scheduler: str + target_batch_pages: int + shard_pages: int + shard_threshold_pages: int + math_enhance: bool + math_targets: Optional[Dict[str, List[Tuple[int, int]]]] + math_batch_size: int + math_dpi_base: int + use_gpus: str + devices: Optional[List[int]] + reprocess_completed: bool + content_debug: bool + + +def _resolve_mode( + *, + logger, + mode: Optional[str], + fix_bad: bool, + math_enhance: bool, +) -> Optional[str]: + mode_norm: Optional[str] = None + if mode: + candidate = str(mode).strip().lower() + if candidate in {"ocr_bad", "math_only", "ocr_bad_then_math"}: + mode_norm = candidate + else: + logger.warning("Unknown mode '%s'; falling back to legacy flags", mode) + if mode_norm is None: + if fix_bad and math_enhance: + mode_norm = "ocr_bad_then_math" + elif fix_bad: + mode_norm = "ocr_bad" + elif math_enhance: + mode_norm = "math_only" + return mode_norm + + +def normalize_ocr_request( + *, + logger, + fix_bad: bool, + mode: Optional[str], + backend: str, + device: Optional[str], + model_dir: Optional[str | Path], + max_pages: Optional[int], + persist_engine: bool, + precision: Optional[str], + workers_per_gpu: int = DEFAULT_WORKERS_PER_GPU, + runtime_backend: str = DEFAULT_RUNTIME_BACKEND, + ocr_profile: str = DEFAULT_OCR_PROFILE, + prompt_override: Optional[str] = None, + attn_backend: str = DEFAULT_ATTN_BACKEND, + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = DEFAULT_RENDER_DPI, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = DEFAULT_GPU_MEMORY_UTILIZATION, + disable_fp8_kv: bool = False, + repair_mode: str = DEFAULT_REPAIR_MODE, + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, + scheduler: str = "auto", + target_batch_pages: int = DEFAULT_TARGET_BATCH_PAGES, + shard_pages: int = 0, + shard_threshold_pages: int = 0, + math_enhance: bool = True, + math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = "single", + devices: Optional[List[int]] = None, + force: Optional[bool] = None, + reprocess_completed: Optional[bool] = None, + skip_existing: Optional[bool] = None, + content_debug: bool = False, + CONTENT_DEBUG: Optional[bool] = None, + internal_debug: bool = False, + INTERNAL_DEBUG: Optional[bool] = None, +) -> Optional[OcrRequest]: + backend_norm = str(backend or "deepseek").strip().lower() + if backend_norm != "deepseek": + raise ValueError("backend must be 'deepseek'") + + if CONTENT_DEBUG is not None: + content_debug = bool(CONTENT_DEBUG) + elif INTERNAL_DEBUG is not None: + content_debug = bool(INTERNAL_DEBUG) + elif internal_debug: + content_debug = True + + fix_bad_effective = bool(fix_bad) + if force is not None: + logger.warning("Corpus.ocr(force=...) is deprecated; use fix_bad=... instead") + fix_bad_effective = bool(force) + + mode_norm = _resolve_mode( + logger=logger, + mode=mode, + fix_bad=fix_bad_effective, + math_enhance=bool(math_enhance), + ) + if mode_norm is None: + logger.info( + "OCR: no operation requested (enable fix_bad and/or math_enhance or set mode='ocr_bad'|'math_only'|'ocr_bad_then_math')" + ) + return None + + if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: + logger.info( + "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." + ) + if mode_norm == "ocr_bad_then_math": + logger.info( + "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." + ) + mode_norm = "ocr_bad" + + reprocess_explicit = reprocess_completed is not None + reprocess_flag = bool(reprocess_completed) if reprocess_explicit else False + if skip_existing is not None: + skip_flag = bool(skip_existing) + logger.warning( + "Corpus.ocr(skip_existing=...) is deprecated; use reprocess_completed=... instead." + ) + desired = not skip_flag + if reprocess_explicit and desired != reprocess_flag: + logger.info( + "Corpus.ocr(): skip_existing=%s overrides reprocess_completed=%s (effective reprocess_completed=%s).", + skip_flag, + reprocess_flag, + desired, + ) + reprocess_flag = desired + + return OcrRequest( + mode=mode_norm, + backend=backend_norm, + device=device, + model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=bool(persist_engine), + precision=precision, + workers_per_gpu=int(max(1, workers_per_gpu)), + runtime_backend=str(runtime_backend or DEFAULT_RUNTIME_BACKEND), + ocr_profile=str(ocr_profile or DEFAULT_OCR_PROFILE), + prompt_override=prompt_override, + attn_backend=str(attn_backend or DEFAULT_ATTN_BACKEND), + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=resolve_render_dpi(render_dpi), + max_new_tokens=int(DEFAULT_MAX_NEW_TOKENS if max_new_tokens is None else max_new_tokens), + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=resolve_gpu_memory_utilization(gpu_memory_utilization), + disable_fp8_kv=bool(disable_fp8_kv), + repair_mode=str(repair_mode or DEFAULT_REPAIR_MODE), + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + scheduler=str(scheduler or "auto"), + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), + math_enhance=bool(math_enhance), + math_targets=math_targets, + math_batch_size=int(math_batch_size), + math_dpi_base=int(math_dpi_base), + use_gpus=str(use_gpus or "single"), + devices=devices, + reprocess_completed=bool(reprocess_flag), + content_debug=bool(content_debug), + ) diff --git a/src/glossapi/corpus/ocr/context.py b/src/glossapi/corpus/ocr/context.py new file mode 100644 index 0000000..7c98795 --- /dev/null +++ b/src/glossapi/corpus/ocr/context.py @@ -0,0 +1,26 @@ +"""Shared typing contracts for corpus OCR helpers.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Protocol + + +class CorpusOcrContext(Protocol): + logger: Any + input_dir: Path + output_dir: Path + markdown_dir: Path + logs_dir: Path + url_column: str + good_files: list[str] + + def _resolve_metadata_parquet(self, *args: Any, **kwargs: Any) -> Path | None: ... + + def _cache_metadata_parquet(self, path: Path | None) -> None: ... + + def _get_cached_metadata_parquet(self) -> Path | None: ... + + def clean(self, *args: Any, **kwargs: Any) -> None: ... + + def formula_enrich_from_json(self, *args: Any, **kwargs: Any) -> None: ... diff --git a/src/glossapi/corpus/ocr/dispatch.py b/src/glossapi/corpus/ocr/dispatch.py new file mode 100644 index 0000000..8e8efce --- /dev/null +++ b/src/glossapi/corpus/ocr/dispatch.py @@ -0,0 +1,49 @@ +"""Backend dispatch helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from ...ocr.deepseek import runner as _deepseek_runner +from .config import OcrRequest +from .context import CorpusOcrContext + + +def run_deepseek_ocr( + context: CorpusOcrContext, + *, + request: OcrRequest, + filenames: list[str], +) -> None: + _deepseek_runner.run_for_files( + context, + filenames, + model_dir=request.model_dir, + max_pages=request.max_pages, + persist_engine=request.persist_engine, + precision=request.precision, + device=request.device, + use_gpus=request.use_gpus, + devices=request.devices, + workers_per_gpu=request.workers_per_gpu, + runtime_backend=request.runtime_backend, + ocr_profile=request.ocr_profile, + prompt_override=request.prompt_override, + attn_backend=request.attn_backend, + base_size=request.base_size, + image_size=request.image_size, + crop_mode=request.crop_mode, + render_dpi=request.render_dpi, + max_new_tokens=request.max_new_tokens, + repetition_penalty=request.repetition_penalty, + no_repeat_ngram_size=request.no_repeat_ngram_size, + vllm_batch_size=request.vllm_batch_size, + gpu_memory_utilization=request.gpu_memory_utilization, + disable_fp8_kv=request.disable_fp8_kv, + repair_mode=request.repair_mode, + repair_exec_batch_target_pages=request.repair_exec_batch_target_pages, + repair_exec_batch_target_items=request.repair_exec_batch_target_items, + scheduler=request.scheduler, + target_batch_pages=request.target_batch_pages, + shard_pages=request.shard_pages, + shard_threshold_pages=request.shard_threshold_pages, + content_debug=request.content_debug, + ) diff --git a/src/glossapi/corpus/ocr/math_targets.py b/src/glossapi/corpus/ocr/math_targets.py new file mode 100644 index 0000000..0737d6f --- /dev/null +++ b/src/glossapi/corpus/ocr/math_targets.py @@ -0,0 +1,43 @@ +"""Math-target selection helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Sequence, Set + +from ..._naming import canonical_stem + + +def discover_docling_json_stems(output_dir: Path) -> List[str]: + json_dir = Path(output_dir) / "json" + if not json_dir.exists(): + return [] + return sorted({canonical_stem(path) for path in json_dir.glob("*.docling.json*")}) + + +def filter_math_only_stems( + *, + stems: Sequence[str], + bad_files: Sequence[str], + math_done_stems: Set[str], + reprocess_completed: bool, + logger, +) -> List[str]: + kept = list(stems) + if bad_files: + before = len(kept) + bad_set = {canonical_stem(name) for name in bad_files} + kept = [stem for stem in kept if stem not in bad_set] + removed = before - len(kept) + if removed: + logger.info("Math-only: skipping %d document(s) flagged for OCR", removed) + if not reprocess_completed and kept and math_done_stems: + before = len(kept) + kept = [stem for stem in kept if stem not in math_done_stems] + removed = before - len(kept) + if removed: + logger.info( + "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", + removed, + ) + return kept diff --git a/src/glossapi/corpus/ocr/pipeline.py b/src/glossapi/corpus/ocr/pipeline.py new file mode 100644 index 0000000..bee65e0 --- /dev/null +++ b/src/glossapi/corpus/ocr/pipeline.py @@ -0,0 +1,46 @@ +"""High-level OCR orchestration for corpus remediation.""" + +from __future__ import annotations + +from .artifacts import persist_ocr_success, refresh_cleaner_after_ocr +from .config import OcrRequest +from .context import CorpusOcrContext +from .dispatch import run_deepseek_ocr +from .targets import build_ocr_selection + + +def run_ocr_phase(context: CorpusOcrContext, request: OcrRequest) -> None: + """Run the OCR-remediation path while preserving the current runtime engine.""" + + if request.mode == "math_only": + raise ValueError("run_ocr_phase handles OCR remediation only") + + selection = build_ocr_selection( + context, + mode=request.mode, + reprocess_completed=request.reprocess_completed, + ) + + if not selection.bad_files: + context.logger.info("OCR: no bad documents flagged by cleaner; skipping OCR fix") + return + + run_deepseek_ocr( + context, + request=request, + filenames=selection.bad_files, + ) + + try: + persist_ocr_success( + context, + filenames=selection.bad_files, + backend_norm=request.backend, + ) + except Exception as exc: + context.logger.warning("Failed to update OCR success metadata: %s", exc) + + try: + refresh_cleaner_after_ocr(context) + except Exception as exc: + context.logger.warning("Cleaner refresh after OCR failed: %s", exc) diff --git a/src/glossapi/corpus/ocr/targets.py b/src/glossapi/corpus/ocr/targets.py new file mode 100644 index 0000000..2a393aa --- /dev/null +++ b/src/glossapi/corpus/ocr/targets.py @@ -0,0 +1,144 @@ +"""Target selection helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional, Set + +import pandas as pd + +from ..._naming import canonical_stem +from ...parquet_schema import ParquetSchema +from .context import CorpusOcrContext +from ..corpus_skiplist import _SkiplistManager, _resolve_skiplist_path + + +@dataclass(slots=True) +class OcrSelection: + bad_files: List[str] + ocr_candidates_initial: int + skipped_completed: int + skipped_skiplist: int + parquet_meta: Optional[pd.DataFrame] + ocr_done_files: List[str] + ocr_done_stems: Set[str] + math_done_stems: Set[str] + skip_mgr: _SkiplistManager + skiplist_path: Path + + +def normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> List[str]: + """Collapse chunk-like metadata rows back to real OCR source files when possible.""" + + source_by_stem = {} + try: + for path in sorted(Path(input_dir).glob("*.pdf")): + source_by_stem.setdefault(canonical_stem(path.name), path.name) + except Exception: + source_by_stem = {} + + normalized: List[str] = [] + seen: Set[str] = set() + for fname in filenames: + resolved = source_by_stem.get(canonical_stem(fname), str(fname)) + if resolved in seen: + continue + normalized.append(resolved) + seen.add(resolved) + return normalized + + +def build_ocr_selection( + context: CorpusOcrContext, + *, + mode: str, + reprocess_completed: bool, +) -> OcrSelection: + bad_files: List[str] = [] + skipped_completed = 0 + skipped_skiplist = 0 + parquet_meta: Optional[pd.DataFrame] = None + ocr_done_files: List[str] = [] + ocr_done_stems: Set[str] = set() + math_done_stems: Set[str] = set() + + parquet_schema = ParquetSchema({"url_column": context.url_column}) + parquet_path = context._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) + if parquet_path and parquet_path.exists(): + df = pd.read_parquet(parquet_path) + if "filename" in df.columns and "needs_ocr" in df.columns: + bad_files = df.loc[df["needs_ocr"] == True, "filename"].dropna().astype(str).tolist() + if "ocr_success" in df.columns: + ocr_done_files = df.loc[df["ocr_success"].fillna(False), "filename"].dropna().astype(str).tolist() + ocr_done_stems = {canonical_stem(name) for name in ocr_done_files} + math_done_files: List[str] = [] + if "math_enriched" in df.columns: + math_done_files = df.loc[df["math_enriched"].fillna(False), "filename"].dropna().astype(str).tolist() + elif "enriched_math" in df.columns: + math_done_files = df.loc[df["enriched_math"].fillna(False), "filename"].dropna().astype(str).tolist() + if math_done_files: + math_done_stems = {canonical_stem(name) for name in math_done_files} + if not reprocess_completed and ocr_done_stems: + before = len(bad_files) + bad_files = [name for name in bad_files if canonical_stem(name) not in ocr_done_stems] + skipped_completed = before - len(bad_files) + if skipped_completed: + context.logger.info( + "OCR: skipping %d already completed document(s) (reprocess_completed=False).", + skipped_completed, + ) + if reprocess_completed and mode in {"ocr_bad", "ocr_bad_then_math"} and ocr_done_files: + pending = {str(name) for name in bad_files} + for fname in ocr_done_files: + if fname not in pending: + bad_files.append(fname) + pending.add(fname) + parquet_meta = df + + ocr_candidates_initial = len(bad_files) + skiplist_path = _resolve_skiplist_path(context.output_dir, context.logger) + skip_mgr = _SkiplistManager(skiplist_path, context.logger) + skip_stems = skip_mgr.load() + if skip_stems: + before = len(bad_files) + bad_files = [name for name in bad_files if canonical_stem(name) not in skip_stems] + skipped_skiplist = before - len(bad_files) + if skipped_skiplist: + context.logger.warning( + "Skip-list %s filtered %d document(s) from Phase-3 OCR.", + skiplist_path, + skipped_skiplist, + ) + + normalized_bad_files = normalize_ocr_target_filenames( + filenames=bad_files, + input_dir=Path(context.input_dir), + ) + if len(normalized_bad_files) != len(bad_files): + context.logger.info( + "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem.", + len(bad_files), + len(normalized_bad_files), + ) + bad_files = normalized_bad_files + context.logger.info( + "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d", + ocr_candidates_initial, + len(bad_files), + skipped_completed, + skipped_skiplist, + ) + + return OcrSelection( + bad_files=bad_files, + ocr_candidates_initial=ocr_candidates_initial, + skipped_completed=skipped_completed, + skipped_skiplist=skipped_skiplist, + parquet_meta=parquet_meta, + ocr_done_files=ocr_done_files, + ocr_done_stems=ocr_done_stems, + math_done_stems=math_done_stems, + skip_mgr=skip_mgr, + skiplist_path=skiplist_path, + ) diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 28030ff..6881e4d 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -22,10 +22,25 @@ from .._naming import canonical_stem from ..gloss_downloader import GlossDownloader from ..gloss_section import GlossSection +from ..ocr.deepseek.defaults import ( + DEFAULT_ATTN_BACKEND, + DEFAULT_GPU_MEMORY_UTILIZATION, + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_OCR_PROFILE, + DEFAULT_RENDER_DPI, + DEFAULT_REPAIR_MODE, + DEFAULT_RUNTIME_BACKEND, + DEFAULT_TARGET_BATCH_PAGES, + DEFAULT_WORKERS_PER_GPU, +) # Avoid importing classifier here; OCR/math phase does not require it at import time. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path from .corpus_state import _ProcessingStateManager from .corpus_utils import _maybe_import_torch +from .ocr.config import OcrRequest, normalize_ocr_request +from .ocr.math_targets import discover_docling_json_stems, filter_math_only_stems +from .ocr.pipeline import run_ocr_phase +from .ocr.targets import build_ocr_selection def _build_ocr_stage_artifact_update( @@ -143,31 +158,30 @@ def ocr( max_pages: Optional[int] = None, persist_engine: bool = True, limit: Optional[int] = None, - dpi: Optional[int] = None, # reserved for future use - precision: Optional[str] = None, # reserved for future use ("fp16","bf16") - workers_per_gpu: int = 1, - runtime_backend: str = "transformers", - ocr_profile: str = "markdown_grounded", + dpi: Optional[int] = None, + precision: Optional[str] = None, + workers_per_gpu: int = DEFAULT_WORKERS_PER_GPU, + runtime_backend: str = DEFAULT_RUNTIME_BACKEND, + ocr_profile: str = DEFAULT_OCR_PROFILE, prompt_override: Optional[str] = None, - attn_backend: str = "auto", + attn_backend: str = DEFAULT_ATTN_BACKEND, base_size: Optional[int] = None, image_size: Optional[int] = None, crop_mode: Optional[bool] = None, - render_dpi: Optional[int] = None, - max_new_tokens: Optional[int] = 2048, + render_dpi: Optional[int] = DEFAULT_RENDER_DPI, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, repetition_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, vllm_batch_size: Optional[int] = None, - gpu_memory_utilization: Optional[float] = None, + gpu_memory_utilization: Optional[float] = DEFAULT_GPU_MEMORY_UTILIZATION, disable_fp8_kv: bool = False, - repair_mode: str = "auto", + repair_mode: str = DEFAULT_REPAIR_MODE, repair_exec_batch_target_pages: Optional[int] = None, repair_exec_batch_target_items: Optional[int] = None, scheduler: str = "auto", - target_batch_pages: int = 160, + target_batch_pages: int = DEFAULT_TARGET_BATCH_PAGES, shard_pages: int = 0, shard_threshold_pages: int = 0, - # Integrated math enrichment controls math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, math_batch_size: int = 8, @@ -177,720 +191,373 @@ def ocr( force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, - # Content debug: keep page separators and truncation markers when True content_debug: bool = False, CONTENT_DEBUG: Optional[bool] = None, - # Back-compat aliases (deprecated): internal_debug: bool = False, INTERNAL_DEBUG: Optional[bool] = None, ) -> None: - """OCR and/or math enrichment with explicit mode control. + """OCR and/or math enrichment with explicit mode control.""" + + del limit, dpi + request = normalize_ocr_request( + logger=self.logger, + fix_bad=fix_bad, + mode=mode, + backend=backend, + device=device, + model_dir=model_dir, + max_pages=max_pages, + persist_engine=persist_engine, + precision=precision, + workers_per_gpu=workers_per_gpu, + runtime_backend=runtime_backend, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + scheduler=scheduler, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + math_enhance=math_enhance, + math_targets=math_targets, + math_batch_size=math_batch_size, + math_dpi_base=math_dpi_base, + use_gpus=use_gpus, + devices=devices, + force=force, + reprocess_completed=reprocess_completed, + skip_existing=skip_existing, + content_debug=content_debug, + CONTENT_DEBUG=CONTENT_DEBUG, + internal_debug=internal_debug, + INTERNAL_DEBUG=INTERNAL_DEBUG, + ) + if request is None: + return + if request.mode == "math_only": + self._run_math_only_request(request) + return + run_ocr_phase(self, request) - Parameters - - mode: one of - - 'ocr_bad': re‑OCR only documents flagged as bad by Rust cleaner (parquet 'filter' != 'ok'). - - 'math_only': run math enrichment from Docling JSON (generate JSON without OCR when missing). - - 'ocr_bad_then_math': re‑OCR bad documents, then run math enrichment on those. - If not provided, falls back to legacy flags (fix_bad, math_enhance): - fix_bad and math_enhance -> 'ocr_bad_then_math'; - fix_bad only -> 'ocr_bad'; - math_enhance only -> 'math_only'; - neither -> no‑op. - - backend: 'deepseek' (default) uses the DeepSeek OCR remediation path. - Docling layout/json remains Phase-1 infrastructure; OCR remediation itself is DeepSeek-only. - - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - - math_enhance: run math/code enrichment after OCR (default True). - - use_gpus/devices/workers_per_gpu: DeepSeek multi-worker controls. Use - ``use_gpus="multi"`` to shard OCR across detected or specified GPUs. - Increase ``workers_per_gpu`` above ``1`` to run multiple OCR workers - per visible GPU. - - scheduler/target_batch_pages/shard_pages/shard_threshold_pages: - Multi-GPU scheduling controls. ``scheduler='auto'`` resolves to - exact-fill page-range batching for multi-GPU vLLM runs and falls back - to whole-document scheduling elsewhere. ``target_batch_pages`` is the - per-lane page budget the scheduler tries to fill. ``fixed_shard`` uses - ``shard_pages`` and ``shard_threshold_pages`` when explicit shard-based - planning is requested. - - runtime_backend: ``transformers`` (default) or ``vllm``. - - ocr_profile/prompt_override/attn_backend/base_size/image_size/crop_mode/render_dpi: - DeepSeek rendering and attention controls used for throughput/quality - benchmarking. - - max_new_tokens/repetition_penalty/no_repeat_ngram_size: - Optional generation controls forwarded to DeepSeek. These are exposed - for runtime experiments; leave them unset unless a benchmark calls for - them explicitly. - - vllm_batch_size/gpu_memory_utilization/disable_fp8_kv/repair_mode: - Optional vLLM controls. ``repair_mode='auto'`` enables the markdown-first - repair pipeline (plain fallback for garbage pages, tiled fallback for - short coverage failures). ``repair_exec_batch_target_pages`` and - ``repair_exec_batch_target_items`` control how many pending repair rows - a worker tries to execute together once the global repair phase begins. - These are ignored by the transformers runtime except for - ``prompt_override``. - - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - - reprocess_completed: when False, skip documents already flagged as successfully - OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False - unless ``skip_existing`` overrides it. - - skip_existing: legacy alias for ``reprocess_completed`` (``skip_existing=True`` equals - ``reprocess_completed=False``). Prefer the explicit ``reprocess_completed`` toggle. - """ - # Normalize backend - backend_norm = str(backend or "deepseek").strip().lower() - if backend_norm != "deepseek": - raise ValueError("backend must be 'deepseek'") + def _run_math_only_request(self, request: OcrRequest) -> None: + selection = build_ocr_selection( + self, + mode=request.mode, + reprocess_completed=request.reprocess_completed, + ) + stems = discover_docling_json_stems(self.output_dir) + stems = filter_math_only_stems( + stems=stems, + bad_files=selection.bad_files, + math_done_stems=selection.math_done_stems, + reprocess_completed=request.reprocess_completed, + logger=self.logger, + ) + self._run_math_targets( + stems=stems, + request=request, + skip_mgr=selection.skip_mgr, + skiplist_path=selection.skiplist_path, + ) - # CONTENT_DEBUG override (preferred uppercase alias) - # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags - if CONTENT_DEBUG is not None: - content_debug = bool(CONTENT_DEBUG) - elif INTERNAL_DEBUG is not None: - content_debug = bool(INTERNAL_DEBUG) - elif internal_debug: - content_debug = True + def _run_math_targets( + self, + *, + stems: List[str], + request: OcrRequest, + skip_mgr: Optional[_SkiplistManager], + skiplist_path: Path, + ) -> None: + if not stems: + self.logger.info("No Docling JSON found for math enrichment.") + return - # Normalize mode from explicit value or legacy flags - mode_norm = None - fix_bad_effective = bool(fix_bad) - if force is not None: - try: - self.logger.warning("Corpus.ocr(force=...) is deprecated; use fix_bad=... instead") - except Exception: - pass - fix_bad_effective = bool(force) - if mode: - m = str(mode).strip().lower() - if m in {"ocr_bad", "math_only", "ocr_bad_then_math"}: - mode_norm = m - else: - self.logger.warning("Unknown mode '%s'; falling back to legacy flags", mode) - if mode_norm is None: - if fix_bad_effective and math_enhance: - mode_norm = "ocr_bad_then_math" - elif fix_bad_effective: - mode_norm = "ocr_bad" - elif math_enhance: - mode_norm = "math_only" - else: - self.logger.info( - "OCR: no operation requested (enable fix_bad and/or math_enhance or set mode='ocr_bad'|'math_only'|'ocr_bad_then_math')" + initial_math_targets = len(stems) + current_skips = skip_mgr.reload() if skip_mgr else set() + if current_skips: + before = len(stems) + stems = [stem for stem in stems if stem not in current_skips] + removed = before - len(stems) + if removed: + self.logger.warning( + "Skip-list %s filtered %d document(s) from Phase-2 math.", + skiplist_path, + removed, ) + if not stems: + self.logger.info("All math targets filtered by skip-list; nothing to do.") return - reprocess_explicit = reprocess_completed is not None - reprocess_flag = bool(reprocess_completed) if reprocess_explicit else False - if skip_existing is not None: - skip_flag = bool(skip_existing) + + self.logger.info( + "Math targets: total=%d kept=%d filtered_skiplist=%d", + initial_math_targets, + len(stems), + initial_math_targets - len(stems), + ) + + local_targets = None + if request.math_targets: + local_targets = {stem: request.math_targets.get(stem) for stem in stems if stem in request.math_targets} + + if str(request.use_gpus).lower() != "multi": + self.formula_enrich_from_json( + files=stems, + device=(request.device or "cuda"), + batch_size=int(request.math_batch_size), + dpi_base=int(request.math_dpi_base), + targets_by_stem=local_targets, + ) + return + + devs = list(request.devices or []) + if not devs: try: - self.logger.warning( - "Corpus.ocr(skip_existing=...) is deprecated; use reprocess_completed=... instead." + proc = subprocess.run( + ["nvidia-smi", "-L"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=5, ) + if proc.returncode == 0 and proc.stdout: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + try: + devs.append(int(line.split(":", 1)[0].split()[1])) + except Exception: + pass except Exception: pass - desired = not skip_flag - if reprocess_explicit and desired != reprocess_flag: + if not devs: + torch_mod = _maybe_import_torch() try: - self.logger.info( - "Corpus.ocr(): skip_existing=%s overrides reprocess_completed=%s (effective reprocess_completed=%s).", - skip_flag, - reprocess_flag, - desired, - ) + if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): + devs = list(range(torch_mod.cuda.device_count())) except Exception: pass - reprocess_flag = desired - reprocess_completed = reprocess_flag - # DeepSeek semantics note - if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: + if not devs: + msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement" + self.logger.error(msg) + raise RuntimeError(msg) + + from multiprocessing import get_context + + ctx = get_context("spawn") + work_q = ctx.Queue() + result_q = ctx.Queue() + manager = ctx.Manager() + status_map = manager.dict() + for stem in stems: + work_q.put(stem) + + worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") + worker_log_dir_to_use = worker_log_dir_env + if not worker_log_dir_to_use: + default_worker_log_dir = self.logs_dir / "math_workers" try: - self.logger.info( - "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." + default_worker_log_dir.mkdir(parents=True, exist_ok=True) + worker_log_dir_to_use = str(default_worker_log_dir) + except Exception as exc: + self.logger.warning( + "Unable to prepare worker log directory %s: %s", + default_worker_log_dir, + exc, ) - except Exception: - pass - if mode_norm == "ocr_bad_then_math": - try: - self.logger.info( - "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." - ) - except Exception: - pass - mode_norm = "ocr_bad" - # Identify bad documents from parquet (Rust cleaner output) - bad_files: List[str] = [] - skipped_completed = 0 - skipped_skiplist = 0 - parquet_meta: Optional["pd.DataFrame"] = None - ocr_done_files: List[str] = [] - ocr_done_stems: Set[str] = set() - math_done_files: List[str] = [] - math_done_stems: Set[str] = set() + worker_log_dir_to_use = None + if worker_log_dir_to_use: + os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_to_use + marker_base = Path(worker_log_dir_to_use) if worker_log_dir_to_use else (self.logs_dir / "math_workers") try: - from glossapi.parquet_schema import ParquetSchema - parquet_schema = ParquetSchema({"url_column": self.url_column}) - parquet_path = self._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) - if parquet_path and parquet_path.exists(): - import pandas as _pd - df = _pd.read_parquet(parquet_path) - if "filename" in df.columns and "needs_ocr" in df.columns: - bad_files = df.loc[df["needs_ocr"] == True, "filename"].dropna().astype(str).tolist() - else: - # No fallback: selection relies strictly on the 'needs_ocr' flag - # populated by the cleaner. If missing, we skip OCR selection. - bad_files = [] - ocr_done: Set[str] = set() - if "ocr_success" in df.columns: - ocr_done_files = df.loc[df["ocr_success"].fillna(False), "filename"].dropna().astype(str).tolist() - ocr_done = {canonical_stem(str(name)) for name in ocr_done_files} - ocr_done_stems = set(ocr_done) - if "math_enriched" in df.columns: - math_done_files = df.loc[df["math_enriched"].fillna(False), "filename"].dropna().astype(str).tolist() - elif "enriched_math" in df.columns: - math_done_files = df.loc[df["enriched_math"].fillna(False), "filename"].dropna().astype(str).tolist() - if math_done_files: - math_done_stems = {canonical_stem(str(name)) for name in math_done_files} - if not reprocess_completed and ocr_done: - before = len(bad_files) - bad_files = [name for name in bad_files if canonical_stem(name) not in ocr_done] - removed = before - len(bad_files) - if removed: - skipped_completed = removed - self.logger.info( - "OCR: skipping %d already completed document(s) (reprocess_completed=False).", - removed, - ) - if reprocess_completed and mode_norm in {"ocr_bad", "ocr_bad_then_math"} and ocr_done_files: - pending = {str(f) for f in bad_files} - for fname in ocr_done_files: - if fname not in pending: - bad_files.append(fname) - pending.add(fname) - parquet_meta = df - else: - parquet_meta = None + marker_base.mkdir(parents=True, exist_ok=True) except Exception: pass + marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} - ocr_candidates_initial = len(bad_files) - skiplist_path = _resolve_skiplist_path(self.output_dir, self.logger) - skip_mgr = _SkiplistManager(skiplist_path, self.logger) - skip_stems = skip_mgr.load() - if skip_stems: - before = len(bad_files) - bad_files = [name for name in bad_files if canonical_stem(name) not in skip_stems] - removed = before - len(bad_files) - if removed: - skipped_skiplist = removed - self.logger.warning( - "Skip-list %s filtered %d document(s) from Phase-3 OCR.", - skiplist_path, - removed, - ) + procs: List[Any] = [] + active: List[Any] = [] + proc_gpu: Dict[int, int] = {} try: - normalized_bad_files = _normalize_ocr_target_filenames( - filenames=bad_files, - input_dir=Path(self.input_dir), - ) - if len(normalized_bad_files) != len(bad_files): - self.logger.info( - "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem.", - len(bad_files), - len(normalized_bad_files), - ) - bad_files = normalized_bad_files - self.logger.info( - "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d", - ocr_candidates_initial, - len(bad_files), - skipped_completed, - skipped_skiplist, - ) + respawn_cap = int(os.environ.get("GLOSSAPI_MATH_RESPAWN_CAP", "5")) except Exception: - pass - - # Helper to run Phase‑2 enrichment over stems - def _run_math(stems: List[str]) -> None: - if not stems: - self.logger.info("No Docling JSON found for math enrichment.") - return - initial_math_targets = len(stems) - current_skips = skip_mgr.reload() if skip_mgr else set() - if current_skips: - before = len(stems) - stems = [s for s in stems if s not in current_skips] - removed = before - len(stems) - if removed: - self.logger.warning( - "Skip-list %s filtered %d document(s) from Phase-2 math.", - skiplist_path, - removed, - ) - if not stems: - self.logger.info("All math targets filtered by skip-list; nothing to do.") - return - try: - self.logger.info( - "Math targets: total=%d kept=%d filtered_skiplist=%d", - initial_math_targets, - len(stems), - initial_math_targets - len(stems), - ) - except Exception: - pass - local_targets = None - if math_targets: - local_targets = {s: math_targets.get(s) for s in stems if s in math_targets} - if str(use_gpus).lower() == "multi": - # Detect GPU devices - devs = devices or [] - if not devs: - try: - import subprocess - p = subprocess.run(["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5) - if p.returncode == 0 and p.stdout: - for line in p.stdout.splitlines(): - if line.startswith("GPU "): - try: - idx = int(line.split(":", 1)[0].split()[1]) - devs.append(idx) - except Exception: - pass - except Exception: - pass - if not devs: - torch_mod = _maybe_import_torch() - try: - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - devs = list(range(torch_mod.cuda.device_count())) - except Exception: - pass - if not devs: - msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement" - self.logger.error(msg) - raise RuntimeError(msg) - else: - from multiprocessing import get_context - - ctx = get_context("spawn") - work_q = ctx.Queue() - result_q = ctx.Queue() - manager = ctx.Manager() - status_map = manager.dict() - for s in stems: - work_q.put(s) + respawn_cap = 5 + respawn_cap = max(0, respawn_cap) + respawn_counts: Dict[int, int] = {dev_id: 0 for dev_id in devs} + + for dev_id in devs: + proc = ctx.Process( + target=_gpu_math_worker, + args=( + dev_id, + str(self.input_dir), + str(self.output_dir), + work_q, + int(request.math_batch_size), + int(request.math_dpi_base), + request.device or "cuda", + local_targets or {}, + result_q, + status_map, + str(marker_base), + ), + ) + proc.start() + procs.append(proc) + active.append(proc) + if proc.pid is not None: + proc_gpu[proc.pid] = dev_id - worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") - worker_log_dir_to_use = worker_log_dir_env - if not worker_log_dir_to_use: - default_worker_log_dir = self.logs_dir / "math_workers" - try: - default_worker_log_dir.mkdir(parents=True, exist_ok=True) - worker_log_dir_to_use = str(default_worker_log_dir) - except Exception as exc: - self.logger.warning( - "Unable to prepare worker log directory %s: %s", - default_worker_log_dir, - exc, + try: + last_summary = time.time() + while active: + for proc in list(active): + proc.join(timeout=0.05) + if proc.is_alive(): + continue + active.remove(proc) + if proc in procs: + procs.remove(proc) + pid = proc.pid or -1 + gpu_id = proc_gpu.pop(pid, None) + exitcode = proc.exitcode + stems_for_skip: List[str] = [] + if gpu_id is not None: + current_entry = status_map.pop(gpu_id, None) + if current_entry: + if isinstance(current_entry, (list, tuple, set)): + entries = list(current_entry) + else: + entries = [current_entry] + stems_for_skip = [str(item) for item in entries if item] + marker_path = marker_files.get(gpu_id) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass + if exitcode not in (0, None) and gpu_id is not None: + if stems_for_skip and skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in stems_for_skip) + self.logger.warning("Math worker GPU%s exited with %s", gpu_id, exitcode) + respawn_counts[gpu_id] = respawn_counts.get(gpu_id, 0) + 1 + attempts = respawn_counts[gpu_id] + if respawn_cap and attempts > respawn_cap: + self.logger.error( + "Math worker GPU%s exceeded respawn cap (%s); not respawning", + gpu_id, + respawn_cap, ) - worker_log_dir_to_use = None - if worker_log_dir_to_use: - os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_to_use - marker_base = Path(worker_log_dir_to_use) if worker_log_dir_to_use else (self.logs_dir / "math_workers") - try: - marker_base.mkdir(parents=True, exist_ok=True) - except Exception: - pass - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} - - procs: List[Any] = [] - active: List[Any] = [] - proc_gpu: Dict[int, int] = {} - try: - respawn_cap = int(os.environ.get("GLOSSAPI_MATH_RESPAWN_CAP", "5")) - except Exception: - respawn_cap = 5 - respawn_cap = max(0, respawn_cap) - respawn_counts: Dict[int, int] = {dev_id: 0 for dev_id in devs} - - for dev_id in devs: - p = ctx.Process( + continue + replacement = ctx.Process( target=_gpu_math_worker, args=( - dev_id, + gpu_id, str(self.input_dir), str(self.output_dir), work_q, - int(math_batch_size), - int(math_dpi_base), - device or "cuda", + int(request.math_batch_size), + int(request.math_dpi_base), + request.device or "cuda", local_targets or {}, result_q, status_map, str(marker_base), ), ) - p.start() - procs.append(p) - active.append(p) - if p.pid is not None: - proc_gpu[p.pid] = dev_id + replacement.start() + procs.append(replacement) + active.append(replacement) + if replacement.pid is not None: + proc_gpu[replacement.pid] = gpu_id + continue + while True: try: - last_summary = time.time() - while active: - for p in list(active): - p.join(timeout=0.05) - if p.is_alive(): - continue - active.remove(p) - if p in procs: - procs.remove(p) - pid = p.pid or -1 - gpu_id = proc_gpu.pop(pid, None) - exitcode = p.exitcode - stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) - if current_entry: - if isinstance(current_entry, (list, tuple, set)): - entries = list(current_entry) - else: - entries = [current_entry] - stems_for_skip = [str(item) for item in entries if item] - marker_path = marker_files.get(gpu_id) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass - if exitcode not in (0, None) and gpu_id is not None: - if stems_for_skip: - skip_mgr.add(canonical_stem(s) for s in stems_for_skip) - self.logger.warning( - "Math worker GPU%s exited with %s", - gpu_id, - exitcode, - ) - respawn_counts[gpu_id] = respawn_counts.get(gpu_id, 0) + 1 - attempts = respawn_counts[gpu_id] - if respawn_cap and attempts > respawn_cap: - self.logger.error( - "Math worker GPU%s exceeded respawn cap (%s); not respawning", - gpu_id, - respawn_cap, - ) - continue - replacement = ctx.Process( - target=_gpu_math_worker, - args=( - gpu_id, - str(self.input_dir), - str(self.output_dir), - work_q, - int(math_batch_size), - int(math_dpi_base), - device or "cuda", - local_targets or {}, - result_q, - status_map, - str(marker_base), - ), - ) - replacement.start() - procs.append(replacement) - active.append(replacement) - if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id - continue - - while True: - try: - event = result_q.get_nowait() - except queue.Empty: - break - if not event: - continue - if event.get("event") == "math_batch": - stems_bad = event.get("problematic", []) - if stems_bad: - skip_mgr.add(canonical_stem(s) for s in stems_bad) - worker = event.get("worker") - try: - worker_gpu = int(worker) - except Exception: - worker_gpu = None - if worker_gpu is not None: - status_map.pop(worker_gpu, None) - marker_path = marker_files.get(worker_gpu) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass - elif event.get("event") == "exit" and event.get("exitcode", 0) not in (0, None): - self.logger.warning( - "Math worker GPU%s reported exit code %s", - event.get("worker"), - event.get("exitcode"), - ) - - now = time.time() - if now - last_summary > 30: - try: - qsize = work_q.qsize() - except NotImplementedError: - qsize = -1 - self.logger.info( - "Math progress: queue=%d active_workers=%d", - qsize, - len(active), - ) - last_summary = now - - if not active: - break - remaining_after_cap: List[str] = [] - try: - while True: - item = work_q.get_nowait() - if isinstance(item, str) and item.strip(): - remaining_after_cap.append(item) - except queue.Empty: - pass - if remaining_after_cap: - skip_mgr.add(canonical_stem(s) for s in remaining_after_cap) - self.logger.error( - "No active math workers remain; skipped %d pending item(s)", - len(remaining_after_cap), - ) - finally: - for p in procs: - if p.is_alive(): - p.join() + event = result_q.get_nowait() + except queue.Empty: + break + if not event: + continue + if event.get("event") == "math_batch": + stems_bad = event.get("problematic", []) + if stems_bad and skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in stems_bad) + worker = event.get("worker") try: - manager.shutdown() + worker_gpu = int(worker) except Exception: - pass - if worker_log_dir_env is not None: - os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_env - else: - os.environ.pop("GLOSSAPI_WORKER_LOG_DIR", None) - return - # Single-GPU path - self.formula_enrich_from_json( - files=stems, - device=(device or "cuda"), - batch_size=int(math_batch_size), - dpi_base=int(math_dpi_base), - targets_by_stem=local_targets, - ) - - # Branches - if mode_norm == "math_only": - if not math_enhance: - self.logger.info("OCR: fix_bad=False and math_enhance=False → nothing to do") - return - # Math-only: ensure JSON exists; if not, generate without OCR - json_dir = self.output_dir / "json" - stems: List[str] = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - # Do not generate layout JSON here; Phase‑1 is responsible for JSON artifacts. - # Never run math on files that need OCR - if bad_files: - before = len(stems) - bad_set = {canonical_stem(s) for s in bad_files} - stems = [s for s in stems if s not in bad_set] - removed = before - len(stems) - if removed: - try: - self.logger.info( - "Math-only: skipping %d document(s) flagged for OCR", - removed, - ) - except Exception: - pass - if not reprocess_completed and stems and parquet_meta is not None: - if math_done_stems: - before = len(stems) - stems = [s for s in stems if s not in math_done_stems] - removed = before - len(stems) - if removed: - self.logger.info( - "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", - removed, + worker_gpu = None + if worker_gpu is not None: + status_map.pop(worker_gpu, None) + marker_path = marker_files.get(worker_gpu) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass + elif event.get("event") == "exit" and event.get("exitcode", 0) not in (0, None): + self.logger.warning( + "Math worker GPU%s reported exit code %s", + event.get("worker"), + event.get("exitcode"), ) - _run_math(stems) - return - - # 'ocr_bad' and 'ocr_bad_then_math' paths: OCR bad files first - if mode_norm in {"ocr_bad", "ocr_bad_then_math"} and not bad_files: - self.logger.info("OCR: no bad documents flagged by cleaner; skipping OCR fix") - if mode_norm == "ocr_bad_then_math": - json_dir = self.output_dir / "json" - stems = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - _run_math(stems) - return - - reran_ocr = False - if mode_norm in {"ocr_bad", "ocr_bad_then_math"}: - if backend_norm == "deepseek": - # DeepSeek path: run OCR via dedicated runner (no Docling JSON) - from glossapi.ocr.deepseek import runner as _deepseek_runner # type: ignore - - try: - _deepseek_runner.run_for_files( - self, - bad_files, - model_dir=Path(model_dir) if model_dir else None, - max_pages=max_pages, - persist_engine=bool(persist_engine), - precision=precision, - device=device, - use_gpus=use_gpus, - devices=devices, - workers_per_gpu=int(max(1, workers_per_gpu)), - runtime_backend=runtime_backend, - ocr_profile=ocr_profile, - prompt_override=prompt_override, - attn_backend=attn_backend, - base_size=base_size, - image_size=image_size, - crop_mode=crop_mode, - render_dpi=render_dpi, - max_new_tokens=max_new_tokens, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - vllm_batch_size=vllm_batch_size, - gpu_memory_utilization=gpu_memory_utilization, - disable_fp8_kv=disable_fp8_kv, - repair_mode=repair_mode, - repair_exec_batch_target_pages=repair_exec_batch_target_pages, - repair_exec_batch_target_items=repair_exec_batch_target_items, - scheduler=scheduler, - target_batch_pages=int(max(1, target_batch_pages)), - shard_pages=int(max(0, shard_pages)), - shard_threshold_pages=int(max(0, shard_threshold_pages)), - content_debug=bool(content_debug), + now = time.time() + if now - last_summary > 30: + try: + qsize = work_q.qsize() + except NotImplementedError: + qsize = -1 + self.logger.info( + "Math progress: queue=%d active_workers=%d", + qsize, + len(active), ) - except Exception as _e: - self.logger.error("DeepSeek OCR runner failed: %s", _e) - raise - reran_ocr = True - # Update metadata to reflect successful OCR reruns - try: - from glossapi.parquet_schema import ParquetSchema as _ParquetSchema - - success_files: List[str] = [] - for _fname in bad_files: - stem = canonical_stem(_fname) - if (self.markdown_dir / f"{stem}.md").exists(): - success_files.append(_fname) + last_summary = now - if success_files: - parquet_schema = _ParquetSchema({"url_column": self.url_column}) - parquet_path = self._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) - if parquet_path and parquet_path.exists(): - import pandas as _pd + if not active: + break - df_meta = _pd.read_parquet(parquet_path) - df_meta = _apply_ocr_success_updates( - df_meta, - filenames=success_files, - markdown_dir=self.markdown_dir, - metrics_dir=self.output_dir / "json" / "metrics", - backend_norm=backend_norm, - ) - self._cache_metadata_parquet(parquet_path) - parquet_schema.write_metadata_parquet(df_meta, parquet_path) - # Keep sectioner in sync with newly recovered files - try: - stems = [canonical_stem(_f) for _f in success_files] - if hasattr(self, "good_files"): - for _stem in stems: - if _stem not in getattr(self, "good_files", []): - self.good_files.append(_stem) - except Exception: - pass - except Exception as _e: - self.logger.warning("Failed to update OCR success metadata: %s", _e) - - if reran_ocr: + remaining_after_cap: List[str] = [] try: - self.logger.info("Re-running Rust cleaner after OCR rerun to refresh metrics") - self.clean( - input_dir=self.markdown_dir, - drop_bad=False, + while True: + item = work_q.get_nowait() + if isinstance(item, str) and item.strip(): + remaining_after_cap.append(item) + except queue.Empty: + pass + if remaining_after_cap: + if skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in remaining_after_cap) + self.logger.error( + "No active math workers remain; skipped %d pending item(s)", + len(remaining_after_cap), ) - except Exception as _e: - self.logger.warning("Cleaner refresh after OCR failed: %s", _e) - - if mode_norm == "ocr_bad_then_math": + finally: + for proc in procs: + if proc.is_alive(): + proc.join() try: - # Run math only on documents that do NOT require OCR - json_dir = self.output_dir / "json" - stems: List[str] = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - bad_set = {canonical_stem(f) for f in bad_files} - if stems: - # When OCR was rerun we now want math on all stems (bad_set included). - # Only skip bad_set when no rerun happened. - if not reran_ocr: - stems = [s for s in stems if s not in bad_set] - if not reprocess_completed: - if math_done_stems: - before = len(stems) - stems = [s for s in stems if s not in math_done_stems] - removed = before - len(stems) - if removed: - self.logger.info( - "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", - removed, - ) - if not stems: - self.logger.info("Math enrichment: no pending documents after filtering.") - return - # Best-effort: ensure placeholder sidecars for metadata-selected math targets - try: - from glossapi.parquet_schema import ParquetSchema as _ParquetSchema - _ps = _ParquetSchema({"url_column": self.url_column}) - _pq = self._resolve_metadata_parquet(_ps, ensure=True, search_input=True) - except Exception: - _pq = None - if _pq and _pq.exists(): - try: - import pandas as _pd, json as _json - _df = _pd.read_parquet(_pq) - if "filename" in _df.columns: - _df['stem'] = _df['filename'].astype(str).str.replace(r"\.pdf$", "", regex=True) - _phase = _df['phase_recommended'].astype(str) == '2A' if 'phase_recommended' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _ft = (_df['formula_total'].fillna(0).astype('float') > 0) if 'formula_total' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _med = (_df['math_equations_detected'].fillna(0).astype('float') > 0) if 'math_equations_detected' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _mask = _phase | _ft | _med - _parq_stems = set(_df.loc[_mask, 'stem'].dropna().astype(str).tolist()) - if _parq_stems: - sc_dir = self.output_dir / 'sidecars' / 'math' - sc_dir.mkdir(parents=True, exist_ok=True) - for _s in (set(stems) | _parq_stems): - _p = sc_dir / f"{_s}.json" - if not _p.exists(): - _p.write_text(_json.dumps({"items": 0, "accepted": 0, "time_sec": 0.0}, ensure_ascii=False), encoding='utf-8') - except Exception: - pass - try: - self.logger.info("OCR: invoking Phase-2 math for stems: %s", ",".join(stems)) - except Exception: - pass - _run_math(stems) - try: - self.logger.info("OCR: Phase-2 math completed for stems: %s", ",".join(stems)) - except Exception: - pass - except Exception as _e: - self.logger.warning("Phase‑2 enrichment after OCR failed: %s", _e) + manager.shutdown() + except Exception: + pass + if worker_log_dir_env is not None: + os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_env + else: + os.environ.pop("GLOSSAPI_WORKER_LOG_DIR", None) def formula_enrich_from_json( self, diff --git a/src/glossapi/ocr/deepseek/defaults.py b/src/glossapi/ocr/deepseek/defaults.py new file mode 100644 index 0000000..c36309c --- /dev/null +++ b/src/glossapi/ocr/deepseek/defaults.py @@ -0,0 +1,27 @@ +"""Canonical DeepSeek OCR defaults shared across orchestration and CLIs.""" + +from __future__ import annotations + +from typing import Optional + +DEFAULT_RUNTIME_BACKEND = "transformers" +DEFAULT_OCR_PROFILE = "markdown_grounded" +DEFAULT_ATTN_BACKEND = "auto" +DEFAULT_RENDER_DPI = 144 +DEFAULT_MAX_NEW_TOKENS = 2048 +DEFAULT_GPU_MEMORY_UTILIZATION = 0.9 +DEFAULT_REPAIR_MODE = "auto" +DEFAULT_WORKERS_PER_GPU = 1 +DEFAULT_TARGET_BATCH_PAGES = 160 + + +def resolve_render_dpi(value: Optional[int]) -> int: + """Return the canonical render DPI, even when callers pass ``None``.""" + + return DEFAULT_RENDER_DPI if value is None else int(value) + + +def resolve_gpu_memory_utilization(value: Optional[float]) -> float: + """Return the canonical vLLM memory target, even when callers pass ``None``.""" + + return DEFAULT_GPU_MEMORY_UTILIZATION if value is None else float(value) diff --git a/tests/test_corpus_ocr_modules.py b/tests/test_corpus_ocr_modules.py new file mode 100644 index 0000000..95561b9 --- /dev/null +++ b/tests/test_corpus_ocr_modules.py @@ -0,0 +1,102 @@ +from pathlib import Path + +import pandas as pd + +from glossapi import Corpus +from glossapi.corpus.ocr.artifacts import apply_ocr_success_updates +from glossapi.corpus.ocr.config import normalize_ocr_request +from glossapi.corpus.ocr.targets import build_ocr_selection +from glossapi.ocr.deepseek.defaults import DEFAULT_GPU_MEMORY_UTILIZATION, DEFAULT_RENDER_DPI + + +def _mk_corpus(tmp_path: Path) -> Corpus: + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_normalize_ocr_request_uses_shared_vllm_defaults(tmp_path): + corpus = _mk_corpus(tmp_path) + + request = normalize_ocr_request( + logger=corpus.logger, + fix_bad=True, + mode="ocr_bad", + backend="deepseek", + device=None, + model_dir=None, + max_pages=None, + persist_engine=True, + precision=None, + runtime_backend="vllm", + render_dpi=None, + gpu_memory_utilization=None, + math_enhance=False, + force=None, + reprocess_completed=None, + skip_existing=None, + ) + + assert request is not None + assert request.render_dpi == DEFAULT_RENDER_DPI + assert request.gpu_memory_utilization == DEFAULT_GPU_MEMORY_UTILIZATION + + +def test_build_ocr_selection_collapses_chunk_rows_and_skips_completed(tmp_path): + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "needs.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + {"filename": "needs.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "done.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": True}, + ] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + + selection = build_ocr_selection( + corpus, + mode="ocr_bad", + reprocess_completed=False, + ) + + assert selection.bad_files == ["needs.pdf"] + assert selection.ocr_candidates_initial == 2 + assert selection.skipped_completed == 1 + assert selection.skipped_skiplist == 0 + assert selection.ocr_done_stems == {"done"} + + +def test_apply_ocr_success_updates_maps_canonical_artifacts_by_stem(tmp_path): + markdown_dir = tmp_path / "markdown" + metrics_dir = tmp_path / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + + (markdown_dir / "needs.md").write_text("fixed markdown\n", encoding="utf-8") + (metrics_dir / "needs.metrics.json").write_text('{"page_count": 1}\n', encoding="utf-8") + + df = pd.DataFrame( + [ + {"filename": "needs.pdf", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", "needs_ocr": True, "ocr_success": False}, + ] + ) + + updated = apply_ocr_success_updates( + df, + filenames=["needs.pdf"], + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + backend_norm="deepseek", + ).set_index("filename") + + assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs__p0001-0002.pdf", "ocr_success"]) is True + assert updated.loc["needs.pdf", "text"] == "fixed markdown\n" + assert updated.loc["needs__p0001-0002.pdf", "text"] == "fixed markdown\n" + assert updated.loc["needs.pdf", "ocr_markdown_relpath"] == "markdown/needs.md" + assert updated.loc["needs__p0001-0002.pdf", "ocr_metrics_relpath"] == "json/metrics/needs.metrics.json" + assert updated.loc["needs.pdf", "extraction_mode"] == "deepseek" From c13ba21d4b9443587f1fdcbb6a350e35bd886301 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 20:24:55 +0300 Subject: [PATCH 74/93] unify OCR cleaner and speed up shared repeats --- docs/index.md | 1 + docs/ocr_noise_failure_modes.md | 118 + docs/ocr_repetition_policy.md | 35 + rust/glossapi_rs_cleaner/Cargo.lock | 5 + rust/glossapi_rs_cleaner/Cargo.toml | 1 + .../src/cleaning_module.rs | 39 +- rust/glossapi_rs_common/Cargo.lock | 7 + rust/glossapi_rs_common/Cargo.toml | 11 + rust/glossapi_rs_common/src/lib.rs | 159 + rust/glossapi_rs_noise/Cargo.lock | 82 + rust/glossapi_rs_noise/Cargo.toml | 2 + rust/glossapi_rs_noise/src/lib.rs | 366 ++- rust/glossapi_rs_noise/src/noise_metrics.rs | 2382 +++++++++++++- src/glossapi/corpus/phase_clean.py | 2775 ++++++++++++++++- .../scripts/build_ocr_golden_pages.py | 223 ++ .../scripts/review_manifest_materialize.py | 156 + src/glossapi/scripts/table_markdown_audit.py | 522 ++++ .../scripts/table_sentence_context_review.py | 256 ++ tests/test_corpus_clean_enhancements.py | 1348 ++++++++ tests/test_ocr_golden_pages.py | 75 + 20 files changed, 8360 insertions(+), 203 deletions(-) create mode 100644 docs/ocr_noise_failure_modes.md create mode 100644 docs/ocr_repetition_policy.md create mode 100644 rust/glossapi_rs_common/Cargo.lock create mode 100644 rust/glossapi_rs_common/Cargo.toml create mode 100644 rust/glossapi_rs_common/src/lib.rs create mode 100644 src/glossapi/scripts/build_ocr_golden_pages.py create mode 100644 src/glossapi/scripts/review_manifest_materialize.py create mode 100644 src/glossapi/scripts/table_markdown_audit.py create mode 100644 src/glossapi/scripts/table_sentence_context_review.py create mode 100644 tests/test_ocr_golden_pages.py diff --git a/docs/index.md b/docs/index.md index 997d2d8..cb15dca 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,6 +11,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Code Map](code_map.md) links the main documentation ideas to the classes and files that implement them. - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. +- [OCR Repetition Policy](ocr_repetition_policy.md) pins the default repetition thresholds for word and LaTeX cleaning. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. ## Configure and debug diff --git a/docs/ocr_noise_failure_modes.md b/docs/ocr_noise_failure_modes.md new file mode 100644 index 0000000..6017e9c --- /dev/null +++ b/docs/ocr_noise_failure_modes.md @@ -0,0 +1,118 @@ +# OCR Noise Failure Modes + +Status: example bank for future `Corpus.clean_ocr(...)` heuristics. These are notes only, not implemented cleaning rules. + +## Why This Exists + +The preserved OCR outputs contain several distinct failure modes that should not be collapsed into one generic `ocr_noise` rule. Some are page-local low-entropy collapses, some are encoding/control-character tails, and some are repetitive math-token artifacts that need math-aware handling. + +The examples below were reviewed on April 3, 2026 from the preserved OCR lane: + +- `/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown` + +## Group 1: Page-Local Low-Entropy Numeric Collapse + +Definition: +pages that collapse into highly repetitive short numeric lines, often immediately after a page split marker. + +Examples: + +- `ABO_768__p00001-00096.md` + - around line 955 the page turns into repeated `0`, `0 0`, `0 0 0` + - the collapse begins directly after `<--- Page Split --->` +- `ACH_787__p00001-00096.md` + - around line 755 the page turns into repeated `1.1` and occasional `1` + - this also begins directly after `<--- Page Split --->` + +Anchored references: + +- [ABO_768__p00001-00096.md:955](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ABO_768__p00001-00096.md#L955) +- [ACH_787__p00001-00096.md:755](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ACH_787__p00001-00096.md#L755) + +Detection ideas: + +- page-level repeated-line detection, not just single-line run detection +- low token entropy on a page-sized region +- special weight if the collapse starts right after `<--- Page Split --->` +- repeated short numeric lines should be treated separately from legitimate tables or lists + +Important note: +the current OCR numeric-noise check is line-local and is better at catching long same-number or ascending sequences inside one line than these repeated-line page collapses. + +## Group 2: Control-Character / Encoding-Garbage Tails + +Definition: +pages that devolve into non-printable or control-like characters, often after otherwise valid text. + +Example: + +- `ADQ_670.md` + - after a page split, the page contains `%` followed by C1/control-like junk such as `€`, ``, `‚`, ..., `°` + - this is not just numeric repetition; it looks like decoding/binary leakage or severe mojibake-like corruption + +Anchored references: + +- [ADQ_670.md:887](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L887) +- [ADQ_670.md:954](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L954) + +Detection ideas: + +- count non-printable/control codepoints +- count dense runs of extended control-like characters on a page +- flag abrupt transitions from valid prose to control-character tails +- keep this separate from ordinary mojibake and separate from numeric collapse + +## Group 3: Repetitive Math-Token Floods + +Definition: +pages or page segments that repeat the same LaTeX-like math atoms or malformed math atoms many times. + +Examples: + +- `ADS_856__p00001-00014.md` + - repeated `\( \gamma \)` sequence on one line +- `ADS_856__p00015-00082.md` + - repeated `\( \Delta_{v} \)` blocks + - malformed variants like `\( \Deltav \)` + - long concatenated runs like `\Delta_{v}\Delta_{v}\Delta_{v}...` + +Anchored references: + +- [ADS_856__p00001-00014.md:139](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00001-00014.md#L139) +- [ADS_856__p00015-00082.md:1](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00015-00082.md#L1) + +Detection ideas: + +- tokenize LaTeX-like math atoms and detect repeated-token floods +- distinguish valid repeated notation from pathological repetition +- score malformed math variants separately from valid math tokens +- this should remain an experimental detector, not a blunt drop rule + +Important note: +real mathematical texts can legitimately repeat symbols, so this class needs a math-aware heuristic rather than a general repetition penalty. + +## Grouping Recommendation + +Do not collapse all of the above into one rule. + +Recommended future flags: + +- `ocr_numeric_page_collapse` +- `ocr_control_char_tail` +- `ocr_math_repetition` + +Recommended future metadata: + +- page-local region counts +- page-split proximity flags +- repeated-line entropy or uniqueness ratio +- control-character density +- math-token repetition density + +## Current Examples To Keep Around + +- [ABO_768__p00001-00096.md:955](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ABO_768__p00001-00096.md#L955) +- [ACH_787__p00001-00096.md:755](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ACH_787__p00001-00096.md#L755) +- [ADQ_670.md:887](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L887) +- [ADS_856__p00001-00014.md:139](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00001-00014.md#L139) +- [ADS_856__p00015-00082.md:1](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00015-00082.md#L1) diff --git a/docs/ocr_repetition_policy.md b/docs/ocr_repetition_policy.md new file mode 100644 index 0000000..6b64ca7 --- /dev/null +++ b/docs/ocr_repetition_policy.md @@ -0,0 +1,35 @@ +# OCR Repetition Policy + +This document pins the intended default repetition thresholds for OCR-cleaner development so they do not drift silently. + +## Defaults + +- Shared word repetition threshold: `4` +- Shared LaTeX repetition threshold: `4` +- Shared minimum repeat period: `3` +- Shared repeat window: `96` + +These defaults apply to the combined OCR debug annotator: +- `Corpus.clean_ocr_numeric_word_debug_docs(...)` + +In that pipeline: +- numeric detection runs first +- LaTeX detection runs second +- shared repeat detection runs last on the remaining untagged text + +## Scope + +These defaults are for: +- word repetition +- LaTeX repetition + +They do not override numeric-specific detectors, which have their own thresholds such as: +- ascending numeric progressions +- compact repeated numeric atoms +- same-digit numeric runs + +## Design Intent + +- A default of `4` is meant to reduce borderline `3`-repeat matches. +- Locality matters more than page-wide reuse, especially for LaTeX. +- Repeated symbols or notation used normally across a page should not be treated as cleaner targets by default. diff --git a/rust/glossapi_rs_cleaner/Cargo.lock b/rust/glossapi_rs_cleaner/Cargo.lock index a3aabd3..07298d7 100644 --- a/rust/glossapi_rs_cleaner/Cargo.lock +++ b/rust/glossapi_rs_cleaner/Cargo.lock @@ -614,6 +614,7 @@ dependencies = [ "chrono", "csv", "futures", + "glossapi_rs_common", "htmlentity", "lazy_static", "memchr", @@ -626,6 +627,10 @@ dependencies = [ "walkdir", ] +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" + [[package]] name = "half" version = "2.6.0" diff --git a/rust/glossapi_rs_cleaner/Cargo.toml b/rust/glossapi_rs_cleaner/Cargo.toml index 7213bc7..e3974ce 100644 --- a/rust/glossapi_rs_cleaner/Cargo.toml +++ b/rust/glossapi_rs_cleaner/Cargo.toml @@ -26,6 +26,7 @@ memchr = "2" aho-corasick = "1" htmlentity = "~1.3.0" chrono = { version = "=0.4.33", features = ["serde"] } +glossapi_rs_common = { path = "../glossapi_rs_common" } [tool.maturin] bindings = "pyo3-abi3-py38" diff --git a/rust/glossapi_rs_cleaner/src/cleaning_module.rs b/rust/glossapi_rs_cleaner/src/cleaning_module.rs index 9b52551..823ab1c 100644 --- a/rust/glossapi_rs_cleaner/src/cleaning_module.rs +++ b/rust/glossapi_rs_cleaner/src/cleaning_module.rs @@ -1,4 +1,5 @@ use aho_corasick::AhoCorasick; +use glossapi_rs_common::scan_script_metrics; use htmlentity::entity::{decode, ICodedDataTrait}; use lazy_static::lazy_static; use memchr::memchr; // For Step 5.1 @@ -548,31 +549,21 @@ pub fn perform_text_analysis( // This block already calculates cleaned_non_whitespace_chars_val correctly after cleaning if calculate_specific_counts { - let mut current_greek_count = 0; - let mut current_latin_count = 0; - let mut current_cleaned_non_ws_count = 0; - - let greek_set = SCRIPT_SETS.get("greek").cloned().unwrap_or_default(); - let latin_set = SCRIPT_SETS.get("latin").cloned().unwrap_or_default(); - - for ch in cleaned_text.chars() { - if !ch.is_whitespace() { - current_cleaned_non_ws_count += 1; - } - if scripts_for_percentage_and_specific_counts.contains(&"greek".to_string()) - && greek_set.contains(&ch) - { - current_greek_count += 1; - } - if scripts_for_percentage_and_specific_counts.contains(&"latin".to_string()) - && latin_set.contains(&ch) - { - current_latin_count += 1; - } + let metrics = scan_script_metrics(&cleaned_text); + let include_greek = scripts_for_percentage_and_specific_counts + .iter() + .any(|script| script == "greek"); + let include_latin = scripts_for_percentage_and_specific_counts + .iter() + .any(|script| script == "latin"); + + if include_greek { + greek_char_count_cleaned = Some(metrics.greek_char_count as usize); + } + if include_latin { + latin_char_count_cleaned = Some(metrics.latin_char_count as usize); } - greek_char_count_cleaned = Some(current_greek_count); - latin_char_count_cleaned = Some(current_latin_count); - cleaned_non_whitespace_chars_val = Some(current_cleaned_non_ws_count); + cleaned_non_whitespace_chars_val = Some(metrics.non_whitespace_chars as usize); } else { cleaned_non_whitespace_chars_val = Some(cleaned_text.chars().filter(|c| !c.is_whitespace()).count()); diff --git a/rust/glossapi_rs_common/Cargo.lock b/rust/glossapi_rs_common/Cargo.lock new file mode 100644 index 0000000..4fc9d61 --- /dev/null +++ b/rust/glossapi_rs_common/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" diff --git a/rust/glossapi_rs_common/Cargo.toml b/rust/glossapi_rs_common/Cargo.toml new file mode 100644 index 0000000..594fc96 --- /dev/null +++ b/rust/glossapi_rs_common/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "glossapi_rs_common" +version = "0.1.0" +edition = "2021" +authors = ["GlossAPI Team "] +description = "Shared Rust script-analysis helpers for GlossAPI" +license = "EUPL-1.2" + +[lib] +name = "glossapi_rs_common" +path = "src/lib.rs" diff --git a/rust/glossapi_rs_common/src/lib.rs b/rust/glossapi_rs_common/src/lib.rs new file mode 100644 index 0000000..a34f2a9 --- /dev/null +++ b/rust/glossapi_rs_common/src/lib.rs @@ -0,0 +1,159 @@ +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ScriptMetrics { + pub non_whitespace_chars: u64, + pub greek_char_count: u64, + pub latin_char_count: u64, + pub greek_word_count: u64, + pub polytonic_word_count: u64, +} + +impl ScriptMetrics { + #[inline] + pub fn percentage_greek(&self) -> f64 { + if self.non_whitespace_chars > 0 { + 100.0 * self.greek_char_count as f64 / self.non_whitespace_chars as f64 + } else { + 0.0 + } + } + + #[inline] + pub fn latin_percentage(&self) -> f64 { + if self.non_whitespace_chars > 0 { + 100.0 * self.latin_char_count as f64 / self.non_whitespace_chars as f64 + } else { + 0.0 + } + } + + #[inline] + pub fn polytonic_ratio(&self) -> f64 { + if self.greek_word_count > 0 { + self.polytonic_word_count as f64 / self.greek_word_count as f64 + } else { + 0.0 + } + } +} + +#[derive(Debug, Clone, Default)] +pub struct ScriptScanner { + metrics: ScriptMetrics, + token_has_greek: bool, + token_has_polytonic: bool, + in_token: bool, +} + +impl ScriptScanner { + #[inline] + pub fn new() -> Self { + Self::default() + } + + #[inline] + pub fn observe_char(&mut self, ch: char) { + if ch.is_whitespace() { + self.finish_token(); + return; + } + + self.in_token = true; + self.metrics.non_whitespace_chars += 1; + + let cp = ch as u32; + if is_greek(cp) { + self.metrics.greek_char_count += 1; + self.token_has_greek = true; + if is_polytonic_codepoint(cp) { + self.token_has_polytonic = true; + } + } else if is_ascii_latin(cp) { + self.metrics.latin_char_count += 1; + } else if is_combining_mark(cp) { + self.token_has_polytonic = true; + } + } + + #[inline] + pub fn observe_str(&mut self, text: &str) { + for ch in text.chars() { + self.observe_char(ch); + } + } + + #[inline] + pub fn finish_token(&mut self) { + if !self.in_token { + return; + } + if self.token_has_greek { + self.metrics.greek_word_count += 1; + if self.token_has_polytonic { + self.metrics.polytonic_word_count += 1; + } + } + self.in_token = false; + self.token_has_greek = false; + self.token_has_polytonic = false; + } + + #[inline] + pub fn finish(mut self) -> ScriptMetrics { + self.finish_token(); + self.metrics + } +} + +#[inline(always)] +pub fn is_greek(cp: u32) -> bool { + (0x0370..=0x03FF).contains(&cp) || (0x1F00..=0x1FFF).contains(&cp) +} + +#[inline(always)] +pub fn is_combining_mark(cp: u32) -> bool { + (0x0300..=0x036F).contains(&cp) + || (0x1DC0..=0x1DFF).contains(&cp) + || (0x20D0..=0x20FF).contains(&cp) +} + +#[inline(always)] +pub fn is_ascii_latin(cp: u32) -> bool { + (0x41..=0x5A).contains(&cp) || (0x61..=0x7A).contains(&cp) +} + +#[inline(always)] +pub fn is_polytonic_codepoint(cp: u32) -> bool { + (0x1F00..=0x1FFF).contains(&cp) +} + +#[inline] +pub fn scan_script_metrics(text: &str) -> ScriptMetrics { + let mut scanner = ScriptScanner::new(); + scanner.observe_str(text); + scanner.finish() +} + +#[cfg(test)] +mod tests { + use super::{scan_script_metrics, ScriptScanner}; + + #[test] + fn scanner_counts_greek_latin_and_polytonic_words() { + let metrics = scan_script_metrics("Αυτή abc Καὶ"); + assert!(metrics.greek_char_count > 0); + assert_eq!(metrics.latin_char_count, 3); + assert_eq!(metrics.greek_word_count, 2); + assert_eq!(metrics.polytonic_word_count, 1); + assert!(metrics.percentage_greek() > metrics.latin_percentage()); + } + + #[test] + fn scanner_flushes_on_line_boundaries() { + let mut scanner = ScriptScanner::new(); + scanner.observe_str("Καὶ\n"); + scanner.observe_str("αὕτη"); + let metrics = scanner.finish(); + assert_eq!(metrics.greek_word_count, 2); + assert_eq!(metrics.polytonic_word_count, 2); + } +} diff --git a/rust/glossapi_rs_noise/Cargo.lock b/rust/glossapi_rs_noise/Cargo.lock index 3c09979..f38df1f 100644 --- a/rust/glossapi_rs_noise/Cargo.lock +++ b/rust/glossapi_rs_noise/Cargo.lock @@ -87,16 +87,33 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" + [[package]] name = "glossapi_rs_noise" version = "0.1.0" dependencies = [ "anyhow", "csv", + "glossapi_rs_common", "lazy_static", "memmap2", "once_cell", "pyo3", + "rand", "rayon", "regex", "walkdir", @@ -189,6 +206,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -267,6 +293,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.10.0" @@ -422,6 +478,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "winapi-util" version = "0.1.9" @@ -503,3 +565,23 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] diff --git a/rust/glossapi_rs_noise/Cargo.toml b/rust/glossapi_rs_noise/Cargo.toml index 8dfa7bc..fd4cebc 100644 --- a/rust/glossapi_rs_noise/Cargo.toml +++ b/rust/glossapi_rs_noise/Cargo.toml @@ -20,3 +20,5 @@ csv = "1.3.0" pyo3 = { version = "0.19.0", features = ["extension-module", "abi3-py38", "macros"] } anyhow = "1" regex = "1.10" +glossapi_rs_common = { path = "../glossapi_rs_common" } +rand = { version = "0.8", features = ["std_rng"] } diff --git a/rust/glossapi_rs_noise/src/lib.rs b/rust/glossapi_rs_noise/src/lib.rs index 33ae607..e6f09bf 100644 --- a/rust/glossapi_rs_noise/src/lib.rs +++ b/rust/glossapi_rs_noise/src/lib.rs @@ -2,47 +2,93 @@ mod noise_metrics; -use pyo3::prelude::*; -use pyo3::types::PyTuple; use noise_metrics::{ - score_markdown_file_internal, - score_markdown_directory_internal, - score_markdown_file_detailed_internal, + annotate_numeric_debug_page_internal, evaluate_page_character_noise_internal, + export_numeric_match_debug_pages_internal, export_ocr_match_debug_pages_internal, + find_numeric_debug_page_spans_internal, find_word_repeat_spans_internal, score_markdown_directory_detailed_internal, + score_markdown_directory_internal, score_markdown_directory_ocr_profile_internal, + score_markdown_file_detailed_internal, score_markdown_file_internal, }; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use pyo3::types::PyTuple; /// Compute the badness score for a single markdown file. /// Returns the numeric score as `float`. #[pyfunction] fn score_markdown_file(path: &str) -> PyResult { - score_markdown_file_internal(std::path::Path::new(path)).map_err(|e| PyErr::new::(e.to_string())) + score_markdown_file_internal(std::path::Path::new(path)) + .map_err(|e| PyErr::new::(e.to_string())) } /// Compute badness scores for all `.md` files under `input_dir` in parallel. /// The result is a list of `(file_path, score, latin_percentage)` tuples. #[pyfunction] -fn score_markdown_directory(input_dir: &str, n_threads: Option) -> PyResult> { - score_markdown_directory_internal(std::path::Path::new(input_dir), n_threads).map_err(|e| PyErr::new::(e.to_string())) +fn score_markdown_directory( + input_dir: &str, + n_threads: Option, +) -> PyResult> { + score_markdown_directory_internal(std::path::Path::new(input_dir), n_threads) + .map_err(|e| PyErr::new::(e.to_string())) } /// Detailed score for a single file: returns a Python tuple of all raw and derived metrics #[pyfunction] fn score_markdown_file_detailed(py: Python<'_>, path: &str) -> PyResult> { let ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, ) = score_markdown_file_detailed_internal(std::path::Path::new(path)) .map_err(|e| PyErr::new::(e.to_string()))?; let tup = PyTuple::new( py, vec![ - score.into_py(py), latin_pct.into_py(py), table_ratio.into_py(py), poly_ratio.into_py(py), - (len_greek as u128).into_py(py), (total_words as u128).into_py(py), - (v_pen as u128).into_py(py), (c_pen as u128).into_py(py), (bad_dbl as u128).into_py(py), (misplaced_sigma as u128).into_py(py), (invalid_bigram as u128).into_py(py), (long_word_count as u128).into_py(py), (longest_word as u128).into_py(py), (short_word_count as u128).into_py(py), (max_run as u128).into_py(py), - v_rate.into_py(py), c_rate.into_py(py), d_rate.into_py(py), sigma_end_rate.into_py(py), bigram_rate.into_py(py), long_word_rate.into_py(py), short_ratio.into_py(py), short_pen.into_py(py), + score.into_py(py), + latin_pct.into_py(py), + table_ratio.into_py(py), + poly_ratio.into_py(py), + (len_greek as u128).into_py(py), + (total_words as u128).into_py(py), + (v_pen as u128).into_py(py), + (c_pen as u128).into_py(py), + (bad_dbl as u128).into_py(py), + (misplaced_sigma as u128).into_py(py), + (invalid_bigram as u128).into_py(py), + (long_word_count as u128).into_py(py), + (longest_word as u128).into_py(py), + (short_word_count as u128).into_py(py), + (max_run as u128).into_py(py), + v_rate.into_py(py), + c_rate.into_py(py), + d_rate.into_py(py), + sigma_end_rate.into_py(py), + bigram_rate.into_py(py), + long_word_rate.into_py(py), + short_ratio.into_py(py), + short_pen.into_py(py), flags.into_py(py), ], ); @@ -51,25 +97,70 @@ fn score_markdown_file_detailed(py: Python<'_>, path: &str) -> PyResult, input_dir: &str, n_threads: Option) -> PyResult>> { - let rows = score_markdown_directory_detailed_internal(std::path::Path::new(input_dir), n_threads) - .map_err(|e| PyErr::new::(e.to_string()))?; +fn score_markdown_directory_detailed( + py: Python<'_>, + input_dir: &str, + n_threads: Option, +) -> PyResult>> { + let rows = + score_markdown_directory_detailed_internal(std::path::Path::new(input_dir), n_threads) + .map_err(|e| PyErr::new::(e.to_string()))?; let mut out: Vec> = Vec::with_capacity(rows.len()); for ( - path, score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) in rows.into_iter() { + path, + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) in rows.into_iter() + { let tup = PyTuple::new( py, vec![ path.into_py(py), - score.into_py(py), latin_pct.into_py(py), table_ratio.into_py(py), poly_ratio.into_py(py), - (len_greek as u128).into_py(py), (total_words as u128).into_py(py), - (v_pen as u128).into_py(py), (c_pen as u128).into_py(py), (bad_dbl as u128).into_py(py), (misplaced_sigma as u128).into_py(py), (invalid_bigram as u128).into_py(py), (long_word_count as u128).into_py(py), (longest_word as u128).into_py(py), (short_word_count as u128).into_py(py), (max_run as u128).into_py(py), - v_rate.into_py(py), c_rate.into_py(py), d_rate.into_py(py), sigma_end_rate.into_py(py), bigram_rate.into_py(py), long_word_rate.into_py(py), short_ratio.into_py(py), short_pen.into_py(py), + score.into_py(py), + latin_pct.into_py(py), + table_ratio.into_py(py), + poly_ratio.into_py(py), + (len_greek as u128).into_py(py), + (total_words as u128).into_py(py), + (v_pen as u128).into_py(py), + (c_pen as u128).into_py(py), + (bad_dbl as u128).into_py(py), + (misplaced_sigma as u128).into_py(py), + (invalid_bigram as u128).into_py(py), + (long_word_count as u128).into_py(py), + (longest_word as u128).into_py(py), + (short_word_count as u128).into_py(py), + (max_run as u128).into_py(py), + v_rate.into_py(py), + c_rate.into_py(py), + d_rate.into_py(py), + sigma_end_rate.into_py(py), + bigram_rate.into_py(py), + long_word_rate.into_py(py), + short_ratio.into_py(py), + short_pen.into_py(py), flags.into_py(py), ], ); @@ -78,11 +169,226 @@ fn score_markdown_directory_detailed(py: Python<'_>, input_dir: &str, n_threads: Ok(out) } +#[pyfunction] +#[pyo3(signature = (input_dir, n_threads=None, min_repeat_run=6))] +fn score_markdown_directory_ocr_profile( + py: Python<'_>, + input_dir: &str, + n_threads: Option, + min_repeat_run: u64, +) -> PyResult>> { + let rows = score_markdown_directory_ocr_profile_internal( + std::path::Path::new(input_dir), + n_threads, + min_repeat_run, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("path", row.path)?; + item.set_item("percentage_greek", row.percentage_greek)?; + item.set_item("latin_percentage", row.latin_percentage)?; + item.set_item("polytonic_ratio", row.polytonic_ratio)?; + item.set_item("non_whitespace_chars", row.non_whitespace_chars)?; + item.set_item("greek_char_count", row.greek_char_count)?; + item.set_item("latin_char_count", row.latin_char_count)?; + item.set_item("ocr_repeat_phrase_run_max", row.ocr_repeat_phrase_run_max)?; + item.set_item("ocr_repeat_line_run_max", row.ocr_repeat_line_run_max)?; + item.set_item( + "ocr_repeat_suspicious_line_count", + row.ocr_repeat_suspicious_line_count, + )?; + item.set_item( + "ocr_repeat_suspicious_line_ratio", + row.ocr_repeat_suspicious_line_ratio, + )?; + item.set_item("ocr_noise_suspect", row.ocr_noise_suspect)?; + item.set_item("ocr_noise_flags", row.ocr_noise_flags)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (input_dir, output_dir, n_threads=None, min_repeat_run=6, max_pages=None, sample_seed=0))] +fn export_ocr_match_debug_pages( + py: Python<'_>, + input_dir: &str, + output_dir: &str, + n_threads: Option, + min_repeat_run: u64, + max_pages: Option, + sample_seed: u64, +) -> PyResult>> { + let rows = export_ocr_match_debug_pages_internal( + std::path::Path::new(input_dir), + std::path::Path::new(output_dir), + n_threads, + min_repeat_run, + max_pages, + sample_seed, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("source_path", row.source_path)?; + item.set_item("output_path", row.output_path)?; + item.set_item("source_stem", row.source_stem)?; + item.set_item("base_stem", row.base_stem)?; + item.set_item("page_number", row.page_number)?; + item.set_item("page_index_in_file", row.page_index_in_file)?; + item.set_item("match_types", row.match_types)?; + item.set_item("match_count", row.match_count)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (input_dir, output_dir, n_threads=None, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10, max_pages=None, sample_seed=0))] +fn export_numeric_match_debug_pages( + py: Python<'_>, + input_dir: &str, + output_dir: &str, + n_threads: Option, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, + max_pages: Option, + sample_seed: u64, +) -> PyResult>> { + let rows = export_numeric_match_debug_pages_internal( + std::path::Path::new(input_dir), + std::path::Path::new(output_dir), + n_threads, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + max_pages, + sample_seed, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("source_path", row.source_path)?; + item.set_item("output_path", row.output_path)?; + item.set_item("source_stem", row.source_stem)?; + item.set_item("base_stem", row.base_stem)?; + item.set_item("page_number", row.page_number)?; + item.set_item("page_index_in_file", row.page_index_in_file)?; + item.set_item("match_types", row.match_types)?; + item.set_item("match_count", row.match_count)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (page, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10))] +fn annotate_numeric_debug_page( + py: Python<'_>, + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> PyResult>> { + let Some((annotated_page, match_types, match_count)) = annotate_numeric_debug_page_internal( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) else { + return Ok(None); + }; + + let item = PyDict::new(py); + item.set_item("annotated_page", annotated_page)?; + item.set_item("match_types", match_types)?; + item.set_item("match_count", match_count)?; + Ok(Some(item.into())) +} + +#[pyfunction] +#[pyo3(signature = (page, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10))] +fn find_numeric_debug_page_spans( + py: Python<'_>, + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> PyResult>> { + let spans = find_numeric_debug_page_spans_internal( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("match_type", span.match_type)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (normalized_text, rep_threshold=4, min_period=3, window=96))] +fn find_word_repeat_spans( + py: Python<'_>, + normalized_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> PyResult>> { + let spans = find_word_repeat_spans_internal(normalized_text, rep_threshold, min_period, window); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("period", span.period)?; + item.set_item("repetitions", span.repetitions)?; + item.set_item("tail_chars", span.tail_chars)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +fn evaluate_page_character_noise(py: Python<'_>, page: &str) -> PyResult> { + let metrics = evaluate_page_character_noise_internal(page); + let item = PyDict::new(py); + item.set_item("total_chars", metrics.total_chars)?; + item.set_item("bad_char_count", metrics.bad_char_count)?; + item.set_item("bad_char_ratio", metrics.bad_char_ratio)?; + item.set_item("control_count", metrics.control_count)?; + item.set_item("private_use_count", metrics.private_use_count)?; + item.set_item("cjk_count", metrics.cjk_count)?; + item.set_item("replacement_count", metrics.replacement_count)?; + Ok(item.into()) +} + #[pymodule] fn glossapi_rs_noise(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(score_markdown_file, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_directory, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_file_detailed, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_directory_detailed, m)?)?; + m.add_function(wrap_pyfunction!(score_markdown_directory_ocr_profile, m)?)?; + m.add_function(wrap_pyfunction!(export_ocr_match_debug_pages, m)?)?; + m.add_function(wrap_pyfunction!(export_numeric_match_debug_pages, m)?)?; + m.add_function(wrap_pyfunction!(annotate_numeric_debug_page, m)?)?; + m.add_function(wrap_pyfunction!(find_numeric_debug_page_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_word_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(evaluate_page_character_noise, m)?)?; Ok(()) } diff --git a/rust/glossapi_rs_noise/src/noise_metrics.rs b/rust/glossapi_rs_noise/src/noise_metrics.rs index 105b823..ad305aa 100644 --- a/rust/glossapi_rs_noise/src/noise_metrics.rs +++ b/rust/glossapi_rs_noise/src/noise_metrics.rs @@ -69,27 +69,18 @@ Positions in detailed tuple (suggested append): Note: after adding these fields, bump the Python bindings accordingly and propagate polytonic_ratio (already computed here) into downstream parquet (already wired in Corpus.clean()). */ - +use glossapi_rs_common::{is_combining_mark, is_greek, scan_script_metrics, ScriptScanner}; +use rand::rngs::StdRng; +use rand::seq::SliceRandom; +use rand::SeedableRng; use rayon::prelude::*; +use rayon::ThreadPoolBuilder; use std::fs::{self, File}; use std::io::Read; use std::path::{Path, PathBuf}; use walkdir::WalkDir; // Avoid heavy regex for table detection; use lightweight checks instead -const GREEK_BLOCK_1: std::ops::RangeInclusive = 0x0370..=0x03FF; // Greek & Coptic -const GREEK_BLOCK_2: std::ops::RangeInclusive = 0x1F00..=0x1FFF; // Greek Extended - -#[inline(always)] -fn is_greek(cp: u32) -> bool { - GREEK_BLOCK_1.contains(&cp) || GREEK_BLOCK_2.contains(&cp) -} - -#[inline(always)] -fn is_combining_mark(cp: u32) -> bool { - (0x0300..=0x036F).contains(&cp) || (0x1DC0..=0x1DFF).contains(&cp) || (0x20D0..=0x20FF).contains(&cp) -} - #[inline(always)] fn is_vowel(cp: u32) -> bool { matches!( @@ -105,13 +96,21 @@ fn is_vowel(cp: u32) -> bool { const LONG_WORD_LIMIT: u64 = 21; const SHORT_WORD_LIMIT: u64 = 3; +const PAGE_SPLIT_MARKER: &str = "<--- Page Split --->"; +const NUMERIC_PAGE_COLLAPSE_MIN_TOKENS: u64 = 64; +const NUMERIC_PAGE_COLLAPSE_MIN_ATOMS: u64 = 64; +const NUMERIC_BLOCK_SEED_MIN_ATOMS: usize = 8; // Baseline for short words per 1000 Greek characters (empirically ~26 on clean texts) const SHORT_BASELINE_PER_1000: f64 = 26.0; #[inline] fn to_lower_fast(cp: u32) -> u32 { // Fast path for basic Greek capitals: add 0x20; otherwise return as-is - if (0x0391..=0x03A9).contains(&cp) { cp + 0x20 } else { cp } + if (0x0391..=0x03A9).contains(&cp) { + cp + 0x20 + } else { + cp + } } #[inline] @@ -140,7 +139,9 @@ fn allowed_double(cp: u32) -> bool { fn is_table_line_trimmed(trimmed: &str) -> bool { // A simple check equivalent to /^\s*\|.*\|\s*$/ after trimming // i.e., line begins and ends with a '|' ignoring outer whitespace - !trimmed.is_empty() && trimmed.as_bytes()[0] == b'|' && trimmed.as_bytes()[trimmed.len()-1] == b'|' + !trimmed.is_empty() + && trimmed.as_bytes()[0] == b'|' + && trimmed.as_bytes()[trimmed.len() - 1] == b'|' } fn table_line_ratio_and_filtered(text: &str) -> (f64, Option, usize, usize) { @@ -156,7 +157,11 @@ fn table_line_ratio_and_filtered(text: &str) -> (f64, Option, usize, usi } } } - let ratio = if non_empty > 0 { table_like as f64 / non_empty as f64 } else { 0.0 }; + let ratio = if non_empty > 0 { + table_like as f64 / non_empty as f64 + } else { + 0.0 + }; if table_like == 0 { return (ratio, None, non_empty, table_like); } @@ -179,24 +184,1564 @@ fn compute_latin_pct(buf: &[u8]) -> f64 { latin_chars as f64 / (buf.len() as f64) } -fn compute_polytonic_word_ratio(text: &str) -> (u64, u64, f64) { - let mut greek_words = 0u64; - let mut polytonic_words = 0u64; - for w in text.split_whitespace() { - let mut has_greek = false; - let mut has_poly = false; - for ch in w.chars() { - let cp = ch as u32; - if is_greek(cp) { has_greek = true; } - if (0x1F00..=0x1FFF).contains(&cp) || is_combining_mark(cp) { has_poly = true; } +#[derive(Debug, Clone)] +pub struct OcrProfileRow { + pub path: String, + pub percentage_greek: f64, + pub latin_percentage: f64, + pub polytonic_ratio: f64, + pub non_whitespace_chars: u64, + pub greek_char_count: u64, + pub latin_char_count: u64, + pub ocr_repeat_phrase_run_max: u64, + pub ocr_repeat_line_run_max: u64, + pub ocr_repeat_suspicious_line_count: u64, + pub ocr_repeat_suspicious_line_ratio: f64, + pub ocr_noise_suspect: bool, + pub ocr_noise_flags: String, +} + +#[derive(Debug, Clone)] +pub struct OcrDebugPageRow { + pub source_path: String, + pub output_path: String, + pub source_stem: String, + pub base_stem: String, + pub page_number: u64, + pub page_index_in_file: u64, + pub match_types: String, + pub match_count: u64, +} + +#[derive(Debug, Clone)] +struct OcrDebugPageCandidate { + source_path: String, + source_stem: String, + base_stem: String, + page_number: u64, + page_index_in_file: u64, +} + +#[derive(Debug, Clone)] +struct DebugMatchSpan { + start: usize, + end: usize, + match_type: &'static str, +} + +#[derive(Debug, Clone)] +pub struct NumericDebugSpan { + pub start: usize, + pub end: usize, + pub match_type: String, +} + +#[derive(Debug, Clone)] +pub struct WordRepeatSpan { + pub start: usize, + pub end: usize, + pub period: usize, + pub repetitions: usize, + pub tail_chars: usize, +} + +#[derive(Debug, Clone, Default)] +pub struct PageCharacterNoise { + pub total_chars: u64, + pub bad_char_count: u64, + pub bad_char_ratio: f64, + pub control_count: u64, + pub private_use_count: u64, + pub cjk_count: u64, + pub replacement_count: u64, +} + +const MERGE_SAME_CATEGORY_MAX_NONWHITESPACE_GAP: usize = 10; + +#[derive(Debug, Clone, Copy)] +struct TokenSpan { + start: usize, + end: usize, +} + +#[derive(Debug, Clone, Copy, Default)] +struct NumericLineSummary { + has_alpha: bool, + rejected_non_numeric: bool, + numeric_token_count: usize, + numeric_atom_count: usize, + is_blank: bool, +} + +#[inline] +fn is_trim_numeric_edge_char(ch: char) -> bool { + ch.is_ascii_punctuation() + || matches!( + ch, + '«' | '»' | '“' | '”' | '„' | '‟' | '‘' | '’' | '‚' | '‛' + ) +} + +#[inline] +fn is_numeric_page_ignored_token(token: &str) -> bool { + !token.is_empty() + && token + .chars() + .all(|ch| !ch.is_whitespace() && !ch.is_alphanumeric()) +} + +fn trim_numeric_token_bounds(token: &str) -> Option<(usize, usize)> { + if token.is_empty() { + return None; + } + + let mut start = 0usize; + let mut end = token.len(); + + while start < end { + let ch = token[start..].chars().next()?; + if ch.is_ascii_digit() { + break; + } + if is_trim_numeric_edge_char(ch) { + start += ch.len_utf8(); + } else { + return None; + } + } + + while start < end { + let ch = token[..end].chars().next_back()?; + if ch.is_ascii_digit() { + break; + } + if is_trim_numeric_edge_char(ch) { + end -= ch.len_utf8(); + } else { + return None; + } + } + + if start >= end { + None + } else { + Some((start, end)) + } +} + +#[inline] +fn is_numeric_page_token_body(text: &str) -> bool { + if text.is_empty() { + return false; + } + + if text.chars().all(|ch| ch.is_ascii_digit()) { + return (1..=4).contains(&text.len()); + } + + let mut saw_digit = false; + for ch in text.chars() { + if ch.is_ascii_digit() { + saw_digit = true; + continue; + } + if matches!(ch, '.' | ',' | ':' | ';' | '/' | '-') { + continue; + } + return false; + } + + saw_digit +} + +fn summarize_numeric_line(line: &str) -> NumericLineSummary { + let trimmed = line.trim(); + if trimmed.is_empty() { + return NumericLineSummary { + is_blank: true, + ..NumericLineSummary::default() + }; + } + + let tokens = extract_non_whitespace_tokens_with_spans(line); + let mut summary = NumericLineSummary::default(); + for token in tokens { + let raw = &line[token.start..token.end]; + if raw.chars().any(char::is_alphabetic) { + summary.has_alpha = true; + } + if is_numeric_page_ignored_token(raw) { + continue; + } + let Some((trim_start, trim_end)) = trim_numeric_token_bounds(raw) else { + summary.rejected_non_numeric = true; + continue; + }; + let trimmed = &raw[trim_start..trim_end]; + if !is_numeric_page_token_body(trimmed) { + summary.rejected_non_numeric = true; + continue; + } + summary.numeric_token_count += 1; + summary.numeric_atom_count += extract_digit_group_spans(trimmed).len(); + } + summary +} + +fn parse_simple_number(text: &str) -> Option { + if text.is_empty() { + return None; + } + + let mut normalized = String::with_capacity(text.len()); + let mut saw_digit = false; + let mut saw_separator = false; + + for ch in text.chars() { + if ch.is_ascii_digit() { + normalized.push(ch); + saw_digit = true; + } else if ch == '.' || ch == ',' { + if saw_separator { + return None; + } + saw_separator = true; + normalized.push('.'); + } else { + return None; + } + } + + if !saw_digit || normalized.starts_with('.') || normalized.ends_with('.') { + return None; + } + + normalized.parse::().ok() +} + +fn repeated_digit_token(text: &str) -> Option { + let mut digit: Option = None; + for ch in text.chars() { + if !ch.is_ascii_digit() { + return None; + } + match digit { + Some(existing) if existing != ch => return None, + Some(_) => {} + None => digit = Some(ch), + } + } + digit +} + +#[inline] +fn is_private_use_codepoint(cp: u32) -> bool { + matches!( + cp, + 0xE000..=0xF8FF | 0xF0000..=0xFFFFD | 0x100000..=0x10FFFD + ) +} + +#[inline] +fn is_cjk_codepoint(cp: u32) -> bool { + matches!( + cp, + 0x3400..=0x4DBF + | 0x4E00..=0x9FFF + | 0xF900..=0xFAFF + | 0x20000..=0x2A6DF + | 0x2A700..=0x2B73F + | 0x2B740..=0x2B81F + | 0x2B820..=0x2CEAF + | 0x2F800..=0x2FA1F + ) +} + +pub fn evaluate_page_character_noise_internal(page: &str) -> PageCharacterNoise { + let mut metrics = PageCharacterNoise::default(); + for ch in page.chars() { + metrics.total_chars += 1; + let cp = ch as u32; + let mut is_bad = false; + if ch == '\u{FFFD}' { + metrics.replacement_count += 1; + is_bad = true; + } else if ch.is_control() && !matches!(ch, '\n' | '\r' | '\t') { + metrics.control_count += 1; + is_bad = true; + } else if is_private_use_codepoint(cp) { + metrics.private_use_count += 1; + is_bad = true; + } else if is_cjk_codepoint(cp) { + metrics.cjk_count += 1; + is_bad = true; + } + if is_bad { + metrics.bad_char_count += 1; + } + } + + metrics.bad_char_ratio = if metrics.total_chars > 0 { + metrics.bad_char_count as f64 / metrics.total_chars as f64 + } else { + 0.0 + }; + metrics +} + +fn extract_digit_group_spans(text: &str) -> Vec { + let mut spans = Vec::new(); + let mut current_start: Option = None; + + for (idx, ch) in text.char_indices() { + if ch.is_ascii_digit() { + if current_start.is_none() { + current_start = Some(idx); + } + } else if let Some(start) = current_start.take() { + spans.push(TokenSpan { start, end: idx }); + } + } + + if let Some(start) = current_start { + spans.push(TokenSpan { + start, + end: text.len(), + }); + } + + spans +} + +#[inline] +fn numeric_step_approx_eq(lhs: f64, rhs: f64) -> bool { + let scale = lhs.abs().max(rhs.abs()).max(1.0); + (lhs - rhs).abs() <= 1e-9 * scale +} + +#[derive(Debug, Clone, Copy, Default)] +struct OcrRepeatNoiseMetrics { + phrase_run_max: u64, + line_run_max: u64, + suspicious_line_count: u64, + suspicious_line_ratio: f64, + suspect: bool, +} + +fn extract_non_whitespace_tokens_with_spans(line: &str) -> Vec { + let mut tokens = Vec::new(); + let mut current_start: Option = None; + + for (idx, ch) in line.char_indices() { + if !ch.is_whitespace() { + if current_start.is_none() { + current_start = Some(idx); + } + } else if let Some(start) = current_start.take() { + tokens.push(TokenSpan { start, end: idx }); + } + } + + if let Some(start) = current_start { + tokens.push(TokenSpan { + start, + end: line.len(), + }); + } + + tokens +} + +fn normalize_line_for_repetition(line: &str) -> Option { + let trimmed = line.trim(); + if trimmed.is_empty() { + return None; + } + + let mut normalized = String::with_capacity(trimmed.len()); + let mut iter = trimmed.split_whitespace(); + if let Some(first) = iter.next() { + normalized.push_str(first); + for token in iter { + normalized.push(' '); + normalized.push_str(token); } - if has_greek { - greek_words += 1; - if has_poly { polytonic_words += 1; } + } + Some(normalized) +} + +fn phrase_tokens_equal( + line: &str, + tokens: &[TokenSpan], + lhs: usize, + rhs: usize, + len: usize, +) -> bool { + (0..len).all(|offset| { + let lhs_token = &line[tokens[lhs + offset].start..tokens[lhs + offset].end]; + let rhs_token = &line[tokens[rhs + offset].start..tokens[rhs + offset].end]; + lhs_token == rhs_token + }) +} + +fn collect_repeat_phrase_debug_matches( + line: &str, + tokens: &[TokenSpan], + min_repeat_run: u64, +) -> Vec { + let mut spans = Vec::new(); + let min_run = min_repeat_run as usize; + if min_run < 2 || tokens.len() < min_run { + return spans; + } + + let max_phrase_len = 4usize.min(tokens.len() / min_run); + for phrase_len in 1..=max_phrase_len { + let mut i = 0usize; + while i + phrase_len * min_run <= tokens.len() { + let mut repeats = 1usize; + while i + phrase_len * (repeats + 1) <= tokens.len() + && phrase_tokens_equal(line, tokens, i, i + repeats * phrase_len, phrase_len) + { + repeats += 1; + } + if repeats >= min_run { + spans.push(DebugMatchSpan { + start: tokens[i].start, + end: tokens[i + phrase_len * repeats - 1].end, + match_type: "repeat_phrase_run", + }); + i += phrase_len * repeats; + } else { + i += 1; + } } } - let ratio = if greek_words > 0 { polytonic_words as f64 / greek_words as f64 } else { 0.0 }; - (polytonic_words, greek_words, ratio) + + spans +} + +fn debug_match_merge_category(match_type: &'static str) -> Option<&'static str> { + match match_type { + "ascending_numeric_sequence" + | "repeat_numeric_run" + | "same_digit_numeric_run" + | "numeric_page_collapse" + | "numeric_block_collapse" => Some("numeric"), + "word_repeat" => Some("word"), + _ => None, + } +} + +fn gap_has_fewer_than_n_nonwhitespace_chars( + text: &str, + start: usize, + end: usize, + max_nonwhitespace: usize, +) -> bool { + if start >= end { + return true; + } + + let mut count = 0usize; + for ch in text[start..end].chars() { + if !ch.is_whitespace() { + count += 1; + if count >= max_nonwhitespace { + return false; + } + } + } + true +} + +fn merge_debug_spans( + text: &str, + spans: Vec, +) -> Vec<(usize, usize, Vec<&'static str>)> { + if spans.is_empty() { + return Vec::new(); + } + + let mut spans = spans; + spans.sort_by_key(|span| (span.start, span.end)); + + let mut merged: Vec<(usize, usize, Vec<&'static str>)> = Vec::new(); + for span in spans { + if let Some((start, end, types)) = merged.last_mut() { + let overlaps = span.start <= *end; + let same_category_gap_merge = !overlaps + && debug_match_merge_category(span.match_type).is_some() + && types.iter().any(|kind| { + debug_match_merge_category(*kind) == debug_match_merge_category(span.match_type) + }) + && gap_has_fewer_than_n_nonwhitespace_chars( + text, + *end, + span.start, + MERGE_SAME_CATEGORY_MAX_NONWHITESPACE_GAP, + ); + if overlaps || same_category_gap_merge { + *end = (*end).max(span.end); + if !types.contains(&span.match_type) { + types.push(span.match_type); + } + *start = (*start).min(span.start); + continue; + } + } + merged.push((span.start, span.end, vec![span.match_type])); + } + + for (_, _, types) in &mut merged { + types.sort_unstable(); + types.dedup(); + } + + merged +} + +fn annotate_text_with_debug_spans( + text: &str, + spans: Vec, +) -> Option<(String, Vec<&'static str>, u64)> { + let merged = merge_debug_spans(text, spans); + if merged.is_empty() { + return None; + } + + let mut annotated = String::with_capacity(text.len() + merged.len() * 48); + let mut pos = 0usize; + let mut match_types: Vec<&'static str> = Vec::new(); + for (start, end, types) in &merged { + if *start > pos { + annotated.push_str(&text[pos..*start]); + } + let type_attr = types.join(","); + annotated.push_str("'); + annotated.push_str(&text[*start..*end]); + annotated.push_str(""); + pos = *end; + for kind in types { + if !match_types.contains(kind) { + match_types.push(*kind); + } + } + } + if pos < text.len() { + annotated.push_str(&text[pos..]); + } + + Some((annotated, match_types, merged.len() as u64)) +} + +fn collect_numeric_page_collapse_span(page: &str, min_page_tokens: u64) -> Option { + let tokens = extract_non_whitespace_tokens_with_spans(page); + let mut page_start: Option = None; + let mut page_end: Option = None; + let mut first_start: Option = None; + let mut last_end: Option = None; + let mut numeric_token_count = 0usize; + let mut numeric_atom_count = 0usize; + for token in tokens { + let raw = &page[token.start..token.end]; + if page_start.is_none() { + page_start = Some(token.start); + } + page_end = Some(token.end); + if is_numeric_page_ignored_token(raw) { + continue; + } + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + if !is_numeric_page_token_body(trimmed) { + return None; + } + let abs_start = token.start + trim_start; + let abs_end = token.start + trim_end; + if first_start.is_none() { + first_start = Some(abs_start); + } + last_end = Some(abs_end); + numeric_token_count += 1; + numeric_atom_count += extract_digit_group_spans(trimmed).len(); + } + + if numeric_token_count < min_page_tokens as usize + && numeric_atom_count < NUMERIC_PAGE_COLLAPSE_MIN_ATOMS as usize + { + return None; + } + + Some(DebugMatchSpan { + start: page_start.or(first_start)?, + end: page_end.or(last_end)?, + match_type: "numeric_page_collapse", + }) +} + +fn collect_numeric_block_collapse_spans(page: &str) -> Vec { + let mut lines: Vec<(usize, usize, NumericLineSummary)> = Vec::new(); + let mut offset = 0usize; + for segment in page.split_inclusive('\n') { + let line = segment.strip_suffix('\n').unwrap_or(segment); + let summary = summarize_numeric_line(line); + lines.push((offset, offset + segment.len(), summary)); + offset += segment.len(); + } + if offset < page.len() { + let line = &page[offset..]; + lines.push((offset, page.len(), summarize_numeric_line(line))); + } + + let mut spans = Vec::new(); + let mut idx = 0usize; + while idx < lines.len() { + let (_, _, summary) = lines[idx]; + let is_seed = !summary.has_alpha + && !summary.rejected_non_numeric + && summary.numeric_atom_count >= NUMERIC_BLOCK_SEED_MIN_ATOMS; + if !is_seed { + idx += 1; + continue; + } + + let mut start_idx = idx; + let mut end_idx = idx; + let mut total_atoms = summary.numeric_atom_count; + + while start_idx > 0 { + let prev = lines[start_idx - 1].2; + let prev_ok = prev.is_blank + || (!prev.has_alpha && !prev.rejected_non_numeric && prev.numeric_token_count > 0); + if !prev_ok { + break; + } + start_idx -= 1; + total_atoms += prev.numeric_atom_count; + } + + while end_idx + 1 < lines.len() { + let next = lines[end_idx + 1].2; + let next_ok = next.is_blank + || (!next.has_alpha && !next.rejected_non_numeric && next.numeric_token_count > 0); + if !next_ok { + break; + } + end_idx += 1; + total_atoms += next.numeric_atom_count; + } + + if total_atoms >= NUMERIC_PAGE_COLLAPSE_MIN_ATOMS as usize { + let first_nonblank = (start_idx..=end_idx).find(|i| !lines[*i].2.is_blank); + let last_nonblank = (start_idx..=end_idx).rfind(|i| !lines[*i].2.is_blank); + if let (Some(first), Some(last)) = (first_nonblank, last_nonblank) { + spans.push(DebugMatchSpan { + start: lines[first].0, + end: lines[last].1, + match_type: "numeric_block_collapse", + }); + } + } + + idx = end_idx + 1; + } + + spans +} + +fn collect_numeric_progression_matches( + line: &str, + tokens: &[TokenSpan], + min_progress_steps: u64, +) -> Vec { + let min_steps = min_progress_steps as usize; + if min_steps < 2 || tokens.len() < min_steps { + return Vec::new(); + } + + let numeric_tokens: Vec> = tokens + .iter() + .map(|token| { + let raw = &line[token.start..token.end]; + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + let value = parse_simple_number(trimmed)?; + Some((token.start + trim_start, token.start + trim_end, value)) + }) + .collect(); + + let mut spans = Vec::new(); + let mut i = 0usize; + while i + min_steps <= numeric_tokens.len() { + let Some((start, _, first)) = numeric_tokens[i] else { + i += 1; + continue; + }; + let Some((_, _, second)) = numeric_tokens[i + 1] else { + i += 1; + continue; + }; + + let step = second - first; + if !step.is_finite() || step <= 0.0 { + i += 1; + continue; + } + + let mut j = i + 1; + while j + 1 < numeric_tokens.len() { + let Some((_, _, current)) = numeric_tokens[j] else { + break; + }; + let Some((_, _, next)) = numeric_tokens[j + 1] else { + break; + }; + if numeric_step_approx_eq(next - current, step) { + j += 1; + } else { + break; + } + } + + let run_len = j - i + 1; + if run_len >= min_steps { + let (_, end, _) = numeric_tokens[j].expect("numeric run end"); + spans.push(DebugMatchSpan { + start, + end, + match_type: "ascending_numeric_sequence", + }); + i = j + 1; + } else { + i += 1; + } + } + + spans +} + +fn collect_compact_repeat_numeric_matches( + line: &str, + tokens: &[TokenSpan], + min_repeat_steps: u64, +) -> Vec { + let min_steps = min_repeat_steps as usize; + if min_steps < 2 { + return Vec::new(); + } + + let mut spans = Vec::new(); + for token in tokens { + let raw = &line[token.start..token.end]; + let Some((trim_start, trim_end)) = trim_numeric_token_bounds(raw) else { + continue; + }; + let trimmed = &raw[trim_start..trim_end]; + let digit_groups = extract_digit_group_spans(trimmed); + if digit_groups.len() < min_steps { + continue; + } + + let first_group = &trimmed[digit_groups[0].start..digit_groups[0].end]; + if digit_groups + .iter() + .any(|group| &trimmed[group.start..group.end] != first_group) + { + continue; + } + + let mut separators_ok = true; + for pair in digit_groups.windows(2) { + let separator = &trimmed[pair[0].end..pair[1].start]; + if separator.is_empty() + || separator + .chars() + .any(|ch| ch.is_ascii_alphanumeric() || ch.is_whitespace()) + { + separators_ok = false; + break; + } + } + if !separators_ok { + continue; + } + + let trailing = &trimmed[digit_groups.last().expect("digit group").end..]; + if trailing + .chars() + .any(|ch| ch.is_ascii_alphanumeric() || ch.is_whitespace()) + { + continue; + } + + spans.push(DebugMatchSpan { + start: token.start + trim_start, + end: token.start + trim_end, + match_type: "repeat_numeric_run", + }); + } + + spans +} + +fn collect_same_digit_numeric_matches( + line: &str, + tokens: &[TokenSpan], + min_same_digit_steps: u64, +) -> Vec { + let min_steps = min_same_digit_steps as usize; + if min_steps < 2 || tokens.len() < min_steps { + return Vec::new(); + } + + let signatures: Vec> = tokens + .iter() + .map(|token| { + let raw = &line[token.start..token.end]; + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + let digit = repeated_digit_token(trimmed)?; + Some((token.start + trim_start, token.start + trim_end, digit)) + }) + .collect(); + + let mut spans = Vec::new(); + let mut i = 0usize; + while i + min_steps <= signatures.len() { + let Some((start, _, digit)) = signatures[i] else { + i += 1; + continue; + }; + + let mut j = i + 1; + while j < signatures.len() && signatures[j].map(|(_, _, current)| current) == Some(digit) { + j += 1; + } + + let run_len = j - i; + if run_len >= min_steps { + let (_, end, _) = signatures[j - 1].expect("same-digit run end"); + spans.push(DebugMatchSpan { + start, + end, + match_type: "same_digit_numeric_run", + }); + i = j; + } else { + i += 1; + } + } + + spans +} + +fn annotate_line_with_numeric_debug_matches( + line: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let tokens = extract_non_whitespace_tokens_with_spans(line); + if tokens.is_empty() { + return None; + } + + let mut spans = Vec::new(); + spans.extend(collect_numeric_progression_matches( + line, + &tokens, + min_progress_steps, + )); + spans.extend(collect_compact_repeat_numeric_matches( + line, + &tokens, + min_repeat_steps, + )); + spans.extend(collect_same_digit_numeric_matches( + line, + &tokens, + min_same_digit_steps, + )); + annotate_text_with_debug_spans(line, spans) +} + +fn annotate_line_with_debug_matches( + line: &str, + min_repeat_run: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let tokens = extract_non_whitespace_tokens_with_spans(line); + if tokens.is_empty() { + return None; + } + + let spans = collect_repeat_phrase_debug_matches(line, &tokens, min_repeat_run); + let merged = merge_debug_spans(line, spans); + if merged.is_empty() { + return None; + } + + let mut annotated = String::with_capacity(line.len() + merged.len() * 48); + let mut pos = 0usize; + let mut line_types: Vec<&'static str> = Vec::new(); + for (start, end, types) in &merged { + if *start > pos { + annotated.push_str(&line[pos..*start]); + } + let type_attr = types.join(","); + annotated.push_str("'); + annotated.push_str(&line[*start..*end]); + annotated.push_str(""); + pos = *end; + for kind in types { + if !line_types.contains(kind) { + line_types.push(*kind); + } + } + } + if pos < line.len() { + annotated.push_str(&line[pos..]); + } + + Some((annotated, line_types, merged.len() as u64)) +} + +fn compute_repeat_phrase_run_max(trimmed: &str, min_repeat_run: u64) -> u64 { + let tokens = extract_non_whitespace_tokens_with_spans(trimmed); + let min_run = min_repeat_run as usize; + if min_run < 2 || tokens.len() < min_run { + return 0; + } + + let max_phrase_len = 4usize.min(tokens.len() / min_run); + let mut phrase_run_max = 0u64; + for phrase_len in 1..=max_phrase_len { + let mut i = 0usize; + while i + phrase_len * min_run <= tokens.len() { + let mut repeats = 1usize; + while i + phrase_len * (repeats + 1) <= tokens.len() + && phrase_tokens_equal(trimmed, &tokens, i, i + repeats * phrase_len, phrase_len) + { + repeats += 1; + } + if repeats >= min_run { + phrase_run_max = phrase_run_max.max(repeats as u64); + i += phrase_len * repeats; + } else { + i += 1; + } + } + } + + phrase_run_max +} + +fn collect_repeat_line_flags(lines: &[Option], min_repeat_run: u64) -> (Vec, u64) { + let min_run = min_repeat_run as usize; + let mut flags = vec![false; lines.len()]; + if min_run < 2 || lines.len() < min_run { + return (flags, 0); + } + + let mut run_max = 0u64; + let mut i = 0usize; + while i < lines.len() { + let Some(current) = lines[i].as_ref() else { + i += 1; + continue; + }; + + let mut j = i + 1; + while j < lines.len() && lines[j].as_ref() == Some(current) { + j += 1; + } + let run_len = j - i; + if run_len >= min_run { + run_max = run_max.max(run_len as u64); + for flag in &mut flags[i..j] { + *flag = true; + } + } + i = j; + } + + (flags, run_max) +} + +fn finalize_ocr_repeat_noise( + phrase_run_max: u64, + line_run_max: u64, + suspicious_line_count: u64, + non_empty_lines: usize, +) -> OcrRepeatNoiseMetrics { + let suspicious_line_ratio = if non_empty_lines > 0 { + suspicious_line_count as f64 / non_empty_lines as f64 + } else { + 0.0 + }; + let suspect = suspicious_line_count > 0; + + OcrRepeatNoiseMetrics { + phrase_run_max, + line_run_max, + suspicious_line_count, + suspicious_line_ratio, + suspect, + } +} + +fn compute_ocr_profile( + text: &str, + min_repeat_run: u64, +) -> (glossapi_rs_common::ScriptMetrics, OcrRepeatNoiseMetrics) { + let mut scanner = ScriptScanner::new(); + let mut non_empty_lines = 0usize; + let mut phrase_run_max = 0u64; + let mut line_repeat_inputs: Vec> = Vec::new(); + let mut phrase_suspicious_lines: Vec = Vec::new(); + + for segment in text.split_inclusive('\n') { + let trimmed = segment.trim(); + if trimmed.is_empty() { + continue; + } + if trimmed == PAGE_SPLIT_MARKER || is_table_line_trimmed(trimmed) { + continue; + } + + non_empty_lines += 1; + scanner.observe_str(segment); + let line_phrase_run_max = compute_repeat_phrase_run_max(trimmed, min_repeat_run); + phrase_run_max = phrase_run_max.max(line_phrase_run_max); + phrase_suspicious_lines.push(line_phrase_run_max >= min_repeat_run); + line_repeat_inputs.push(normalize_line_for_repetition(trimmed)); + } + + let (repeat_line_flags, line_run_max) = + collect_repeat_line_flags(&line_repeat_inputs, min_repeat_run); + let suspicious_line_count = phrase_suspicious_lines + .iter() + .zip(repeat_line_flags.iter()) + .filter(|(phrase_flag, line_flag)| **phrase_flag || **line_flag) + .count() as u64; + + ( + scanner.finish(), + finalize_ocr_repeat_noise( + phrase_run_max, + line_run_max, + suspicious_line_count, + non_empty_lines, + ), + ) +} + +fn split_pages(text: &str) -> Vec { + let mut pages = Vec::new(); + let mut current = String::new(); + + for segment in text.split_inclusive('\n') { + if segment.trim() == PAGE_SPLIT_MARKER { + pages.push(current); + current = String::new(); + continue; + } + current.push_str(segment); + } + pages.push(current); + pages +} + +fn parse_source_stem(stem: &str) -> (String, u64) { + if let Some((base, suffix)) = stem.rsplit_once("__p") { + if let Some((start, _end)) = suffix.split_once('-') { + if let Ok(start_page) = start.parse::() { + return (base.to_string(), start_page); + } + } + } + (stem.to_string(), 1) +} + +fn annotate_page_for_debug( + page: &str, + min_repeat_run: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let mut segments: Vec<(&str, &str)> = Vec::new(); + let mut normalized_lines: Vec> = Vec::new(); + for segment in page.split_inclusive('\n') { + let (line, newline) = if let Some(body) = segment.strip_suffix('\n') { + (body, "\n") + } else { + (segment, "") + }; + segments.push((line, newline)); + let trimmed = line.trim(); + if trimmed.is_empty() || is_table_line_trimmed(trimmed) { + normalized_lines.push(None); + } else { + normalized_lines.push(normalize_line_for_repetition(trimmed)); + } + } + + let (repeat_line_flags, _line_run_max) = + collect_repeat_line_flags(&normalized_lines, min_repeat_run); + + let mut annotated = String::with_capacity(page.len()); + let mut page_types: Vec<&'static str> = Vec::new(); + let mut match_count = 0u64; + + for (idx, (line, newline)) in segments.iter().enumerate() { + let line_debug = annotate_line_with_debug_matches(line, min_repeat_run); + let line_repeat_flag = repeat_line_flags.get(idx).copied().unwrap_or(false); + + let mut line_content = + if let Some((annotated_line, line_types, line_match_count)) = line_debug { + match_count += line_match_count; + for kind in line_types { + if !page_types.contains(&kind) { + page_types.push(kind); + } + } + annotated_line + } else { + (*line).to_string() + }; + + if line_repeat_flag { + if !page_types.contains(&"repeat_line_run") { + page_types.push("repeat_line_run"); + } + match_count += 1; + line_content = format!("{}", line_content); + } + + annotated.push_str(&line_content); + annotated.push_str(newline); + } + + if match_count == 0 { + return None; + } + + page_types.sort_unstable(); + page_types.dedup(); + Some((annotated, page_types, match_count)) +} + +pub fn annotate_numeric_debug_page_internal( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Option<(String, Vec, u64)> { + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + let (annotated_page, match_types, match_count) = annotate_text_with_debug_spans(page, spans)?; + Some(( + annotated_page, + match_types.into_iter().map(str::to_string).collect(), + match_count, + )) +} + +pub fn find_numeric_debug_page_spans_internal( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + .into_iter() + .map(|span| NumericDebugSpan { + start: span.start, + end: span.end, + match_type: span.match_type.to_string(), + }) + .collect() +} + +const WORD_REPEAT_HASH_MASK: u64 = (1u64 << 63).wrapping_mul(2).wrapping_sub(1); +const WORD_REPEAT_HASH_BASE: u64 = 1469598103934665603u64; + +#[inline] +fn word_repeat_hash_slice(pref: &[u64], pw: &[u64], start: usize, end: usize) -> u64 { + pref[end].wrapping_sub(pref[start].wrapping_mul(pw[end - start])) & WORD_REPEAT_HASH_MASK +} + +#[inline] +fn word_repeat_blocks_equal( + codes: &[u32], + pref: &[u64], + pw: &[u64], + lhs: usize, + rhs: usize, + period: usize, +) -> bool { + word_repeat_hash_slice(pref, pw, lhs, lhs + period) + == word_repeat_hash_slice(pref, pw, rhs, rhs + period) + && codes[lhs..lhs + period] == codes[rhs..rhs + period] +} + +pub fn find_word_repeat_spans_internal( + normalized_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> Vec { + let codes: Vec = normalized_text.chars().map(|ch| ch as u32).collect(); + let n_chars = codes.len(); + if rep_threshold == 0 || min_period == 0 || n_chars < rep_threshold.saturating_mul(min_period) { + return Vec::new(); + } + + let mut pref = vec![0u64; n_chars + 1]; + let mut pw = vec![1u64; n_chars + 1]; + for (idx, code) in codes.iter().enumerate() { + pref[idx + 1] = + (pref[idx].wrapping_mul(WORD_REPEAT_HASH_BASE).wrapping_add(*code as u64)) + & WORD_REPEAT_HASH_MASK; + pw[idx + 1] = pw[idx].wrapping_mul(WORD_REPEAT_HASH_BASE) & WORD_REPEAT_HASH_MASK; + } + + let max_period = std::cmp::min( + std::cmp::max(min_period, window / rep_threshold), + n_chars / rep_threshold, + ); + let mut spans: Vec = Vec::new(); + + for period in min_period..=max_period { + let mut idx = 0usize; + while idx + rep_threshold * period <= n_chars { + let mut is_repeat = true; + for multiple in 1..rep_threshold { + if !word_repeat_blocks_equal(&codes, &pref, &pw, idx, idx + multiple * period, period) + { + is_repeat = false; + break; + } + } + if !is_repeat { + idx += 1; + continue; + } + + let mut left = idx; + while left >= period + && word_repeat_blocks_equal(&codes, &pref, &pw, left - period, left, period) + { + left -= period; + } + + let mut right = idx + rep_threshold * period; + while right + period <= n_chars + && word_repeat_blocks_equal(&codes, &pref, &pw, right - period, right, period) + { + right += period; + } + + let pattern = &codes[left..left + period]; + let mut tail_chars = 0usize; + while right + tail_chars < n_chars + && tail_chars < period + && codes[right + tail_chars] == pattern[tail_chars] + { + tail_chars += 1; + } + + spans.push(WordRepeatSpan { + start: left, + end: right + tail_chars, + period, + repetitions: (right - left) / period, + tail_chars, + }); + idx = right; + } + } + + spans.sort_by(|lhs, rhs| { + lhs.start + .cmp(&rhs.start) + .then((rhs.end - rhs.start).cmp(&(lhs.end - lhs.start))) + .then(lhs.period.cmp(&rhs.period)) + }); + + let mut deduped: Vec = Vec::new(); + for span in spans { + if let Some(previous) = deduped.last() { + if span.start >= previous.start && span.end <= previous.end { + continue; + } + } + deduped.push(span); + } + deduped +} + +fn collect_numeric_debug_spans_for_page( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + if let Some(page_span) = + collect_numeric_page_collapse_span(page, NUMERIC_PAGE_COLLAPSE_MIN_TOKENS) + { + return vec![page_span]; + } + + let block_spans = collect_numeric_block_collapse_spans(page); + if !block_spans.is_empty() { + return block_spans; + } + + let page_tokens = extract_non_whitespace_tokens_with_spans(page); + let mut spans = collect_numeric_progression_matches(page, &page_tokens, min_progress_steps); + let mut line_offset = 0usize; + + for segment in page.split_inclusive('\n') { + let (line, newline) = if let Some(body) = segment.strip_suffix('\n') { + (body, "\n") + } else { + (segment, "") + }; + + let line_tokens = extract_non_whitespace_tokens_with_spans(line); + spans.extend( + collect_compact_repeat_numeric_matches(line, &line_tokens, min_repeat_steps) + .into_iter() + .map(|span| DebugMatchSpan { + start: span.start + line_offset, + end: span.end + line_offset, + match_type: span.match_type, + }), + ); + spans.extend( + collect_same_digit_numeric_matches(line, &line_tokens, min_same_digit_steps) + .into_iter() + .map(|span| DebugMatchSpan { + start: span.start + line_offset, + end: span.end + line_offset, + match_type: span.match_type, + }), + ); + line_offset += line.len() + newline.len(); + } + + spans +} + +fn collect_ocr_debug_candidates_for_text( + source_path: &Path, + source_stem: &str, + base_stem: &str, + start_page: u64, + text: &str, + min_repeat_run: u64, +) -> Vec { + let mut candidates = Vec::new(); + let pages = split_pages(text); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if let Some((_annotated_page, _match_types, _match_count)) = + annotate_page_for_debug(page, min_repeat_run) + { + candidates.push(OcrDebugPageCandidate { + source_path: source_path.to_string_lossy().into_owned(), + source_stem: source_stem.to_string(), + base_stem: base_stem.to_string(), + page_number, + page_index_in_file, + }); + } + } + + candidates +} + +fn collect_numeric_debug_candidates_for_text( + source_path: &Path, + source_stem: &str, + base_stem: &str, + start_page: u64, + text: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + let mut candidates = Vec::new(); + let pages = split_pages(text); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if !collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + .is_empty() + { + candidates.push(OcrDebugPageCandidate { + source_path: source_path.to_string_lossy().into_owned(), + source_stem: source_stem.to_string(), + base_stem: base_stem.to_string(), + page_number, + page_index_in_file, + }); + } + } + + candidates +} + +fn render_ocr_debug_candidate( + candidate: &OcrDebugPageCandidate, + output_dir: &Path, + min_repeat_run: u64, +) -> anyhow::Result { + let source_path = PathBuf::from(&candidate.source_path); + let buf = fs::read(&source_path)?; + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let page_idx = candidate + .page_index_in_file + .checked_sub(1) + .ok_or_else(|| anyhow::anyhow!("invalid page index"))? as usize; + let page = pages + .get(page_idx) + .ok_or_else(|| anyhow::anyhow!("page index out of range for {}", candidate.source_path))?; + let (annotated_page, match_types, match_count) = annotate_page_for_debug(page, min_repeat_run) + .ok_or_else(|| { + anyhow::anyhow!( + "candidate page no longer matches: {}", + candidate.source_path + ) + })?; + let match_types_joined = match_types.join(","); + let output_name = format!( + "{}__debug_page_{:05}.md", + candidate.source_stem, candidate.page_number + ); + let output_path = output_dir.join(output_name); + + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content)?; + + Ok(OcrDebugPageRow { + source_path: candidate.source_path.clone(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: candidate.source_stem.clone(), + base_stem: candidate.base_stem.clone(), + page_number: candidate.page_number, + page_index_in_file: candidate.page_index_in_file, + match_types: match_types_joined, + match_count, + }) +} + +fn render_numeric_debug_candidate( + candidate: &OcrDebugPageCandidate, + output_dir: &Path, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> anyhow::Result { + let source_path = PathBuf::from(&candidate.source_path); + let buf = fs::read(&source_path)?; + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let page_idx = candidate + .page_index_in_file + .checked_sub(1) + .ok_or_else(|| anyhow::anyhow!("invalid page index"))? as usize; + let page = pages + .get(page_idx) + .ok_or_else(|| anyhow::anyhow!("page index out of range for {}", candidate.source_path))?; + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + let (annotated_page, match_types, match_count) = annotate_text_with_debug_spans(page, spans) + .ok_or_else(|| { + anyhow::anyhow!( + "candidate page no longer matches numeric detector: {}", + candidate.source_path + ) + })?; + let match_types_joined = match_types.join(","); + let output_name = format!( + "{}__debug_page_{:05}.md", + candidate.source_stem, candidate.page_number + ); + let output_path = output_dir.join(output_name); + + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content)?; + + Ok(OcrDebugPageRow { + source_path: candidate.source_path.clone(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: candidate.source_stem.clone(), + base_stem: candidate.base_stem.clone(), + page_number: candidate.page_number, + page_index_in_file: candidate.page_index_in_file, + match_types: match_types_joined, + match_count, + }) } /// Compute metrics for UTF-8 bytes; ported from original CLI. @@ -243,7 +1788,9 @@ fn analyse_bytes(buf: &[u8]) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u6 long_word_count += 1; let extra = (word_len - LONG_WORD_LIMIT) as u64; // >= 0 let mut weight = 1 + extra; // equals (len - 20) - if weight > 380 { weight = 380; } + if weight > 380 { + weight = 380; + } long_word_weight_sum += weight; } if word_len > longest_word { @@ -268,44 +1815,83 @@ fn analyse_bytes(buf: &[u8]) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u6 } else { if run_len >= 4 { let pen = run_len - 3; - if run_is_vowel { v_pen += pen; } else { c_pen += pen; } + if run_is_vowel { + v_pen += pen; + } else { + c_pen += pen; + } + } + if run_len > max_run { + max_run = run_len; } - if run_len > max_run { max_run = run_len; } run_is_vowel = vowel; run_len = 1; } if prev_cp != 0 { let pc_low = to_lower_fast(prev_cp); let cc_low = to_lower_fast(cp); - if is_invalid_bigram_pair(pc_low, cc_low) { invalid_bigram += 1; } + if is_invalid_bigram_pair(pc_low, cc_low) { + invalid_bigram += 1; + } + } + if prev_cp == cp && !allowed_double(cp) { + bad_double += 1; } - if prev_cp == cp && !allowed_double(cp) { bad_double += 1; } prev_cp = cp; } if run_len >= 4 { let pen = run_len - 3; - if run_is_vowel { v_pen += pen; } else { c_pen += pen; } + if run_is_vowel { + v_pen += pen; + } else { + c_pen += pen; + } + } + if run_len > max_run { + max_run = run_len; } - if run_len > max_run { max_run = run_len; } if word_len > 0 { total_word_count += 1; - if word_len < SHORT_WORD_LIMIT { short_word_count += 1; } + if word_len < SHORT_WORD_LIMIT { + short_word_count += 1; + } if word_len >= LONG_WORD_LIMIT { long_word_count += 1; let extra = (word_len - LONG_WORD_LIMIT) as u64; let mut weight = 1 + extra; // equals (len - 20) - if weight > 380 { weight = 380; } + if weight > 380 { + weight = 380; + } long_word_weight_sum += weight; } - if word_len > longest_word { longest_word = word_len; } - if prev_cp == 0x03C3 { misplaced_sigma += 1; } + if word_len > longest_word { + longest_word = word_len; + } + if prev_cp == 0x03C3 { + misplaced_sigma += 1; + } } - (len_greek, v_pen, c_pen, bad_double, max_run, long_word_count, long_word_weight_sum, longest_word, misplaced_sigma, invalid_bigram, short_word_count, total_word_count) + ( + len_greek, + v_pen, + c_pen, + bad_double, + max_run, + long_word_count, + long_word_weight_sum, + longest_word, + misplaced_sigma, + invalid_bigram, + short_word_count, + total_word_count, + ) } fn decode_utf8(slice: &[u8]) -> (u32, usize) { - if slice.is_empty() { return (0, 0); } + if slice.is_empty() { + return (0, 0); + } let c0 = slice[0]; if c0 < 0x80 { return (c0 as u32, 1); @@ -313,10 +1899,14 @@ fn decode_utf8(slice: &[u8]) -> (u32, usize) { let cp = ((c0 & 0x1F) as u32) << 6 | (slice[1] & 0x3F) as u32; return (cp, 2); } else if c0 & 0xF0 == 0xE0 && slice.len() >= 3 { - let cp = ((c0 & 0x0F) as u32) << 12 | ((slice[1] & 0x3F) as u32) << 6 | (slice[2] & 0x3F) as u32; + let cp = + ((c0 & 0x0F) as u32) << 12 | ((slice[1] & 0x3F) as u32) << 6 | (slice[2] & 0x3F) as u32; return (cp, 3); } else if c0 & 0xF8 == 0xF0 && slice.len() >= 4 { - let cp = ((c0 & 0x07) as u32) << 18 | ((slice[1] & 0x3F) as u32) << 12 | ((slice[2] & 0x3F) as u32) << 6 | (slice[3] & 0x3F) as u32; + let cp = ((c0 & 0x07) as u32) << 18 + | ((slice[1] & 0x3F) as u32) << 12 + | ((slice[2] & 0x3F) as u32) << 6 + | (slice[3] & 0x3F) as u32; return (cp, 4); } (0, 1) @@ -329,13 +1919,32 @@ fn decode_utf8(slice: &[u8]) -> (u32, usize) { /// v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, /// flags) fn compute_score_and_details( - buf: &[u8] + buf: &[u8], ) -> ( - f64, f64, f64, f64, - u64, u64, - u64, u64, u64, u64, u64, u64, u64, u64, u64, - f64, f64, f64, f64, f64, f64, f64, f64, - String + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, ) { let latin_pct = compute_latin_pct(buf); @@ -343,9 +1952,26 @@ fn compute_score_and_details( let text = String::from_utf8_lossy(buf); let (table_ratio, filtered_opt, _non_empty, table_like) = table_line_ratio_and_filtered(&text); let had_tables = table_like > 0; - let target: &[u8] = if let Some(ref s) = filtered_opt { s.as_bytes() } else { buf }; + let target: &[u8] = if let Some(ref s) = filtered_opt { + s.as_bytes() + } else { + buf + }; - let (len_greek, v_pen, c_pen, bad_dbl, max_run, long_word_count, long_word_weight_sum, longest_word, misplaced_sigma, invalid_bigram, short_word_count, total_word_count) = analyse_bytes(target); + let ( + len_greek, + v_pen, + c_pen, + bad_dbl, + max_run, + long_word_count, + long_word_weight_sum, + longest_word, + misplaced_sigma, + invalid_bigram, + short_word_count, + total_word_count, + ) = analyse_bytes(target); let mut flags: Vec<&str> = Vec::with_capacity(2); @@ -369,34 +1995,113 @@ fn compute_score_and_details( 0.0 }; // Normalized short words: per 1000 Greek chars, then excess over baseline - let short_per_1000 = if len > 0.0 { 1000.0 * (short_word_count as f64) / len } else { 0.0 }; - let short_excess_per_1000 = if short_per_1000 > SHORT_BASELINE_PER_1000 { short_per_1000 - SHORT_BASELINE_PER_1000 } else { 0.0 }; + let short_per_1000 = if len > 0.0 { + 1000.0 * (short_word_count as f64) / len + } else { + 0.0 + }; + let short_excess_per_1000 = if short_per_1000 > SHORT_BASELINE_PER_1000 { + short_per_1000 - SHORT_BASELINE_PER_1000 + } else { + 0.0 + }; // Halved sigma coefficient from 5.0 to 2.5; removed longest_word term - let score = v_rate + 1.5*c_rate + 2.0*d_rate + 2.5*sigma_end_rate + 2.0*bigram_rate + short_excess_per_1000 + long_word_rate; + let score = v_rate + + 1.5 * c_rate + + 2.0 * d_rate + + 2.5 * sigma_end_rate + + 2.0 * bigram_rate + + short_excess_per_1000 + + long_word_rate; - let (_poly_words, _greek_words, poly_ratio) = if len_greek == 0 { - (0, 0, 0.0) + let poly_ratio = if len_greek == 0 { + 0.0 } else { - compute_polytonic_word_ratio(if let Some(ref s) = filtered_opt { s } else { &text }) + let target_text: &str = if let Some(ref s) = filtered_opt { + s.as_str() + } else { + text.as_ref() + }; + scan_script_metrics(target_text).polytonic_ratio() }; - if poly_ratio > 0.0 { flags.push("polytonic"); } - if had_tables { flags.push("had_tables"); } + if poly_ratio > 0.0 { + flags.push("polytonic"); + } + if had_tables { + flags.push("had_tables"); + } ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_word_count, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_excess_per_1000, - flags.join(",") + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_word_count, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_excess_per_1000, + flags.join(","), ) } /// Compute noise score and latin percentage for a UTF-8 buffer. Backward-compatible API. fn compute_score(buf: &[u8]) -> (f64, f64) { - let (score, latin_pct, _t, _p, _lg, _tw, _v,_c,_bd,_ms,_ib,_lwc,_lw,_swc,_mr,_vr,_cr,_dr,_sr,_br,_lwr,_sr2,_sp,_f) = compute_score_and_details(buf); + let ( + score, + latin_pct, + _t, + _p, + _lg, + _tw, + _v, + _c, + _bd, + _ms, + _ib, + _lwc, + _lw, + _swc, + _mr, + _vr, + _cr, + _dr, + _sr, + _br, + _lwr, + _sr2, + _sp, + _f, + ) = compute_score_and_details(buf); (score, latin_pct) } +fn run_in_thread_pool(n_threads: Option, work: F) -> anyhow::Result +where + T: Send, + F: FnOnce() -> T + Send, +{ + let threads = n_threads + .filter(|count| *count > 0) + .unwrap_or_else(rayon::current_num_threads); + let pool = ThreadPoolBuilder::new().num_threads(threads).build()?; + Ok(pool.install(work)) +} + pub fn score_markdown_file_internal(path: &Path) -> anyhow::Result { let mut file = File::open(path)?; let mut buf = Vec::new(); @@ -405,57 +2110,508 @@ pub fn score_markdown_file_internal(path: &Path) -> anyhow::Result { Ok(score) } -pub fn score_markdown_directory_internal(root: &Path, n_threads: Option) -> anyhow::Result> { - if let Some(t) = n_threads { rayon::ThreadPoolBuilder::new().num_threads(t).build_global().ok(); } - let results: Vec<(String, f64, f64)> = WalkDir::new(root) - .into_iter() - .par_bridge() - .filter_map(Result::ok) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) - .map(|e| { - let path = e.path(); - let buf = fs::read(path).expect("read"); - let (score, latin_pct) = compute_score(&buf); - (path.to_string_lossy().into_owned(), score, latin_pct) - }) - .collect(); - Ok(results) +pub fn score_markdown_directory_internal( + root: &Path, + n_threads: Option, +) -> anyhow::Result> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let (score, latin_pct) = compute_score(&buf); + (path.to_string_lossy().into_owned(), score, latin_pct) + }) + .collect() + }) } // Detailed variants for analysis layer -pub fn score_markdown_file_detailed_internal(path: &Path) -> anyhow::Result<(f64, f64, f64, f64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, f64, f64, f64, f64, f64, f64, f64, f64, String)> { +pub fn score_markdown_file_detailed_internal( + path: &Path, +) -> anyhow::Result<( + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, +)> { let mut file = File::open(path)?; let mut buf = Vec::new(); file.read_to_end(&mut buf)?; Ok(compute_score_and_details(&buf)) } -pub fn score_markdown_directory_detailed_internal(root: &Path, n_threads: Option) -> anyhow::Result> { - if let Some(t) = n_threads { rayon::ThreadPoolBuilder::new().num_threads(t).build_global().ok(); } - let results: Vec<(String, f64, f64, f64, f64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, f64, f64, f64, f64, f64, f64, f64, f64, String)> = WalkDir::new(root) - .into_iter() - .par_bridge() - .filter_map(Result::ok) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) - .map(|e| { - let path = e.path(); - let buf = fs::read(path).expect("read"); - let ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) = compute_score_and_details(&buf); - ( - path.to_string_lossy().into_owned(), - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) - }) - .collect(); - Ok(results) +pub fn score_markdown_directory_detailed_internal( + root: &Path, + n_threads: Option, +) -> anyhow::Result< + Vec<( + String, + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, + )>, +> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let ( + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) = compute_score_and_details(&buf); + ( + path.to_string_lossy().into_owned(), + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) + }) + .collect() + }) +} + +pub fn score_markdown_directory_ocr_profile_internal( + root: &Path, + n_threads: Option, + min_repeat_run: u64, +) -> anyhow::Result> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let (script, noise) = compute_ocr_profile(&text, min_repeat_run); + let mut flags = Vec::new(); + if noise.phrase_run_max >= min_repeat_run { + flags.push("repeat_phrase_run"); + } + if noise.line_run_max >= min_repeat_run { + flags.push("repeat_line_run"); + } + + OcrProfileRow { + path: path.to_string_lossy().into_owned(), + percentage_greek: script.percentage_greek(), + latin_percentage: script.latin_percentage(), + polytonic_ratio: script.polytonic_ratio(), + non_whitespace_chars: script.non_whitespace_chars, + greek_char_count: script.greek_char_count, + latin_char_count: script.latin_char_count, + ocr_repeat_phrase_run_max: noise.phrase_run_max, + ocr_repeat_line_run_max: noise.line_run_max, + ocr_repeat_suspicious_line_count: noise.suspicious_line_count, + ocr_repeat_suspicious_line_ratio: noise.suspicious_line_ratio, + ocr_noise_suspect: noise.suspect, + ocr_noise_flags: flags.join(","), + } + }) + .collect() + }) +} + +pub fn export_ocr_match_debug_pages_internal( + root: &Path, + output_dir: &Path, + n_threads: Option, + min_repeat_run: u64, + max_pages: Option, + sample_seed: u64, +) -> anyhow::Result> { + fs::create_dir_all(output_dir)?; + if let Some(limit) = max_pages { + let mut candidates: Vec = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + collect_ocr_debug_candidates_for_text( + path, + &source_stem, + &base_stem, + start_page, + &text, + min_repeat_run, + ) + }) + .reduce(Vec::new, |mut acc, mut item| { + acc.append(&mut item); + acc + }) + })?; + + if candidates.len() > limit { + let mut rng = StdRng::seed_from_u64(sample_seed); + candidates.shuffle(&mut rng); + candidates.truncate(limit); + } + candidates.sort_by(|a, b| { + a.source_path + .cmp(&b.source_path) + .then(a.page_number.cmp(&b.page_number)) + }); + + let output_dir = output_dir.to_path_buf(); + let mut rows: Vec = run_in_thread_pool(n_threads, move || { + candidates + .into_par_iter() + .map(|candidate| { + render_ocr_debug_candidate(&candidate, &output_dir, min_repeat_run) + }) + .collect::>>() + })??; + rows.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + return Ok(rows); + } + + let rows: Vec> = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let mut page_rows = Vec::new(); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if let Some((annotated_page, match_types, match_count)) = + annotate_page_for_debug(page, min_repeat_run) + { + let match_types_joined = match_types.join(","); + let output_name = + format!("{}__debug_page_{:05}.md", source_stem, page_number); + let output_path = output_dir.join(output_name); + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content).expect("write debug page"); + + page_rows.push(OcrDebugPageRow { + source_path: path.to_string_lossy().into_owned(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: source_stem.clone(), + base_stem: base_stem.clone(), + page_number, + page_index_in_file, + match_types: match_types_joined, + match_count, + }); + } + } + + page_rows + }) + .collect() + })?; + + let mut flat = Vec::new(); + for mut group in rows { + flat.append(&mut group); + } + flat.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + Ok(flat) +} + +pub fn export_numeric_match_debug_pages_internal( + root: &Path, + output_dir: &Path, + n_threads: Option, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, + max_pages: Option, + sample_seed: u64, +) -> anyhow::Result> { + fs::create_dir_all(output_dir)?; + if let Some(limit) = max_pages { + let mut candidates: Vec = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + collect_numeric_debug_candidates_for_text( + path, + &source_stem, + &base_stem, + start_page, + &text, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }) + .reduce(Vec::new, |mut acc, mut item| { + acc.append(&mut item); + acc + }) + })?; + + if candidates.len() > limit { + let mut rng = StdRng::seed_from_u64(sample_seed); + candidates.shuffle(&mut rng); + candidates.truncate(limit); + } + candidates.sort_by(|a, b| { + a.source_path + .cmp(&b.source_path) + .then(a.page_number.cmp(&b.page_number)) + }); + + let output_dir = output_dir.to_path_buf(); + let mut rows: Vec = run_in_thread_pool(n_threads, move || { + candidates + .into_par_iter() + .map(|candidate| { + render_numeric_debug_candidate( + &candidate, + &output_dir, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }) + .collect::>>() + })??; + rows.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + return Ok(rows); + } + + let rows: Vec> = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let mut page_rows = Vec::new(); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + if let Some((annotated_page, match_types, match_count)) = + annotate_text_with_debug_spans(page, spans) + { + let match_types_joined = match_types.join(","); + let output_name = + format!("{}__debug_page_{:05}.md", source_stem, page_number); + let output_path = output_dir.join(output_name); + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content).expect("write numeric debug page"); + + page_rows.push(OcrDebugPageRow { + source_path: path.to_string_lossy().into_owned(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: source_stem.clone(), + base_stem: base_stem.clone(), + page_number, + page_index_in_file, + match_types: match_types_joined, + match_count, + }); + } + } + + page_rows + }) + .collect() + })?; + + let mut flat = Vec::new(); + for mut group in rows { + flat.append(&mut group); + } + flat.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + Ok(flat) } diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index e5a4329..7e4a75b 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1,6 +1,8 @@ """Cleaning and filtering helpers split from Corpus.""" from __future__ import annotations +import html +import importlib import json import logging import math @@ -12,6 +14,8 @@ import subprocess import sys import time +import unicodedata +from collections import Counter from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -20,11 +24,1880 @@ from .._naming import canonical_stem from ..gloss_downloader import GlossDownloader +from ..scripts.table_markdown_audit import ( + _expand_rows as _audit_expand_table_rows, + _parse_table_rows as _audit_parse_table_rows, + audit_table as _audit_table_html, +) # Avoid importing section/classifier here; cleaning phase does not use them. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path from .corpus_state import _ProcessingStateManager from .corpus_utils import _maybe_import_torch +PAGE_SPLIT_MARKER = "<--- Page Split --->" +WORD_REPEAT_HASH_MASK = (1 << 64) - 1 +WORD_REPEAT_HASH_BASE = 1469598103934665603 +WORD_REPEAT_MERGE_NONWHITESPACE_GAP = 10 +HTML_TABLE_BLOCK_RE = re.compile(r"(?is)") +HTML_TABLE_LINE_RE = re.compile(r"(?i).*?") +HTML_TABLE_CELL_RE = re.compile(r"(?is)(.*?)") +HTML_TAG_RE = re.compile(r"(?is)<[^>]+>") +EXISTING_MATCH_BLOCK_RE = re.compile(r"(?is)]*>.*?") +LATEX_BLOCK_RE = re.compile(r"(?is)\$\$.*?\$\$") +LATEX_BRACKET_RE = re.compile(r"(?is)\\\[.*?\\\]") +LATEX_BEGIN_END_RE = re.compile(r"(?is)\\begin\{([^\n{}]+)\}.*?\\end\{\1\}") +LATEX_INLINE_PAREN_RE = re.compile(r"(?is)\\\(.*?\\\)") +LATEX_INLINE_DOLLAR_RE = re.compile(r"(?s)(?[^<]{1,16}|[^<]{1,16})){8,}" +) +WORD_CONFUSABLE_FOLD_MAP = { + "ο": "o", + "κ": "k", +} +LATEX_SEGMENT_PATTERNS = [ + ("begin_end", LATEX_BEGIN_END_RE), + ("display_dollar", LATEX_BLOCK_RE), + ("display_bracket", LATEX_BRACKET_RE), + ("inline_paren", LATEX_INLINE_PAREN_RE), + ("inline_dollar", LATEX_INLINE_DOLLAR_RE), +] +LATEX_TEXT_WRAPPER_MACROS = ( + r"\mathrm{", + r"\text{", + r"\operatorname{", + r"\mathit{", + r"\mathbf{", +) +LATEX_INTERNAL_REPEAT_COMMANDS = { + r"\frac", + r"\left", + r"\right", + r"\sqrt", + r"\begin", + r"\end", + r"\quad", + r"\qquad", + r"\cdots", + r"\ldots", + r"\mathrm", + r"\text", + r"\operatorname", + r"\mathit", + r"\mathbf", + r"\hat", + r"\tilde", + r"\bar", +} +LATEX_SHORT_REPEAT_ATOM_COMMANDS = { + r"\Delta", + r"\hat", + r"\tilde", + r"\bar", +} +LATEX_SEGMENT_LOCAL_NONWHITESPACE_GAP = 12 +LATEX_SEGMENT_EXACT_RUN_MIN = 4 +LATEX_SEGMENT_SKELETON_RUN_MIN = 4 +LATEX_SEGMENT_ALTERNATING_RUN_MIN = 6 +LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN = 4 +LATEX_SHORT_SEGMENT_MAX_NORM = 32 +LATEX_LONG_SEGMENT_MIN_NORM = 24 +LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP = 3 +LATEX_SMALL_DEFINITION_FAMILY_MAX_RUN = 6 +HYBRID_PREFIX_RE = re.compile( + r"(?\d+\)|\d+\.(?:\d+\.)*\d*\.?)(?=\s*[^\W\d_])", + re.UNICODE, +) +HYBRID_MARKUP_BODY_RE = re.compile(r"(?i)(<[^>]+>|src=|alt=|image_|\.png\b|\.jpg\b|\.jpeg\b|\.gif\b)") +HYBRID_REPEAT_MIN_ITEMS = 4 +HYBRID_REPEAT_MIN_BODY_ALNUM = 6 +HYBRID_REPEAT_MAX_CYCLE = 6 +HYBRID_REPEAT_MIN_CYCLE_ITEMS = 8 +HYBRID_INLINE_CLAUSE_DELIMITER_RE = re.compile(r"[;\n]|,(?!\d)") +HYBRID_INLINE_TOKEN_RE = re.compile(r"[0-9]+(?:[.,/][0-9]+)*|[^\W\d_]+", re.UNICODE) +HYBRID_INLINE_CONTEXT_WORDS = 2 +HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS = 2 +HYBRID_INLINE_CONTEXT_MIN_CHARS = 8 +HYBRID_INLINE_REPEAT_MIN_ITEMS = 6 +LATEX_SYMBOL_SLOT_COMMANDS = ( + r"\mu", + r"\nu", + r"\alpha", + r"\beta", + r"\gamma", + r"\lambda", + r"\tau", + r"\omega", +) +TABLE_EMPTY_MIN_ROWS = 6 +TABLE_EMPTY_MIN_CELLS = 18 +TABLE_EMPTY_MAX_NONEMPTY_RATIO = 0.15 +TABLE_REPEAT_MIN_ROWS = 4 +TABLE_REPEAT_MIN_NONEMPTY_CELLS = 2 +TABLE_REPEAT_MIN_ROW_TEXT_CHARS = 6 +TABLE_REPEAT_MIN_DUPLICATE_ROWS = 2 +TABLE_SENTENCE_SHELL_MIN_WORDS = 6 +TABLE_SENTENCE_SHELL_MIN_CHARS = 40 +MATCH_CATEGORY_BY_TYPE = { + "ascending_numeric_sequence": "numeric", + "repeat_numeric_run": "numeric", + "same_digit_numeric_run": "numeric", + "numeric_page_collapse": "numeric", + "numeric_block_collapse": "numeric", + "numeric_repeat": "numeric", + "word_repeat": "word", + "latex_repeat": "latex", + "hybrid_repeat": "hybrid", + "table_repeat": "table", +} + +_WORD_REPEAT_RUST_MOD: Optional[Any] = None +_WORD_REPEAT_RUST_IMPORT_ATTEMPTED = False +_RUST_EXTENSION_PREBUILD_ATTEMPTED: Set[str] = set() + + +def _blank_non_newlines(text: str) -> str: + return "".join("\n" if ch == "\n" else " " for ch in text) + + +def _blank_regex_matches_preserve_layout(text: str, pattern: re.Pattern[str]) -> str: + return pattern.sub(lambda match: _blank_non_newlines(match.group(0)), text) + + +def _filter_tables_preserve_layout(text: str) -> str: + text = _blank_regex_matches_preserve_layout(text, HTML_TABLE_BLOCK_RE) + kept: List[str] = [] + for segment in text.splitlines(keepends=True): + trimmed = segment.strip() + if trimmed and trimmed.startswith("|") and trimmed.endswith("|"): + kept.append(_blank_non_newlines(segment)) + continue + if trimmed and HTML_TABLE_LINE_RE.search(trimmed): + kept.append(_blank_non_newlines(segment)) + continue + kept.append(segment) + return "".join(kept) + + +def _filter_latex_preserve_layout(text: str) -> str: + for pattern in ( + LATEX_BEGIN_END_RE, + LATEX_BLOCK_RE, + LATEX_BRACKET_RE, + LATEX_INLINE_PAREN_RE, + LATEX_INLINE_DOLLAR_RE, + ): + text = _blank_regex_matches_preserve_layout(text, pattern) + return text + + +def _blank_existing_match_regions_preserve_layout(text: str) -> str: + return _blank_regex_matches_preserve_layout(text, EXISTING_MATCH_BLOCK_RE) + + +def _blank_raw_spans_preserve_layout(text: str, spans: List[Dict[str, Any]]) -> str: + if not spans: + return text + + chars = list(text) + for span in spans: + start = max(0, int(span["start"])) + end = min(len(chars), int(span["end"])) + for idx in range(start, end): + if chars[idx] != "\n": + chars[idx] = " " + return "".join(chars) + + +def _extract_latex_segments(text: str) -> List[Dict[str, Any]]: + raw: List[Tuple[int, int, str, str]] = [] + for name, pattern in LATEX_SEGMENT_PATTERNS: + for match in pattern.finditer(text): + raw.append((match.start(), match.end(), name, match.group(0))) + + raw.sort(key=lambda item: (item[0], -(item[1] - item[0]), item[2])) + segments: List[Dict[str, Any]] = [] + last_end = -1 + for start, end, kind, body in raw: + if segments and start >= segments[-1]["start"] and end <= segments[-1]["end"]: + continue + if start < last_end: + continue + segments.append({"start": start, "end": end, "kind": kind, "text": body}) + last_end = end + return segments + + +def _normalize_table_cell_text(cell_html: str) -> str: + text = HTML_TAG_RE.sub(" ", cell_html) + text = html.unescape(text) + return " ".join(text.split()) + + +def _table_cell_has_content(cell_text: str) -> bool: + return any(ch.isalnum() for ch in cell_text) + + +def _extract_html_table_rows(table_text: str) -> List[List[str]]: + rows: List[List[str]] = [] + for row_match in HTML_TABLE_ROW_RE.finditer(table_text): + cells = [ + _normalize_table_cell_text(cell_match.group(1)) + for cell_match in HTML_TABLE_CELL_RE.finditer(row_match.group(0)) + ] + if cells: + rows.append(cells) + return rows + + +def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: + parsed_rows, _ = _audit_parse_table_rows(table_text) + grid, _ = _audit_expand_table_rows(parsed_rows) + if not grid: + return [] + nonempty: List[str] = [] + for row in grid: + for cell in row: + normalized = " ".join(cell.split()) + if any(ch.isalnum() for ch in normalized): + nonempty.append(normalized) + return nonempty + + +def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: + nonempty_cells = _flatten_html_table_nonempty_cells(table_text) + if len(nonempty_cells) != 1: + return None + candidate = nonempty_cells[0].strip() + if len(candidate) < TABLE_SENTENCE_SHELL_MIN_CHARS: + return None + if len(re.findall(r"[^\W\d_]+", candidate, re.UNICODE)) < TABLE_SENTENCE_SHELL_MIN_WORDS: + return None + return candidate + + +def _render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: + sentence_shell = _extract_sentence_shell_table_text(table_text) + if sentence_shell and match_kind == "sentence_shell_table": + return sentence_shell + + audit = _audit_table_html(Path("/tmp/table_fragment.md"), 0, 0, table_text) + if audit.markdown: + return audit.markdown + return table_text + + +def _replace_html_tables_with_markdown(text: str) -> str: + if " str: + if match_kind in {"sentence_shell_table", "empty_table_collapse", "repeated_rows"}: + return "" + return _render_table_html_for_output(table_text, match_kind=match_kind) + + +def _clean_fill_for_removed_span(page_text: str, start: int, end: int) -> str: + removed = page_text[start:end] + prev_char = page_text[start - 1] if start > 0 else "" + next_char = page_text[end] if end < len(page_text) else "" + if "\n" in removed: + if prev_char == "\n" or next_char == "\n": + return "" + return "\n" + if prev_char and next_char and not prev_char.isspace() and not next_char.isspace(): + return " " + return "" + + +def _find_table_repeat_spans(page_text: str) -> List[Dict[str, Any]]: + analysis_text = _blank_existing_match_regions_preserve_layout(page_text) + spans: List[Dict[str, Any]] = [] + for table_match in HTML_TABLE_BLOCK_RE.finditer(analysis_text): + raw_table = page_text[table_match.start() : table_match.end()] + sentence_shell = _extract_sentence_shell_table_text(raw_table) + if sentence_shell is not None: + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["table_repeat"], + "kind": "sentence_shell_table", + "word_count": len(re.findall(r"[^\W\d_]+", sentence_shell, re.UNICODE)), + "char_count": len(sentence_shell), + } + ) + continue + + rows = _extract_html_table_rows(raw_table) + if not rows: + continue + + row_count = len(rows) + cell_count = sum(len(row) for row in rows) + nonempty_cells = sum( + 1 for row in rows for cell in row if _table_cell_has_content(cell) + ) + nonempty_ratio = (nonempty_cells / cell_count) if cell_count else 0.0 + + if ( + row_count >= TABLE_EMPTY_MIN_ROWS + and cell_count >= TABLE_EMPTY_MIN_CELLS + and nonempty_ratio <= TABLE_EMPTY_MAX_NONEMPTY_RATIO + ): + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["table_repeat"], + "kind": "empty_table_collapse", + "row_count": row_count, + "cell_count": cell_count, + "nonempty_ratio": round(nonempty_ratio, 3), + } + ) + continue + + row_keys: List[Tuple[str, ...]] = [] + for row in rows: + nonempty_cells_in_row = [cell for cell in row if _table_cell_has_content(cell)] + if len(nonempty_cells_in_row) < TABLE_REPEAT_MIN_NONEMPTY_CELLS: + continue + row_text = " ".join(nonempty_cells_in_row) + if len(row_text) < TABLE_REPEAT_MIN_ROW_TEXT_CHARS: + continue + row_keys.append(tuple(cell.casefold() for cell in row)) + + if row_count < TABLE_REPEAT_MIN_ROWS or not row_keys: + continue + + row_counts = Counter(row_keys) + duplicate_rows = sum(freq - 1 for freq in row_counts.values() if freq >= 2) + if duplicate_rows >= TABLE_REPEAT_MIN_DUPLICATE_ROWS: + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["table_repeat"], + "kind": "repeated_rows", + "row_count": row_count, + "duplicate_rows": duplicate_rows, + } + ) + + return spans + + +def _normalize_latex_repeat_with_map(text: str) -> Tuple[str, List[int]]: + normalized: List[str] = [] + raw_map: List[int] = [] + for raw_idx, ch in enumerate(text): + if ch.isspace(): + continue + normalized.append(ch.casefold()) + raw_map.append(raw_idx) + return "".join(normalized), raw_map + + +def _normalize_latex_segment_exact(text: str) -> str: + return "".join(ch.casefold() for ch in text if not ch.isspace()) + + +def _normalize_latex_segment_skeleton(text: str) -> str: + normalized = _normalize_latex_segment_exact(text) + normalized = re.sub(r"\d+", "#", normalized) + for command in LATEX_SYMBOL_SLOT_COMMANDS: + normalized = normalized.replace(command.casefold(), r"\sym") + normalized = re.sub(r"dr(?:_?\*|_?\\ast)?", "dr@", normalized) + return normalized + + +def _is_short_latex_repeat_atom(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if len(normalized) > LATEX_SHORT_SEGMENT_MAX_NORM: + return False + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + if not command_tokens: + return False + return set(command_tokens).issubset(LATEX_SHORT_REPEAT_ATOM_COMMANDS) + + +def _is_suspicious_internal_latex_repeat(raw_segment: str) -> bool: + if not raw_segment: + return False + if "" in raw_segment or "" in raw_segment: + return True + + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + if any(wrapper in raw_segment for wrapper in LATEX_TEXT_WRAPPER_MACROS): + return len(command_tokens) >= 8 or len(raw_segment) >= 60 + + counts = Counter(command_tokens) + if any(command in LATEX_INTERNAL_REPEAT_COMMANDS for command in counts): + return max(counts.values(), default=0) >= LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP + + return False + + +def _extract_latex_lhs_key(raw_segment: str) -> Optional[str]: + normalized = _normalize_latex_segment_exact(raw_segment) + if "=" not in normalized: + return None + lhs = normalized.split("=", 1)[0] + return lhs or None + + +def _is_latex_symbol_inventory_segment(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if not normalized or len(normalized) > 96: + return False + if any(token in normalized for token in ("=", "+", "-", r"\sum", r"\prod", r"\int", r"\frac")): + return False + if _is_short_latex_repeat_atom(raw_segment): + return False + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + return bool(command_tokens) + + +def _is_small_parameterized_definition_family(run: List[Dict[str, Any]]) -> bool: + if len(run) > LATEX_SMALL_DEFINITION_FAMILY_MAX_RUN: + return False + lhs_keys = [_extract_latex_lhs_key(str(item["text"])) for item in run] + if any(key is None for key in lhs_keys): + return False + if any( + key is not None and any(token in key for token in (r"\frac", r"\sum", r"\prod", r"\int", "+", "-", "=")) + for key in lhs_keys + ): + return False + return len(set(lhs_keys)) == len(lhs_keys) + + +def _is_symbol_inventory_run(run: List[Dict[str, Any]]) -> bool: + return all(_is_latex_symbol_inventory_segment(str(item["text"])) for item in run) + + +def _short_atom_run_has_clean_gaps(page_text: str, run: List[Dict[str, Any]]) -> bool: + if len(run) < 2: + return True + for left, right in zip(run, run[1:]): + gap = page_text[int(left["end"]) : int(right["start"])] + if any(ch.isalnum() for ch in gap): + return False + return True + + +def _extract_latex_numeric_slots(raw_segment: str) -> Optional[List[float]]: + slots: List[float] = [] + for token in re.findall(r"[0-9]+(?:[.,/][0-9]+)*", raw_segment): + if "/" in token: + if token.count("/") != 1: + return None + lhs, rhs = token.split("/", 1) + if not lhs.isdigit() or not rhs.isdigit() or int(rhs) == 0: + return None + slots.append(float(int(lhs) / int(rhs))) + continue + if token.count(".") + token.count(",") > 1: + return None + normalized = token.replace(",", ".", 1) + if "." in normalized: + lhs, rhs = normalized.split(".", 1) + if not lhs.isdigit() or not rhs.isdigit(): + return None + slots.append(float(normalized)) + continue + if token.isdigit(): + slots.append(float(int(token))) + continue + return None + return slots or None + + +def _latex_slot_progress_position(values: List[float]) -> bool: + if len(values) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + return False + + diffs: List[float] = [] + tolerance = 1e-9 + for left, right in zip(values, values[1:]): + diff = right - left + if diff < -tolerance: + return False + if diff > tolerance: + diffs.append(diff) + + if not diffs: + return False + + baseline = diffs[0] + return all(abs(diff - baseline) <= max(tolerance, abs(baseline) * 1e-6) for diff in diffs[1:]) + + +def _is_latex_slot_progression_run(run: List[Dict[str, Any]]) -> bool: + if len(run) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + return False + if _is_small_parameterized_definition_family(run): + return False + if _is_symbol_inventory_run(run): + return False + if _is_short_latex_repeat_atom(str(run[0]["text"])): + return False + + slot_lists = [item.get("numeric_slots") for item in run] + if any(not slots for slots in slot_lists): + return False + slot_count = len(slot_lists[0] or []) + if slot_count == 0 or any(len(slots or []) != slot_count for slots in slot_lists): + return False + + varying_positions = 0 + for slot_idx in range(slot_count): + values = [float(slots[slot_idx]) for slots in slot_lists if slots is not None] + if len({round(value, 9) for value in values}) > 1: + varying_positions += 1 + if varying_positions == 0 or varying_positions > 2: + return False + + for slot_idx in range(slot_count): + values = [float(slots[slot_idx]) for slots in slot_lists if slots is not None] + if _latex_slot_progress_position(values): + return True + return False + + +def _normalize_alnum_with_map_skip_tags(text: str) -> Tuple[str, List[int]]: + norm_chars: List[str] = [] + raw_char_indices: List[int] = [] + in_tag = False + for raw_idx, ch in enumerate(text): + if in_tag: + if ch == ">": + in_tag = False + continue + if ch == "<": + in_tag = True + continue + folded = unicodedata.normalize("NFD", ch.casefold()) + for sub in folded: + category = unicodedata.category(sub) + if category.startswith("L") or category.startswith("N"): + sub = WORD_CONFUSABLE_FOLD_MAP.get(sub, sub) + norm_chars.append(sub) + raw_char_indices.append(raw_idx) + return "".join(norm_chars), raw_char_indices + + +def _normalize_hybrid_body(text: str) -> str: + norm_chars: List[str] = [] + for ch in text: + folded = unicodedata.normalize("NFD", ch.casefold()) + for sub in folded: + category = unicodedata.category(sub) + if category.startswith("L") or category.startswith("N"): + norm_chars.append(WORD_CONFUSABLE_FOLD_MAP.get(sub, sub)) + return "".join(norm_chars) + + +def _classify_hybrid_numeric_field(prefix: str) -> Optional[Dict[str, Any]]: + token = prefix.strip() + if not token: + return None + + trailing_paren = token.endswith(")") + trailing_dot = token.endswith(".") + stripped = token[:-1] if trailing_paren or trailing_dot else token + if not stripped: + return None + + if "/" in stripped: + return {"field_kind": "numeric_value", "raw": token} + + parts = stripped.split(".") + if not all(part.isdigit() for part in parts): + return None + + numbers = [int(part) for part in parts] + shape = ".".join("#" for _ in numbers) + if trailing_paren: + shape += ")" + elif trailing_dot: + shape += "." + + if trailing_paren or trailing_dot: + field_kind = "header_counter" + elif len(numbers) >= 3: + field_kind = "header_counter" + elif len(numbers) == 2 and len(parts[-1]) <= 2: + field_kind = "header_counter" + else: + field_kind = "numeric_value" + + return { + "field_kind": field_kind, + "numbers": numbers, + "shape": shape, + "raw": token, + } + + +def _classify_hybrid_inline_numeric_field(token: str) -> Optional[Dict[str, Any]]: + stripped = token.strip() + if not stripped: + return None + + if re.fullmatch(r"[0-9]+", stripped): + return {"field_kind": "numeric_value", "raw": stripped} + + if stripped.count("/") == 1: + lhs, rhs = stripped.split("/", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs) and int(rhs) != 0: + return {"field_kind": "numeric_value", "raw": stripped} + return None + + decimal_candidate = stripped.replace(",", ".", 1) + if decimal_candidate.count(".") == 1: + lhs, rhs = decimal_candidate.split(".", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs): + return {"field_kind": "numeric_value", "raw": stripped} + + return None + + +def _parse_hybrid_numeric_value(token: str) -> Optional[float]: + stripped = token.strip() + if not stripped: + return None + + if re.fullmatch(r"[0-9]+", stripped): + return float(int(stripped)) + + if stripped.count("/") == 1: + lhs, rhs = stripped.split("/", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs) and int(rhs) != 0: + return float(int(lhs) / int(rhs)) + return None + + decimal_candidate = stripped.replace(",", ".", 1) + if decimal_candidate.count(".") == 1: + lhs, rhs = decimal_candidate.split(".", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs): + return float(decimal_candidate) + + return None + + +def _extract_hybrid_numbered_items( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + candidates: List[Dict[str, Any]] = [] + for match in HYBRID_PREFIX_RE.finditer(analysis_text): + field = _classify_hybrid_numeric_field(match.group("prefix")) + if field is None: + continue + candidates.append( + { + "prefix_start": match.start("prefix"), + "prefix_end": match.end("prefix"), + **field, + } + ) + + items: List[Dict[str, Any]] = [] + for idx, candidate in enumerate(candidates): + next_start = ( + int(candidates[idx + 1]["prefix_start"]) if idx + 1 < len(candidates) else len(analysis_text) + ) + body_raw = analysis_text[int(candidate["prefix_end"]) : next_start].strip() + if HYBRID_MARKUP_BODY_RE.search(body_raw): + continue + body_key = _normalize_hybrid_body(body_raw) + has_alpha = any(ch.isalpha() for ch in body_key) + if not has_alpha: + continue + body_is_full = len(body_key) >= HYBRID_REPEAT_MIN_BODY_ALNUM + items.append( + { + "start": int(candidate["prefix_start"]), + "end": next_start, + "prefix_end": int(candidate["prefix_end"]), + "field_kind": str(candidate["field_kind"]), + "numbers": list(candidate.get("numbers", [])), + "shape": str(candidate.get("shape", "")), + "body_raw": body_raw, + "body_key": body_key, + "body_is_full": body_is_full, + } + ) + + return items + + +def _extract_hybrid_inline_numeric_items( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + clause_ranges: List[Tuple[int, int]] = [] + clause_start = 0 + for match in HYBRID_INLINE_CLAUSE_DELIMITER_RE.finditer(analysis_text): + clause_ranges.append((clause_start, match.start())) + clause_start = match.end() + clause_ranges.append((clause_start, len(analysis_text))) + + items: List[Dict[str, Any]] = [] + for clause_index, (raw_start, raw_end) in enumerate(clause_ranges): + clause = analysis_text[raw_start:raw_end] + if not clause.strip(): + continue + + leading_ws = len(clause) - len(clause.lstrip()) + trailing_ws = len(clause) - len(clause.rstrip()) + clause_start_abs = raw_start + leading_ws + clause_end_abs = raw_end - trailing_ws + clause_text = analysis_text[clause_start_abs:clause_end_abs] + if not clause_text or HYBRID_MARKUP_BODY_RE.search(clause_text): + continue + + working_offset = clause_start_abs + working_text = clause_text + prefix_match = HYBRID_PREFIX_RE.match(working_text) + if prefix_match: + working_offset += prefix_match.end() + working_text = working_text[prefix_match.end() :].lstrip() + working_offset = clause_end_abs - len(working_text) + if not working_text: + continue + + tokens: List[Dict[str, Any]] = [] + numeric_token_positions: List[int] = [] + for match in HYBRID_INLINE_TOKEN_RE.finditer(working_text): + token = match.group(0) + abs_start = working_offset + match.start() + abs_end = working_offset + match.end() + if token and token[0].isdigit(): + numeric_info = _classify_hybrid_inline_numeric_field(token) + if numeric_info is None: + continue + parsed_value = _parse_hybrid_numeric_value(token) + if parsed_value is None: + continue + numeric_token_positions.append(len(tokens)) + tokens.append( + { + "kind": "numeric", + "start": abs_start, + "end": abs_end, + "raw": token, + "numeric_value": parsed_value, + } + ) + continue + token_key = _normalize_hybrid_body(token) + if not token_key: + continue + tokens.append( + { + "kind": "alpha", + "start": abs_start, + "end": abs_end, + "raw": token, + "token_key": token_key, + } + ) + + if len(numeric_token_positions) != 1: + continue + + numeric_pos = numeric_token_positions[0] + numeric_token = tokens[numeric_pos] + left_alpha = [token for token in tokens[:numeric_pos] if token.get("kind") == "alpha"] + right_alpha = [token for token in tokens[numeric_pos + 1 :] if token.get("kind") == "alpha"] + left_context = left_alpha[-HYBRID_INLINE_CONTEXT_WORDS:] + right_context = right_alpha[:HYBRID_INLINE_CONTEXT_WORDS] + alpha_word_count = len(left_context) + len(right_context) + if alpha_word_count < HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS: + continue + + context_parts = [str(token.get("token_key", "")) for token in left_context] + context_parts.append("num") + context_parts.extend(str(token.get("token_key", "")) for token in right_context) + context_key = _normalize_hybrid_body(" ".join(context_parts)) + if len(context_key) < HYBRID_INLINE_CONTEXT_MIN_CHARS: + continue + + item_start = int(left_context[0]["start"]) if left_context else int(numeric_token["start"]) + item_end = int(right_context[-1]["end"]) if right_context else int(numeric_token["end"]) + items.append( + { + "start": item_start, + "end": item_end, + "clause_index": clause_index, + "field_kind": "numeric_value", + "inline_context_key": context_key, + "numeric_value": float(numeric_token["numeric_value"]), + } + ) + + return items + + +def _hybrid_partial_body_matches(candidate_body_key: str, target_body_key: str) -> bool: + if not candidate_body_key or not target_body_key: + return False + if candidate_body_key == target_body_key: + return False + if not target_body_key.startswith(candidate_body_key): + return False + min_chars = min(4, len(target_body_key)) + min_ratio_chars = max(1, math.ceil(len(target_body_key) * 0.5)) + return len(candidate_body_key) >= min(min_chars, min_ratio_chars) + + +def _extend_hybrid_tail_span_end( + items: List[Dict[str, Any]], + *, + run_start: int, + run_end: int, + expected_body_key: str, +) -> int: + span_end = int(items[run_end - 1]["end"]) + if run_end >= len(items): + return span_end + + tail = items[run_end] + if tail.get("field_kind") != "header_counter": + return span_end + if str(tail.get("shape", "")) != str(items[run_start].get("shape", "")): + return span_end + if not _hybrid_header_progresses(items[run_end - 1], tail): + return span_end + if not _hybrid_partial_body_matches(str(tail.get("body_key", "")), expected_body_key): + return span_end + return int(tail["end"]) + + +def _hybrid_header_progresses(previous: Dict[str, Any], current: Dict[str, Any]) -> bool: + if previous.get("field_kind") != "header_counter" or current.get("field_kind") != "header_counter": + return False + prev_numbers = list(previous.get("numbers", [])) + curr_numbers = list(current.get("numbers", [])) + if len(prev_numbers) != len(curr_numbers) or not prev_numbers: + return False + return prev_numbers[:-1] == curr_numbers[:-1] and curr_numbers[-1] == prev_numbers[-1] + 1 + + +def _hybrid_header_is_parent(previous: Dict[str, Any], current: Dict[str, Any]) -> bool: + if previous.get("field_kind") != "header_counter" or current.get("field_kind") != "header_counter": + return False + prev_numbers = list(previous.get("numbers", [])) + curr_numbers = list(current.get("numbers", [])) + if not prev_numbers or len(prev_numbers) + 1 != len(curr_numbers): + return False + return curr_numbers[:-1] == prev_numbers + + +def _hybrid_inline_step(previous: Dict[str, Any], current: Dict[str, Any]) -> Optional[float]: + if previous.get("field_kind") != "numeric_value" or current.get("field_kind") != "numeric_value": + return None + if int(current.get("clause_index", -1)) != int(previous.get("clause_index", -1)) + 1: + return None + if str(previous.get("inline_context_key", "")) != str(current.get("inline_context_key", "")): + return None + + previous_value = float(previous.get("numeric_value", 0.0)) + current_value = float(current.get("numeric_value", 0.0)) + step = current_value - previous_value + if step <= 0: + return None + return step + + +def _hybrid_inline_step_matches(expected_step: float, actual_step: float) -> bool: + tolerance = max(1e-9, abs(expected_step) * 1e-6) + return abs(expected_step - actual_step) <= tolerance + + +def _find_hybrid_same_body_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + idx = 0 + while idx < len(items): + item = items[idx] + if item.get("field_kind") != "header_counter" or not bool(item.get("body_is_full")): + idx += 1 + continue + + end_idx = idx + 1 + while ( + end_idx < len(items) + and items[end_idx].get("field_kind") == "header_counter" + and bool(items[end_idx].get("body_is_full")) + and str(items[end_idx].get("body_key", "")) == str(item.get("body_key", "")) + and str(items[end_idx].get("shape", "")) == str(item.get("shape", "")) + and _hybrid_header_progresses(items[end_idx - 1], items[end_idx]) + ): + end_idx += 1 + + run_length = end_idx - idx + if run_length >= HYBRID_REPEAT_MIN_ITEMS: + start_idx = idx + if idx > 0: + previous = items[idx - 1] + if ( + bool(previous.get("body_is_full")) + and + str(previous.get("body_key", "")) == str(item.get("body_key", "")) + and _hybrid_header_is_parent(previous, item) + ): + start_idx = idx - 1 + + span_end = _extend_hybrid_tail_span_end( + items, + run_start=idx, + run_end=end_idx, + expected_body_key=str(item.get("body_key", "")), + ) + spans.append( + { + "start": int(items[start_idx]["start"]), + "end": span_end, + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "same_body_progression", + "item_count": end_idx - start_idx, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_cycle_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + n_items = len(items) + for cycle_len in range(2, HYBRID_REPEAT_MAX_CYCLE + 1): + idx = 0 + while idx + 2 * cycle_len <= n_items: + run = items[idx : idx + 2 * cycle_len] + if any(item.get("field_kind") != "header_counter" or not bool(item.get("body_is_full")) for item in run): + idx += 1 + continue + shapes = {str(item.get("shape", "")) for item in run} + if len(shapes) != 1: + idx += 1 + continue + if not all(_hybrid_header_progresses(run[pos - 1], run[pos]) for pos in range(1, len(run))): + idx += 1 + continue + + template = [str(item.get("body_key", "")) for item in run[:cycle_len]] + if len(set(template)) < 2: + idx += 1 + continue + + if any(str(run[pos].get("body_key", "")) != template[pos % cycle_len] for pos in range(cycle_len, len(run))): + idx += 1 + continue + + end_idx = idx + 2 * cycle_len + while ( + end_idx < n_items + and items[end_idx].get("field_kind") == "header_counter" + and bool(items[end_idx].get("body_is_full")) + and str(items[end_idx].get("shape", "")) == str(items[idx].get("shape", "")) + and _hybrid_header_progresses(items[end_idx - 1], items[end_idx]) + and str(items[end_idx].get("body_key", "")) == template[(end_idx - idx) % cycle_len] + ): + end_idx += 1 + + item_count = end_idx - idx + if item_count >= HYBRID_REPEAT_MIN_CYCLE_ITEMS: + span_end = _extend_hybrid_tail_span_end( + items, + run_start=idx, + run_end=end_idx, + expected_body_key=template[(end_idx - idx) % cycle_len], + ) + spans.append( + { + "start": int(items[idx]["start"]), + "end": span_end, + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "body_cycle_progression", + "item_count": item_count, + "cycle_len": cycle_len, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_inline_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + idx = 0 + while idx + HYBRID_INLINE_REPEAT_MIN_ITEMS <= len(items): + first = items[idx] + second = items[idx + 1] + expected_step = _hybrid_inline_step(first, second) + if expected_step is None: + idx += 1 + continue + + end_idx = idx + 2 + while end_idx < len(items): + actual_step = _hybrid_inline_step(items[end_idx - 1], items[end_idx]) + if actual_step is None or not _hybrid_inline_step_matches(expected_step, actual_step): + break + end_idx += 1 + + item_count = end_idx - idx + if item_count >= HYBRID_INLINE_REPEAT_MIN_ITEMS: + spans.append( + { + "start": int(items[idx]["start"]), + "end": int(items[end_idx - 1]["end"]), + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "inline_numeric_progression", + "item_count": item_count, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_numbered_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + items = _extract_hybrid_numbered_items(page_text, blocked_spans=blocked_spans) + spans = _find_hybrid_same_body_progression_spans(items) + spans.extend(_find_hybrid_cycle_progression_spans(items)) + inline_items = _extract_hybrid_inline_numeric_items(page_text, blocked_spans=blocked_spans) + spans.extend(_find_hybrid_inline_progression_spans(inline_items)) + spans.sort(key=lambda item: (int(item["start"]), -(int(item["end"]) - int(item["start"])))) + + deduped: List[Dict[str, Any]] = [] + for span in spans: + if deduped and int(span["start"]) >= int(deduped[-1]["start"]) and int(span["end"]) <= int(deduped[-1]["end"]): + continue + deduped.append(span) + return deduped + + +def _build_word_repeat_hash(text: str) -> Tuple[List[int], List[int]]: + pref = [0] * (len(text) + 1) + pw = [1] * (len(text) + 1) + for idx, ch in enumerate(text): + code = ord(ch) + 1 + pref[idx + 1] = (pref[idx] * WORD_REPEAT_HASH_BASE + code) & WORD_REPEAT_HASH_MASK + pw[idx + 1] = (pw[idx] * WORD_REPEAT_HASH_BASE) & WORD_REPEAT_HASH_MASK + return pref, pw + + +def _word_repeat_hash_slice(pref: List[int], pw: List[int], start: int, end: int) -> int: + return (pref[end] - ((pref[start] * pw[end - start]) & WORD_REPEAT_HASH_MASK)) & WORD_REPEAT_HASH_MASK + + +def _word_repeat_blocks_equal( + text: str, + pref: List[int], + pw: List[int], + lhs: int, + rhs: int, + period: int, +) -> bool: + return ( + _word_repeat_hash_slice(pref, pw, lhs, lhs + period) + == _word_repeat_hash_slice(pref, pw, rhs, rhs + period) + and text[lhs : lhs + period] == text[rhs : rhs + period] + ) + + +def _get_word_repeat_rust_module() -> Optional[Any]: + global _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED + if _WORD_REPEAT_RUST_IMPORT_ATTEMPTED: + return _WORD_REPEAT_RUST_MOD + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + try: + module = importlib.import_module("glossapi_rs_noise") + except Exception: + _WORD_REPEAT_RUST_MOD = None + return None + if hasattr(module, "find_word_repeat_spans"): + _WORD_REPEAT_RUST_MOD = module + else: + _WORD_REPEAT_RUST_MOD = None + return _WORD_REPEAT_RUST_MOD + + +def _find_word_repeat_spans_python( + normalized_text: str, + *, + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, int]]: + n_chars = len(normalized_text) + if n_chars < rep_threshold * min_period: + return [] + + pref, pw = _build_word_repeat_hash(normalized_text) + max_period = min(max(min_period, window // rep_threshold), n_chars // rep_threshold) + spans: List[Dict[str, int]] = [] + + for period in range(min_period, max_period + 1): + idx = 0 + while idx + rep_threshold * period <= n_chars: + is_repeat = True + for multiple in range(1, rep_threshold): + if not _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + idx, + idx + multiple * period, + period, + ): + is_repeat = False + break + if not is_repeat: + idx += 1 + continue + + left = idx + while left - period >= 0 and _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + left - period, + left, + period, + ): + left -= period + + right = idx + rep_threshold * period + while right + period <= n_chars and _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + right - period, + right, + period, + ): + right += period + + pattern = normalized_text[left : left + period] + tail_chars = 0 + while ( + right + tail_chars < n_chars + and tail_chars < period + and normalized_text[right + tail_chars] == pattern[tail_chars] + ): + tail_chars += 1 + + spans.append( + { + "start": left, + "end": right + tail_chars, + "period": period, + "repetitions": (right - left) // period, + "tail_chars": tail_chars, + } + ) + idx = right + + spans.sort(key=lambda item: (item["start"], -(item["end"] - item["start"]), item["period"])) + deduped: List[Dict[str, int]] = [] + for span in spans: + if deduped and span["start"] >= deduped[-1]["start"] and span["end"] <= deduped[-1]["end"]: + continue + deduped.append(span) + return deduped + + +def _find_word_repeat_spans( + normalized_text: str, + *, + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, int]]: + rust_mod = _get_word_repeat_rust_module() + if rust_mod is None: + return _find_word_repeat_spans_python( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "period": int(item["period"]), + "repetitions": int(item["repetitions"]), + "tail_chars": int(item["tail_chars"]), + } + for item in rust_mod.find_word_repeat_spans( + normalized_text, + int(rep_threshold), + int(min_period), + int(window), + ) + ] + + +def _gap_has_fewer_than_n_nonwhitespace_chars(text: str, start: int, end: int, limit: int) -> bool: + if start >= end: + return True + count = 0 + for ch in text[start:end]: + if not ch.isspace(): + count += 1 + if count >= limit: + return False + return True + + +def _latex_segments_are_local(page_text: str, left: Dict[str, Any], right: Dict[str, Any]) -> bool: + return _gap_has_fewer_than_n_nonwhitespace_chars( + page_text, + int(left["end"]), + int(right["start"]), + LATEX_SEGMENT_LOCAL_NONWHITESPACE_GAP, + ) + + +def _latex_local_groups(page_text: str, segments: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]: + if not segments: + return [] + + groups: List[List[Dict[str, Any]]] = [[segments[0]]] + for segment in segments[1:]: + if _latex_segments_are_local(page_text, groups[-1][-1], segment): + groups[-1].append(segment) + else: + groups.append([segment]) + return groups + + +def _find_local_latex_segment_block_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + if len(group) < LATEX_SEGMENT_EXACT_RUN_MIN: + continue + + idx = 0 + while idx < len(group): + exact_key = str(group[idx]["exact_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["exact_key"]) == exact_key: + end_idx += 1 + + run_length = end_idx - idx + exact_run = group[idx:end_idx] + if run_length >= LATEX_SEGMENT_EXACT_RUN_MIN and ( + len(exact_key) >= LATEX_LONG_SEGMENT_MIN_NORM + or ( + _is_short_latex_repeat_atom(str(group[idx]["text"])) + and _short_atom_run_has_clean_gaps(page_text, exact_run) + ) + ): + labeled_spans.append( + { + "start": int(exact_run[0]["start"]), + "end": int(exact_run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + idx = end_idx + + idx = 0 + while idx < len(group): + skeleton_key = str(group[idx]["skeleton_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["skeleton_key"]) == skeleton_key: + end_idx += 1 + + run = group[idx:end_idx] + exact_vocab = {str(item["exact_key"]) for item in run} + if ( + len(run) >= LATEX_SEGMENT_SKELETON_RUN_MIN + and len(skeleton_key) >= LATEX_LONG_SEGMENT_MIN_NORM + and not _is_short_latex_repeat_atom(str(run[0]["text"])) + and len(exact_vocab) >= 2 + and not _is_small_parameterized_definition_family(run) + and not _is_symbol_inventory_run(run) + ): + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + idx = end_idx + + exact_sequence = [str(item["exact_key"]) for item in group] + exact_counts = Counter(exact_sequence) + if ( + len(group) >= LATEX_SEGMENT_ALTERNATING_RUN_MIN + and len(exact_counts) <= 2 + and min(exact_counts.values()) >= 2 + ): + avg_length = sum(len(item) for item in exact_sequence) / len(exact_sequence) + if avg_length >= LATEX_LONG_SEGMENT_MIN_NORM and not all( + _is_short_latex_repeat_atom(str(item["text"])) for item in group + ): + labeled_spans.append( + { + "start": int(group[0]["start"]), + "end": int(group[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + return labeled_spans + + +def _find_local_latex_slot_progression_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + if len(group) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + continue + + idx = 0 + while idx < len(group): + skeleton_key = str(group[idx]["skeleton_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["skeleton_key"]) == skeleton_key: + end_idx += 1 + + run = group[idx:end_idx] + exact_vocab = {str(item["exact_key"]) for item in run} + if ( + len(run) >= LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN + and len(skeleton_key) >= LATEX_LONG_SEGMENT_MIN_NORM + and len(exact_vocab) >= 2 + and _is_latex_slot_progression_run(run) + ): + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "slot_progression", + "item_count": len(run), + } + ) + idx = end_idx + + return labeled_spans + + +def _find_latex_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, Any]]: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + labeled_spans: List[Dict[str, Any]] = [] + + for wrapper_pattern in (LATEX_TEXT_WRAPPER_BODY_RE, LATEX_TEXT_WRAPPER_OPEN_BODY_RE): + for match in wrapper_pattern.finditer(analysis_text): + body = match.group(1) + command_tokens = LATEX_COMMAND_RE.findall(body) + if len(command_tokens) < 16: + continue + if len(set(command_tokens)) > 4: + continue + labeled_spans.append( + { + "start": match.start(1), + "end": match.end(1), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + for match in HTML_MATH_MARKUP_CLUSTER_RE.finditer(analysis_text): + labeled_spans.append( + { + "start": match.start(), + "end": match.end(), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + segments = _extract_latex_segments(analysis_text) + for segment in segments: + segment["exact_key"] = _normalize_latex_segment_exact(str(segment["text"])) + segment["skeleton_key"] = _normalize_latex_segment_skeleton(str(segment["text"])) + + labeled_spans.extend(_find_local_latex_segment_block_spans(page_text, segments)) + + for segment in segments: + normalized_text, raw_map = _normalize_latex_repeat_with_map(segment["text"]) + normalized_spans = _find_word_repeat_spans( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + for span in normalized_spans: + if span["end"] <= span["start"] or span["start"] >= len(raw_map): + continue + start = segment["start"] + raw_map[span["start"]] + end = segment["start"] + raw_map[span["end"] - 1] + 1 + raw_span = page_text[start:end] + if not _is_suspicious_internal_latex_repeat(raw_span): + continue + labeled_spans.append( + { + "start": start, + "end": end, + "period": span["period"], + "repetitions": span["repetitions"], + "tail_chars": span["tail_chars"], + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + return labeled_spans + + +def _find_latex_slot_progression_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + segments = _extract_latex_segments(analysis_text) + for segment in segments: + raw_text = str(segment["text"]) + segment["exact_key"] = _normalize_latex_segment_exact(raw_text) + segment["skeleton_key"] = _normalize_latex_segment_skeleton(raw_text) + segment["numeric_slots"] = _extract_latex_numeric_slots(raw_text) + + return _find_local_latex_slot_progression_spans(page_text, segments) + + +def _shared_repeat_match_type(segment: str) -> Optional[str]: + if not segment: + return None + has_letter = any(ch.isalpha() for ch in segment) + has_digit = any(ch.isdigit() for ch in segment) + if has_letter: + return "word_repeat" + if has_digit: + return "numeric_repeat" + return None + + +def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + if not spans: + return [] + + spans = sorted(spans, key=lambda item: (item["start"], item["end"])) + merged: List[Dict[str, Any]] = [] + for span in spans: + if not merged: + merged.append(dict(span)) + continue + + previous = merged[-1] + overlaps = span["start"] <= previous["end"] + close_gap = ( + not overlaps + and previous["category"] == span["category"] + and previous["category"] != "table" + and _gap_has_fewer_than_n_nonwhitespace_chars( + text, + previous["end"], + span["start"], + WORD_REPEAT_MERGE_NONWHITESPACE_GAP, + ) + ) + if overlaps or close_gap: + same_single_type = previous.get("match_types", []) == span.get("match_types", []) + same_kind = previous.get("kind") == span.get("kind") + previous["start"] = min(previous["start"], span["start"]) + previous["end"] = max(previous["end"], span["end"]) + previous["match_types"] = sorted( + set(previous.get("match_types", [])) | set(span.get("match_types", [])) + ) + if "period" in span: + previous["period"] = min(previous.get("period", span["period"]), span["period"]) + if "repetitions" in span: + previous["repetitions"] = max( + previous.get("repetitions", span["repetitions"]), + span["repetitions"], + ) + if "tail_chars" in span: + previous["tail_chars"] = max( + previous.get("tail_chars", 0), + span.get("tail_chars", 0), + ) + if ( + same_single_type + and same_kind + and previous.get("item_count") is not None + and span.get("item_count") is not None + ): + previous["item_count"] = int(previous["item_count"]) + int(span["item_count"]) + continue + merged.append(dict(span)) + return merged + + +def _render_page_with_labeled_spans( + page_text: str, + spans: List[Dict[str, Any]], + *, + mode: str = "debug", +) -> Tuple[str, List[str], int, int, int, int, int]: + if mode not in {"debug", "clean"}: + raise ValueError(f"Unsupported OCR render mode: {mode}") + merged_spans = _merge_labeled_raw_spans(page_text, spans) + if not merged_spans: + return _replace_html_tables_with_markdown(page_text), [], 0, 0, 0, 0, 0 + + parts: List[str] = [] + pos = 0 + seen_types: Set[str] = set() + numeric_count = 0 + word_count = 0 + latex_count = 0 + table_count = 0 + hybrid_count = 0 + for span in merged_spans: + start = span["start"] + end = span["end"] + if start > pos: + parts.append(_replace_html_tables_with_markdown(page_text[pos:start])) + match_types = list(span.get("match_types", [])) + if mode == "debug": + open_tag = f'") + else: + if match_types == ["table_repeat"]: + parts.append( + _render_table_html_for_clean( + page_text[start:end], + match_kind=span.get("kind"), + ) + ) + else: + parts.append(_clean_fill_for_removed_span(page_text, start, end)) + pos = end + seen_types.update(match_types) + if span["category"] == "numeric": + numeric_count += 1 + elif span["category"] == "word": + word_count += 1 + elif span["category"] == "latex": + latex_count += 1 + elif span["category"] == "table": + table_count += 1 + elif span["category"] == "hybrid": + hybrid_count += 1 + if pos < len(page_text): + parts.append(_replace_html_tables_with_markdown(page_text[pos:])) + return "".join(parts), sorted(seen_types), numeric_count, word_count, latex_count, table_count, hybrid_count + + +def _annotate_page_with_labeled_spans( + page_text: str, + spans: List[Dict[str, Any]], +) -> Tuple[str, List[str], int, int, int, int, int]: + return _render_page_with_labeled_spans(page_text, spans, mode="debug") + + +def _count_hybrid_matches_in_page(page_text: str, spans: List[Dict[str, Any]]) -> int: + merged_spans = _merge_labeled_raw_spans(page_text, spans) + return sum(1 for span in merged_spans if span.get("category") == "hybrid") + + +def _find_labeled_shared_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, Any]]: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + normalized_text, raw_map = _normalize_alnum_with_map_skip_tags(analysis_text) + normalized_spans = _find_word_repeat_spans( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + labeled_spans: List[Dict[str, Any]] = [] + for span in normalized_spans: + if span["end"] <= span["start"] or span["start"] >= len(raw_map): + continue + match_type = _shared_repeat_match_type(normalized_text[span["start"] : span["end"]]) + if match_type is None: + continue + start = raw_map[span["start"]] + end = raw_map[span["end"] - 1] + 1 + labeled_spans.append( + { + "start": start, + "end": end, + "period": span["period"], + "repetitions": span["repetitions"], + "tail_chars": span["tail_chars"], + "match_types": [match_type], + "category": MATCH_CATEGORY_BY_TYPE[match_type], + } + ) + return labeled_spans + + +def _render_combined_ocr_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + mode: str = "debug", +) -> Dict[str, Any]: + page_start = time.perf_counter() + + char_eval_start = time.perf_counter() + page_noise_metrics = dict(noise_mod.evaluate_page_character_noise(page_text)) + char_eval_elapsed = time.perf_counter() - char_eval_start + + table_start = time.perf_counter() + table_spans = _find_table_repeat_spans(page_text) + table_elapsed = time.perf_counter() - table_start + + numeric_analysis_page = _filter_tables_preserve_layout(page_text) + numeric_analysis_page = _filter_latex_preserve_layout(numeric_analysis_page) + + numeric_start = time.perf_counter() + numeric_spans = [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "match_types": [str(item["match_type"])], + "category": MATCH_CATEGORY_BY_TYPE[str(item["match_type"])], + } + for item in noise_mod.find_numeric_debug_page_spans( + numeric_analysis_page, + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + ) + ] + numeric_elapsed = time.perf_counter() - numeric_start + + latex_start = time.perf_counter() + latex_spans = _find_latex_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans, + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + ) + latex_elapsed = time.perf_counter() - latex_start + + hybrid_start = time.perf_counter() + hybrid_spans = _find_hybrid_numbered_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans + latex_spans, + ) + hybrid_elapsed = time.perf_counter() - hybrid_start + + shared_start = time.perf_counter() + shared_spans = _find_labeled_shared_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans + latex_spans + hybrid_spans, + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + ) + shared_elapsed = time.perf_counter() - shared_start + + ( + annotated_page, + page_types, + page_numeric_count, + page_word_count, + page_latex_count, + page_table_count, + page_hybrid_count, + ) = _render_page_with_labeled_spans( + page_text, + table_spans + numeric_spans + latex_spans + hybrid_spans + shared_spans, + mode=mode, + ) + + page_total_time = time.perf_counter() - page_start + return { + "annotated_page": annotated_page, + "page_types": page_types, + "page_numeric_count": page_numeric_count, + "page_word_count": page_word_count, + "page_latex_count": page_latex_count, + "page_table_count": page_table_count, + "page_hybrid_count": page_hybrid_count, + "page_noise_metrics": page_noise_metrics, + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + + +def _render_combined_ocr_debug_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + return _render_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + mode="debug", + ) + + +def _summarize_metric(values: List[float]) -> Dict[str, float]: + if not values: + return {"count": 0, "p50": 0.0, "p95": 0.0, "max": 0.0} + array = np.array(values, dtype=float) + return { + "count": int(array.size), + "p50": float(np.percentile(array, 50)), + "p95": float(np.percentile(array, 95)), + "max": float(array.max()), + } + class CleanPhaseMixin: @staticmethod @@ -37,27 +1910,45 @@ def _project_root() -> Path: return candidate return here.parents[2] - def _load_rust_extension(self, module_name: str, manifest_relative: str): + def _load_rust_extension( + self, + module_name: str, + manifest_relative: str, + *, + required_attrs: Optional[Iterable[str]] = None, + ): """Import a Rust extension, building it with maturin if necessary.""" import importlib - try: - return importlib.import_module(module_name) - except ModuleNotFoundError: - self.logger.warning( - "Rust extension %s missing; attempting in-place build via maturin …", - module_name, - ) + required = tuple(required_attrs or ()) + + def _missing_attrs(module: Any) -> List[str]: + return [attr for attr in required if not hasattr(module, attr)] + + def _build_extension_once() -> None: + if module_name in _RUST_EXTENSION_PREBUILD_ATTEMPTED: + return + _RUST_EXTENSION_PREBUILD_ATTEMPTED.add(module_name) root_dir = self._project_root() manifest = root_dir / manifest_relative if not manifest.exists(): - raise RuntimeError( - f"Cannot locate Cargo manifest for {module_name} at {manifest}" + return + build_env = os.environ.copy() + if sys.prefix != getattr(sys, "base_prefix", sys.prefix): + build_env.setdefault("VIRTUAL_ENV", sys.prefix) + venv_bin = str(Path(sys.prefix) / "bin") + build_env["PATH"] = ( + f"{venv_bin}:{build_env['PATH']}" + if build_env.get("PATH") + else venv_bin ) try: subprocess.run( [sys.executable, "-m", "pip", "install", "maturin>=1.5,<2.0"], check=True, + env=build_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) subprocess.run( [ @@ -70,12 +1961,106 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str): str(manifest), ], check=True, + env=build_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) - return importlib.import_module(module_name) + importlib.invalidate_caches() except Exception as build_err: + self.logger.debug( + "Rust prebuild for %s skipped or failed: %s", + module_name, + build_err, + ) + + def _import_module_with_fallback(): + candidates = [module_name] + if "." not in module_name: + candidates.append(f"{module_name}.{module_name}") + + last_error: Optional[Exception] = None + for candidate in candidates: + try: + return importlib.import_module(candidate) + except Exception as err: # pragma: no cover - import surface varies by wheel layout + last_error = err + if last_error is not None: + raise last_error + raise ModuleNotFoundError(module_name) + + _build_extension_once() + + needs_build = False + try: + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if not missing: + return module + self.logger.warning( + "Rust extension %s is missing required attributes %s; attempting in-place build via maturin …", + module_name, + ", ".join(missing), + ) + needs_build = True + except ModuleNotFoundError: + self.logger.warning( + "Rust extension %s missing; attempting in-place build via maturin …", + module_name, + ) + needs_build = True + + if not needs_build: + raise RuntimeError(f"Unexpected load state for Rust extension {module_name}") + + root_dir = self._project_root() + manifest = root_dir / manifest_relative + if not manifest.exists(): + raise RuntimeError( + f"Cannot locate Cargo manifest for {module_name} at {manifest}" + ) + try: + build_env = os.environ.copy() + if sys.prefix != getattr(sys, "base_prefix", sys.prefix): + build_env.setdefault("VIRTUAL_ENV", sys.prefix) + venv_bin = str(Path(sys.prefix) / "bin") + build_env["PATH"] = ( + f"{venv_bin}:{build_env['PATH']}" + if build_env.get("PATH") + else venv_bin + ) + subprocess.run( + [sys.executable, "-m", "pip", "install", "maturin>=1.5,<2.0"], + check=True, + env=build_env, + ) + subprocess.run( + [ + sys.executable, + "-m", + "maturin", + "develop", + "--release", + "--manifest-path", + str(manifest), + ], + check=True, + env=build_env, + ) + importlib.invalidate_caches() + sys.modules.pop(module_name, None) + if "." not in module_name: + sys.modules.pop(f"{module_name}.{module_name}", None) + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if missing: raise RuntimeError( - f"Automatic build of {module_name} failed: {build_err}" + f"Built {module_name} but it is still missing required attributes: {missing}" ) + return module + except Exception as build_err: + raise RuntimeError( + f"Automatic build of {module_name} failed: {build_err}" + ) def _load_metrics_dataframe( self, parquet_path: Path, filenames: Optional[Iterable[str]] = None @@ -115,6 +2100,30 @@ def _merge_metric_dataframe( base_idx.update(update_idx) return base_idx.reset_index(drop=True) + def _resolve_clean_metrics_parquet(self, parquet_schema) -> Path: + parquet_path: Optional[Path] = self._get_cached_metadata_parquet() + if parquet_path is None: + existing_metadata = parquet_schema.find_metadata_parquet(self.input_dir) + if existing_metadata is not None: + parquet_path = self._cache_metadata_parquet(existing_metadata) + if parquet_path is None: + ensured = parquet_schema.ensure_metadata_parquet(self.output_dir) + if ensured is not None: + parquet_path = self._cache_metadata_parquet(ensured) + if parquet_path is None: + ensured = parquet_schema.ensure_metadata_parquet(self.input_dir) + if ensured is not None: + parquet_path = self._cache_metadata_parquet(ensured) + if parquet_path is None: + metadata_target = self.output_dir / "download_results" / "download_results.parquet" + self.logger.info( + "Cleaner: no metadata parquet found; will bootstrap %s when metrics become available.", + metadata_target, + ) + else: + metadata_target = parquet_path + return self._cache_metadata_parquet(metadata_target) + def clean( self, input_dir: Union[str, Path] = None, @@ -156,7 +2165,9 @@ def clean( self.ocr_model_dir = Path(ocr_model_dir) self._load_rust_extension( - "glossapi_rs_cleaner", "rust/glossapi_rs_cleaner/Cargo.toml" + "glossapi_rs_cleaner", + "rust/glossapi_rs_cleaner/Cargo.toml", + required_attrs=("run_complete_pipeline",), ) self.logger.info("Using compiled glossapi_rs_cleaner extension for fast cleaning") @@ -168,28 +2179,7 @@ def clean( # Prepare parquet helper parquet_schema = ParquetSchema({"url_column": self.url_column}) - parquet_path: Optional[Path] = self._get_cached_metadata_parquet() - if parquet_path is None: - existing_metadata = parquet_schema.find_metadata_parquet(self.input_dir) - if existing_metadata is not None: - parquet_path = self._cache_metadata_parquet(existing_metadata) - if parquet_path is None: - ensured = parquet_schema.ensure_metadata_parquet(self.output_dir) - if ensured is not None: - parquet_path = self._cache_metadata_parquet(ensured) - if parquet_path is None: - ensured = parquet_schema.ensure_metadata_parquet(self.input_dir) - if ensured is not None: - parquet_path = self._cache_metadata_parquet(ensured) - if parquet_path is None: - metadata_target = self.output_dir / "download_results" / "download_results.parquet" - self.logger.info( - "Cleaner: no metadata parquet found; will bootstrap %s when metrics become available.", - metadata_target, - ) - else: - metadata_target = parquet_path - parquet_path = self._cache_metadata_parquet(metadata_target) + parquet_path = self._resolve_clean_metrics_parquet(parquet_schema) import os records: list = [] # will hold metrics for parquet merge @@ -447,7 +2437,9 @@ def finalize(self) -> None: try: self.logger.info("Scoring cleaned markdown files with glossapi_rs_noise …") noise_mod = self._load_rust_extension( - "glossapi_rs_noise", "rust/glossapi_rs_noise/Cargo.toml" + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("score_markdown_directory_detailed",), ) results = noise_mod.score_markdown_directory_detailed( str(self.cleaned_markdown_dir), os.cpu_count() @@ -702,6 +2694,717 @@ def _merge_reason(value: str) -> str: if write_cleaned_files: self.markdown_dir = self.cleaned_markdown_dir + def clean_ocr( + self, + input_dir: Union[str, Path] = None, + num_threads: int = None, + drop_bad: bool = False, + *, + min_repeat_run: int = 6, + write_cleaned_files: bool = True, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> None: + """Clean OCR markdown with the shared page loop and update OCR-noise metrics. + + The OCR profile keeps the existing canonical script metrics columns + (`percentage_greek`, `latin_percentage`, `polytonic_ratio`) and adds + OCR-specific noise diagnostics. When ``write_cleaned_files`` is enabled, + the same combined page analyzer used by the debugger is applied in + ``mode="clean"`` and the cleaned markdown is written to + ``self.cleaned_markdown_dir``. + """ + from glossapi.parquet_schema import ParquetSchema + + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + parquet_schema = ParquetSchema({"url_column": self.url_column}) + parquet_path = self._resolve_clean_metrics_parquet(parquet_schema) + parquet_path.parent.mkdir(parents=True, exist_ok=True) + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=( + "score_markdown_directory_ocr_profile", + "find_numeric_debug_page_spans", + "evaluate_page_character_noise", + ), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + md_files = sorted(input_dir.glob("*.md")) + if write_cleaned_files: + if self.cleaned_markdown_dir.exists(): + shutil.rmtree(self.cleaned_markdown_dir) + self.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) + self.logger.info( + "Cleaning OCR markdown with shared combined loop into %s for %d markdown files…", + self.cleaned_markdown_dir, + len(md_files), + ) + for source_path in md_files: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + cleaned_pages: List[str] = [] + for page in pages: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="clean", + ) + cleaned_pages.append(str(page_result["annotated_page"])) + output_path = self.cleaned_markdown_dir / source_path.name + output_path.write_text( + PAGE_SPLIT_MARKER.join(cleaned_pages), + encoding="utf-8", + ) + + self.logger.info( + "Scoring OCR markdown files with glossapi_rs_noise OCR profile on %d markdown files…", + len(md_files), + ) + + results = noise_mod.score_markdown_directory_ocr_profile( + str(input_dir), + n_threads, + int(min_repeat_run), + ) + df_updates = pd.DataFrame(list(results)) + if df_updates.empty: + self.good_files = [] + self.logger.info("OCR cleaning found no markdown files under %s", input_dir) + return + + df_updates["filename"] = df_updates["path"].apply( + lambda value: f"{Path(str(value)).stem}.pdf" + ) + df_updates["polytonic_ratio"] = pd.to_numeric( + df_updates["polytonic_ratio"], errors="coerce" + ).round(2) + df_updates["percentage_greek"] = pd.to_numeric( + df_updates["percentage_greek"], errors="coerce" + ).round(3) + df_updates["latin_percentage"] = pd.to_numeric( + df_updates["latin_percentage"], errors="coerce" + ).round(3) + df_updates["ocr_repeat_suspicious_line_ratio"] = pd.to_numeric( + df_updates["ocr_repeat_suspicious_line_ratio"], errors="coerce" + ).round(4) + df_updates["ocr_noise_flags"] = ( + df_updates["ocr_noise_flags"].fillna("").astype(str) + ) + + update_columns = [ + "filename", + "percentage_greek", + "latin_percentage", + "polytonic_ratio", + "ocr_noise_suspect", + "ocr_noise_flags", + "ocr_repeat_phrase_run_max", + "ocr_repeat_line_run_max", + "ocr_repeat_suspicious_line_count", + "ocr_repeat_suspicious_line_ratio", + ] + + df = self._load_metrics_dataframe(parquet_path, df_updates.get("filename")) + self._ensure_metric_columns( + df, + { + "filter": "ok", + "percentage_greek": pd.NA, + "latin_percentage": pd.NA, + "polytonic_ratio": pd.NA, + "ocr_noise_suspect": False, + "ocr_noise_flags": "", + "ocr_repeat_phrase_run_max": pd.NA, + "ocr_repeat_line_run_max": pd.NA, + "ocr_repeat_suspicious_line_count": pd.NA, + "ocr_repeat_suspicious_line_ratio": pd.NA, + }, + ) + df = self._merge_metric_dataframe(df, df_updates[update_columns]) + + if "filter" not in df.columns: + df["filter"] = "ok" + else: + df["filter"] = df["filter"].fillna("ok").astype(str) + + suspect_mask = df["ocr_noise_suspect"].fillna(False).astype(bool) + if bool(suspect_mask.any()): + current = df.loc[suspect_mask, "filter"].astype(str) + + def _append_ocr_noise(value: str) -> str: + if value == "ok" or not value: + return "ocr_noise" + tokens = [token for token in value.split(";") if token] + if "ocr_noise" not in tokens: + tokens.append("ocr_noise") + return ";".join(tokens) + + df.loc[suspect_mask, "filter"] = current.apply(_append_ocr_noise) + + parquet_schema.write_metadata_parquet(df, parquet_path) + self.logger.info("OCR metrics updated in %s", parquet_path) + + filenames = df.get("filename", pd.Series(dtype=str)) + if drop_bad: + good_df = df[~df["ocr_noise_suspect"].fillna(False).astype(bool)] + filenames = good_df.get("filename", pd.Series(dtype=str)) + self.logger.info( + "After OCR filtering, %d good files remain", + len(filenames.dropna()), + ) + self.good_files = [canonical_stem(f) for f in filenames.dropna().astype(str).tolist()] + if write_cleaned_files: + self.markdown_dir = self.cleaned_markdown_dir + + def clean_ocr_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + num_threads: int = None, + *, + min_repeat_run: int = 6, + max_pages: Optional[int] = 1000, + sample_seed: int = 0, + ) -> List[Dict[str, Any]]: + """Export page-level OCR debug files for repeated-pattern matches. + + Only pages that contain OCR repetition matches are exported. Each output page + contains inline `...` tags around the matched spans. + """ + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("export_ocr_match_debug_pages",), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + self.logger.info( + "Exporting OCR debug matches from %s into %s with glossapi_rs_noise…", + input_dir, + output_dir, + ) + + rows = list( + noise_mod.export_ocr_match_debug_pages( + str(input_dir), + str(output_dir), + n_threads, + int(min_repeat_run), + None if max_pages is None else int(max_pages), + int(sample_seed), + ) + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(dict(row), ensure_ascii=False)) + handle.write("\n") + + self.logger.info( + "Exported %d OCR debug pages with matches to %s", + len(rows), + output_dir, + ) + return [dict(row) for row in rows] + + def clean_ocr_numeric_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + num_threads: int = None, + *, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + max_pages: Optional[int] = 1000, + sample_seed: int = 0, + ) -> List[Dict[str, Any]]: + """Export page-level OCR debug files for numeric-only collapse patterns.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("export_numeric_match_debug_pages",), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + self.logger.info( + "Exporting OCR numeric debug matches from %s into %s with glossapi_rs_noise…", + input_dir, + output_dir, + ) + + rows = list( + noise_mod.export_numeric_match_debug_pages( + str(input_dir), + str(output_dir), + n_threads, + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + None if max_pages is None else int(max_pages), + int(sample_seed), + ) + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(dict(row), ensure_ascii=False)) + handle.write("\n") + + self.logger.info( + "Exported %d OCR numeric debug pages with matches to %s", + len(rows), + output_dir, + ) + return [dict(row) for row in rows] + + def clean_ocr_numeric_word_debug_docs( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 100, + doc_offset: int = 0, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> List[Dict[str, Any]]: + """Annotate complete markdown documents with table, numeric, LaTeX, hybrid, then shared-repeat matches. + + Default repetition threshold for both word and LaTeX repeat detection is 4. + """ + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + page_metrics_path = output_dir / "page_metrics.jsonl" + if page_metrics_path.exists(): + page_metrics_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_numeric_debug_page_spans", "evaluate_page_character_noise"), + ) + + all_source_paths = sorted(input_dir.glob("*.md")) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting combined OCR table+numeric+latex+hybrid+word debug docs from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_metric_rows: List[Dict[str, Any]] = [] + total_page_times: List[float] = [] + table_page_times: List[float] = [] + numeric_page_times: List[float] = [] + latex_page_times: List[float] = [] + shared_page_times: List[float] = [] + hybrid_page_times: List[float] = [] + char_eval_times: List[float] = [] + bad_char_ratios: List[float] = [] + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + annotated_pages: List[str] = [] + matched_page_count = 0 + table_match_count = 0 + numeric_match_count = 0 + latex_match_count = 0 + hybrid_match_count = 0 + word_match_count = 0 + doc_match_types: Set[str] = set() + + for page_index, page in enumerate(pages, start=1): + page_result = _render_combined_ocr_debug_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + annotated_page = str(page_result["annotated_page"]) + page_types = list(page_result["page_types"]) + page_numeric_count = int(page_result["page_numeric_count"]) + page_word_count = int(page_result["page_word_count"]) + page_latex_count = int(page_result["page_latex_count"]) + page_table_count = int(page_result["page_table_count"]) + page_hybrid_count = int(page_result["page_hybrid_count"]) + page_noise_metrics = dict(page_result["page_noise_metrics"]) + char_eval_elapsed = float(page_result["char_eval_seconds"]) + table_elapsed = float(page_result["table_seconds"]) + numeric_elapsed = float(page_result["numeric_seconds"]) + latex_elapsed = float(page_result["latex_seconds"]) + hybrid_elapsed = float(page_result["hybrid_seconds"]) + shared_elapsed = float(page_result["shared_repeat_seconds"]) + page_total_time = float(page_result["total_page_seconds"]) + + char_eval_times.append(char_eval_elapsed) + bad_char_ratios.append(float(page_noise_metrics.get("bad_char_ratio", 0.0))) + table_page_times.append(table_elapsed) + numeric_page_times.append(numeric_elapsed) + latex_page_times.append(latex_elapsed) + hybrid_page_times.append(hybrid_elapsed) + shared_page_times.append(shared_elapsed) + total_page_times.append(page_total_time) + + page_match_total = ( + page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count + ) + if page_match_total: + matched_page_count += 1 + table_match_count += page_table_count + numeric_match_count += page_numeric_count + latex_match_count += page_latex_count + hybrid_match_count += page_hybrid_count + word_match_count += page_word_count + doc_match_types.update(page_types) + annotated_pages.append(annotated_page) + + page_metric_rows.append( + { + "source_path": str(source_path), + "source_stem": source_path.stem, + "page_number": page_index, + "page_index_in_file": page_index, + "total_chars": int(page_noise_metrics.get("total_chars", 0)), + "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), + "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), + "control_count": int(page_noise_metrics.get("control_count", 0)), + "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), + "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), + "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), + "table_match_count": page_table_count, + "numeric_match_count": page_numeric_count, + "latex_match_count": page_latex_count, + "hybrid_match_count": page_hybrid_count, + "word_match_count": page_word_count, + "match_types": ",".join(page_types), + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + ) + + output_path = output_dir / source_path.name + output_path.write_text(PAGE_SPLIT_MARKER.join(annotated_pages), encoding="utf-8") + row = { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_count": len(pages), + "matched_page_count": matched_page_count, + "table_match_count": table_match_count, + "numeric_match_count": numeric_match_count, + "latex_match_count": latex_match_count, + "hybrid_match_count": hybrid_match_count, + "word_match_count": word_match_count, + "match_types": ",".join(sorted(doc_match_types)), + } + rows.append(row) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + with page_metrics_path.open("w", encoding="utf-8") as handle: + for row in page_metric_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(rows), + "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), + "matched_page_count": int(sum(int(row["matched_page_count"]) for row in rows)), + "table_match_count": int(sum(int(row["table_match_count"]) for row in rows)), + "numeric_match_count": int(sum(int(row["numeric_match_count"]) for row in rows)), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "word_match_count": int(sum(int(row["word_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "total_page_seconds": _summarize_metric(total_page_times), + "table_seconds": _summarize_metric(table_page_times), + "numeric_seconds": _summarize_metric(numeric_page_times), + "latex_seconds": _summarize_metric(latex_page_times), + "hybrid_seconds": _summarize_metric(hybrid_page_times), + "shared_repeat_seconds": _summarize_metric(shared_page_times), + "char_eval_seconds": _summarize_metric(char_eval_times), + "bad_char_ratio": _summarize_metric(bad_char_ratios), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d combined OCR debug docs to %s", + len(rows), + output_dir, + ) + return rows + + def clean_ocr_hybrid_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 100, + doc_offset: int = 0, + ) -> List[Dict[str, Any]]: + """Export only matched pages for local hybrid numbered repetitions.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting hybrid OCR debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + hybrid_spans = _find_hybrid_numbered_repeat_spans(page, blocked_spans=[]) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not hybrid_spans: + continue + + annotated_page, page_types, _, _, _, _, _ = _annotate_page_with_labeled_spans( + page, + hybrid_spans, + ) + hybrid_count = _count_hybrid_matches_in_page(page, hybrid_spans) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "hybrid_match_count": hybrid_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d hybrid OCR debug pages to %s", + len(rows), + output_dir, + ) + return rows + + def clean_ocr_latex_slot_progression_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 1000, + doc_offset: int = 0, + ) -> List[Dict[str, Any]]: + """Export only matched pages for local LaTeX slot-progression runs.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting LaTeX slot-progression debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + latex_spans = _find_latex_slot_progression_spans(page, blocked_spans=[]) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not latex_spans: + continue + + annotated_page, page_types, _, _, latex_count, _, _ = _annotate_page_with_labeled_spans( + page, + latex_spans, + ) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "latex_match_count": latex_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d LaTeX slot-progression debug pages to %s", + len(rows), + output_dir, + ) + return rows + def filter(self, *args, **kwargs): # type: ignore[override] """Deprecated: use :py:meth:`clean` instead. Retained for one release.""" self.logger.warning("Corpus.filter() is deprecated – calling clean() instead") diff --git a/src/glossapi/scripts/build_ocr_golden_pages.py b/src/glossapi/scripts/build_ocr_golden_pages.py new file mode 100644 index 0000000..f6bb5b9 --- /dev/null +++ b/src/glossapi/scripts/build_ocr_golden_pages.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple + +PAGE_SPLIT_MARKER = "<--- Page Split --->" + + +def _read_jsonl(path: Path) -> List[Dict[str, object]]: + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def _stable_sort_rows(rows: Sequence[Dict[str, object]], seed: str) -> List[Dict[str, object]]: + def _key(row: Dict[str, object]) -> str: + basis = f"{seed}|{row['source_stem']}|{row['page_number']}" + return hashlib.sha1(basis.encode("utf-8")).hexdigest() + + return sorted(rows, key=_key) + + +def _take_rows( + rows: Sequence[Dict[str, object]], + selected_keys: set[Tuple[str, int]], + *, + limit: int, + seed: str, +) -> List[Dict[str, object]]: + out: List[Dict[str, object]] = [] + for row in _stable_sort_rows(rows, seed): + key = (str(row["source_stem"]), int(row["page_number"])) + if key in selected_keys: + continue + out.append(row) + selected_keys.add(key) + if len(out) >= limit: + break + return out + + +def _split_pages(path: Path) -> List[str]: + return path.read_text(encoding="utf-8", errors="ignore").split(PAGE_SPLIT_MARKER) + + +def build_ocr_goldens( + *, + run_dir: Path, + source_dir: Path, + output_dir: Path, + seed: str = "ocr-golden-v1", +) -> Dict[str, object]: + page_metrics = _read_jsonl(run_dir / "page_metrics.jsonl") + manifest_rows = _read_jsonl(run_dir / "manifest.jsonl") + source_by_stem = {Path(str(row["source_path"])).stem: Path(str(row["source_path"])) for row in manifest_rows} + output_by_stem = {Path(str(row["output_path"])).stem: Path(str(row["output_path"])) for row in manifest_rows} + + for target in (output_dir / "inputs", output_dir / "expected"): + target.mkdir(parents=True, exist_ok=True) + for stale in target.iterdir(): + if stale.is_file(): + stale.unlink() + for stale_name in ("manifest.jsonl", "summary.json"): + stale = output_dir / stale_name + if stale.exists(): + stale.unlink() + + source_pages_cache: Dict[str, List[str]] = {} + output_pages_cache: Dict[str, List[str]] = {} + + rows_with_features: List[Dict[str, object]] = [] + for row in page_metrics: + stem = str(row["source_stem"]) + source_path = source_by_stem.get(stem) + output_path = output_by_stem.get(stem) + if source_path is None or output_path is None: + continue + if stem not in source_pages_cache: + source_pages_cache[stem] = _split_pages(source_path) + output_pages_cache[stem] = _split_pages(output_path) + page_idx = int(row["page_number"]) - 1 + source_page = source_pages_cache[stem][page_idx] + output_page = output_pages_cache[stem][page_idx] + feature_row = dict(row) + feature_row["has_table_html"] = " 0 + ] + feature_row["positive_categories"] = positive_categories + rows_with_features.append(feature_row) + + selected_keys: set[Tuple[str, int]] = set() + selected_rows: List[Tuple[str, Dict[str, object]]] = [] + + def add_bucket(label: str, candidates: Iterable[Dict[str, object]], limit: int) -> None: + bucket = _take_rows(list(candidates), selected_keys, limit=limit, seed=f"{seed}:{label}") + for item in bucket: + selected_rows.append((label, item)) + + add_bucket( + "hybrid_positive", + [row for row in rows_with_features if int(row.get("hybrid_match_count", 0)) > 0], + 9999, + ) + add_bucket( + "latex_positive", + [row for row in rows_with_features if int(row.get("latex_match_count", 0)) > 0], + 9999, + ) + add_bucket( + "mixed_positive", + [row for row in rows_with_features if len(list(row.get("positive_categories", []))) >= 2], + 120, + ) + add_bucket( + "numeric_positive", + [row for row in rows_with_features if int(row.get("numeric_match_count", 0)) > 0], + 140, + ) + add_bucket( + "word_positive", + [row for row in rows_with_features if int(row.get("word_match_count", 0)) > 0], + 140, + ) + add_bucket( + "table_positive", + [row for row in rows_with_features if int(row.get("table_match_count", 0)) > 0], + 180, + ) + add_bucket( + "table_kept_conversion", + [ + row + for row in rows_with_features + if row.get("has_table_html") + and all(int(row.get(f"{category}_match_count", 0)) == 0 for category in ("table", "numeric", "latex", "hybrid", "word")) + ], + 60, + ) + add_bucket( + "negative_plain", + [ + row + for row in rows_with_features + if not row.get("has_table_html") + and all(int(row.get(f"{category}_match_count", 0)) == 0 for category in ("table", "numeric", "latex", "hybrid", "word")) + ], + 60, + ) + + manifest_out = output_dir / "manifest.jsonl" + summary_out = output_dir / "summary.json" + written_rows: List[Dict[str, object]] = [] + category_counts: Dict[str, int] = {} + + for idx, (label, row) in enumerate(selected_rows, start=1): + stem = str(row["source_stem"]) + page_number = int(row["page_number"]) + base_name = f"{idx:04d}__{stem}__page_{page_number:05d}" + input_path = output_dir / "inputs" / f"{base_name}.md" + expected_path = output_dir / "expected" / f"{base_name}.md" + input_path.write_text(str(row["source_page"]), encoding="utf-8") + expected_path.write_text(str(row["expected_page"]), encoding="utf-8") + + category_counts[label] = category_counts.get(label, 0) + 1 + written_rows.append( + { + "case_id": base_name, + "label": label, + "source_stem": stem, + "page_number": page_number, + "input_path": str(input_path), + "expected_path": str(expected_path), + "source_path": str(source_by_stem[stem]), + "output_path": str(output_by_stem[stem]), + "match_counts": { + category: int(row.get(f"{category}_match_count", 0)) + for category in ("table", "numeric", "latex", "hybrid", "word") + }, + "has_table_html": bool(row.get("has_table_html")), + } + ) + + with manifest_out.open("w", encoding="utf-8") as handle: + for row in written_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "run_dir": str(run_dir), + "source_dir": str(source_dir), + "output_dir": str(output_dir), + "case_count": len(written_rows), + "category_counts": category_counts, + } + summary_out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build OCR golden page fixtures from a combined debug run.") + parser.add_argument("--run-dir", required=True, type=Path) + parser.add_argument("--source-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--seed", default="ocr-golden-v1") + args = parser.parse_args() + + summary = build_ocr_goldens( + run_dir=args.run_dir, + source_dir=args.source_dir, + output_dir=args.output_dir, + seed=args.seed, + ) + print(json.dumps(summary, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/review_manifest_materialize.py b/src/glossapi/scripts/review_manifest_materialize.py new file mode 100644 index 0000000..56fc7b1 --- /dev/null +++ b/src/glossapi/scripts/review_manifest_materialize.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import argparse +import json +import re +import shutil +from collections import Counter +from pathlib import Path +from typing import Dict, Iterable, List + + +_SAFE_LABEL_RE = re.compile(r"[^a-z0-9._-]+") + + +def _slugify_label(value: object) -> str: + text = str(value).strip().lower() + text = text.replace(" ", "_") + text = _SAFE_LABEL_RE.sub("_", text) + text = text.strip("._-") + return text or "unlabeled" + + +def _format_metadata_lines(row: Dict[str, object], source_field: str, label_field: str, category_name: str) -> List[str]: + lines = [ + f"REVIEW_CATEGORY: {category_name}", + f"REVIEW_LABEL: {row.get(label_field, '')}", + ] + for key, value in row.items(): + if key in {source_field, label_field}: + continue + if isinstance(value, (dict, list)): + rendered = json.dumps(value, ensure_ascii=False) + else: + rendered = str(value) + lines.append(f"{key.upper()}: {rendered}") + return lines + + +def _read_manifest_rows(path: Path) -> List[Dict[str, object]]: + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def _write_review_copy( + src: Path, + dest: Path, + row: Dict[str, object], + source_field: str, + label_field: str, + category_name: str, +) -> None: + body = src.read_text(encoding="utf-8", errors="ignore") + header = "\n".join(_format_metadata_lines(row, source_field, label_field, category_name)) + dest.write_text(f"{header}\n\n=== REVIEW_SOURCE_CONTENT ===\n{body}", encoding="utf-8") + + +def materialize_manifest_categories( + manifest_path: Path, + output_dir: Path, + *, + source_field: str = "path", + label_field: str = "label", + category_name: str | None = None, +) -> Dict[str, object]: + rows = _read_manifest_rows(manifest_path) + category_name = category_name or label_field + + if output_dir.exists(): + for stale in output_dir.rglob("*.txt"): + stale.unlink() + for stale in output_dir.rglob("*.json"): + stale.unlink() + for stale in output_dir.rglob("*.jsonl"): + stale.unlink() + output_dir.mkdir(parents=True, exist_ok=True) + + labels_dir = output_dir / "by_label" + labels_dir.mkdir(parents=True, exist_ok=True) + + label_counts: Counter[str] = Counter() + written_rows: List[Dict[str, object]] = [] + + for row in rows: + if source_field not in row or label_field not in row: + raise KeyError(f"Manifest row missing required fields: {source_field!r}, {label_field!r}") + + src = Path(str(row[source_field])) + label = str(row[label_field]) + label_slug = _slugify_label(label) + dest_dir = labels_dir / label_slug + dest_dir.mkdir(parents=True, exist_ok=True) + dest = dest_dir / src.name + if dest.exists(): + stem = dest.stem + suffix = dest.suffix + counter = 2 + while True: + candidate = dest_dir / f"{stem}__dup{counter}{suffix}" + if not candidate.exists(): + dest = candidate + break + counter += 1 + + _write_review_copy(src, dest, row, source_field, label_field, category_name) + label_counts[label] += 1 + written_rows.append( + { + "label": label, + "label_slug": label_slug, + "source_path": str(src), + "copied_path": str(dest), + } + ) + + manifest_out = output_dir / "materialized_manifest.jsonl" + with manifest_out.open("w", encoding="utf-8") as handle: + for row in written_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "manifest_path": str(manifest_path), + "output_dir": str(output_dir), + "category_name": category_name, + "source_field": source_field, + "label_field": label_field, + "row_count": len(rows), + "label_counts": dict(label_counts), + "label_dirs": { + _slugify_label(label): str(labels_dir / _slugify_label(label)) + for label in sorted(label_counts) + }, + } + (output_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary + + +def main() -> None: + parser = argparse.ArgumentParser(description="Materialize categorized review copies from a JSONL manifest.") + parser.add_argument("--manifest", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--source-field", default="path") + parser.add_argument("--label-field", default="label") + parser.add_argument("--category-name", default=None) + args = parser.parse_args() + + materialize_manifest_categories( + args.manifest, + args.output_dir, + source_field=args.source_field, + label_field=args.label_field, + category_name=args.category_name, + ) + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/table_markdown_audit.py b/src/glossapi/scripts/table_markdown_audit.py new file mode 100644 index 0000000..1bba05d --- /dev/null +++ b/src/glossapi/scripts/table_markdown_audit.py @@ -0,0 +1,522 @@ +from __future__ import annotations + +import argparse +import html +import json +import re +from collections import Counter +from dataclasses import dataclass +from html.parser import HTMLParser +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + + +TABLE_BLOCK_RE = re.compile(r"(?is)") +ROW_RE = re.compile(r"(?is).*?") +CELL_RE = re.compile(r"(?is)<(td|th)\b(.*?)>(.*?)") +ATTR_RE = re.compile(r'([A-Za-z_:][-A-Za-z0-9_:.]*)\s*=\s*(".*?"|\'.*?\'|[^\s>]+)', re.S) +TAG_RE = re.compile(r"(?is)<[^>]+>") +DISALLOWED_TAG_RE = re.compile(r"(?is)]*>") +BREAK_TAG_RE = re.compile(r"(?is)") + + +@dataclass +class ParsedCell: + tag: str + text: str + rowspan: int + colspan: int + + +@dataclass +class TableAudit: + source_path: str + source_stem: str + table_index_in_doc: int + global_index: int + html: str + status: str + convertible: bool + broken: bool + reasons: List[str] + row_count: int + col_count: int + nonempty_ratio: float + duplicate_rows: int + header_mode: str + spans_present: bool + markdown: Optional[str] + + +class _CellHTMLNormalizer(HTMLParser): + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.parts: List[str] = [] + self.link_stack: List[Optional[str]] = [] + + def _append_break(self) -> None: + if self.parts and not self.parts[-1].endswith("\n"): + self.parts.append("\n") + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + tag = tag.lower() + attr_map = {key.lower(): (value or "") for key, value in attrs} + if tag == "br": + self._append_break() + return + if tag in {"p", "div", "li"}: + self._append_break() + if tag == "li": + self.parts.append("- ") + return + if tag in {"sub", "sup"}: + self.parts.append(f"<{tag}>") + return + if tag == "img": + alt = " ".join(attr_map.get("alt", "").split()) + if alt: + self.parts.append(alt) + return + if tag == "a": + href = attr_map.get("href", "").strip() + self.link_stack.append(href or None) + self.parts.append("[") + return + + def handle_endtag(self, tag: str) -> None: + tag = tag.lower() + if tag in {"p", "div", "li"}: + self._append_break() + return + if tag in {"sub", "sup"}: + self.parts.append(f"") + return + if tag == "a": + href = self.link_stack.pop() if self.link_stack else None + if href: + self.parts.append(f"]({href})") + else: + self.parts.append("]") + + def handle_data(self, data: str) -> None: + self.parts.append(data) + + def get_text(self) -> str: + return "".join(self.parts) + + +def _parse_attrs(attr_text: str) -> Dict[str, str]: + attrs: Dict[str, str] = {} + for key, raw_value in ATTR_RE.findall(attr_text): + value = raw_value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}: + value = value[1:-1] + attrs[key.lower()] = html.unescape(value) + return attrs + + +def _normalize_cell_html(cell_html: str) -> str: + parser = _CellHTMLNormalizer() + parser.feed(cell_html) + parser.close() + text = parser.get_text() + text = BREAK_TAG_RE.sub("\n", text) + text = DISALLOWED_TAG_RE.sub(" ", text) + text = html.unescape(text) + lines = [" ".join(line.split()) for line in text.splitlines()] + return "\n".join(line for line in lines if line).strip() + + +def _parse_table_rows(table_html: str) -> Tuple[List[List[ParsedCell]], List[str]]: + reasons: List[str] = [] + if re.search(r"(?is) Tuple[Optional[List[List[str]]], List[str]]: + reasons: List[str] = [] + active_rowspans: Dict[int, int] = {} + expanded_rows: List[List[str]] = [] + max_cols = 0 + + for parsed_row in parsed_rows: + row: List[str] = [] + col_idx = 0 + + def fill_active_until_free() -> None: + nonlocal col_idx + while active_rowspans.get(col_idx, 0) > 0: + row.append("") + active_rowspans[col_idx] -= 1 + if active_rowspans[col_idx] <= 0: + del active_rowspans[col_idx] + col_idx += 1 + + fill_active_until_free() + for cell in parsed_row: + fill_active_until_free() + row.append(cell.text) + if cell.rowspan > 1: + active_rowspans[col_idx] = max(active_rowspans.get(col_idx, 0), cell.rowspan - 1) + start_col = col_idx + col_idx += 1 + for extra in range(1, cell.colspan): + row.append("") + if cell.rowspan > 1: + active_rowspans[start_col + extra] = max( + active_rowspans.get(start_col + extra, 0), cell.rowspan - 1 + ) + col_idx += 1 + fill_active_until_free() + + max_cols = max(max_cols, len(row)) + expanded_rows.append(row) + + while active_rowspans: + row: List[str] = [] + col_idx = 0 + max_active_col = max(active_rowspans) + while col_idx <= max_active_col: + if active_rowspans.get(col_idx, 0) > 0: + row.append("") + active_rowspans[col_idx] -= 1 + if active_rowspans[col_idx] <= 0: + del active_rowspans[col_idx] + else: + row.append("") + col_idx += 1 + max_cols = max(max_cols, len(row)) + expanded_rows.append(row) + + if max_cols == 0 or not expanded_rows: + reasons.append("empty_grid") + return None, reasons + + for row in expanded_rows: + if len(row) < max_cols: + row.extend([""] * (max_cols - len(row))) + return expanded_rows, reasons + + +def _markdown_escape(text: str) -> str: + text = text.replace("\\", "\\\\") + text = text.replace("|", "\\|") + text = text.replace("\n", "
") + return text + + +def _format_markdown_row(values: Sequence[str], widths: Sequence[int]) -> str: + padded = [value.ljust(width) for value, width in zip(values, widths)] + return "| " + " | ".join(padded) + " |" + + +def _should_infer_header_row(grid: Sequence[Sequence[str]]) -> bool: + if len(grid) < 2: + return False + first_row = grid[0] + if not first_row: + return False + return all(any(ch.isalnum() for ch in cell) for cell in first_row) + + +def _grid_to_markdown(grid: Sequence[Sequence[str]], header_mode: str) -> str: + if not grid: + return "" + cols = len(grid[0]) + if header_mode in {"explicit_first_row", "inferred_first_row"}: + header = [_markdown_escape(cell) for cell in grid[0]] + data_rows = list(grid[1:]) + else: + header = [""] * cols + data_rows = list(grid) + escaped_rows = [[_markdown_escape(cell) for cell in row] for row in data_rows] + sep = ["---"] * cols + widths = [ + max( + len(header[idx]), + len(sep[idx]), + *(len(row[idx]) for row in escaped_rows), + ) + for idx in range(cols) + ] + + lines = [ + _format_markdown_row(header, widths), + _format_markdown_row(sep, widths), + ] + for row in escaped_rows: + lines.append(_format_markdown_row(row, widths)) + return "\n".join(lines) + + +def _assess_content( + grid: Sequence[Sequence[str]], + *, + spans_present: bool, +) -> Tuple[bool, List[str], float, int]: + total_cells = sum(len(row) for row in grid) + nonempty_cells = sum(1 for row in grid for cell in row if any(ch.isalnum() for ch in cell)) + nonempty_ratio = (nonempty_cells / total_cells) if total_cells else 0.0 + + row_keys = [] + for row in grid: + normalized = tuple(" ".join(cell.split()).casefold() for cell in row) + nonempty_in_row = sum(1 for cell in normalized if any(ch.isalnum() for ch in cell)) + if nonempty_in_row >= 2: + row_keys.append(normalized) + duplicate_rows = sum(freq - 1 for freq in Counter(row_keys).values() if freq >= 2) + + reasons: List[str] = [] + broken = False + if total_cells >= 18 and nonempty_ratio <= 0.15: + broken = True + reasons.append("near_empty_table") + if spans_present and total_cells >= 4 and nonempty_ratio <= 0.34: + broken = True + reasons.append("sparse_span_shell") + if len(grid) >= 4 and duplicate_rows >= 2: + broken = True + reasons.append("repeated_rows") + return broken, reasons, round(nonempty_ratio, 4), duplicate_rows + + +def audit_table(source_path: Path, table_index_in_doc: int, global_index: int, table_html: str) -> TableAudit: + parsed_rows, parse_reasons = _parse_table_rows(table_html) + spans_present = any(cell.rowspan > 1 or cell.colspan > 1 for row in parsed_rows for cell in row) + explicit_header = bool(parsed_rows and any(cell.tag == "th" for cell in parsed_rows[0])) + grid, expand_reasons = _expand_rows(parsed_rows) + reasons = list(dict.fromkeys(parse_reasons + expand_reasons)) + + if grid is None: + return TableAudit( + source_path=str(source_path), + source_stem=source_path.stem, + table_index_in_doc=table_index_in_doc, + global_index=global_index, + html=table_html, + status="broken_or_ambiguous", + convertible=False, + broken=True, + reasons=reasons or ["parse_failure"], + row_count=0, + col_count=0, + nonempty_ratio=0.0, + duplicate_rows=0, + header_mode="none", + spans_present=spans_present, + markdown=None, + ) + + broken, content_reasons, nonempty_ratio, duplicate_rows = _assess_content( + grid, + spans_present=spans_present, + ) + reasons = list(dict.fromkeys(reasons + content_reasons)) + if explicit_header: + header_mode = "explicit_first_row" + elif _should_infer_header_row(grid): + header_mode = "inferred_first_row" + else: + header_mode = "blank_first_row" + markdown = _grid_to_markdown(grid, header_mode=header_mode) + + if any(reason in {"nested_table", "invalid_rowspan", "invalid_colspan"} for reason in reasons): + status = "broken_or_ambiguous" + convertible = False + markdown = None + broken = True + else: + status = "convertible_but_broken" if broken else "convertible_clean" + convertible = True + + return TableAudit( + source_path=str(source_path), + source_stem=source_path.stem, + table_index_in_doc=table_index_in_doc, + global_index=global_index, + html=table_html, + status=status, + convertible=convertible, + broken=broken, + reasons=reasons, + row_count=len(grid), + col_count=len(grid[0]) if grid else 0, + nonempty_ratio=nonempty_ratio, + duplicate_rows=duplicate_rows, + header_mode=header_mode, + spans_present=spans_present, + markdown=markdown, + ) + + +def iter_tables(markdown_dir: Path): + global_index = 0 + for source_path in sorted(markdown_dir.glob("*.md")): + text = source_path.read_text(encoding="utf-8", errors="ignore") + table_index = 0 + for match in TABLE_BLOCK_RE.finditer(text): + table_index += 1 + global_index += 1 + yield source_path, table_index, global_index, match.group(0) + + +def write_review_file(output_dir: Path, audit: TableAudit) -> str: + filename = f"{audit.global_index:05d}__{audit.source_stem}__table_{audit.table_index_in_doc:03d}.txt" + output_path = output_dir / filename + lines = [ + f"SOURCE_PATH: {audit.source_path}", + f"SOURCE_STEM: {audit.source_stem}", + f"TABLE_INDEX_IN_DOC: {audit.table_index_in_doc}", + f"GLOBAL_INDEX: {audit.global_index}", + f"STATUS: {audit.status}", + f"CONVERTIBLE: {audit.convertible}", + f"BROKEN: {audit.broken}", + f"REASONS: {', '.join(audit.reasons) if audit.reasons else 'none'}", + f"ROWS: {audit.row_count}", + f"COLS: {audit.col_count}", + f"NONEMPTY_RATIO: {audit.nonempty_ratio}", + f"DUPLICATE_ROWS: {audit.duplicate_rows}", + f"HEADER_MODE: {audit.header_mode}", + f"SPANS_PRESENT: {audit.spans_present}", + "", + "=== HTML ===", + audit.html, + "", + "=== GITHUB_MD ===", + audit.markdown if audit.markdown is not None else "UNAVAILABLE", + "", + ] + output_path.write_text("\n".join(lines), encoding="utf-8") + return str(output_path) + + +def write_clean_markdown_file(output_dir: Path, audit: TableAudit) -> Optional[str]: + if audit.markdown is None: + return None + filename = f"{audit.global_index:05d}__{audit.source_stem}__table_{audit.table_index_in_doc:03d}.md" + output_path = output_dir / filename + output_path.write_text( + "\n".join( + [ + "## ORIGINAL_HTML", + "", + audit.html, + "", + "## GITHUB_MD", + "", + audit.markdown, + "", + ] + ), + encoding="utf-8", + ) + return str(output_path) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Audit HTML tables and export GitHub Markdown conversions.") + parser.add_argument("--input-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--max-tables", type=int, default=1000) + args = parser.parse_args() + + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + tables_dir = output_dir / "tables" + tables_dir.mkdir(parents=True, exist_ok=True) + clean_md_dir = output_dir / "github_md_tables" + clean_md_dir.mkdir(parents=True, exist_ok=True) + + manifest_path = output_dir / "manifest.jsonl" + summary_path = output_dir / "summary.json" + if manifest_path.exists(): + manifest_path.unlink() + if summary_path.exists(): + summary_path.unlink() + for stale in tables_dir.glob("*.txt"): + stale.unlink() + for stale in clean_md_dir.glob("*.md"): + stale.unlink() + + rows = [] + audited = 0 + for source_path, table_index, global_index, table_html in iter_tables(args.input_dir): + audited += 1 + audit = audit_table(source_path, table_index, global_index, table_html) + output_path = write_review_file(tables_dir, audit) + markdown_path = write_clean_markdown_file(clean_md_dir, audit) + row = { + "source_path": audit.source_path, + "source_stem": audit.source_stem, + "table_index_in_doc": audit.table_index_in_doc, + "global_index": audit.global_index, + "status": audit.status, + "convertible": audit.convertible, + "broken": audit.broken, + "reasons": audit.reasons, + "row_count": audit.row_count, + "col_count": audit.col_count, + "nonempty_ratio": audit.nonempty_ratio, + "duplicate_rows": audit.duplicate_rows, + "header_mode": audit.header_mode, + "spans_present": audit.spans_present, + "output_path": output_path, + "markdown_output_path": markdown_path, + } + rows.append(row) + if audited >= args.max_tables: + break + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + reason_counts = Counter(reason for row in rows for reason in row["reasons"]) + status_counts = Counter(row["status"] for row in rows) + summary = { + "input_dir": str(args.input_dir), + "output_dir": str(output_dir), + "github_md_dir": str(clean_md_dir), + "audited_table_count": len(rows), + "convertible_count": sum(1 for row in rows if row["convertible"]), + "broken_count": sum(1 for row in rows if row["broken"]), + "status_counts": dict(status_counts), + "reason_counts": dict(reason_counts), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/table_sentence_context_review.py b/src/glossapi/scripts/table_sentence_context_review.py new file mode 100644 index 0000000..6e2a074 --- /dev/null +++ b/src/glossapi/scripts/table_sentence_context_review.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +import argparse +import importlib.util +import json +import re +import sys +from collections import Counter +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +PAGE_SPLIT_MARKER = "<--- Page Split --->" +TABLE_BLOCK_RE = re.compile(r"(?is)") +WORD_RE = re.compile(r"[^\W\d_]+", re.UNICODE) + + +_TABLE_AUDIT_PATH = Path(__file__).with_name("table_markdown_audit.py") +_TABLE_AUDIT_SPEC = importlib.util.spec_from_file_location("table_markdown_audit_local", _TABLE_AUDIT_PATH) +assert _TABLE_AUDIT_SPEC and _TABLE_AUDIT_SPEC.loader +_TABLE_AUDIT_MODULE = importlib.util.module_from_spec(_TABLE_AUDIT_SPEC) +sys.modules[_TABLE_AUDIT_SPEC.name] = _TABLE_AUDIT_MODULE +_TABLE_AUDIT_SPEC.loader.exec_module(_TABLE_AUDIT_MODULE) +_expand_rows = _TABLE_AUDIT_MODULE._expand_rows +_parse_table_rows = _TABLE_AUDIT_MODULE._parse_table_rows + + +def _extract_review_html(review_text: str) -> str: + return review_text.split("=== HTML ===\n", 1)[1].split("\n\n=== GITHUB_MD ===", 1)[0] + + +def _flatten_nonempty_cells(table_html: str) -> List[str]: + parsed_rows, _ = _parse_table_rows(table_html) + grid, _ = _expand_rows(parsed_rows) + if not grid: + return [] + nonempty: List[str] = [] + for row in grid: + for cell in row: + normalized = " ".join(cell.split()) + if any(ch.isalnum() for ch in normalized): + nonempty.append(normalized) + return nonempty + + +def _is_sentence_shell_candidate(review_row: Dict[str, object], table_html: str) -> Tuple[bool, Dict[str, int]]: + nonempty_cells = _flatten_nonempty_cells(table_html) + word_count = sum(len(WORD_RE.findall(cell)) for cell in nonempty_cells) + max_cell_len = max((len(cell) for cell in nonempty_cells), default=0) + metrics = { + "nonempty_cell_count": len(nonempty_cells), + "word_count": word_count, + "max_cell_len": max_cell_len, + } + is_candidate = ( + bool(review_row.get("broken")) + and "sparse_span_shell" in list(review_row.get("reasons", [])) + and len(nonempty_cells) == 1 + and word_count >= 6 + and max_cell_len >= 40 + ) + return is_candidate, metrics + + +def _find_table_page_context( + source_path: Path, + table_index_in_doc: int, +) -> Tuple[int, int, int, int, str, str, str]: + text = source_path.read_text(encoding="utf-8", errors="ignore") + pages = text.split(PAGE_SPLIT_MARKER) + seen = 0 + for page_idx, page in enumerate(pages): + matches = list(TABLE_BLOCK_RE.finditer(page)) + if seen + len(matches) < table_index_in_doc: + seen += len(matches) + continue + local_idx = table_index_in_doc - seen - 1 + match = matches[local_idx] + prev_page = pages[page_idx - 1] if page_idx > 0 else "" + curr_page = page + next_page = pages[page_idx + 1] if page_idx + 1 < len(pages) else "" + return page_idx, match.start(), match.end(), len(pages), prev_page, curr_page, next_page + raise ValueError(f"Could not find table {table_index_in_doc} in {source_path}") + + +def _smart_join(before_text: str, inline_text: str, after_text: str) -> str: + left = before_text.rstrip() + right = after_text.lstrip() + insertion = inline_text.strip() + + if left and not left.endswith(("\n", " ", "(", "[", "{", "“", "\"", "'")): + if left[-1].isalnum() and insertion and insertion[0].isalnum(): + left += " " + if right and not right.startswith(("\n", " ", ".", ",", ";", ":", "!", "?", ")", "]", "}", "”", "\"", "'")): + if insertion and insertion[-1].isalnum() and right[0].isalnum(): + insertion += " " + return left + insertion + right + + +def _context_fit_guess(before_text: str, inline_text: str, after_text: str) -> Tuple[bool, List[str]]: + reasons: List[str] = [] + word_count = len(WORD_RE.findall(inline_text)) + if word_count < 6: + reasons.append("short_inline_text") + left_window = before_text[-4:] + right_window = after_text[:4] + left_blockish = (not before_text) or ("\n" in left_window) or before_text.endswith((" ", "\t")) + right_blockish = (not after_text) or ("\n" in right_window) or after_text.startswith((" ", "\t")) + if not left_blockish: + reasons.append("not_block_isolated_left") + if not right_blockish: + reasons.append("not_block_isolated_right") + fit = word_count >= 6 and left_blockish and right_blockish + return fit, reasons + + +def _format_three_page_context( + prev_page: str, + curr_page: str, + next_page: str, + start: int, + end: int, + inline_text: str, +) -> Tuple[str, str]: + tagged_current = curr_page[:start] + "[[[TABLE_START]]]" + curr_page[start:end] + "[[[TABLE_END]]]" + curr_page[end:] + replaced_current = ( + curr_page[:start] + + "[[[INLINE_TEXT_START]]]" + + inline_text + + "[[[INLINE_TEXT_END]]]" + + curr_page[end:] + ) + original_context = ( + f"=== PAGE -1 ===\n{prev_page}\n\n" + f"=== PAGE 0 ===\n{tagged_current}\n\n" + f"=== PAGE +1 ===\n{next_page}\n" + ) + replaced_context = ( + f"=== PAGE -1 ===\n{prev_page}\n\n" + f"=== PAGE 0 ===\n{replaced_current}\n\n" + f"=== PAGE +1 ===\n{next_page}\n" + ) + return original_context, replaced_context + + +def main() -> None: + parser = argparse.ArgumentParser(description="Export 3-page context review files for sentence-in-table shells.") + parser.add_argument("--audit-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + args = parser.parse_args() + + audit_dir = args.audit_dir + manifest_path = audit_dir / "manifest.jsonl" + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + contexts_dir = output_dir / "contexts" + contexts_dir.mkdir(parents=True, exist_ok=True) + + summary_path = output_dir / "summary.json" + review_manifest_path = output_dir / "manifest.jsonl" + if summary_path.exists(): + summary_path.unlink() + if review_manifest_path.exists(): + review_manifest_path.unlink() + for stale in contexts_dir.glob("*.txt"): + stale.unlink() + + rows = [json.loads(line) for line in manifest_path.read_text(encoding="utf-8").splitlines() if line.strip()] + review_rows: List[Dict[str, object]] = [] + + for row in rows: + review_text = Path(str(row["output_path"])).read_text(encoding="utf-8") + table_html = _extract_review_html(review_text) + is_candidate, metrics = _is_sentence_shell_candidate(row, table_html) + if not is_candidate: + continue + + inline_text = _flatten_nonempty_cells(table_html)[0] + page_idx, start, end, page_count, prev_page, curr_page, next_page = _find_table_page_context( + Path(str(row["source_path"])), + int(row["table_index_in_doc"]), + ) + fit_guess, fit_reasons = _context_fit_guess(curr_page[:start], inline_text, curr_page[end:]) + original_context, replaced_context = _format_three_page_context( + prev_page, + curr_page, + next_page, + start, + end, + inline_text, + ) + filename = f"{int(row['global_index']):05d}__{row['source_stem']}__table_{int(row['table_index_in_doc']):03d}.txt" + output_path = contexts_dir / filename + output_path.write_text( + "\n".join( + [ + f"SOURCE_PATH: {row['source_path']}", + f"SOURCE_STEM: {row['source_stem']}", + f"TABLE_INDEX_IN_DOC: {row['table_index_in_doc']}", + f"GLOBAL_INDEX: {row['global_index']}", + f"PAGE_INDEX_ZERO_BASED: {page_idx}", + f"PAGE_NUMBER_ONE_BASED: {page_idx + 1}", + f"PAGE_COUNT: {page_count}", + f"FIT_GUESS: {fit_guess}", + f"FIT_REASONS: {', '.join(fit_reasons) if fit_reasons else 'none'}", + f"INLINE_TEXT_WORDS: {metrics['word_count']}", + f"INLINE_TEXT_CHARS: {metrics['max_cell_len']}", + "", + "=== INLINE_TEXT ===", + inline_text, + "", + "=== ORIGINAL_CONTEXT_3P ===", + original_context, + "", + "=== REPLACED_CONTEXT_3P ===", + replaced_context, + "", + ] + ), + encoding="utf-8", + ) + review_rows.append( + { + "source_path": row["source_path"], + "source_stem": row["source_stem"], + "table_index_in_doc": row["table_index_in_doc"], + "global_index": row["global_index"], + "page_number": page_idx + 1, + "page_count": page_count, + "fit_guess": fit_guess, + "fit_reasons": fit_reasons, + "inline_text_words": metrics["word_count"], + "inline_text_chars": metrics["max_cell_len"], + "output_path": str(output_path), + } + ) + + with review_manifest_path.open("w", encoding="utf-8") as handle: + for row in review_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + fit_counter = Counter(bool(row["fit_guess"]) for row in review_rows) + reason_counter = Counter(reason for row in review_rows for reason in row["fit_reasons"]) + summary = { + "audit_dir": str(audit_dir), + "output_dir": str(output_dir), + "candidate_count": len(review_rows), + "fit_guess_count": fit_counter.get(True, 0), + "fit_guess_rate": round((fit_counter.get(True, 0) / len(review_rows)), 4) if review_rows else 0.0, + "fit_reason_counts": dict(reason_counter), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index b876a20..95d5df3 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os from pathlib import Path @@ -7,6 +8,13 @@ import pytest from glossapi import Corpus +from glossapi.corpus.phase_clean import ( + _find_word_repeat_spans, + _find_word_repeat_spans_python, + _normalize_alnum_with_map_skip_tags, +) +from glossapi.scripts.table_markdown_audit import audit_table, write_clean_markdown_file +from glossapi.scripts.review_manifest_materialize import materialize_manifest_categories LATEX_MOJIBAKE_MD = """# Sample Document @@ -55,6 +63,108 @@ def _run_clean_and_read_row( return row.iloc[0] +def _run_clean_ocr_and_read_row( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + drop_bad: bool = False, +) -> pd.Series: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + corpus.clean_ocr(drop_bad=drop_bad) + parquet = corpus.output_dir / "download_results" / "download_results.parquet" + df = pd.read_parquet(parquet) + row = df[df["filename"] == f"{stem}.pdf"] + assert not row.empty, "Expected OCR metrics entry for generated markdown" + return row.iloc[0] + + +def _run_clean_ocr_and_read_cleaned_text( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + write_cleaned_files: bool = True, +) -> str: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + corpus.clean_ocr(write_cleaned_files=write_cleaned_files) + cleaned_path = corpus.cleaned_markdown_dir / f"{stem}.md" + assert cleaned_path.exists(), f"Expected cleaned markdown output at {cleaned_path}" + return cleaned_path.read_text(encoding="utf-8") + + +def _run_clean_ocr_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_pages: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_debug" + rows = corpus.clean_ocr_debug(debug_dir, max_pages=max_pages) + return rows, debug_dir + + +def _run_clean_ocr_numeric_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_pages: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_numeric_debug" + rows = corpus.clean_ocr_numeric_debug(debug_dir, max_pages=max_pages) + return rows, debug_dir + + +def _run_clean_ocr_numeric_word_debug_docs( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 100, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_numeric_word_debug" + rows = corpus.clean_ocr_numeric_word_debug_docs(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def _run_clean_ocr_hybrid_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 100, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_hybrid_debug" + rows = corpus.clean_ocr_hybrid_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def _run_clean_ocr_latex_slot_progression_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_latex_slot_progression_debug" + rows = corpus.clean_ocr_latex_slot_progression_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + def test_clean_skips_latex_blocks_for_mojibake(tmp_path: Path) -> None: corpus = _build_corpus(tmp_path) row = _run_clean_and_read_row(corpus, LATEX_MOJIBAKE_MD, stem="latex-case") @@ -88,3 +198,1241 @@ def test_clean_flags_uppercase_glyph_noise(tmp_path: Path) -> None: filter_value = row.get("filter") or "" assert "mojibake>0.1" in filter_value or "non_greek_text" in filter_value assert bool(row.get("needs_ocr", False)) + + +def test_clean_ocr_populates_script_metrics(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "Αυτή είναι η πρώτη σελίδα.\n<--- Page Split --->\nΚαὶ αὕτη εἶναι ἡ δευτέρα.", + stem="ocr-script-metrics", + ) + assert float(row.get("percentage_greek") or 0.0) > 70.0 + assert float(row.get("latin_percentage") or 0.0) < 5.0 + assert float(row.get("polytonic_ratio") or 0.0) > 0.0 + assert not bool(row.get("ocr_noise_suspect", False)) + assert (row.get("filter") or "") == "ok" + + +def test_clean_ocr_writes_cleaned_markdown_with_combined_loop(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + content = _run_clean_ocr_and_read_cleaned_text( + corpus, + ( + "1111 1 1 1 1 1 1 1 1 1 1\n" + "<--- Page Split --->\n" + "1. Από το 2020, η αγορά των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + "
NameScore
Alice10
\n" + ), + stem="ocr-clean-shared-loop", + ) + assert "<--- Page Split --->" in content + assert "" not in content + assert "| Name" in content + assert "| Alice" in content + assert corpus.markdown_dir == corpus.cleaned_markdown_dir + + +def test_clean_ocr_drops_sentence_shell_and_repeated_row_tables(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + content = _run_clean_ocr_and_read_cleaned_text( + corpus, + ( + "Πρόλογος\n" + "
Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας
\n" + "
StateValue
Alpha10
Beta20
Alpha10
Beta20
\n" + "Επίλογος\n" + ), + stem="ocr-clean-drop-tables", + ) + assert "" not in content + assert "Η οινοφόρος άμπελος" not in content + assert "| Alpha" not in content + assert "Πρόλογος" in content + assert "Επίλογος" in content + + +def test_clean_ocr_supports_score_only_mode(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + md_path = corpus.markdown_dir / "ocr-clean-score-only.md" + md_path.write_text("Κανονικό περιεχόμενο.\n", encoding="utf-8") + corpus.clean_ocr(write_cleaned_files=False) + assert not any(corpus.cleaned_markdown_dir.glob("*.md")) + assert corpus.markdown_dir == corpus.output_dir / "markdown" + + +def test_clean_ocr_ignores_numeric_lists_and_dotted_values(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "1. 2. 3. 4. 5. 6. 7.\n9.9.9.9.9\n", + stem="ocr-non-repeat-noise", + drop_bad=True, + ) + assert not bool(row.get("ocr_noise_suspect", False)) + assert int(row.get("ocr_repeat_phrase_run_max") or 0) == 0 + assert int(row.get("ocr_repeat_line_run_max") or 0) == 0 + flags = row.get("ocr_noise_flags") or "" + assert flags == "" + assert "ocr_noise" not in (row.get("filter") or "") + assert "ocr-non-repeat-noise" in corpus.good_files + + +def test_clean_ocr_flags_repeated_phrase_noise(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "0 0 0 0 0 0\n1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n", + stem="ocr-repeat-noise", + drop_bad=True, + ) + assert bool(row.get("ocr_noise_suspect", False)) + assert int(row.get("ocr_repeat_phrase_run_max") or 0) >= 6 + assert int(row.get("ocr_repeat_line_run_max") or 0) >= 6 + flags = row.get("ocr_noise_flags") or "" + assert "repeat_phrase_run" in flags + assert "repeat_line_run" in flags + assert "ocr_noise" in (row.get("filter") or "") + assert "ocr-repeat-noise" not in corpus.good_files + + +def test_clean_ocr_debug_exports_annotated_pages(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_debug_export( + corpus, + ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7.\n" + "0 0 0 0 0 0\n" + "1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n" + ), + stem="ocr-debug-source", + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert row["page_index_in_file"] == 2 + assert row["match_count"] >= 2 + assert "repeat_phrase_run" in row["match_types"] + assert "repeat_line_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "1. 2. 3. 4. 5. 6. 7." in content + assert "0 0 0 0 0 0" in content + assert "1.1" in content + + manifest = debug_dir / "manifest.jsonl" + lines = manifest.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 1 + + +def test_clean_ocr_debug_respects_sample_limit(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + md_path = corpus.markdown_dir / "ocr-debug-many.md" + md_path.write_text( + ( + "0 0 0 0 0 0\n" + "<--- Page Split --->\n" + "0 0 0 0 0 0\n" + "<--- Page Split --->\n" + "0 0 0 0 0 0\n" + ), + encoding="utf-8", + ) + debug_dir = corpus.output_dir / "ocr_debug" + rows = corpus.clean_ocr_debug(debug_dir, max_pages=2, sample_seed=0) + assert len(rows) == 2 + manifest = debug_dir / "manifest.jsonl" + lines = manifest.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 2 + + +def test_clean_ocr_numeric_debug_flags_ascending_sequences(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + ( + "Κανονικό κείμενο.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7. 8. 9. 10.\n" + ), + stem="ocr-numeric-progress", + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert "ascending_numeric_sequence" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert ( + "1. 2. 3. 4. 5. 6. 7. 8. 9. 10" + in content + ) + + +def test_clean_ocr_numeric_debug_flags_compact_repeated_numbers(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "2.2.2.2.2.2.2.2.\n", + stem="ocr-numeric-compact-repeat", + ) + assert len(rows) == 1 + row = rows[0] + assert "repeat_numeric_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "2.2.2.2.2.2.2.2" in content + + +def test_clean_ocr_numeric_debug_flags_same_digit_runs(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "1111 1 1 1 111 11 1 111 1 11\n", + stem="ocr-numeric-same-digit", + ) + assert len(rows) == 1 + row = rows[0] + assert "same_digit_numeric_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert ( + "1111 1 1 1 111 11 1 111 1 11" + in content + ) + + +def test_clean_ocr_numeric_debug_merges_close_same_category_spans(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "1111 1 1 1 1 1 1 1 1 1 1 xy 1111 1 1 1 1 1 1 1 1 1 1\n", + stem="ocr-numeric-gap-merge", + ) + assert len(rows) == 1 + exported = Path(rows[0]["output_path"]) + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert content.count("") == 1 + assert ( + "" + "1111 1 1 1 1 1 1 1 1 1 1 xy 1111 1 1 1 1 1 1 1 1 1 1" + "" + in content + ) + + +def test_clean_ocr_numeric_debug_flags_numeric_page_collapse(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + tokens = ("22 2 22 6 22 8 22 1 22 7 22 5 " * 12).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + tokens + "\n", + stem="ocr-numeric-page-collapse", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_ignores_punctuation_only_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + tokens = ("1 1 . 1 1 . 2 2 . 2 2 . " * 16).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + tokens + "\n", + stem="ocr-numeric-page-collapse-punct", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_ignores_container_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + numeric_body = ("11 11 11 22 22 22 33 33 33 44 44 44 " * 8).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + f"```\n( {numeric_body} )\n```\n", + stem="ocr-numeric-page-collapse-fenced", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert numeric_body in content + + +def test_clean_ocr_numeric_debug_page_collapse_accepts_dotted_numeric_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + dotted_tokens = " ".join(f"{major}.{minor}." for major in range(1, 6) for minor in range(1, 21)) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + dotted_tokens + "\n", + stem="ocr-numeric-page-collapse-dotted", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert dotted_tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_accepts_compact_numeric_atom_pages( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + compact_tokens = " ".join(["1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1."] * 20) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + compact_tokens + "\n", + stem="ocr-numeric-page-collapse-compact-atoms", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert compact_tokens in content + + +def test_clean_ocr_numeric_debug_flags_numeric_block_after_heading(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + numeric_block = "\n\n".join( + f"{i}.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.{i}.1.1.1.1.1.1.1.1.1.1.1.1.1.1" + for i in range(1, 27) + ) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + f"1\n\n## ΑΠΡΙΛΙΟΣ\n\n1\n\n{numeric_block}\n", + stem="ocr-numeric-block-heading", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_block_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "## ΑΠΡΙΛΙΟΣ" in content + assert "" in content + assert numeric_block in content + + +def test_clean_ocr_numeric_word_debug_docs_runs_numeric_then_word(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "1111 1 1 1 1 1 1 1 1 1 1\n" + "<--- Page Split --->\n" + "1. Από το 2020, η αγορά των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + "
Standard nameStandard nameStandard name
\n" + ), + stem="ocr-number-word-doc", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_count"] == 2 + assert row["matched_page_count"] == 2 + assert row["numeric_match_count"] >= 1 + assert row["word_match_count"] >= 1 + assert "word_repeat" in row["match_types"] + + exported = debug_dir / "ocr-number-word-doc.md" + content = exported.read_text(encoding="utf-8") + assert "<--- Page Split --->" in content + assert content.count("") == 1 + assert "" in content + assert "Standard name" not in content + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["doc_count"] == 1 + assert summary["numeric_match_count"] >= 1 + assert summary["word_match_count"] >= 1 + + page_metrics = (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert len(page_metrics) == 2 + + +def test_rust_word_repeat_spans_match_python_reference(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + corpus._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_word_repeat_spans",), + ) + cases = [ + "των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ", + "1.1 Hypergeometric function 1.1.1 Hypergeometric function 1.1.2 Hypergeometric function 1.1.3 Hypergeometric function", + r"\Delta \Delta \Delta \Delta \Delta", + "το σημείο 1, το σημείο 2, το σημείο 3, το σημείο 4, το σημείο 5, το σημείο 6", + ] + for text in cases: + normalized, _ = _normalize_alnum_with_map_skip_tags(text) + assert _find_word_repeat_spans( + normalized, + rep_threshold=4, + min_period=3, + window=96, + ) == _find_word_repeat_spans_python( + normalized, + rep_threshold=4, + min_period=3, + window=96, + ) + + +def test_clean_ocr_numeric_word_debug_docs_flags_empty_html_table_collapse(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + empty_table = ( + "" + "" + "" + "" + "" + "" + "" + "
\n" + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + empty_table, + stem="ocr-empty-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-empty-table.md").read_text(encoding="utf-8") + assert "" not in content + assert "|" in content + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["table_match_count"] == 1 + + +def test_clean_ocr_numeric_word_debug_docs_flags_repeated_html_table_rows(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + repeated_table = ( + "" + "" + "" + "" + "" + "" + "
StateValue
Alpha10
Beta20
Alpha10
Beta20
\n" + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated_table, + stem="ocr-repeated-table-rows", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-repeated-table-rows.md").read_text(encoding="utf-8") + assert "" not in content + assert "| Alpha" in content or "| Beta" in content + + +def test_clean_ocr_numeric_word_debug_docs_ignores_small_distinct_html_table(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "" + "" + "" + "" + "
NameScore
Alice10
Bob11
\n" + ), + stem="ocr-distinct-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 0 + assert "table_repeat" not in row["match_types"] + + content = (debug_dir / "ocr-distinct-table.md").read_text(encoding="utf-8") + assert "" not in content + assert "| Name" in content + assert "| Alice" in content + + +def test_clean_ocr_numeric_word_debug_docs_flags_sentence_shell_table(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "
" + "Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας" + "
\n" + ), + stem="ocr-sentence-shell-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-sentence-shell-table.md").read_text(encoding="utf-8") + assert "" not in content + + +def test_clean_ocr_numeric_word_debug_docs_transfers_pure_numeric_repeats_to_numeric( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "12 12 12 12 12 12 12 12 12 12 12 12\n", + stem="ocr-number-transfer", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["numeric_match_count"] >= 1 + assert row["word_match_count"] == 0 + assert "numeric_repeat" in row["match_types"] + assert "word_repeat" not in row["match_types"] + + content = (debug_dir / "ocr-number-transfer.md").read_text(encoding="utf-8") + assert "12 12 12 12 12 12 12 12 12 12 12 12" in content + + +def test_clean_ocr_numeric_word_debug_docs_flags_hybrid_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "1.1 Hypergeometric function 1.1.1 Hypergeometric function 1.1.2 Hypergeometric function 1.1.3 Hypergeometric function 1.1.4 Hypergeometric function\n", + stem="ocr-combined-hybrid", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["hybrid_match_count"] >= 1 + assert "hybrid_repeat" in row["match_types"] + + content = (debug_dir / "ocr-combined-hybrid.md").read_text(encoding="utf-8") + assert "= 1 + + +def test_clean_ocr_numeric_word_debug_docs_ignores_latex_in_shared_repeat(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ S=\frac{1}{16\pi}\int\sqrt{-g}d^{4}x\left[\phi R-\frac{\omega(\phi)}{\phi}\phi_{,a}\phi^{,a}+2\phi\lambda(\phi)\right]+S_{M} \quad (149) \]" + "\n", + stem="ocr-latex-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["word_match_count"] == 0 + assert row["latex_match_count"] == 0 + assert "word_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\( \varepsilon_{H} = \frac{1}{2} \left( \frac{1}{2} \left( \frac{1}{2} \left( \frac{1}{2} \left( x \right) \right) \right) \right) \)" + + "\n", + stem="ocr-latex-structural-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-structural-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ uαuαuαuαuαuαuαuαuα \]" + + "\n", + stem="ocr-latex-markup-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-markup-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ K:\mathrm{\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa} \]" + + "\n", + stem="ocr-latex-text-wrapper-noise", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-text-wrapper-noise.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ K:\mathrm{\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa \]" + + "\n", + stem="ocr-latex-unclosed-text-wrapper-noise", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-unclosed-text-wrapper-noise.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ \delta R^{\mu\nu}=g^{\mu\alpha}g^{\nu\beta}\left(\nabla_{\kappa}\left(\delta g_{\nu\alpha}\right)\right). \]" + + "\n", + stem="ocr-latex-bookkeeping-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + assert "latex_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-bookkeeping-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-consecutive-exact", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-consecutive-exact.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,02m}{1,5} = 10,05KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,03m}{1,5} = 15,07KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,04m}{1,5} = 20,10KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,05m}{1,5} = 25,12KN \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-consecutive-template", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-consecutive-template.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join([r"\( \Delta \)", r"\( \Delta \)", r"\( \Delta \)", r"\( \Delta \)"]) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-delta-run", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-delta-run.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + r"\( Q^{I} \) : \( \uparrow\uparrow\uparrow \) + \( \uparrow\downarrow\downarrow \) + ..." + "\n\n" + r"\( Q^{IV} \) : \( \uparrow\uparrow\uparrow \) + \( \downarrow\downarrow\downarrow \) + ..." + "\n" + ), + stem="ocr-latex-diagram-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + assert "latex_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-diagram-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr_*^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr_*^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^5\Psi}{dr_*^5} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^5\Psi}{dr^5} + (\omega^2 - V(r))\Psi = 0 \]", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-derivative-ladder", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-derivative-ladder.md").read_text(encoding="utf-8") + assert content.count("") + + +def test_clean_ocr_numeric_word_debug_docs_ignores_small_parameterized_formula_family( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( f_{11}(k) = (1 - 0.0561)^{k-1}0.0561 \)", + r"\( f_{12}(k) = (1 - 0.0617)^{k-1}0.0617 \)", + r"\( f_{21}(k) = (1 - 0.1057)^{k-1}0.1057 \)", + r"\( f_{22}(k) = (1 - 0.1724)^{k-1}0.1724 \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-parameter-family-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-parameter-family-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \tilde{p}_{(1,1)(1,2)}^{\prime} \)", + r"\( \tilde{p}_{(1,1)(2,0)}^{\prime} \)", + r"\( \tilde{p}_{(1,1)(1,0)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(1,0)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(2,1)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(2,0)}^{\prime} \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-symbol-inventory-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-symbol-inventory-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"where \( \Delta \) CFF = \( \Delta \) CFF(t) - \( \Delta \) CFF(t-1)." + "\n", + stem="ocr-latex-delta-definition-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-delta-definition-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + vertical_numbers = "\n\n".join(str(i) for i in range(0, 121)) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + vertical_numbers + "\n", + stem="ocr-vertical-numeric-page", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + + content = Path(row["output_path"]).read_text(encoding="utf-8") + assert "" in content + assert "100" in content + assert "120" in content + + +def test_clean_ocr_numeric_word_debug_docs_records_bad_char_metrics(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "Κανονική γραμμή\n<--- Page Split --->\n## \x01\x02\x00 漢 \uf0b7\n", + stem="ocr-bad-char-metrics", + max_docs=1, + ) + assert len(rows) == 1 + + page_metric_rows = [ + json.loads(line) + for line in (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert len(page_metric_rows) == 2 + second_page = page_metric_rows[1] + assert second_page["bad_char_count"] >= 4 + assert second_page["bad_char_ratio"] > 0.0 + assert second_page["control_count"] >= 3 + assert second_page["cjk_count"] >= 1 + assert second_page["private_use_count"] >= 1 + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["bad_char_ratio"]["max"] > 0.0 + + +def test_clean_ocr_numeric_word_debug_docs_respects_doc_offset(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + (corpus.markdown_dir / "a-first.md").write_text("χωρίς επανάληψη\n", encoding="utf-8") + (corpus.markdown_dir / "b-second.md").write_text( + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n", + encoding="utf-8", + ) + + debug_dir = corpus.output_dir / "ocr_numeric_word_debug" + rows = corpus.clean_ocr_numeric_word_debug_docs(debug_dir, max_docs=1, doc_offset=1) + + assert len(rows) == 1 + row = rows[0] + assert row["source_stem"] == "b-second" + assert row["latex_match_count"] >= 1 + assert not (debug_dir / "a-first.md").exists() + assert (debug_dir / "b-second.md").exists() + + +def test_clean_ocr_hybrid_debug_flags_same_body_numbered_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Απόκτηση της αξίας του αξιώματος. " + "2. Απόκτηση της αξίας του αξιώματος. " + "3. Απόκτηση της αξίας του αξιώματος. " + "4. Απόκτηση της αξίας του αξιώματος. " + "5. Απόκτηση της αξίας του αξιώματος.\n" + ), + stem="ocr-hybrid-same-body", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-same-body__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1.1 Hypergeometric function " + "1.1.1 Hypergeometric function " + "1.1.2 Hypergeometric function " + "1.1.3 Hypergeometric function " + "1.1.4 Hypergeometric function " + "1.1.5 Hypergeometric function\n" + ), + stem="ocr-hybrid-hierarchical", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-hierarchical__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Σχεδία 1.1. Σχεδία 1.2. Σχεδία 1.3. Σχεδία 1.4. Σχεδία 1.5. Σχεδ\n" + ), + stem="ocr-hybrid-partial-tail", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-partial-tail__debug_page_00001.md").read_text(encoding="utf-8") + assert "1.5. Σχεδ" in content + assert content.index("1.5. Σχεδ") < content.index("") + + +def test_clean_ocr_hybrid_debug_flags_body_cycle_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Εισαγωγή 2. Φυσικοχημικές ιδιότητες 3. Φάσεις 4. Επιπλοκές " + "5. Εισαγωγή 6. Φυσικοχημικές ιδιότητες 7. Φάσεις 8. Επιπλοκές " + "9. Εισαγωγή 10. Φυσικοχημικές ιδιότητες 11. Φάσεις 12. Επιπλοκές\n" + ), + stem="ocr-hybrid-cycle", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-cycle__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Από το σημείο 1, το σημείο 2, το σημείο 3, " + "το σημείο 4, το σημείο 5, το σημείο 6, το σημείο 7.\n" + ), + stem="ocr-hybrid-inline-progress", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-inline-progress__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Από το σημείο 1, το σημείο 2, το σημείο 3, " + "το σημείο 4, το σημείο 5.\n" + ), + stem="ocr-hybrid-inline-short-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_hybrid_debug_ignores_diverse_numbered_list(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Εισαγωγή 2. Μέθοδοι 3. Αποτελέσματα 4. Συζήτηση 5. Συμπεράσματα\n" + ), + stem="ocr-hybrid-diverse-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_hybrid_debug_ignores_markup_number_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + 'Π ' + 'Π ' + 'Π ' + 'Π\n' + ), + stem="ocr-hybrid-markup-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_latex_slot_progression_debug_flags_derivative_ladder( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^2\Psi}{dr^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr_*^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr_*^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr^4} + (\omega^2 - V(r))\Psi = 0 \]", + ] + ) + rows, debug_dir = _run_clean_ocr_latex_slot_progression_debug_export( + corpus, + repeated + "\n", + stem="ocr-latex-slot-derivative", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-latex-slot-derivative__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( f_{11}(k) = (1 - 0.0561)^{k-1}0.0561 \)", + r"\( f_{12}(k) = (1 - 0.0617)^{k-1}0.0617 \)", + r"\( f_{21}(k) = (1 - 0.1057)^{k-1}0.1057 \)", + r"\( f_{22}(k) = (1 - 0.1724)^{k-1}0.1724 \)", + ] + ) + rows, debug_dir = _run_clean_ocr_latex_slot_progression_debug_export( + corpus, + repeated + "\n", + stem="ocr-latex-slot-parameter-family-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_review_manifest_materialize_creates_labeled_copies(tmp_path: Path) -> None: + source_dir = tmp_path / "contexts" + source_dir.mkdir() + first = source_dir / "case_001.txt" + second = source_dir / "case_002.txt" + first.write_text("alpha body\n", encoding="utf-8") + second.write_text("beta body\n", encoding="utf-8") + + manifest = tmp_path / "semantic_review_manifest.jsonl" + manifest.write_text( + "\n".join( + [ + json.dumps( + { + "path": str(first), + "label": "fits_semantically", + "confidence": "high", + "notes": "complete", + }, + ensure_ascii=False, + ), + json.dumps( + { + "path": str(second), + "label": "fits_but_truncated_or_incomplete", + "confidence": "medium", + "notes": "cut off", + }, + ensure_ascii=False, + ), + ] + ) + + "\n", + encoding="utf-8", + ) + + output_dir = tmp_path / "categorized" + summary = materialize_manifest_categories( + manifest, + output_dir, + category_name="semantic_fit", + ) + + assert summary["row_count"] == 2 + fit_copy = output_dir / "by_label" / "fits_semantically" / "case_001.txt" + trunc_copy = output_dir / "by_label" / "fits_but_truncated_or_incomplete" / "case_002.txt" + assert fit_copy.exists() + assert trunc_copy.exists() + + fit_text = fit_copy.read_text(encoding="utf-8") + assert "REVIEW_CATEGORY: semantic_fit" in fit_text + assert "REVIEW_LABEL: fits_semantically" in fit_text + assert "=== REVIEW_SOURCE_CONTENT ===" in fit_text + assert "alpha body" in fit_text + + +def test_table_markdown_audit_preserves_semantic_inline_html() -> None: + audit = audit_table( + Path("/tmp/demo.md"), + 1, + 1, + ( + "" + "" + "" + "
Line A
Line B
xi2source\"diagram\"
" + ), + ) + assert audit.convertible is True + assert audit.markdown is not None + assert "Line A
Line B" in audit.markdown + assert "xi2" in audit.markdown + assert "[source](https://example.com)" in audit.markdown + assert "diagram" in audit.markdown + + +def test_table_markdown_audit_writes_clean_markdown_file(tmp_path: Path) -> None: + audit = audit_table( + Path("/tmp/demo.md"), + 1, + 7, + "
ΑΒ
12
", + ) + output = write_clean_markdown_file(tmp_path, audit) + assert output is not None + path = Path(output) + assert path.exists() + text = path.read_text(encoding="utf-8") + assert text.startswith("## ORIGINAL_HTML") + assert "## GITHUB_MD" in text + assert "" in text + assert "Α" in text + assert "1" in text diff --git a/tests/test_ocr_golden_pages.py b/tests/test_ocr_golden_pages.py new file mode 100644 index 0000000..6274d96 --- /dev/null +++ b/tests/test_ocr_golden_pages.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import difflib +import json +from pathlib import Path + +from glossapi import Corpus +from glossapi.corpus.phase_clean import _render_combined_ocr_debug_page + + +GOLDEN_DIR = Path( + "/home/foivos/data/openarchives_ocr_ingest_20260403/debug/ocr_golden_pages_first300_20260410" +) + + +def _load_manifest_rows() -> list[dict]: + manifest_path = GOLDEN_DIR / "manifest.jsonl" + assert manifest_path.exists(), f"Missing OCR golden manifest: {manifest_path}" + return [ + json.loads(line) + for line in manifest_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _format_diff(case_id: str, expected: str, actual: str) -> str: + diff = list( + difflib.unified_diff( + expected.splitlines(), + actual.splitlines(), + fromfile=f"{case_id}:expected", + tofile=f"{case_id}:actual", + lineterm="", + n=3, + ) + ) + return "\n".join(diff[:120]) + + +def test_combined_ocr_real_goldens_match_exact_output(tmp_path: Path) -> None: + rows = _load_manifest_rows() + assert len(rows) >= 300, f"Expected hundreds of real OCR golden cases, got {len(rows)}" + + corpus = Corpus(input_dir=tmp_path / "input", output_dir=tmp_path / "output") + corpus.input_dir.mkdir(parents=True, exist_ok=True) + corpus.output_dir.mkdir(parents=True, exist_ok=True) + noise_mod = corpus._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_numeric_debug_page_spans", "evaluate_page_character_noise"), + ) + + mismatches: list[str] = [] + for row in rows: + case_id = str(row["case_id"]) + input_path = Path(str(row["input_path"])) + expected_path = Path(str(row["expected_path"])) + page_text = input_path.read_text(encoding="utf-8") + expected = expected_path.read_text(encoding="utf-8") + actual = _render_combined_ocr_debug_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=10, + min_repeat_steps=8, + min_same_digit_steps=10, + word_rep_threshold=4, + word_min_period=3, + word_window=96, + )["annotated_page"] + if actual != expected: + mismatches.append(_format_diff(case_id, expected, actual)) + if len(mismatches) >= 5: + break + + assert not mismatches, "\n\n".join(mismatches) From 586a543b017d6ae1c8fdbb6873e13078c8bca2bd Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 20:30:46 +0300 Subject: [PATCH 75/93] reuse filtered page views in OCR analyzer --- src/glossapi/corpus/phase_clean.py | 60 ++++++++++++++++++------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 7e4a75b..6d44551 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -703,16 +703,19 @@ def _parse_hybrid_numeric_value(token: str) -> Optional[float]: return None -def _extract_hybrid_numbered_items( +def _prepare_hybrid_analysis_text( page_text: str, *, blocked_spans: List[Dict[str, Any]], -) -> List[Dict[str, Any]]: +) -> str: analysis_text = _filter_tables_preserve_layout(page_text) analysis_text = _filter_latex_preserve_layout(analysis_text) analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + return analysis_text + +def _extract_hybrid_numbered_items_from_analysis_text(analysis_text: str) -> List[Dict[str, Any]]: candidates: List[Dict[str, Any]] = [] for match in HYBRID_PREFIX_RE.finditer(analysis_text): field = _classify_hybrid_numeric_field(match.group("prefix")) @@ -756,16 +759,7 @@ def _extract_hybrid_numbered_items( return items -def _extract_hybrid_inline_numeric_items( - page_text: str, - *, - blocked_spans: List[Dict[str, Any]], -) -> List[Dict[str, Any]]: - analysis_text = _filter_tables_preserve_layout(page_text) - analysis_text = _filter_latex_preserve_layout(analysis_text) - analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) - analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) - +def _extract_hybrid_inline_numeric_items_from_analysis_text(analysis_text: str) -> List[Dict[str, Any]]: clause_ranges: List[Tuple[int, int]] = [] clause_start = 0 for match in HYBRID_INLINE_CLAUSE_DELIMITER_RE.finditer(analysis_text): @@ -1110,11 +1104,16 @@ def _find_hybrid_numbered_repeat_spans( page_text: str, *, blocked_spans: List[Dict[str, Any]], + analysis_text: Optional[str] = None, ) -> List[Dict[str, Any]]: - items = _extract_hybrid_numbered_items(page_text, blocked_spans=blocked_spans) + if analysis_text is None: + analysis_text = _prepare_hybrid_analysis_text(page_text, blocked_spans=blocked_spans) + else: + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + items = _extract_hybrid_numbered_items_from_analysis_text(analysis_text) spans = _find_hybrid_same_body_progression_spans(items) spans.extend(_find_hybrid_cycle_progression_spans(items)) - inline_items = _extract_hybrid_inline_numeric_items(page_text, blocked_spans=blocked_spans) + inline_items = _extract_hybrid_inline_numeric_items_from_analysis_text(analysis_text) spans.extend(_find_hybrid_inline_progression_spans(inline_items)) spans.sort(key=lambda item: (int(item["start"]), -(int(item["end"]) - int(item["start"])))) @@ -1454,9 +1453,11 @@ def _find_latex_repeat_spans( rep_threshold: int, min_period: int, window: int, + analysis_text: Optional[str] = None, ) -> List[Dict[str, Any]]: - analysis_text = _filter_tables_preserve_layout(page_text) - analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) labeled_spans: List[Dict[str, Any]] = [] @@ -1529,9 +1530,11 @@ def _find_latex_slot_progression_spans( page_text: str, *, blocked_spans: List[Dict[str, Any]], + analysis_text: Optional[str] = None, ) -> List[Dict[str, Any]]: - analysis_text = _filter_tables_preserve_layout(page_text) - analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) segments = _extract_latex_segments(analysis_text) @@ -1725,10 +1728,12 @@ def _find_labeled_shared_repeat_spans( rep_threshold: int, min_period: int, window: int, + analysis_text: Optional[str] = None, ) -> List[Dict[str, Any]]: - analysis_text = _filter_tables_preserve_layout(page_text) - analysis_text = _filter_latex_preserve_layout(analysis_text) - analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) normalized_text, raw_map = _normalize_alnum_with_map_skip_tags(analysis_text) normalized_spans = _find_word_repeat_spans( @@ -1782,8 +1787,12 @@ def _render_combined_ocr_page( table_spans = _find_table_repeat_spans(page_text) table_elapsed = time.perf_counter() - table_start - numeric_analysis_page = _filter_tables_preserve_layout(page_text) - numeric_analysis_page = _filter_latex_preserve_layout(numeric_analysis_page) + page_without_tables = _filter_tables_preserve_layout(page_text) + page_without_tables_existing = _blank_existing_match_regions_preserve_layout(page_without_tables) + page_without_tables_latex = _filter_latex_preserve_layout(page_without_tables) + page_without_tables_latex_existing = _blank_existing_match_regions_preserve_layout( + page_without_tables_latex + ) numeric_start = time.perf_counter() numeric_spans = [ @@ -1794,7 +1803,7 @@ def _render_combined_ocr_page( "category": MATCH_CATEGORY_BY_TYPE[str(item["match_type"])], } for item in noise_mod.find_numeric_debug_page_spans( - numeric_analysis_page, + page_without_tables_latex, int(min_progress_steps), int(min_repeat_steps), int(min_same_digit_steps), @@ -1809,6 +1818,7 @@ def _render_combined_ocr_page( rep_threshold=int(word_rep_threshold), min_period=int(word_min_period), window=int(word_window), + analysis_text=page_without_tables_existing, ) latex_elapsed = time.perf_counter() - latex_start @@ -1816,6 +1826,7 @@ def _render_combined_ocr_page( hybrid_spans = _find_hybrid_numbered_repeat_spans( page_text, blocked_spans=table_spans + numeric_spans + latex_spans, + analysis_text=page_without_tables_latex_existing, ) hybrid_elapsed = time.perf_counter() - hybrid_start @@ -1826,6 +1837,7 @@ def _render_combined_ocr_page( rep_threshold=int(word_rep_threshold), min_period=int(word_min_period), window=int(word_window), + analysis_text=page_without_tables_latex_existing, ) shared_elapsed = time.perf_counter() - shared_start From bbc643b6ddf0bc9cd992b5428bc5b7076a1b5f26 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 21:53:53 +0300 Subject: [PATCH 76/93] speed up hybrid OCR matching in Rust --- rust/glossapi_rs_noise/Cargo.lock | 25 + rust/glossapi_rs_noise/Cargo.toml | 1 + rust/glossapi_rs_noise/src/lib.rs | 24 +- rust/glossapi_rs_noise/src/noise_metrics.rs | 868 ++++++++++++++++++++ src/glossapi/corpus/phase_clean.py | 14 + 5 files changed, 931 insertions(+), 1 deletion(-) diff --git a/rust/glossapi_rs_noise/Cargo.lock b/rust/glossapi_rs_noise/Cargo.lock index f38df1f..f68e0a8 100644 --- a/rust/glossapi_rs_noise/Cargo.lock +++ b/rust/glossapi_rs_noise/Cargo.lock @@ -116,6 +116,7 @@ dependencies = [ "rand", "rayon", "regex", + "unicode-normalization", "walkdir", ] @@ -456,12 +457,36 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unindent" version = "0.1.11" diff --git a/rust/glossapi_rs_noise/Cargo.toml b/rust/glossapi_rs_noise/Cargo.toml index fd4cebc..463e884 100644 --- a/rust/glossapi_rs_noise/Cargo.toml +++ b/rust/glossapi_rs_noise/Cargo.toml @@ -22,3 +22,4 @@ anyhow = "1" regex = "1.10" glossapi_rs_common = { path = "../glossapi_rs_common" } rand = { version = "0.8", features = ["std_rng"] } +unicode-normalization = "0.1" diff --git a/rust/glossapi_rs_noise/src/lib.rs b/rust/glossapi_rs_noise/src/lib.rs index e6f09bf..6d06ce5 100644 --- a/rust/glossapi_rs_noise/src/lib.rs +++ b/rust/glossapi_rs_noise/src/lib.rs @@ -5,7 +5,8 @@ mod noise_metrics; use noise_metrics::{ annotate_numeric_debug_page_internal, evaluate_page_character_noise_internal, export_numeric_match_debug_pages_internal, export_ocr_match_debug_pages_internal, - find_numeric_debug_page_spans_internal, find_word_repeat_spans_internal, + find_hybrid_repeat_spans_internal, find_numeric_debug_page_spans_internal, + find_word_repeat_spans_internal, score_markdown_directory_detailed_internal, score_markdown_directory_internal, score_markdown_directory_ocr_profile_internal, score_markdown_file_detailed_internal, score_markdown_file_internal, @@ -363,6 +364,26 @@ fn find_word_repeat_spans( Ok(out) } +#[pyfunction] +fn find_hybrid_repeat_spans(py: Python<'_>, analysis_text: &str) -> PyResult>> { + let spans = find_hybrid_repeat_spans_internal(analysis_text); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("match_types", vec!["hybrid_repeat"])?; + item.set_item("category", "hybrid")?; + item.set_item("kind", span.kind)?; + item.set_item("item_count", span.item_count)?; + if let Some(cycle_len) = span.cycle_len { + item.set_item("cycle_len", cycle_len)?; + } + out.push(item.into()); + } + Ok(out) +} + #[pyfunction] fn evaluate_page_character_noise(py: Python<'_>, page: &str) -> PyResult> { let metrics = evaluate_page_character_noise_internal(page); @@ -389,6 +410,7 @@ fn glossapi_rs_noise(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(annotate_numeric_debug_page, m)?)?; m.add_function(wrap_pyfunction!(find_numeric_debug_page_spans, m)?)?; m.add_function(wrap_pyfunction!(find_word_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_hybrid_repeat_spans, m)?)?; m.add_function(wrap_pyfunction!(evaluate_page_character_noise, m)?)?; Ok(()) } diff --git a/rust/glossapi_rs_noise/src/noise_metrics.rs b/rust/glossapi_rs_noise/src/noise_metrics.rs index ad305aa..8127810 100644 --- a/rust/glossapi_rs_noise/src/noise_metrics.rs +++ b/rust/glossapi_rs_noise/src/noise_metrics.rs @@ -78,6 +78,7 @@ use rayon::ThreadPoolBuilder; use std::fs::{self, File}; use std::io::Read; use std::path::{Path, PathBuf}; +use unicode_normalization::UnicodeNormalization; use walkdir::WalkDir; // Avoid heavy regex for table detection; use lightweight checks instead @@ -245,6 +246,15 @@ pub struct WordRepeatSpan { pub tail_chars: usize, } +#[derive(Debug, Clone)] +pub struct HybridRepeatSpan { + pub start: usize, + pub end: usize, + pub kind: &'static str, + pub item_count: usize, + pub cycle_len: Option, +} + #[derive(Debug, Clone, Default)] pub struct PageCharacterNoise { pub total_chars: u64, @@ -257,6 +267,64 @@ pub struct PageCharacterNoise { } const MERGE_SAME_CATEGORY_MAX_NONWHITESPACE_GAP: usize = 10; +const HYBRID_REPEAT_MIN_ITEMS: usize = 4; +const HYBRID_REPEAT_MIN_BODY_ALNUM: usize = 6; +const HYBRID_REPEAT_MAX_CYCLE: usize = 6; +const HYBRID_REPEAT_MIN_CYCLE_ITEMS: usize = 8; +const HYBRID_INLINE_CONTEXT_WORDS: usize = 2; +const HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS: usize = 2; +const HYBRID_INLINE_CONTEXT_MIN_CHARS: usize = 8; +const HYBRID_INLINE_REPEAT_MIN_ITEMS: usize = 6; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HybridFieldKind { + HeaderCounter, + NumericValue, +} + +#[derive(Debug, Clone)] +struct HybridNumberedItem { + start: usize, + end: usize, + field_kind: HybridFieldKind, + numbers: Vec, + shape: String, + body_key: String, + body_is_full: bool, +} + +#[derive(Debug, Clone)] +struct HybridInlineItem { + start: usize, + end: usize, + clause_index: usize, + inline_context_key: String, + numeric_value: f64, +} + +#[derive(Debug, Clone)] +struct HybridCandidate { + prefix_start_byte: usize, + prefix_end_byte: usize, + field_kind: HybridFieldKind, + numbers: Vec, + shape: String, +} + +#[derive(Debug, Clone)] +struct HybridToken { + kind: HybridTokenKind, + start: usize, + end: usize, + token_key: Option, + numeric_value: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HybridTokenKind { + Numeric, + Alpha, +} #[derive(Debug, Clone, Copy)] struct TokenSpan { @@ -1378,6 +1446,806 @@ const WORD_REPEAT_HASH_MASK: u64 = (1u64 << 63).wrapping_mul(2).wrapping_sub(1); const WORD_REPEAT_HASH_BASE: u64 = 1469598103934665603u64; #[inline] +fn hybrid_text_char_boundaries(text: &str) -> Vec { + let mut boundaries = Vec::with_capacity(text.chars().count() + 1); + for (byte_idx, _) in text.char_indices() { + boundaries.push(byte_idx); + } + boundaries.push(text.len()); + boundaries +} + +fn hybrid_byte_to_char_idx(boundaries: &[usize], byte_idx: usize) -> usize { + match boundaries.binary_search(&byte_idx) { + Ok(idx) => idx, + Err(idx) => idx, + } +} + +fn hybrid_normalize_body(text: &str) -> String { + let mut out = String::with_capacity(text.len()); + for ch in text.chars() { + for lower in ch.to_lowercase() { + let lower = if lower == 'ς' { 'σ' } else { lower }; + for sub in lower.to_string().nfd() { + if sub.is_alphanumeric() { + let mapped = match sub { + 'ο' => 'o', + 'κ' => 'k', + _ => sub, + }; + out.push(mapped); + } + } + } + } + out +} + +fn hybrid_has_markup_body(text: &str) -> bool { + if text.is_empty() { + return false; + } + let lower = text.to_lowercase(); + if lower.contains("src=") + || lower.contains("alt=") + || lower.contains("image_") + || lower.contains(".png") + || lower.contains(".jpg") + || lower.contains(".jpeg") + || lower.contains(".gif") + { + return true; + } + + let bytes = text.as_bytes(); + for (idx, byte) in bytes.iter().enumerate() { + if *byte == b'<' && idx + 2 <= bytes.len() && bytes[idx + 1..].contains(&b'>') { + return true; + } + } + false +} + +fn hybrid_classify_numeric_field(token: &str) -> Option<(HybridFieldKind, Vec, String)> { + let token = token.trim(); + if token.is_empty() { + return None; + } + + let trailing_paren = token.ends_with(')'); + let trailing_dot = token.ends_with('.'); + let stripped = if trailing_paren || trailing_dot { + &token[..token.len() - 1] + } else { + token + }; + if stripped.is_empty() { + return None; + } + + if stripped.contains('/') { + return Some((HybridFieldKind::NumericValue, Vec::new(), String::new())); + } + + let parts: Vec<&str> = stripped.split('.').collect(); + if parts.is_empty() || parts.iter().any(|part| part.is_empty() || !part.chars().all(|ch| ch.is_ascii_digit())) { + return None; + } + + let mut numbers = Vec::with_capacity(parts.len()); + for part in &parts { + numbers.push(part.parse::().ok()?); + } + + let mut shape = std::iter::repeat("#") + .take(numbers.len()) + .collect::>() + .join("."); + if trailing_paren { + shape.push(')'); + } else if trailing_dot { + shape.push('.'); + } + + let field_kind = if trailing_paren || trailing_dot { + HybridFieldKind::HeaderCounter + } else if numbers.len() >= 3 { + HybridFieldKind::HeaderCounter + } else if numbers.len() == 2 && parts.last().map(|part| part.len()).unwrap_or(0) <= 2 { + HybridFieldKind::HeaderCounter + } else { + HybridFieldKind::NumericValue + }; + + Some((field_kind, numbers, shape)) +} + +fn hybrid_classify_inline_numeric_field(token: &str) -> bool { + let stripped = token.trim(); + if stripped.is_empty() { + return false; + } + + if stripped.chars().all(|ch| ch.is_ascii_digit()) { + return true; + } + + if stripped.matches('/').count() == 1 { + let mut parts = stripped.split('/'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + return !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs != "0"; + } + + let decimal_candidate = stripped.replacen(',', ".", 1); + if decimal_candidate.matches('.').count() == 1 { + let mut parts = decimal_candidate.split('.'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + return !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()); + } + + false +} + +fn hybrid_parse_numeric_value(token: &str) -> Option { + let stripped = token.trim(); + if stripped.is_empty() { + return None; + } + + if stripped.chars().all(|ch| ch.is_ascii_digit()) { + return stripped.parse::().ok().map(|value| value as f64); + } + + if stripped.matches('/').count() == 1 { + let mut parts = stripped.split('/'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + if !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + { + let lhs_value = lhs.parse::().ok()?; + let rhs_value = rhs.parse::().ok()?; + if rhs_value != 0.0 { + return Some(lhs_value / rhs_value); + } + } + return None; + } + + let decimal_candidate = stripped.replacen(',', ".", 1); + if decimal_candidate.matches('.').count() == 1 { + let mut parts = decimal_candidate.split('.'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + if !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + { + return decimal_candidate.parse::().ok(); + } + } + + None +} + +fn hybrid_next_char(text: &str, byte_idx: usize) -> Option<(char, usize)> { + let ch = text[byte_idx..].chars().next()?; + Some((ch, byte_idx + ch.len_utf8())) +} + +fn hybrid_previous_char(text: &str, byte_idx: usize) -> Option { + text[..byte_idx].chars().next_back() +} + +fn hybrid_parse_prefix_at(text: &str, start: usize) -> Option { + if start >= text.len() { + return None; + } + if let Some(prev) = hybrid_previous_char(text, start) { + if prev.is_ascii_digit() { + return None; + } + } + + let (first, mut idx) = hybrid_next_char(text, start)?; + if !first.is_ascii_digit() { + return None; + } + while idx < text.len() { + let (ch, next_idx) = hybrid_next_char(text, idx)?; + if !ch.is_ascii_digit() { + break; + } + idx = next_idx; + } + + if idx >= text.len() { + return None; + } + let (delimiter, mut end_idx) = hybrid_next_char(text, idx)?; + match delimiter { + ')' => {} + '.' => { + loop { + let mut cursor = end_idx; + let mut saw_digit = false; + while cursor < text.len() { + let (ch, next_cursor) = hybrid_next_char(text, cursor)?; + if !ch.is_ascii_digit() { + break; + } + saw_digit = true; + cursor = next_cursor; + } + if saw_digit { + if cursor < text.len() { + let (ch, next_cursor) = hybrid_next_char(text, cursor)?; + if ch == '.' { + end_idx = next_cursor; + continue; + } + } + end_idx = cursor; + } + break; + } + } + _ => return None, + } + + let mut lookahead = end_idx; + while lookahead < text.len() { + let (ch, next_idx) = hybrid_next_char(text, lookahead)?; + if !ch.is_whitespace() { + return ch.is_alphabetic().then_some(end_idx); + } + lookahead = next_idx; + } + None +} + +fn hybrid_extract_numbered_items(analysis_text: &str) -> Vec { + let boundaries = hybrid_text_char_boundaries(analysis_text); + let mut candidates: Vec = Vec::new(); + let mut byte_idx = 0usize; + while byte_idx < analysis_text.len() { + let (ch, next_idx) = match hybrid_next_char(analysis_text, byte_idx) { + Some(value) => value, + None => break, + }; + if ch.is_ascii_digit() { + if let Some(prefix_end_byte) = hybrid_parse_prefix_at(analysis_text, byte_idx) { + let prefix = &analysis_text[byte_idx..prefix_end_byte]; + if let Some((field_kind, numbers, shape)) = hybrid_classify_numeric_field(prefix) { + candidates.push(HybridCandidate { + prefix_start_byte: byte_idx, + prefix_end_byte, + field_kind, + numbers, + shape, + }); + } + byte_idx = prefix_end_byte; + continue; + } + } + byte_idx = next_idx; + } + + let mut items: Vec = Vec::new(); + for (idx, candidate) in candidates.iter().enumerate() { + let next_start = candidates + .get(idx + 1) + .map(|item| item.prefix_start_byte) + .unwrap_or_else(|| analysis_text.len()); + let body_raw = analysis_text[candidate.prefix_end_byte..next_start].trim(); + if hybrid_has_markup_body(body_raw) { + continue; + } + let body_key = hybrid_normalize_body(body_raw); + let has_alpha = body_key.chars().any(|ch| ch.is_alphabetic()); + if !has_alpha { + continue; + } + let body_is_full = body_key.chars().count() >= HYBRID_REPEAT_MIN_BODY_ALNUM; + items.push(HybridNumberedItem { + start: hybrid_byte_to_char_idx(&boundaries, candidate.prefix_start_byte), + end: hybrid_byte_to_char_idx(&boundaries, next_start), + field_kind: candidate.field_kind, + numbers: candidate.numbers.clone(), + shape: candidate.shape.clone(), + body_key, + body_is_full, + }); + } + + items +} + +fn hybrid_clause_ranges(text: &str) -> Vec<(usize, usize)> { + let mut ranges: Vec<(usize, usize)> = Vec::new(); + let mut clause_start = 0usize; + let mut iter = text.char_indices().peekable(); + while let Some((idx, ch)) = iter.next() { + let is_delimiter = match ch { + ';' | '\n' => true, + ',' => match iter.peek() { + Some((_, next_ch)) => !next_ch.is_ascii_digit(), + None => true, + }, + _ => false, + }; + if is_delimiter { + ranges.push((clause_start, idx)); + clause_start = idx + ch.len_utf8(); + } + } + ranges.push((clause_start, text.len())); + ranges +} + +fn hybrid_extract_inline_items(analysis_text: &str) -> Vec { + let boundaries = hybrid_text_char_boundaries(analysis_text); + let clause_ranges = hybrid_clause_ranges(analysis_text); + let mut items: Vec = Vec::new(); + + for (clause_index, (raw_start, raw_end)) in clause_ranges.iter().enumerate() { + let clause = &analysis_text[*raw_start..*raw_end]; + if clause.trim().is_empty() { + continue; + } + + let leading_ws = clause.len() - clause.trim_start().len(); + let trailing_ws = clause.len() - clause.trim_end().len(); + let clause_start_abs = raw_start + leading_ws; + let clause_end_abs = raw_end - trailing_ws; + if clause_start_abs >= clause_end_abs { + continue; + } + + let clause_text = &analysis_text[clause_start_abs..clause_end_abs]; + if clause_text.is_empty() || hybrid_has_markup_body(clause_text) { + continue; + } + + let mut working_offset = clause_start_abs; + let mut working_text = clause_text; + if let Some(prefix_end) = hybrid_parse_prefix_at(working_text, 0) { + let trimmed = working_text[prefix_end..].trim_start(); + let trimmed_leading = working_text[prefix_end..].len() - trimmed.len(); + working_offset += prefix_end + trimmed_leading; + working_text = trimmed; + } + if working_text.is_empty() { + continue; + } + + let mut tokens: Vec = Vec::new(); + let mut numeric_positions: Vec = Vec::new(); + let mut token_byte = 0usize; + while token_byte < working_text.len() { + let (ch, next_idx) = match hybrid_next_char(working_text, token_byte) { + Some(value) => value, + None => break, + }; + if ch.is_ascii_digit() { + let mut end = next_idx; + loop { + let mut cursor = end; + while cursor < working_text.len() { + let (digit_ch, digit_next) = match hybrid_next_char(working_text, cursor) { + Some(value) => value, + None => break, + }; + if !digit_ch.is_ascii_digit() { + break; + } + cursor = digit_next; + } + end = cursor; + if end >= working_text.len() { + break; + } + let (sep, sep_next) = match hybrid_next_char(working_text, end) { + Some(value) => value, + None => break, + }; + if !matches!(sep, '.' | ',' | '/') { + break; + } + if sep_next >= working_text.len() { + break; + } + let (after_sep, _) = match hybrid_next_char(working_text, sep_next) { + Some(value) => value, + None => break, + }; + if !after_sep.is_ascii_digit() { + break; + } + end = sep_next; + } + let token = &working_text[token_byte..end]; + if hybrid_classify_inline_numeric_field(token) { + if let Some(parsed_value) = hybrid_parse_numeric_value(token) { + numeric_positions.push(tokens.len()); + tokens.push(HybridToken { + kind: HybridTokenKind::Numeric, + start: hybrid_byte_to_char_idx(&boundaries, working_offset + token_byte), + end: hybrid_byte_to_char_idx(&boundaries, working_offset + end), + token_key: None, + numeric_value: Some(parsed_value), + }); + } + } + token_byte = end; + continue; + } + if ch.is_alphabetic() { + let mut end = next_idx; + while end < working_text.len() { + let (next_ch, next_end) = match hybrid_next_char(working_text, end) { + Some(value) => value, + None => break, + }; + if !next_ch.is_alphabetic() { + break; + } + end = next_end; + } + let token = &working_text[token_byte..end]; + let token_key = hybrid_normalize_body(token); + if !token_key.is_empty() { + tokens.push(HybridToken { + kind: HybridTokenKind::Alpha, + start: hybrid_byte_to_char_idx(&boundaries, working_offset + token_byte), + end: hybrid_byte_to_char_idx(&boundaries, working_offset + end), + token_key: Some(token_key), + numeric_value: None, + }); + } + token_byte = end; + continue; + } + token_byte = next_idx; + } + + if numeric_positions.len() != 1 { + continue; + } + let numeric_pos = numeric_positions[0]; + let numeric_token = &tokens[numeric_pos]; + let left_alpha: Vec<&HybridToken> = tokens[..numeric_pos] + .iter() + .filter(|token| token.kind == HybridTokenKind::Alpha) + .collect(); + let right_alpha: Vec<&HybridToken> = tokens[numeric_pos + 1..] + .iter() + .filter(|token| token.kind == HybridTokenKind::Alpha) + .collect(); + + let left_start = left_alpha.len().saturating_sub(HYBRID_INLINE_CONTEXT_WORDS); + let left_context = &left_alpha[left_start..]; + let right_limit = std::cmp::min(HYBRID_INLINE_CONTEXT_WORDS, right_alpha.len()); + let right_context = &right_alpha[..right_limit]; + let alpha_word_count = left_context.len() + right_context.len(); + if alpha_word_count < HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS { + continue; + } + + let mut context_parts: Vec = + Vec::with_capacity(left_context.len() + 1 + right_context.len()); + for token in left_context { + if let Some(token_key) = &token.token_key { + context_parts.push(token_key.clone()); + } + } + context_parts.push("num".to_string()); + for token in right_context { + if let Some(token_key) = &token.token_key { + context_parts.push(token_key.clone()); + } + } + let context_key = hybrid_normalize_body(&context_parts.join(" ")); + if context_key.chars().count() < HYBRID_INLINE_CONTEXT_MIN_CHARS { + continue; + } + + let item_start = left_context + .first() + .map(|token| token.start) + .unwrap_or(numeric_token.start); + let item_end = right_context + .last() + .map(|token| token.end) + .unwrap_or(numeric_token.end); + items.push(HybridInlineItem { + start: item_start, + end: item_end, + clause_index, + inline_context_key: context_key, + numeric_value: numeric_token.numeric_value.unwrap_or(0.0), + }); + } + + items +} + +fn hybrid_partial_body_matches(candidate_body_key: &str, target_body_key: &str) -> bool { + if candidate_body_key.is_empty() || target_body_key.is_empty() || candidate_body_key == target_body_key { + return false; + } + if !target_body_key.starts_with(candidate_body_key) { + return false; + } + let target_len = target_body_key.chars().count(); + let candidate_len = candidate_body_key.chars().count(); + let min_chars = std::cmp::min(4usize, target_len); + let min_ratio_chars = std::cmp::max(1usize, (target_len + 1) / 2); + candidate_len >= std::cmp::min(min_chars, min_ratio_chars) +} + +fn hybrid_header_progresses(previous: &HybridNumberedItem, current: &HybridNumberedItem) -> bool { + previous.field_kind == HybridFieldKind::HeaderCounter + && current.field_kind == HybridFieldKind::HeaderCounter + && !previous.numbers.is_empty() + && previous.numbers.len() == current.numbers.len() + && previous.numbers[..previous.numbers.len() - 1] == current.numbers[..current.numbers.len() - 1] + && current.numbers.last().copied() == previous.numbers.last().copied().map(|value| value + 1) +} + +fn hybrid_header_is_parent(previous: &HybridNumberedItem, current: &HybridNumberedItem) -> bool { + previous.field_kind == HybridFieldKind::HeaderCounter + && current.field_kind == HybridFieldKind::HeaderCounter + && !previous.numbers.is_empty() + && previous.numbers.len() + 1 == current.numbers.len() + && current.numbers[..current.numbers.len() - 1] == previous.numbers[..] +} + +fn hybrid_extend_tail_span_end( + items: &[HybridNumberedItem], + run_start: usize, + run_end: usize, + expected_body_key: &str, +) -> usize { + let span_end = items[run_end - 1].end; + if run_end >= items.len() { + return span_end; + } + let tail = &items[run_end]; + if tail.field_kind != HybridFieldKind::HeaderCounter + || tail.shape != items[run_start].shape + || !hybrid_header_progresses(&items[run_end - 1], tail) + || !hybrid_partial_body_matches(&tail.body_key, expected_body_key) + { + return span_end; + } + tail.end +} + +fn hybrid_inline_step(previous: &HybridInlineItem, current: &HybridInlineItem) -> Option { + if current.clause_index != previous.clause_index + 1 + || current.inline_context_key != previous.inline_context_key + { + return None; + } + let step = current.numeric_value - previous.numeric_value; + (step > 0.0).then_some(step) +} + +fn hybrid_inline_step_matches(expected_step: f64, actual_step: f64) -> bool { + let tolerance = f64::max(1e-9, expected_step.abs() * 1e-6); + (expected_step - actual_step).abs() <= tolerance +} + +fn hybrid_find_same_body_progression_spans(items: &[HybridNumberedItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let mut idx = 0usize; + while idx < items.len() { + let item = &items[idx]; + if item.field_kind != HybridFieldKind::HeaderCounter || !item.body_is_full { + idx += 1; + continue; + } + + let mut end_idx = idx + 1; + while end_idx < items.len() + && items[end_idx].field_kind == HybridFieldKind::HeaderCounter + && items[end_idx].body_is_full + && items[end_idx].body_key == item.body_key + && items[end_idx].shape == item.shape + && hybrid_header_progresses(&items[end_idx - 1], &items[end_idx]) + { + end_idx += 1; + } + + let run_length = end_idx - idx; + if run_length >= HYBRID_REPEAT_MIN_ITEMS { + let mut start_idx = idx; + if idx > 0 { + let previous = &items[idx - 1]; + if previous.body_is_full + && previous.body_key == item.body_key + && hybrid_header_is_parent(previous, item) + { + start_idx = idx - 1; + } + } + let span_end = hybrid_extend_tail_span_end(items, idx, end_idx, &item.body_key); + spans.push(HybridRepeatSpan { + start: items[start_idx].start, + end: span_end, + kind: "same_body_progression", + item_count: end_idx - start_idx, + cycle_len: None, + }); + idx = end_idx; + continue; + } + + idx += 1; + } + spans +} + +fn hybrid_find_cycle_progression_spans(items: &[HybridNumberedItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let n_items = items.len(); + for cycle_len in 2..=HYBRID_REPEAT_MAX_CYCLE { + let mut idx = 0usize; + while idx + 2 * cycle_len <= n_items { + let run = &items[idx..idx + 2 * cycle_len]; + if run + .iter() + .any(|item| item.field_kind != HybridFieldKind::HeaderCounter || !item.body_is_full) + { + idx += 1; + continue; + } + let first_shape = &run[0].shape; + if run.iter().any(|item| item.shape != *first_shape) { + idx += 1; + continue; + } + if !(1..run.len()).all(|pos| hybrid_header_progresses(&run[pos - 1], &run[pos])) { + idx += 1; + continue; + } + + let template: Vec<&str> = run[..cycle_len] + .iter() + .map(|item| item.body_key.as_str()) + .collect(); + let unique_template_count = template + .iter() + .copied() + .collect::>() + .len(); + if unique_template_count < 2 { + idx += 1; + continue; + } + + if (cycle_len..run.len()).any(|pos| run[pos].body_key != template[pos % cycle_len]) { + idx += 1; + continue; + } + + let mut end_idx = idx + 2 * cycle_len; + while end_idx < n_items + && items[end_idx].field_kind == HybridFieldKind::HeaderCounter + && items[end_idx].body_is_full + && items[end_idx].shape == items[idx].shape + && hybrid_header_progresses(&items[end_idx - 1], &items[end_idx]) + && items[end_idx].body_key == template[(end_idx - idx) % cycle_len] + { + end_idx += 1; + } + + let item_count = end_idx - idx; + if item_count >= HYBRID_REPEAT_MIN_CYCLE_ITEMS { + let span_end = hybrid_extend_tail_span_end( + items, + idx, + end_idx, + template[(end_idx - idx) % cycle_len], + ); + spans.push(HybridRepeatSpan { + start: items[idx].start, + end: span_end, + kind: "body_cycle_progression", + item_count, + cycle_len: Some(cycle_len), + }); + idx = end_idx; + continue; + } + idx += 1; + } + } + spans +} + +fn hybrid_find_inline_progression_spans(items: &[HybridInlineItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let mut idx = 0usize; + while idx + HYBRID_INLINE_REPEAT_MIN_ITEMS <= items.len() { + let first = &items[idx]; + let second = &items[idx + 1]; + let expected_step = match hybrid_inline_step(first, second) { + Some(step) => step, + None => { + idx += 1; + continue; + } + }; + + let mut end_idx = idx + 2; + while end_idx < items.len() { + let actual_step = match hybrid_inline_step(&items[end_idx - 1], &items[end_idx]) { + Some(step) => step, + None => break, + }; + if !hybrid_inline_step_matches(expected_step, actual_step) { + break; + } + end_idx += 1; + } + + let item_count = end_idx - idx; + if item_count >= HYBRID_INLINE_REPEAT_MIN_ITEMS { + spans.push(HybridRepeatSpan { + start: items[idx].start, + end: items[end_idx - 1].end, + kind: "inline_numeric_progression", + item_count, + cycle_len: None, + }); + idx = end_idx; + continue; + } + idx += 1; + } + spans +} + +pub fn find_hybrid_repeat_spans_internal(analysis_text: &str) -> Vec { + let items = hybrid_extract_numbered_items(analysis_text); + let mut spans = hybrid_find_same_body_progression_spans(&items); + spans.extend(hybrid_find_cycle_progression_spans(&items)); + let inline_items = hybrid_extract_inline_items(analysis_text); + spans.extend(hybrid_find_inline_progression_spans(&inline_items)); + spans.sort_by(|lhs, rhs| { + lhs.start + .cmp(&rhs.start) + .then_with(|| (rhs.end - rhs.start).cmp(&(lhs.end - lhs.start))) + }); + + let mut deduped: Vec = Vec::new(); + for span in spans { + if let Some(previous) = deduped.last() { + if span.start >= previous.start && span.end <= previous.end { + continue; + } + } + deduped.push(span); + } + deduped +} + fn word_repeat_hash_slice(pref: &[u64], pw: &[u64], start: usize, end: usize) -> u64 { pref[end].wrapping_sub(pref[start].wrapping_mul(pw[end - start])) & WORD_REPEAT_HASH_MASK } diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 6d44551..644532e 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1110,6 +1110,20 @@ def _find_hybrid_numbered_repeat_spans( analysis_text = _prepare_hybrid_analysis_text(page_text, blocked_spans=blocked_spans) else: analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + rust_mod = _get_word_repeat_rust_module() + if rust_mod is not None and hasattr(rust_mod, "find_hybrid_repeat_spans"): + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "match_types": list(item["match_types"]), + "category": str(item["category"]), + "kind": str(item["kind"]), + "item_count": int(item["item_count"]), + **({"cycle_len": int(item["cycle_len"])} if "cycle_len" in item else {}), + } + for item in rust_mod.find_hybrid_repeat_spans(analysis_text) + ] items = _extract_hybrid_numbered_items_from_analysis_text(analysis_text) spans = _find_hybrid_same_body_progression_spans(items) spans.extend(_find_hybrid_cycle_progression_spans(items)) From 880fbe24a56c2cc4b6f38216153d5f49e00459bc Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 21:58:13 +0300 Subject: [PATCH 77/93] move shared OCR text normalization into Rust --- rust/glossapi_rs_noise/src/lib.rs | 29 +++++- rust/glossapi_rs_noise/src/noise_metrics.rs | 99 +++++++++++++++++++++ src/glossapi/corpus/phase_clean.py | 19 ++++ 3 files changed, 146 insertions(+), 1 deletion(-) diff --git a/rust/glossapi_rs_noise/src/lib.rs b/rust/glossapi_rs_noise/src/lib.rs index 6d06ce5..8d990d3 100644 --- a/rust/glossapi_rs_noise/src/lib.rs +++ b/rust/glossapi_rs_noise/src/lib.rs @@ -5,7 +5,8 @@ mod noise_metrics; use noise_metrics::{ annotate_numeric_debug_page_internal, evaluate_page_character_noise_internal, export_numeric_match_debug_pages_internal, export_ocr_match_debug_pages_internal, - find_hybrid_repeat_spans_internal, find_numeric_debug_page_spans_internal, + find_hybrid_repeat_spans_internal, find_labeled_shared_repeat_spans_internal, + find_numeric_debug_page_spans_internal, find_word_repeat_spans_internal, score_markdown_directory_detailed_internal, score_markdown_directory_internal, score_markdown_directory_ocr_profile_internal, @@ -384,6 +385,31 @@ fn find_hybrid_repeat_spans(py: Python<'_>, analysis_text: &str) -> PyResult, + analysis_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> PyResult>> { + let spans = + find_labeled_shared_repeat_spans_internal(analysis_text, rep_threshold, min_period, window); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("period", span.period)?; + item.set_item("repetitions", span.repetitions)?; + item.set_item("tail_chars", span.tail_chars)?; + item.set_item("match_type", span.match_type)?; + out.push(item.into()); + } + Ok(out) +} + #[pyfunction] fn evaluate_page_character_noise(py: Python<'_>, page: &str) -> PyResult> { let metrics = evaluate_page_character_noise_internal(page); @@ -411,6 +437,7 @@ fn glossapi_rs_noise(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(find_numeric_debug_page_spans, m)?)?; m.add_function(wrap_pyfunction!(find_word_repeat_spans, m)?)?; m.add_function(wrap_pyfunction!(find_hybrid_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_labeled_shared_repeat_spans, m)?)?; m.add_function(wrap_pyfunction!(evaluate_page_character_noise, m)?)?; Ok(()) } diff --git a/rust/glossapi_rs_noise/src/noise_metrics.rs b/rust/glossapi_rs_noise/src/noise_metrics.rs index 8127810..90245f7 100644 --- a/rust/glossapi_rs_noise/src/noise_metrics.rs +++ b/rust/glossapi_rs_noise/src/noise_metrics.rs @@ -255,6 +255,16 @@ pub struct HybridRepeatSpan { pub cycle_len: Option, } +#[derive(Debug, Clone)] +pub struct LabeledSharedRepeatSpan { + pub start: usize, + pub end: usize, + pub period: usize, + pub repetitions: usize, + pub tail_chars: usize, + pub match_type: &'static str, +} + #[derive(Debug, Clone, Default)] pub struct PageCharacterNoise { pub total_chars: u64, @@ -2246,6 +2256,95 @@ pub fn find_hybrid_repeat_spans_internal(analysis_text: &str) -> Vec (String, Vec) { + let mut normalized = String::with_capacity(text.len()); + let mut raw_char_indices: Vec = Vec::with_capacity(text.len()); + let mut in_tag = false; + + for (raw_idx, ch) in text.chars().enumerate() { + if in_tag { + if ch == '>' { + in_tag = false; + } + continue; + } + if ch == '<' { + in_tag = true; + continue; + } + let mut casefolded = String::new(); + for lower in ch.to_lowercase() { + match lower { + 'ς' => casefolded.push('σ'), + 'ß' => { + casefolded.push('s'); + casefolded.push('s'); + } + 'ſ' => casefolded.push('s'), + _ => casefolded.push(lower), + } + } + for sub in casefolded.nfd() { + if sub.is_alphanumeric() { + let mapped = match sub { + 'ο' => 'o', + 'κ' => 'k', + _ => sub, + }; + normalized.push(mapped); + raw_char_indices.push(raw_idx); + } + } + } + + (normalized, raw_char_indices) +} + +pub fn find_labeled_shared_repeat_spans_internal( + text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> Vec { + let (normalized_text, raw_map) = normalize_alnum_with_map_skip_tags_internal(text); + let normalized_chars: Vec = normalized_text.chars().collect(); + let spans = find_word_repeat_spans_internal(&normalized_text, rep_threshold, min_period, window); + let mut labeled: Vec = Vec::new(); + + for span in spans { + if span.end <= span.start || span.start >= raw_map.len() { + continue; + } + let mut has_letter = false; + let mut has_digit = false; + for ch in &normalized_chars[span.start..span.end] { + if ch.is_alphabetic() { + has_letter = true; + } + if ch.is_ascii_digit() { + has_digit = true; + } + } + let match_type = if has_letter { + "word_repeat" + } else if has_digit { + "numeric_repeat" + } else { + continue; + }; + labeled.push(LabeledSharedRepeatSpan { + start: raw_map[span.start], + end: raw_map[span.end - 1] + 1, + period: span.period, + repetitions: span.repetitions, + tail_chars: span.tail_chars, + match_type, + }); + } + + labeled +} + fn word_repeat_hash_slice(pref: &[u64], pw: &[u64], start: usize, end: usize) -> u64 { pref[end].wrapping_sub(pref[start].wrapping_mul(pw[end - start])) & WORD_REPEAT_HASH_MASK } diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 644532e..117893b 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1749,6 +1749,25 @@ def _find_labeled_shared_repeat_spans( analysis_text = _filter_latex_preserve_layout(analysis_text) analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + rust_mod = _get_word_repeat_rust_module() + if rust_mod is not None and hasattr(rust_mod, "find_labeled_shared_repeat_spans"): + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "period": int(item["period"]), + "repetitions": int(item["repetitions"]), + "tail_chars": int(item["tail_chars"]), + "match_types": [str(item["match_type"])], + "category": MATCH_CATEGORY_BY_TYPE[str(item["match_type"])], + } + for item in rust_mod.find_labeled_shared_repeat_spans( + analysis_text, + int(rep_threshold), + int(min_period), + int(window), + ) + ] normalized_text, raw_map = _normalize_alnum_with_map_skip_tags(analysis_text) normalized_spans = _find_word_repeat_spans( normalized_text, From 8b052f8f4586e1d4d6d81bb164b8ebf8d13d8401 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 22:15:42 +0300 Subject: [PATCH 78/93] parallelize combined OCR document rendering --- rust/glossapi_rs_noise/src/lib.rs | 26 ++- src/glossapi/corpus/phase_clean.py | 345 ++++++++++++++++++----------- 2 files changed, 231 insertions(+), 140 deletions(-) diff --git a/rust/glossapi_rs_noise/src/lib.rs b/rust/glossapi_rs_noise/src/lib.rs index 8d990d3..e3dc334 100644 --- a/rust/glossapi_rs_noise/src/lib.rs +++ b/rust/glossapi_rs_noise/src/lib.rs @@ -325,12 +325,14 @@ fn find_numeric_debug_page_spans( min_repeat_steps: u64, min_same_digit_steps: u64, ) -> PyResult>> { - let spans = find_numeric_debug_page_spans_internal( - page, - min_progress_steps, - min_repeat_steps, - min_same_digit_steps, - ); + let spans = py.allow_threads(|| { + find_numeric_debug_page_spans_internal( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }); let mut out: Vec> = Vec::with_capacity(spans.len()); for span in spans { let item = PyDict::new(py); @@ -351,7 +353,8 @@ fn find_word_repeat_spans( min_period: usize, window: usize, ) -> PyResult>> { - let spans = find_word_repeat_spans_internal(normalized_text, rep_threshold, min_period, window); + let spans = + py.allow_threads(|| find_word_repeat_spans_internal(normalized_text, rep_threshold, min_period, window)); let mut out: Vec> = Vec::with_capacity(spans.len()); for span in spans { let item = PyDict::new(py); @@ -367,7 +370,7 @@ fn find_word_repeat_spans( #[pyfunction] fn find_hybrid_repeat_spans(py: Python<'_>, analysis_text: &str) -> PyResult>> { - let spans = find_hybrid_repeat_spans_internal(analysis_text); + let spans = py.allow_threads(|| find_hybrid_repeat_spans_internal(analysis_text)); let mut out: Vec> = Vec::with_capacity(spans.len()); for span in spans { let item = PyDict::new(py); @@ -394,8 +397,9 @@ fn find_labeled_shared_repeat_spans( min_period: usize, window: usize, ) -> PyResult>> { - let spans = - find_labeled_shared_repeat_spans_internal(analysis_text, rep_threshold, min_period, window); + let spans = py.allow_threads(|| { + find_labeled_shared_repeat_spans_internal(analysis_text, rep_threshold, min_period, window) + }); let mut out: Vec> = Vec::with_capacity(spans.len()); for span in spans { let item = PyDict::new(py); @@ -412,7 +416,7 @@ fn find_labeled_shared_repeat_spans( #[pyfunction] fn evaluate_page_character_noise(py: Python<'_>, page: &str) -> PyResult> { - let metrics = evaluate_page_character_noise_internal(page); + let metrics = py.allow_threads(|| evaluate_page_character_noise_internal(page)); let item = PyDict::new(py); item.set_item("total_chars", metrics.total_chars)?; item.set_item("bad_char_count", metrics.bad_char_count)?; diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 117893b..9f870f5 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -16,6 +16,7 @@ import time import unicodedata from collections import Counter +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -1932,6 +1933,176 @@ def _render_combined_ocr_debug_page( ) +def _process_combined_ocr_debug_document( + source_path: Path, + output_path: Path, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + annotated_pages: List[str] = [] + matched_page_count = 0 + table_match_count = 0 + numeric_match_count = 0 + latex_match_count = 0 + hybrid_match_count = 0 + word_match_count = 0 + doc_match_types: Set[str] = set() + page_metric_rows: List[Dict[str, Any]] = [] + total_page_times: List[float] = [] + table_page_times: List[float] = [] + numeric_page_times: List[float] = [] + latex_page_times: List[float] = [] + shared_page_times: List[float] = [] + hybrid_page_times: List[float] = [] + char_eval_times: List[float] = [] + bad_char_ratios: List[float] = [] + + for page_index, page in enumerate(pages, start=1): + page_result = _render_combined_ocr_debug_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + annotated_page = str(page_result["annotated_page"]) + page_types = list(page_result["page_types"]) + page_numeric_count = int(page_result["page_numeric_count"]) + page_word_count = int(page_result["page_word_count"]) + page_latex_count = int(page_result["page_latex_count"]) + page_table_count = int(page_result["page_table_count"]) + page_hybrid_count = int(page_result["page_hybrid_count"]) + page_noise_metrics = dict(page_result["page_noise_metrics"]) + char_eval_elapsed = float(page_result["char_eval_seconds"]) + table_elapsed = float(page_result["table_seconds"]) + numeric_elapsed = float(page_result["numeric_seconds"]) + latex_elapsed = float(page_result["latex_seconds"]) + hybrid_elapsed = float(page_result["hybrid_seconds"]) + shared_elapsed = float(page_result["shared_repeat_seconds"]) + page_total_time = float(page_result["total_page_seconds"]) + + char_eval_times.append(char_eval_elapsed) + bad_char_ratios.append(float(page_noise_metrics.get("bad_char_ratio", 0.0))) + table_page_times.append(table_elapsed) + numeric_page_times.append(numeric_elapsed) + latex_page_times.append(latex_elapsed) + hybrid_page_times.append(hybrid_elapsed) + shared_page_times.append(shared_elapsed) + total_page_times.append(page_total_time) + + page_match_total = ( + page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count + ) + if page_match_total: + matched_page_count += 1 + table_match_count += page_table_count + numeric_match_count += page_numeric_count + latex_match_count += page_latex_count + hybrid_match_count += page_hybrid_count + word_match_count += page_word_count + doc_match_types.update(page_types) + annotated_pages.append(annotated_page) + + page_metric_rows.append( + { + "source_path": str(source_path), + "source_stem": source_path.stem, + "page_number": page_index, + "page_index_in_file": page_index, + "total_chars": int(page_noise_metrics.get("total_chars", 0)), + "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), + "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), + "control_count": int(page_noise_metrics.get("control_count", 0)), + "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), + "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), + "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), + "table_match_count": page_table_count, + "numeric_match_count": page_numeric_count, + "latex_match_count": page_latex_count, + "hybrid_match_count": page_hybrid_count, + "word_match_count": page_word_count, + "match_types": ",".join(page_types), + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + ) + + output_path.write_text(PAGE_SPLIT_MARKER.join(annotated_pages), encoding="utf-8") + row = { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_count": len(pages), + "matched_page_count": matched_page_count, + "table_match_count": table_match_count, + "numeric_match_count": numeric_match_count, + "latex_match_count": latex_match_count, + "hybrid_match_count": hybrid_match_count, + "word_match_count": word_match_count, + "match_types": ",".join(sorted(doc_match_types)), + } + return { + "row": row, + "page_metric_rows": page_metric_rows, + "total_page_times": total_page_times, + "table_page_times": table_page_times, + "numeric_page_times": numeric_page_times, + "latex_page_times": latex_page_times, + "shared_page_times": shared_page_times, + "hybrid_page_times": hybrid_page_times, + "char_eval_times": char_eval_times, + "bad_char_ratios": bad_char_ratios, + } + + +def _process_combined_ocr_clean_document( + source_path: Path, + output_path: Path, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> None: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + cleaned_pages: List[str] = [] + for page in pages: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="clean", + ) + cleaned_pages.append(str(page_result["annotated_page"])) + output_path.write_text(PAGE_SPLIT_MARKER.join(cleaned_pages), encoding="utf-8") + + def _summarize_metric(values: List[float]) -> Dict[str, float]: if not values: return {"count": 0, "p50": 0.0, "p95": 0.0, "max": 0.0} @@ -2784,39 +2955,34 @@ def clean_ocr( ), ) n_threads = int(num_threads or os.cpu_count() or 4) + render_workers = max(1, min(4, n_threads)) md_files = sorted(input_dir.glob("*.md")) if write_cleaned_files: if self.cleaned_markdown_dir.exists(): shutil.rmtree(self.cleaned_markdown_dir) self.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) self.logger.info( - "Cleaning OCR markdown with shared combined loop into %s for %d markdown files…", + "Cleaning OCR markdown with shared combined loop into %s for %d markdown files (workers=%d)…", self.cleaned_markdown_dir, len(md_files), + render_workers, ) - for source_path in md_files: - text = source_path.read_text(encoding="utf-8") - pages = text.split(PAGE_SPLIT_MARKER) - cleaned_pages: List[str] = [] - for page in pages: - page_result = _render_combined_ocr_page( - page, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - mode="clean", - ) - cleaned_pages.append(str(page_result["annotated_page"])) - output_path = self.cleaned_markdown_dir / source_path.name - output_path.write_text( - PAGE_SPLIT_MARKER.join(cleaned_pages), - encoding="utf-8", + def _run_clean_doc(source_path: Path) -> None: + _process_combined_ocr_clean_document( + source_path, + self.cleaned_markdown_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), ) + with ThreadPoolExecutor(max_workers=render_workers) as executor: + list(executor.map(_run_clean_doc, md_files)) + self.logger.info( "Scoring OCR markdown files with glossapi_rs_noise OCR profile on %d markdown files…", len(md_files), @@ -3050,6 +3216,7 @@ def clean_ocr_numeric_word_debug_docs( *, max_docs: Optional[int] = 100, doc_offset: int = 0, + doc_workers: Optional[int] = None, min_progress_steps: int = 10, min_repeat_steps: int = 8, min_same_digit_steps: int = 10, @@ -3092,13 +3259,15 @@ def clean_ocr_numeric_word_debug_docs( source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] else: source_paths = all_source_paths[doc_offset:] + render_workers = max(1, int(doc_workers or min(4, os.cpu_count() or 1))) self.logger.info( - "Exporting combined OCR table+numeric+latex+hybrid+word debug docs from %s into %s for %d documents (offset=%d)", + "Exporting combined OCR table+numeric+latex+hybrid+word debug docs from %s into %s for %d documents (offset=%d, workers=%d)", input_dir, output_dir, len(source_paths), doc_offset, + render_workers, ) rows: List[Dict[str, Any]] = [] @@ -3111,113 +3280,31 @@ def clean_ocr_numeric_word_debug_docs( hybrid_page_times: List[float] = [] char_eval_times: List[float] = [] bad_char_ratios: List[float] = [] - for source_path in source_paths: - text = source_path.read_text(encoding="utf-8") - pages = text.split(PAGE_SPLIT_MARKER) - annotated_pages: List[str] = [] - matched_page_count = 0 - table_match_count = 0 - numeric_match_count = 0 - latex_match_count = 0 - hybrid_match_count = 0 - word_match_count = 0 - doc_match_types: Set[str] = set() - - for page_index, page in enumerate(pages, start=1): - page_result = _render_combined_ocr_debug_page( - page, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - ) - annotated_page = str(page_result["annotated_page"]) - page_types = list(page_result["page_types"]) - page_numeric_count = int(page_result["page_numeric_count"]) - page_word_count = int(page_result["page_word_count"]) - page_latex_count = int(page_result["page_latex_count"]) - page_table_count = int(page_result["page_table_count"]) - page_hybrid_count = int(page_result["page_hybrid_count"]) - page_noise_metrics = dict(page_result["page_noise_metrics"]) - char_eval_elapsed = float(page_result["char_eval_seconds"]) - table_elapsed = float(page_result["table_seconds"]) - numeric_elapsed = float(page_result["numeric_seconds"]) - latex_elapsed = float(page_result["latex_seconds"]) - hybrid_elapsed = float(page_result["hybrid_seconds"]) - shared_elapsed = float(page_result["shared_repeat_seconds"]) - page_total_time = float(page_result["total_page_seconds"]) - - char_eval_times.append(char_eval_elapsed) - bad_char_ratios.append(float(page_noise_metrics.get("bad_char_ratio", 0.0))) - table_page_times.append(table_elapsed) - numeric_page_times.append(numeric_elapsed) - latex_page_times.append(latex_elapsed) - hybrid_page_times.append(hybrid_elapsed) - shared_page_times.append(shared_elapsed) - total_page_times.append(page_total_time) - - page_match_total = ( - page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count - ) - if page_match_total: - matched_page_count += 1 - table_match_count += page_table_count - numeric_match_count += page_numeric_count - latex_match_count += page_latex_count - hybrid_match_count += page_hybrid_count - word_match_count += page_word_count - doc_match_types.update(page_types) - annotated_pages.append(annotated_page) - - page_metric_rows.append( - { - "source_path": str(source_path), - "source_stem": source_path.stem, - "page_number": page_index, - "page_index_in_file": page_index, - "total_chars": int(page_noise_metrics.get("total_chars", 0)), - "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), - "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), - "control_count": int(page_noise_metrics.get("control_count", 0)), - "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), - "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), - "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), - "table_match_count": page_table_count, - "numeric_match_count": page_numeric_count, - "latex_match_count": page_latex_count, - "hybrid_match_count": page_hybrid_count, - "word_match_count": page_word_count, - "match_types": ",".join(page_types), - "char_eval_seconds": char_eval_elapsed, - "table_seconds": table_elapsed, - "numeric_seconds": numeric_elapsed, - "latex_seconds": latex_elapsed, - "hybrid_seconds": hybrid_elapsed, - "shared_repeat_seconds": shared_elapsed, - "total_page_seconds": page_total_time, - } - ) + def _run_debug_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_debug_document( + source_path, + output_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) - output_path = output_dir / source_path.name - output_path.write_text(PAGE_SPLIT_MARKER.join(annotated_pages), encoding="utf-8") - row = { - "source_path": str(source_path), - "output_path": str(output_path), - "source_stem": source_path.stem, - "base_stem": canonical_stem(source_path.stem), - "page_count": len(pages), - "matched_page_count": matched_page_count, - "table_match_count": table_match_count, - "numeric_match_count": numeric_match_count, - "latex_match_count": latex_match_count, - "hybrid_match_count": hybrid_match_count, - "word_match_count": word_match_count, - "match_types": ",".join(sorted(doc_match_types)), - } - rows.append(row) + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(_run_debug_doc, source_paths): + rows.append(dict(doc_result["row"])) + page_metric_rows.extend(list(doc_result["page_metric_rows"])) + total_page_times.extend(list(doc_result["total_page_times"])) + table_page_times.extend(list(doc_result["table_page_times"])) + numeric_page_times.extend(list(doc_result["numeric_page_times"])) + latex_page_times.extend(list(doc_result["latex_page_times"])) + hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) + shared_page_times.extend(list(doc_result["shared_page_times"])) + char_eval_times.extend(list(doc_result["char_eval_times"])) + bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) with manifest_path.open("w", encoding="utf-8") as handle: for row in rows: From ad55b880cb9041ce6c0c218276f5dd710fa9627c Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 22:37:57 +0300 Subject: [PATCH 79/93] speed up OCR document rendering with process pools --- src/glossapi/corpus/phase_clean.py | 269 ++++++++++++++++++++++++----- 1 file changed, 226 insertions(+), 43 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 9f870f5..d29456f 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -6,6 +6,7 @@ import json import logging import math +import multiprocessing as mp import os import queue import random @@ -15,8 +16,10 @@ import sys import time import unicodedata +import warnings from collections import Counter -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from contextlib import contextmanager from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -163,12 +166,72 @@ _WORD_REPEAT_RUST_MOD: Optional[Any] = None _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = False _RUST_EXTENSION_PREBUILD_ATTEMPTED: Set[str] = set() +_COMBINED_OCR_WORKER_NOISE_MOD: Optional[Any] = None +_COMBINED_OCR_WORKER_REQUIRED_ATTRS = ( + "find_numeric_debug_page_spans", + "evaluate_page_character_noise", +) def _blank_non_newlines(text: str) -> str: return "".join("\n" if ch == "\n" else " " for ch in text) +def _init_combined_ocr_worker() -> None: + global _COMBINED_OCR_WORKER_NOISE_MOD + noise_mod = importlib.import_module("glossapi_rs_noise") + missing = [ + attr for attr in _COMBINED_OCR_WORKER_REQUIRED_ATTRS if not hasattr(noise_mod, attr) + ] + if missing: + raise ImportError( + "glossapi_rs_noise missing required attrs for OCR worker: " + + ", ".join(missing) + ) + _COMBINED_OCR_WORKER_NOISE_MOD = noise_mod + + +def _get_combined_ocr_worker_noise_mod() -> Any: + global _COMBINED_OCR_WORKER_NOISE_MOD + if _COMBINED_OCR_WORKER_NOISE_MOD is None: + _init_combined_ocr_worker() + return _COMBINED_OCR_WORKER_NOISE_MOD + + +def _can_use_combined_ocr_process_pool(noise_mod: Any, render_workers: int) -> bool: + return ( + render_workers > 1 + and os.name != "nt" + and getattr(noise_mod, "__name__", "") == "glossapi_rs_noise" + ) + + +def _default_combined_ocr_render_workers( + *, + noise_mod: Any, + requested_workers: Optional[int], + max_workers: int, +) -> int: + if requested_workers is not None: + return max(1, int(requested_workers)) + host_workers = max(1, int(max_workers)) + if _can_use_combined_ocr_process_pool(noise_mod, host_workers): + return host_workers + return min(4, host_workers) + + +@contextmanager +def _combined_ocr_process_pool_warning_ctx() -> Iterable[None]: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r"This process .* is multi-threaded, use of fork\(\) may lead to deadlocks in the child\.", + category=DeprecationWarning, + module=r"multiprocessing\.popen_fork", + ) + yield + + def _blank_regex_matches_preserve_layout(text: str, pattern: re.Pattern[str]) -> str: return pattern.sub(lambda match: _blank_non_newlines(match.group(0)), text) @@ -1937,7 +2000,7 @@ def _process_combined_ocr_debug_document( source_path: Path, output_path: Path, *, - noise_mod: Any, + noise_mod: Optional[Any], min_progress_steps: int, min_repeat_steps: int, min_same_digit_steps: int, @@ -1945,6 +2008,8 @@ def _process_combined_ocr_debug_document( word_min_period: int, word_window: int, ) -> Dict[str, Any]: + if noise_mod is None: + noise_mod = _get_combined_ocr_worker_noise_mod() text = source_path.read_text(encoding="utf-8") pages = text.split(PAGE_SPLIT_MARKER) annotated_pages: List[str] = [] @@ -2076,7 +2141,7 @@ def _process_combined_ocr_clean_document( source_path: Path, output_path: Path, *, - noise_mod: Any, + noise_mod: Optional[Any], min_progress_steps: int, min_repeat_steps: int, min_same_digit_steps: int, @@ -2084,6 +2149,8 @@ def _process_combined_ocr_clean_document( word_min_period: int, word_window: int, ) -> None: + if noise_mod is None: + noise_mod = _get_combined_ocr_worker_noise_mod() text = source_path.read_text(encoding="utf-8") pages = text.split(PAGE_SPLIT_MARKER) cleaned_pages: List[str] = [] @@ -2103,6 +2170,58 @@ def _process_combined_ocr_clean_document( output_path.write_text(PAGE_SPLIT_MARKER.join(cleaned_pages), encoding="utf-8") +def _process_combined_ocr_debug_document_job( + job: Tuple[str, str, int, int, int, int, int, int] +) -> Dict[str, Any]: + ( + source_path_str, + output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + return _process_combined_ocr_debug_document( + Path(source_path_str), + Path(output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + +def _process_combined_ocr_clean_document_job( + job: Tuple[str, str, int, int, int, int, int, int] +) -> None: + ( + source_path_str, + output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + _process_combined_ocr_clean_document( + Path(source_path_str), + Path(output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + def _summarize_metric(values: List[float]) -> Dict[str, float]: if not values: return {"count": 0, "p50": 0.0, "p95": 0.0, "max": 0.0} @@ -2955,7 +3074,11 @@ def clean_ocr( ), ) n_threads = int(num_threads or os.cpu_count() or 4) - render_workers = max(1, min(4, n_threads)) + render_workers = _default_combined_ocr_render_workers( + noise_mod=noise_mod, + requested_workers=None, + max_workers=n_threads, + ) md_files = sorted(input_dir.glob("*.md")) if write_cleaned_files: if self.cleaned_markdown_dir.exists(): @@ -2967,21 +3090,43 @@ def clean_ocr( len(md_files), render_workers, ) - def _run_clean_doc(source_path: Path) -> None: - _process_combined_ocr_clean_document( - source_path, - self.cleaned_markdown_dir / source_path.name, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - ) + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + jobs = [ + ( + str(source_path), + str(self.cleaned_markdown_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + list(executor.map(_process_combined_ocr_clean_document_job, jobs)) + else: + def _run_clean_doc(source_path: Path) -> None: + _process_combined_ocr_clean_document( + source_path, + self.cleaned_markdown_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) - with ThreadPoolExecutor(max_workers=render_workers) as executor: - list(executor.map(_run_clean_doc, md_files)) + with ThreadPoolExecutor(max_workers=render_workers) as executor: + list(executor.map(_run_clean_doc, md_files)) self.logger.info( "Scoring OCR markdown files with glossapi_rs_noise OCR profile on %d markdown files…", @@ -3259,7 +3404,11 @@ def clean_ocr_numeric_word_debug_docs( source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] else: source_paths = all_source_paths[doc_offset:] - render_workers = max(1, int(doc_workers or min(4, os.cpu_count() or 1))) + render_workers = _default_combined_ocr_render_workers( + noise_mod=noise_mod, + requested_workers=doc_workers, + max_workers=int(os.cpu_count() or 1), + ) self.logger.info( "Exporting combined OCR table+numeric+latex+hybrid+word debug docs from %s into %s for %d documents (offset=%d, workers=%d)", @@ -3280,31 +3429,65 @@ def clean_ocr_numeric_word_debug_docs( hybrid_page_times: List[float] = [] char_eval_times: List[float] = [] bad_char_ratios: List[float] = [] - def _run_debug_doc(source_path: Path) -> Dict[str, Any]: - return _process_combined_ocr_debug_document( - source_path, - output_dir / source_path.name, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - ) + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + jobs = [ + ( + str(source_path), + str(output_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in source_paths + ] + iterator: Iterable[Dict[str, Any]] + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) + for doc_result in iterator: + rows.append(dict(doc_result["row"])) + page_metric_rows.extend(list(doc_result["page_metric_rows"])) + total_page_times.extend(list(doc_result["total_page_times"])) + table_page_times.extend(list(doc_result["table_page_times"])) + numeric_page_times.extend(list(doc_result["numeric_page_times"])) + latex_page_times.extend(list(doc_result["latex_page_times"])) + hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) + shared_page_times.extend(list(doc_result["shared_page_times"])) + char_eval_times.extend(list(doc_result["char_eval_times"])) + bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) + else: + def _run_debug_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_debug_document( + source_path, + output_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) - with ThreadPoolExecutor(max_workers=render_workers) as executor: - for doc_result in executor.map(_run_debug_doc, source_paths): - rows.append(dict(doc_result["row"])) - page_metric_rows.extend(list(doc_result["page_metric_rows"])) - total_page_times.extend(list(doc_result["total_page_times"])) - table_page_times.extend(list(doc_result["table_page_times"])) - numeric_page_times.extend(list(doc_result["numeric_page_times"])) - latex_page_times.extend(list(doc_result["latex_page_times"])) - hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) - shared_page_times.extend(list(doc_result["shared_page_times"])) - char_eval_times.extend(list(doc_result["char_eval_times"])) - bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(_run_debug_doc, source_paths): + rows.append(dict(doc_result["row"])) + page_metric_rows.extend(list(doc_result["page_metric_rows"])) + total_page_times.extend(list(doc_result["total_page_times"])) + table_page_times.extend(list(doc_result["table_page_times"])) + numeric_page_times.extend(list(doc_result["numeric_page_times"])) + latex_page_times.extend(list(doc_result["latex_page_times"])) + hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) + shared_page_times.extend(list(doc_result["shared_page_times"])) + char_eval_times.extend(list(doc_result["char_eval_times"])) + bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) with manifest_path.open("w", encoding="utf-8") as handle: for row in rows: From f405195ab89236ecb2b7e94c36876f4f523ffa1d Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 22:41:05 +0300 Subject: [PATCH 80/93] reduce OCR process-pool overhead and tune defaults --- src/glossapi/corpus/phase_clean.py | 88 +++++++++--------------------- 1 file changed, 27 insertions(+), 61 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index d29456f..a03ae38 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -2021,14 +2021,6 @@ def _process_combined_ocr_debug_document( word_match_count = 0 doc_match_types: Set[str] = set() page_metric_rows: List[Dict[str, Any]] = [] - total_page_times: List[float] = [] - table_page_times: List[float] = [] - numeric_page_times: List[float] = [] - latex_page_times: List[float] = [] - shared_page_times: List[float] = [] - hybrid_page_times: List[float] = [] - char_eval_times: List[float] = [] - bad_char_ratios: List[float] = [] for page_index, page in enumerate(pages, start=1): page_result = _render_combined_ocr_debug_page( @@ -2057,15 +2049,6 @@ def _process_combined_ocr_debug_document( shared_elapsed = float(page_result["shared_repeat_seconds"]) page_total_time = float(page_result["total_page_seconds"]) - char_eval_times.append(char_eval_elapsed) - bad_char_ratios.append(float(page_noise_metrics.get("bad_char_ratio", 0.0))) - table_page_times.append(table_elapsed) - numeric_page_times.append(numeric_elapsed) - latex_page_times.append(latex_elapsed) - hybrid_page_times.append(hybrid_elapsed) - shared_page_times.append(shared_elapsed) - total_page_times.append(page_total_time) - page_match_total = ( page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count ) @@ -2126,14 +2109,6 @@ def _process_combined_ocr_debug_document( return { "row": row, "page_metric_rows": page_metric_rows, - "total_page_times": total_page_times, - "table_page_times": table_page_times, - "numeric_page_times": numeric_page_times, - "latex_page_times": latex_page_times, - "shared_page_times": shared_page_times, - "hybrid_page_times": hybrid_page_times, - "char_eval_times": char_eval_times, - "bad_char_ratios": bad_char_ratios, } @@ -3420,7 +3395,6 @@ def clean_ocr_numeric_word_debug_docs( ) rows: List[Dict[str, Any]] = [] - page_metric_rows: List[Dict[str, Any]] = [] total_page_times: List[float] = [] table_page_times: List[float] = [] numeric_page_times: List[float] = [] @@ -3429,6 +3403,19 @@ def clean_ocr_numeric_word_debug_docs( hybrid_page_times: List[float] = [] char_eval_times: List[float] = [] bad_char_ratios: List[float] = [] + def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) -> None: + rows.append(dict(doc_result["row"])) + for page_row in doc_result["page_metric_rows"]: + page_metrics_handle.write(json.dumps(page_row, ensure_ascii=False)) + page_metrics_handle.write("\n") + total_page_times.append(float(page_row["total_page_seconds"])) + table_page_times.append(float(page_row["table_seconds"])) + numeric_page_times.append(float(page_row["numeric_seconds"])) + latex_page_times.append(float(page_row["latex_seconds"])) + hybrid_page_times.append(float(page_row["hybrid_seconds"])) + shared_page_times.append(float(page_row["shared_repeat_seconds"])) + char_eval_times.append(float(page_row["char_eval_seconds"])) + bad_char_ratios.append(float(page_row["bad_char_ratio"])) if _can_use_combined_ocr_process_pool(noise_mod, render_workers): jobs = [ ( @@ -3444,24 +3431,16 @@ def clean_ocr_numeric_word_debug_docs( for source_path in source_paths ] iterator: Iterable[Dict[str, Any]] - with _combined_ocr_process_pool_warning_ctx(): - with ProcessPoolExecutor( - max_workers=render_workers, - mp_context=mp.get_context("fork"), - initializer=_init_combined_ocr_worker, - ) as executor: - iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) - for doc_result in iterator: - rows.append(dict(doc_result["row"])) - page_metric_rows.extend(list(doc_result["page_metric_rows"])) - total_page_times.extend(list(doc_result["total_page_times"])) - table_page_times.extend(list(doc_result["table_page_times"])) - numeric_page_times.extend(list(doc_result["numeric_page_times"])) - latex_page_times.extend(list(doc_result["latex_page_times"])) - hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) - shared_page_times.extend(list(doc_result["shared_page_times"])) - char_eval_times.extend(list(doc_result["char_eval_times"])) - bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle: + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) + for doc_result in iterator: + _consume_doc_result(doc_result, page_metrics_handle=page_metrics_handle) else: def _run_debug_doc(source_path: Path) -> Dict[str, Any]: return _process_combined_ocr_debug_document( @@ -3476,29 +3455,16 @@ def _run_debug_doc(source_path: Path) -> Dict[str, Any]: word_window=int(word_window), ) - with ThreadPoolExecutor(max_workers=render_workers) as executor: - for doc_result in executor.map(_run_debug_doc, source_paths): - rows.append(dict(doc_result["row"])) - page_metric_rows.extend(list(doc_result["page_metric_rows"])) - total_page_times.extend(list(doc_result["total_page_times"])) - table_page_times.extend(list(doc_result["table_page_times"])) - numeric_page_times.extend(list(doc_result["numeric_page_times"])) - latex_page_times.extend(list(doc_result["latex_page_times"])) - hybrid_page_times.extend(list(doc_result["hybrid_page_times"])) - shared_page_times.extend(list(doc_result["shared_page_times"])) - char_eval_times.extend(list(doc_result["char_eval_times"])) - bad_char_ratios.extend(list(doc_result["bad_char_ratios"])) + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle: + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(_run_debug_doc, source_paths): + _consume_doc_result(doc_result, page_metrics_handle=page_metrics_handle) with manifest_path.open("w", encoding="utf-8") as handle: for row in rows: handle.write(json.dumps(row, ensure_ascii=False)) handle.write("\n") - with page_metrics_path.open("w", encoding="utf-8") as handle: - for row in page_metric_rows: - handle.write(json.dumps(row, ensure_ascii=False)) - handle.write("\n") - summary = { "doc_count": len(rows), "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), From d00af17aa6dc10b35d738387749ca8860b539357 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 23:03:16 +0300 Subject: [PATCH 81/93] speed up OCR table handling and page fast paths --- src/glossapi/corpus/phase_clean.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index a03ae38..7e52d34 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -20,6 +20,7 @@ from collections import Counter from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from contextlib import contextmanager +from functools import lru_cache from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -237,6 +238,9 @@ def _blank_regex_matches_preserve_layout(text: str, pattern: re.Pattern[str]) -> def _filter_tables_preserve_layout(text: str) -> str: + lowered = text.lower() + if " str: def _filter_latex_preserve_layout(text: str) -> str: + if "$" not in text and "\\" not in text: + return text for pattern in ( LATEX_BEGIN_END_RE, LATEX_BLOCK_RE, @@ -264,6 +270,8 @@ def _filter_latex_preserve_layout(text: str) -> str: def _blank_existing_match_regions_preserve_layout(text: str) -> str: + if " List[List[str]]: return rows +@lru_cache(maxsize=2048) +def _extract_html_table_rows_cached(table_text: str) -> Tuple[Tuple[str, ...], ...]: + return tuple(tuple(row) for row in _extract_html_table_rows(table_text)) + + def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: parsed_rows, _ = _audit_parse_table_rows(table_text) grid, _ = _audit_expand_table_rows(parsed_rows) @@ -336,8 +349,13 @@ def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: return nonempty +@lru_cache(maxsize=2048) +def _flatten_html_table_nonempty_cells_cached(table_text: str) -> Tuple[str, ...]: + return tuple(_flatten_html_table_nonempty_cells(table_text)) + + def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: - nonempty_cells = _flatten_html_table_nonempty_cells(table_text) + nonempty_cells = _flatten_html_table_nonempty_cells_cached(table_text) if len(nonempty_cells) != 1: return None candidate = nonempty_cells[0].strip() @@ -348,7 +366,8 @@ def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: return candidate -def _render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: +@lru_cache(maxsize=2048) +def _render_table_html_for_output_cached(table_text: str, match_kind: Optional[str]) -> str: sentence_shell = _extract_sentence_shell_table_text(table_text) if sentence_shell and match_kind == "sentence_shell_table": return sentence_shell @@ -359,6 +378,10 @@ def _render_table_html_for_output(table_text: str, *, match_kind: Optional[str] return table_text +def _render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: + return _render_table_html_for_output_cached(table_text, match_kind) + + def _replace_html_tables_with_markdown(text: str) -> str: if " List[Dict[str, Any]]: ) continue - rows = _extract_html_table_rows(raw_table) + rows = _extract_html_table_rows_cached(raw_table) if not rows: continue From d5fafeb206fb0c5db648360bb6e673c9240edee6 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 23:04:43 +0300 Subject: [PATCH 82/93] skip irrelevant OCR passes on marker-free pages --- src/glossapi/corpus/phase_clean.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 7e52d34..0ef54f0 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -411,6 +411,8 @@ def _clean_fill_for_removed_span(page_text: str, start: int, end: int) -> str: def _find_table_repeat_spans(page_text: str) -> List[Dict[str, Any]]: + if " List[Dict[str, Any]]: + if not any(ch.isdigit() for ch in page_text): + return [] if analysis_text is None: analysis_text = _prepare_hybrid_analysis_text(page_text, blocked_spans=blocked_spans) else: @@ -1559,6 +1563,13 @@ def _find_latex_repeat_spans( if analysis_text is None: analysis_text = _filter_tables_preserve_layout(page_text) analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + if ( + "$" not in analysis_text + and "\\" not in analysis_text + and "" not in analysis_text + and "" not in analysis_text + ): + return [] analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) labeled_spans: List[Dict[str, Any]] = [] From 7cebe12d891185c6b8e06dcfa6edbbeebe374e26 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 23:10:15 +0300 Subject: [PATCH 83/93] avoid redundant Rust prebuilds on OCR startup --- src/glossapi/corpus/phase_clean.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 0ef54f0..6885cf5 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -179,7 +179,7 @@ def _blank_non_newlines(text: str) -> str: def _init_combined_ocr_worker() -> None: - global _COMBINED_OCR_WORKER_NOISE_MOD + global _COMBINED_OCR_WORKER_NOISE_MOD, _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED noise_mod = importlib.import_module("glossapi_rs_noise") missing = [ attr for attr in _COMBINED_OCR_WORKER_REQUIRED_ATTRS if not hasattr(noise_mod, attr) @@ -190,6 +190,8 @@ def _init_combined_ocr_worker() -> None: + ", ".join(missing) ) _COMBINED_OCR_WORKER_NOISE_MOD = noise_mod + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + _WORD_REPEAT_RUST_MOD = noise_mod if hasattr(noise_mod, "find_word_repeat_spans") else None def _get_combined_ocr_worker_noise_mod() -> Any: @@ -199,6 +201,14 @@ def _get_combined_ocr_worker_noise_mod() -> Any: return _COMBINED_OCR_WORKER_NOISE_MOD +def _prime_word_repeat_rust_module(module_name: str, module: Any) -> Any: + global _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED + if module_name == "glossapi_rs_noise": + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + _WORD_REPEAT_RUST_MOD = module if hasattr(module, "find_word_repeat_spans") else None + return module + + def _can_use_combined_ocr_process_pool(noise_mod: Any, render_workers: int) -> bool: return ( render_workers > 1 @@ -2332,14 +2342,12 @@ def _import_module_with_fallback(): raise last_error raise ModuleNotFoundError(module_name) - _build_extension_once() - needs_build = False try: module = _import_module_with_fallback() missing = _missing_attrs(module) if not missing: - return module + return _prime_word_repeat_rust_module(module_name, module) self.logger.warning( "Rust extension %s is missing required attributes %s; attempting in-place build via maturin …", module_name, @@ -2353,6 +2361,16 @@ def _import_module_with_fallback(): ) needs_build = True + if needs_build: + _build_extension_once() + try: + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if not missing: + return _prime_word_repeat_rust_module(module_name, module) + except ModuleNotFoundError: + pass + if not needs_build: raise RuntimeError(f"Unexpected load state for Rust extension {module_name}") @@ -2400,7 +2418,7 @@ def _import_module_with_fallback(): raise RuntimeError( f"Built {module_name} but it is still missing required attributes: {missing}" ) - return module + return _prime_word_repeat_rust_module(module_name, module) except Exception as build_err: raise RuntimeError( f"Automatic build of {module_name} failed: {build_err}" From c15f4b38e50165c936533a81a50ff3de6221880e Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Fri, 10 Apr 2026 23:22:01 +0300 Subject: [PATCH 84/93] refactor OCR table policy and document cleaner runtime --- docs/architecture/index.md | 1 + docs/architecture/ocr_cleaning_runtime.md | 118 ++++++++++ docs/index.md | 1 + docs/ocr_repetition_policy.md | 10 +- docs/stages/clean.md | 49 ++++- src/glossapi/corpus/ocr_table.py | 240 ++++++++++++++++++++ src/glossapi/corpus/phase_clean.py | 255 ++++++---------------- 7 files changed, 480 insertions(+), 194 deletions(-) create mode 100644 docs/architecture/ocr_cleaning_runtime.md create mode 100644 src/glossapi/corpus/ocr_table.py diff --git a/docs/architecture/index.md b/docs/architecture/index.md index f6e1c85..7f3d113 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -172,4 +172,5 @@ The current architecture is effective but has important tradeoffs: These pressure points are documented separately in: - [Artifact Layout and Stage Handoffs](artifact_layout_and_stage_handoffs.md) +- [OCR Cleaning Runtime](ocr_cleaning_runtime.md) - [Resumability, Recovery, and Retention](resumability_recovery_and_retention.md) diff --git a/docs/architecture/ocr_cleaning_runtime.md b/docs/architecture/ocr_cleaning_runtime.md new file mode 100644 index 0000000..6b780b9 --- /dev/null +++ b/docs/architecture/ocr_cleaning_runtime.md @@ -0,0 +1,118 @@ +# OCR Cleaning Runtime + +This document explains how the current OCR cleaner is organized, why the +matcher families are separated, and why the clean/debug behavior is driven by +one shared page analyzer. + +## One Analyzer, Two Render Modes + +The OCR cleaner now works in two modes over the same span plan: + +- `debug` + - preserves the source page surface + - inserts `` tags around the matched regions +- `clean` + - applies the removal/rewrite policy directly + - writes the cleaned page text with no debug tags + +This is deliberate. The project previously had a tendency for the reviewer-facing +debug logic to evolve faster than the real cleaner. Sharing one analyzer avoids +that drift: if the debug page is right, the clean page is operating on the same +decisions. + +## Why The Cleaner Is Not One Generic Matcher + +The cleaner is trying to remove OCR- or VLM-induced garbage, not every repeated +pattern in a page. A single fuzzy matcher over the whole page overgeneralizes +quickly: + +- numbers steal matches that should belong to numeric progression logic +- repeated notation in LaTeX looks like corruption even when it is legitimate +- HTML tables distort text surfaces and cause spurious word matches + +So the runtime uses ownership by surface type and structure instead of one broad +"repetition" rule. + +## Page Ownership Order + +The current analyzer order is: + +1. tables +2. numeric +3. LaTeX +4. hybrid numbered repetition +5. shared text repetition + +Why this order: + +- Tables run first because HTML table shells can dominate a page and confuse + every later pass. +- Numeric runs before generic text because `1, 2, 3, ...` style progressions + are real OCR-collapse signals and should not be absorbed by `word_repeat`. +- LaTeX and hybrid passes run before generic text because they depend on local + structure, not just repeated tokens. +- Shared text repetition runs last on the remaining visible surface only. + +This ordering is the main false-positive control mechanism. + +## Table Cleaning Is Broader Than Repetition + +Table handling is intentionally separated into `src/glossapi/corpus/ocr_table.py` +because it is not just another repetition matcher. + +Current table classes: + +- `sentence_shell_table` + - a table with one prose-like filled cell + - treated as layout noise around content + - dropped in clean mode +- `empty_table_collapse` + - a large sparse shell with almost no real cell content + - dropped in clean mode +- `repeated_rows` + - an actually repetition-oriented table problem + - dropped in clean mode +- unmatched kept tables + - converted from HTML to GitHub-style Markdown + +The important design point is that sentence-shell and empty-shell tables are +structural cleanup decisions, not repetition decisions. + +## LaTeX And Hybrid Generalization Strategy + +LaTeX and hybrid numbered matching both follow the same conservative pattern: + +- prefer local runs +- abstract slot fields +- require mechanical progression or stable low-diversity cycles +- avoid page-wide reuse as evidence on its own + +That is why the cleaner does not treat "same symbol appears many times on a +page" as enough evidence. The goal is to catch degenerate local collapse, not +normal scholarly notation reuse. + +## Why Rust Is Used Selectively + +The hot-path detection work is in Rust because page-scale scanning dominates run +time. Python still owns: + +- orchestration +- filesystem I/O +- debug/clean rendering +- policy composition across matcher families + +This split is intentional: + +- Rust is best for large repeated scans and token-normalization hot loops +- Python is still easier for mode-aware rendering and pipeline integration + +## Performance And Correctness Contract + +Performance work is allowed only if exact debug output stays stable. + +The correctness lock is: + +- `tests/test_ocr_golden_pages.py` + +That suite uses hundreds of real pages and compares exact output bytes. The +speed work therefore optimizes implementation, not semantics. diff --git a/docs/index.md b/docs/index.md index cb15dca..13cef9d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,6 +12,7 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. - [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. - [OCR Repetition Policy](ocr_repetition_policy.md) pins the default repetition thresholds for word and LaTeX cleaning. +- [OCR Cleaning Runtime](architecture/ocr_cleaning_runtime.md) explains the shared clean/debug analyzer, ordering, and why the cleaner separates tables, numeric, LaTeX, hybrid, and text ownership. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. ## Configure and debug diff --git a/docs/ocr_repetition_policy.md b/docs/ocr_repetition_policy.md index 6b64ca7..e6c6fde 100644 --- a/docs/ocr_repetition_policy.md +++ b/docs/ocr_repetition_policy.md @@ -12,9 +12,13 @@ This document pins the intended default repetition thresholds for OCR-cleaner de These defaults apply to the combined OCR debug annotator: - `Corpus.clean_ocr_numeric_word_debug_docs(...)` +The same analyzer now also drives real clean-mode rendering in `clean_ocr()`; +debug and clean differ only in rendering, not in span discovery. + In that pipeline: -- numeric detection runs first -- LaTeX detection runs second +- tables are handled first +- numeric detection runs before generic text ownership +- LaTeX and hybrid structural detection run before shared text repetition - shared repeat detection runs last on the remaining untagged text ## Scope @@ -33,3 +37,5 @@ They do not override numeric-specific detectors, which have their own thresholds - A default of `4` is meant to reduce borderline `3`-repeat matches. - Locality matters more than page-wide reuse, especially for LaTeX. - Repeated symbols or notation used normally across a page should not be treated as cleaner targets by default. +- Numeric progression should be handled by numeric or hybrid logic before text repetition sees it. +- Table cleanup includes structural cases that are not repetition problems, so table policy is documented separately in `docs/architecture/ocr_cleaning_runtime.md`. diff --git a/docs/stages/clean.md b/docs/stages/clean.md index ae3c735..63bab08 100644 --- a/docs/stages/clean.md +++ b/docs/stages/clean.md @@ -6,7 +6,7 @@ The clean stage normalizes extracted Markdown and evaluates its quality. ## Main responsibilities -- run Rust-backed cleaning +- run the shared OCR analyzer in either clean or debug rendering mode - compute text quality and badness metrics - detect documents that require OCR reruns - update metadata for downstream stage selection @@ -20,6 +20,7 @@ The clean stage normalizes extracted Markdown and evaluates its quality. ## Main outputs - cleaned Markdown in `clean_markdown/` +- debug-marked Markdown when using the debug helpers - quality metrics and reports - metadata updates including OCR-related decisions @@ -32,6 +33,22 @@ It is especially important for Greek corpora because it distinguishes: - technically extracted text - actually usable Greek text +It also separates two different responsibilities that are easy to conflate: + +- structural cleanup + - tables, numeric runs, LaTeX collapse, hybrid numbered loops, word repetition +- quality scoring + - bad-character metrics + - suspicious-line metrics + - OCR rerun recommendations + +The stage now uses one shared analyzer for both: + +- `debug` mode + - shows exact match placement with `` tags +- `clean` mode + - removes or rewrites those exact same matched regions + ## Important operational outputs This stage may contribute or update: @@ -42,6 +59,32 @@ This stage may contribute or update: - character-count-based diagnostics - processing-stage status +## Current cleaning policy + +The cleaner does not use one generic fuzzy matcher over the whole page. +Instead it applies ownership in a fixed order: + +1. tables +2. numeric +3. LaTeX +4. hybrid numbered repetition +5. shared word repetition + +Why this matters: + +- tables can distort the visible text surface for every later pass +- numeric progressions are often valid cleaner targets but should not be + consumed by generic text repetition +- LaTeX and hybrid passes rely on more specific local structure +- shared text repetition is therefore safest on the remaining surface only + +Table handling is intentionally broader than repetition: + +- `sentence_shell_table` is dropped +- `empty_table_collapse` is dropped +- `repeated_rows` is dropped +- unmatched tables are converted from HTML to GitHub-style Markdown + ## Failure concerns Typical issues include: @@ -53,3 +96,7 @@ Typical issues include: ## Contributor note Changes here affect OCR routing and post-run quality analysis. Treat score and flag semantics as contract-level behavior. + +For content-cleaning changes, the exact-output benchmark in +`tests/test_ocr_golden_pages.py` is the main regression lock. Speed work is only +acceptable if those outputs remain stable. diff --git a/src/glossapi/corpus/ocr_table.py b/src/glossapi/corpus/ocr_table.py new file mode 100644 index 0000000..63756ed --- /dev/null +++ b/src/glossapi/corpus/ocr_table.py @@ -0,0 +1,240 @@ +"""Table-specific OCR cleaning helpers. + +This module isolates HTML-table handling from the broader OCR repetition logic. + +That separation is intentional: +- some table decisions are repetition-based, like repeated rows +- others are structural cleanups, like sentence-shell tables or near-empty shells + +Keeping table logic together makes the policy easier to understand and keeps the +main OCR page pipeline focused on ordering and span ownership. +""" +from __future__ import annotations + +import html +import re +from collections import Counter +from functools import lru_cache +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from ..scripts.table_markdown_audit import ( + _expand_rows as _audit_expand_table_rows, + _parse_table_rows as _audit_parse_table_rows, + audit_table as _audit_table_html, +) + +HTML_TABLE_BLOCK_RE = re.compile(r"(?is)") +HTML_TABLE_LINE_RE = re.compile(r"(?i).*?") +HTML_TABLE_CELL_RE = re.compile(r"(?is)(.*?)") +HTML_TAG_RE = re.compile(r"(?is)<[^>]+>") + +TABLE_EMPTY_MIN_ROWS = 6 +TABLE_EMPTY_MIN_CELLS = 18 +TABLE_EMPTY_MAX_NONEMPTY_RATIO = 0.15 +TABLE_REPEAT_MIN_ROWS = 4 +TABLE_REPEAT_MIN_NONEMPTY_CELLS = 2 +TABLE_REPEAT_MIN_ROW_TEXT_CHARS = 6 +TABLE_REPEAT_MIN_DUPLICATE_ROWS = 2 +TABLE_SENTENCE_SHELL_MIN_WORDS = 6 +TABLE_SENTENCE_SHELL_MIN_CHARS = 40 + + +def _normalize_table_cell_text(cell_html: str) -> str: + text = HTML_TAG_RE.sub(" ", cell_html) + text = html.unescape(text) + return " ".join(text.split()) + + +def _table_cell_has_content(cell_text: str) -> bool: + return any(ch.isalnum() for ch in cell_text) + + +def _extract_html_table_rows(table_text: str) -> List[List[str]]: + rows: List[List[str]] = [] + for row_match in HTML_TABLE_ROW_RE.finditer(table_text): + cells = [ + _normalize_table_cell_text(cell_match.group(1)) + for cell_match in HTML_TABLE_CELL_RE.finditer(row_match.group(0)) + ] + if cells: + rows.append(cells) + return rows + + +@lru_cache(maxsize=2048) +def _extract_html_table_rows_cached(table_text: str) -> Tuple[Tuple[str, ...], ...]: + """Cache repeated table shells by exact HTML string. + + The OCR corpus contains many duplicated HTML fragments, so exact-string + memoization pays off without changing behavior. + """ + return tuple(tuple(row) for row in _extract_html_table_rows(table_text)) + + +def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: + parsed_rows, _ = _audit_parse_table_rows(table_text) + grid, _ = _audit_expand_table_rows(parsed_rows) + if not grid: + return [] + nonempty: List[str] = [] + for row in grid: + for cell in row: + normalized = " ".join(cell.split()) + if any(ch.isalnum() for ch in normalized): + nonempty.append(normalized) + return nonempty + + +@lru_cache(maxsize=2048) +def _flatten_html_table_nonempty_cells_cached(table_text: str) -> Tuple[str, ...]: + return tuple(_flatten_html_table_nonempty_cells(table_text)) + + +def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: + """Return prose text when a table is only a layout shell around one cell. + + This is intentionally not a repetition rule. OCR and VLM extraction often + emit a normal sentence inside a tiny one-cell table shell; when that + happens, the table structure is noise and the prose cell is the content. + """ + nonempty_cells = _flatten_html_table_nonempty_cells_cached(table_text) + if len(nonempty_cells) != 1: + return None + candidate = nonempty_cells[0].strip() + if len(candidate) < TABLE_SENTENCE_SHELL_MIN_CHARS: + return None + if len(re.findall(r"[^\W\d_]+", candidate, re.UNICODE)) < TABLE_SENTENCE_SHELL_MIN_WORDS: + return None + return candidate + + +@lru_cache(maxsize=2048) +def _render_table_html_for_output_cached(table_text: str, match_kind: Optional[str]) -> str: + sentence_shell = _extract_sentence_shell_table_text(table_text) + if sentence_shell and match_kind == "sentence_shell_table": + return sentence_shell + + audit = _audit_table_html(Path("/tmp/table_fragment.md"), 0, 0, table_text) + if audit.markdown: + return audit.markdown + return table_text + + +def render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: + """Render one HTML table for human review/debug output.""" + return _render_table_html_for_output_cached(table_text, match_kind) + + +def replace_html_tables_with_markdown(text: str) -> str: + """Normalize kept HTML tables into GitHub-style Markdown in page text.""" + if " str: + """Render a table in clean mode. + + Clean mode drops tables whose structure is the problem: + - sentence-shell tables + - empty shell tables + - repeated-row tables + """ + if match_kind in {"sentence_shell_table", "empty_table_collapse", "repeated_rows"}: + return "" + return render_table_html_for_output(table_text, match_kind=match_kind) + + +def find_table_repeat_spans(page_text: str, *, match_category: str) -> List[Dict[str, Any]]: + """Classify OCR table problems on a page. + + Table handling is intentionally broader than repetition: + - sentence-shell tables are removed because they are layout shells around prose + - empty table collapse removes sparse structural noise + - repeated rows is the actual repetition-oriented table rule + """ + if "= TABLE_EMPTY_MIN_ROWS + and cell_count >= TABLE_EMPTY_MIN_CELLS + and nonempty_ratio <= TABLE_EMPTY_MAX_NONEMPTY_RATIO + ): + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": match_category, + "kind": "empty_table_collapse", + "row_count": row_count, + "cell_count": cell_count, + "nonempty_ratio": round(nonempty_ratio, 3), + } + ) + continue + + row_keys: List[Tuple[str, ...]] = [] + for row in rows: + nonempty_cells_in_row = [cell for cell in row if _table_cell_has_content(cell)] + if len(nonempty_cells_in_row) < TABLE_REPEAT_MIN_NONEMPTY_CELLS: + continue + row_text = " ".join(nonempty_cells_in_row) + if len(row_text) < TABLE_REPEAT_MIN_ROW_TEXT_CHARS: + continue + row_keys.append(tuple(cell.casefold() for cell in row)) + + if row_count < TABLE_REPEAT_MIN_ROWS or not row_keys: + continue + + row_counts = Counter(row_keys) + duplicate_rows = sum(freq - 1 for freq in row_counts.values() if freq >= 2) + if duplicate_rows >= TABLE_REPEAT_MIN_DUPLICATE_ROWS: + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": match_category, + "kind": "repeated_rows", + "row_count": row_count, + "duplicate_rows": duplicate_rows, + } + ) + + return spans diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 6885cf5..06839cf 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1,7 +1,15 @@ -"""Cleaning and filtering helpers split from Corpus.""" +"""Cleaning and filtering helpers split from Corpus. + +This module now primarily owns OCR orchestration: +- page-level analyzer ordering +- shared clean/debug rendering +- worker/process orchestration + +Specialized policy modules, like HTML-table handling, live alongside it so the +main pipeline can stay focused on span ownership and mode selection. +""" from __future__ import annotations -import html import importlib import json import logging @@ -29,12 +37,15 @@ from .._naming import canonical_stem from ..gloss_downloader import GlossDownloader -from ..scripts.table_markdown_audit import ( - _expand_rows as _audit_expand_table_rows, - _parse_table_rows as _audit_parse_table_rows, - audit_table as _audit_table_html, -) # Avoid importing section/classifier here; cleaning phase does not use them. +from .ocr_table import ( + HTML_TABLE_BLOCK_RE, + HTML_TABLE_LINE_RE, + find_table_repeat_spans as _find_table_repeat_spans_impl, + render_table_html_for_clean as _render_table_html_for_clean, + render_table_html_for_output as _render_table_html_for_output, + replace_html_tables_with_markdown as _replace_html_tables_with_markdown, +) from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path from .corpus_state import _ProcessingStateManager from .corpus_utils import _maybe_import_torch @@ -43,11 +54,6 @@ WORD_REPEAT_HASH_MASK = (1 << 64) - 1 WORD_REPEAT_HASH_BASE = 1469598103934665603 WORD_REPEAT_MERGE_NONWHITESPACE_GAP = 10 -HTML_TABLE_BLOCK_RE = re.compile(r"(?is)") -HTML_TABLE_LINE_RE = re.compile(r"(?i).*?") -HTML_TABLE_CELL_RE = re.compile(r"(?is)(.*?)") -HTML_TAG_RE = re.compile(r"(?is)<[^>]+>") EXISTING_MATCH_BLOCK_RE = re.compile(r"(?is)]*>.*?") LATEX_BLOCK_RE = re.compile(r"(?is)\$\$.*?\$\$") LATEX_BRACKET_RE = re.compile(r"(?is)\\\[.*?\\\]") @@ -142,15 +148,6 @@ r"\tau", r"\omega", ) -TABLE_EMPTY_MIN_ROWS = 6 -TABLE_EMPTY_MIN_CELLS = 18 -TABLE_EMPTY_MAX_NONEMPTY_RATIO = 0.15 -TABLE_REPEAT_MIN_ROWS = 4 -TABLE_REPEAT_MIN_NONEMPTY_CELLS = 2 -TABLE_REPEAT_MIN_ROW_TEXT_CHARS = 6 -TABLE_REPEAT_MIN_DUPLICATE_ROWS = 2 -TABLE_SENTENCE_SHELL_MIN_WORDS = 6 -TABLE_SENTENCE_SHELL_MIN_CHARS = 40 MATCH_CATEGORY_BY_TYPE = { "ascending_numeric_sequence": "numeric", "repeat_numeric_run": "numeric", @@ -318,95 +315,6 @@ def _extract_latex_segments(text: str) -> List[Dict[str, Any]]: return segments -def _normalize_table_cell_text(cell_html: str) -> str: - text = HTML_TAG_RE.sub(" ", cell_html) - text = html.unescape(text) - return " ".join(text.split()) - - -def _table_cell_has_content(cell_text: str) -> bool: - return any(ch.isalnum() for ch in cell_text) - - -def _extract_html_table_rows(table_text: str) -> List[List[str]]: - rows: List[List[str]] = [] - for row_match in HTML_TABLE_ROW_RE.finditer(table_text): - cells = [ - _normalize_table_cell_text(cell_match.group(1)) - for cell_match in HTML_TABLE_CELL_RE.finditer(row_match.group(0)) - ] - if cells: - rows.append(cells) - return rows - - -@lru_cache(maxsize=2048) -def _extract_html_table_rows_cached(table_text: str) -> Tuple[Tuple[str, ...], ...]: - return tuple(tuple(row) for row in _extract_html_table_rows(table_text)) - - -def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: - parsed_rows, _ = _audit_parse_table_rows(table_text) - grid, _ = _audit_expand_table_rows(parsed_rows) - if not grid: - return [] - nonempty: List[str] = [] - for row in grid: - for cell in row: - normalized = " ".join(cell.split()) - if any(ch.isalnum() for ch in normalized): - nonempty.append(normalized) - return nonempty - - -@lru_cache(maxsize=2048) -def _flatten_html_table_nonempty_cells_cached(table_text: str) -> Tuple[str, ...]: - return tuple(_flatten_html_table_nonempty_cells(table_text)) - - -def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: - nonempty_cells = _flatten_html_table_nonempty_cells_cached(table_text) - if len(nonempty_cells) != 1: - return None - candidate = nonempty_cells[0].strip() - if len(candidate) < TABLE_SENTENCE_SHELL_MIN_CHARS: - return None - if len(re.findall(r"[^\W\d_]+", candidate, re.UNICODE)) < TABLE_SENTENCE_SHELL_MIN_WORDS: - return None - return candidate - - -@lru_cache(maxsize=2048) -def _render_table_html_for_output_cached(table_text: str, match_kind: Optional[str]) -> str: - sentence_shell = _extract_sentence_shell_table_text(table_text) - if sentence_shell and match_kind == "sentence_shell_table": - return sentence_shell - - audit = _audit_table_html(Path("/tmp/table_fragment.md"), 0, 0, table_text) - if audit.markdown: - return audit.markdown - return table_text - - -def _render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: - return _render_table_html_for_output_cached(table_text, match_kind) - - -def _replace_html_tables_with_markdown(text: str) -> str: - if " str: - if match_kind in {"sentence_shell_table", "empty_table_collapse", "repeated_rows"}: - return "" - return _render_table_html_for_output(table_text, match_kind=match_kind) - - def _clean_fill_for_removed_span(page_text: str, start: int, end: int) -> str: removed = page_text[start:end] prev_char = page_text[start - 1] if start > 0 else "" @@ -421,86 +329,11 @@ def _clean_fill_for_removed_span(page_text: str, start: int, end: int) -> str: def _find_table_repeat_spans(page_text: str) -> List[Dict[str, Any]]: - if "= TABLE_EMPTY_MIN_ROWS - and cell_count >= TABLE_EMPTY_MIN_CELLS - and nonempty_ratio <= TABLE_EMPTY_MAX_NONEMPTY_RATIO - ): - spans.append( - { - "start": table_match.start(), - "end": table_match.end(), - "match_types": ["table_repeat"], - "category": MATCH_CATEGORY_BY_TYPE["table_repeat"], - "kind": "empty_table_collapse", - "row_count": row_count, - "cell_count": cell_count, - "nonempty_ratio": round(nonempty_ratio, 3), - } - ) - continue - - row_keys: List[Tuple[str, ...]] = [] - for row in rows: - nonempty_cells_in_row = [cell for cell in row if _table_cell_has_content(cell)] - if len(nonempty_cells_in_row) < TABLE_REPEAT_MIN_NONEMPTY_CELLS: - continue - row_text = " ".join(nonempty_cells_in_row) - if len(row_text) < TABLE_REPEAT_MIN_ROW_TEXT_CHARS: - continue - row_keys.append(tuple(cell.casefold() for cell in row)) - - if row_count < TABLE_REPEAT_MIN_ROWS or not row_keys: - continue - - row_counts = Counter(row_keys) - duplicate_rows = sum(freq - 1 for freq in row_counts.values() if freq >= 2) - if duplicate_rows >= TABLE_REPEAT_MIN_DUPLICATE_ROWS: - spans.append( - { - "start": table_match.start(), - "end": table_match.end(), - "match_types": ["table_repeat"], - "category": MATCH_CATEGORY_BY_TYPE["table_repeat"], - "kind": "repeated_rows", - "row_count": row_count, - "duplicate_rows": duplicate_rows, - } - ) - - return spans + """Keep phase_clean's old call shape while table policy lives in ocr_table.""" + return _find_table_repeat_spans_impl( + page_text, + match_category=MATCH_CATEGORY_BY_TYPE["table_repeat"], + ) def _normalize_latex_repeat_with_map(text: str) -> Tuple[str, List[int]]: @@ -1743,6 +1576,16 @@ def _render_page_with_labeled_spans( *, mode: str = "debug", ) -> Tuple[str, List[str], int, int, int, int, int]: + """Render one page from a shared span plan. + + `debug` and `clean` intentionally share the exact same merged span plan. + The only difference is how that plan is rendered: + - debug wraps the matched source surface in `` tags + - clean removes or rewrites the matched surface according to policy + + Keeping both modes on one renderer prevents the real cleaner from drifting + away from the reviewed debug output. + """ if mode not in {"debug", "clean"}: raise ValueError(f"Unsupported OCR render mode: {mode}") merged_spans = _merge_labeled_raw_spans(page_text, spans) @@ -1918,6 +1761,20 @@ def _render_combined_ocr_page( word_window: int, mode: str = "debug", ) -> Dict[str, Any]: + """Analyze one OCR page in the shared ownership order. + + The ordering is a policy decision, not an implementation accident: + 1. tables first, because table shells distort every later text pass + 2. numeric second, because numeric progressions should not be stolen by + generic word repetition + 3. LaTeX and hybrid structural passes next, because they operate on more + specialized local structure + 4. shared text repetition last, on the remaining visible surface only + + That ownership model keeps the matcher family specific and reduces the + false positives that appear when a single fuzzy text matcher sees + everything at once. + """ page_start = time.perf_counter() char_eval_start = time.perf_counter() @@ -1928,6 +1785,8 @@ def _render_combined_ocr_page( table_spans = _find_table_repeat_spans(page_text) table_elapsed = time.perf_counter() - table_start + # Reuse progressively filtered page views so later passes do not rebuild the + # same blanked surfaces repeatedly. page_without_tables = _filter_tables_preserve_layout(page_text) page_without_tables_existing = _blank_existing_match_regions_preserve_layout(page_without_tables) page_without_tables_latex = _filter_latex_preserve_layout(page_without_tables) @@ -2271,7 +2130,16 @@ def _load_rust_extension( *, required_attrs: Optional[Iterable[str]] = None, ): - """Import a Rust extension, building it with maturin if necessary.""" + """Import a Rust extension, building it with maturin if necessary. + + The load path is intentionally import-first: + - fast path: import an already-built extension and return immediately + - fallback: build in place only if the module is missing or incomplete + + That keeps ordinary OCR runs from paying a `maturin develop` startup tax + in every fresh process while still letting a developer bootstrap a local + checkout without separate setup steps. + """ import importlib required = tuple(required_attrs or ()) @@ -3134,6 +3002,9 @@ def clean_ocr( with _combined_ocr_process_pool_warning_ctx(): with ProcessPoolExecutor( max_workers=render_workers, + # Linux workers inherit the already-imported Rust + # extension cheaply under `fork`, which keeps the + # document-level renderer fast without changing output. mp_context=mp.get_context("fork"), initializer=_init_combined_ocr_worker, ) as executor: @@ -3487,6 +3358,8 @@ def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) with _combined_ocr_process_pool_warning_ctx(): with ProcessPoolExecutor( max_workers=render_workers, + # Match the clean-mode executor policy so debug and + # clean keep the same performance shape and worker init. mp_context=mp.get_context("fork"), initializer=_init_combined_ocr_worker, ) as executor: From bc51980c1bb04f12dcac25a8d04d03c8967a16f4 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 16:54:51 +0300 Subject: [PATCH 85/93] widen same-type OCR span merge gap --- docs/ocr_repetition_policy.md | 1 + src/glossapi/corpus/phase_clean.py | 21 ++++++++++++++++++--- tests/test_corpus_clean_enhancements.py | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/docs/ocr_repetition_policy.md b/docs/ocr_repetition_policy.md index e6c6fde..eccd446 100644 --- a/docs/ocr_repetition_policy.md +++ b/docs/ocr_repetition_policy.md @@ -34,6 +34,7 @@ They do not override numeric-specific detectors, which have their own thresholds ## Design Intent +- Neighboring same-type spans may merge when their separator has `40` non-whitespace characters or less; this keeps fragmented OCR loops from being split into multiple tiny matches. - A default of `4` is meant to reduce borderline `3`-repeat matches. - Locality matters more than page-wide reuse, especially for LaTeX. - Repeated symbols or notation used normally across a page should not be treated as cleaner targets by default. diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 06839cf..bf0dd65 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -53,7 +53,10 @@ PAGE_SPLIT_MARKER = "<--- Page Split --->" WORD_REPEAT_HASH_MASK = (1 << 64) - 1 WORD_REPEAT_HASH_BASE = 1469598103934665603 -WORD_REPEAT_MERGE_NONWHITESPACE_GAP = 10 +# Neighboring same-category spans may be merged when the visible separator is +# still small enough to read as one corrupted region rather than two separate +# failures. This is intentionally more permissive than the older 10-char rule. +WORD_REPEAT_MERGE_MAX_NONWHITESPACE_GAP = 40 EXISTING_MATCH_BLOCK_RE = re.compile(r"(?is)]*>.*?") LATEX_BLOCK_RE = re.compile(r"(?is)\$\$.*?\$\$") LATEX_BRACKET_RE = re.compile(r"(?is)\\\[.*?\\\]") @@ -1248,6 +1251,18 @@ def _gap_has_fewer_than_n_nonwhitespace_chars(text: str, start: int, end: int, l return True +def _gap_has_at_most_n_nonwhitespace_chars(text: str, start: int, end: int, limit: int) -> bool: + if start >= end: + return True + count = 0 + for ch in text[start:end]: + if not ch.isspace(): + count += 1 + if count > limit: + return False + return True + + def _latex_segments_are_local(page_text: str, left: Dict[str, Any], right: Dict[str, Any]) -> bool: return _gap_has_fewer_than_n_nonwhitespace_chars( page_text, @@ -1531,11 +1546,11 @@ def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dic not overlaps and previous["category"] == span["category"] and previous["category"] != "table" - and _gap_has_fewer_than_n_nonwhitespace_chars( + and _gap_has_at_most_n_nonwhitespace_chars( text, previous["end"], span["start"], - WORD_REPEAT_MERGE_NONWHITESPACE_GAP, + WORD_REPEAT_MERGE_MAX_NONWHITESPACE_GAP, ) ) if overlaps or close_gap: diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index 95d5df3..929f1be 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -11,6 +11,7 @@ from glossapi.corpus.phase_clean import ( _find_word_repeat_spans, _find_word_repeat_spans_python, + _merge_labeled_raw_spans, _normalize_alnum_with_map_skip_tags, ) from glossapi.scripts.table_markdown_audit import audit_table, write_clean_markdown_file @@ -165,6 +166,28 @@ def _run_clean_ocr_latex_slot_progression_debug_export( return rows, debug_dir +def test_merge_labeled_raw_spans_merges_same_type_with_gap_of_40() -> None: + text = "A" * 10 + ("x" * 40) + "B" * 10 + spans = [ + {"start": 0, "end": 10, "match_types": ["word_repeat"], "category": "word"}, + {"start": 50, "end": 60, "match_types": ["word_repeat"], "category": "word"}, + ] + merged = _merge_labeled_raw_spans(text, spans) + assert len(merged) == 1 + assert merged[0]["start"] == 0 + assert merged[0]["end"] == 60 + + +def test_merge_labeled_raw_spans_does_not_merge_same_type_with_gap_of_41() -> None: + text = "A" * 10 + ("x" * 41) + "B" * 10 + spans = [ + {"start": 0, "end": 10, "match_types": ["word_repeat"], "category": "word"}, + {"start": 51, "end": 61, "match_types": ["word_repeat"], "category": "word"}, + ] + merged = _merge_labeled_raw_spans(text, spans) + assert len(merged) == 2 + + def test_clean_skips_latex_blocks_for_mojibake(tmp_path: Path) -> None: corpus = _build_corpus(tmp_path) row = _run_clean_and_read_row(corpus, LATEX_MOJIBAKE_MD, stem="latex-case") From 51db020a38aaaf59ab198a42b6072ba1e37b5548 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 17:10:18 +0300 Subject: [PATCH 86/93] add latex short atom block matching --- src/glossapi/corpus/phase_clean.py | 262 +++++++++++++++++++++++- tests/test_corpus_clean_enhancements.py | 122 +++++++++++ 2 files changed, 382 insertions(+), 2 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index bf0dd65..1248f12 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -117,11 +117,30 @@ r"\tilde", r"\bar", } +LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS = { + r"\alpha", + r"\beta", + r"\gamma", + r"\delta", + r"\epsilon", + r"\varepsilon", + r"\lambda", + r"\mu", + r"\nu", + r"\omega", + r"\Delta", +} +LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS = { + r"\hat", + r"\tilde", + r"\bar", +} LATEX_SEGMENT_LOCAL_NONWHITESPACE_GAP = 12 LATEX_SEGMENT_EXACT_RUN_MIN = 4 LATEX_SEGMENT_SKELETON_RUN_MIN = 4 LATEX_SEGMENT_ALTERNATING_RUN_MIN = 6 LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN = 4 +LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS = 12 LATEX_SHORT_SEGMENT_MAX_NORM = 32 LATEX_LONG_SEGMENT_MIN_NORM = 24 LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP = 3 @@ -373,6 +392,53 @@ def _is_short_latex_repeat_atom(raw_segment: str) -> bool: return set(command_tokens).issubset(LATEX_SHORT_REPEAT_ATOM_COMMANDS) +def _strip_latex_outer_delimiters(raw_segment: str) -> str: + stripped = raw_segment.strip() + wrappers = ( + (r"\(", r"\)"), + (r"\[", r"\]"), + ("$$", "$$"), + ("$", "$"), + ) + for left, right in wrappers: + if stripped.startswith(left) and stripped.endswith(right) and len(stripped) >= len(left) + len(right): + return stripped[len(left) : len(stripped) - len(right)].strip() + return stripped + + +def _latex_short_atom_block_key(raw_segment: str) -> Optional[str]: + body = "".join(ch for ch in _strip_latex_outer_delimiters(raw_segment) if not ch.isspace()) + if not body or len(body) > LATEX_SHORT_SEGMENT_MAX_NORM: + return None + + plain_pattern = ( + r"^(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS)) + + r")(?P\'+)?$" + ) + match = re.fullmatch(plain_pattern, body) + if match: + base = match.group("base") or "" + primes = match.group("primes") or "" + return f"{base}{primes}" + + decorated_pattern = ( + r"^(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS)) + + r")\{(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS)) + + r")\}(?P\'+)?$" + ) + match = re.fullmatch(decorated_pattern, body) + if match: + decorator = match.group("decorator") or "" + base = match.group("base") or "" + primes = match.group("primes") or "" + return f"{decorator}{{{base}}}{primes}" + + return None + + def _is_suspicious_internal_latex_repeat(raw_segment: str) -> bool: if not raw_segment: return False @@ -1370,6 +1436,88 @@ def _find_local_latex_segment_block_spans( return labeled_spans +def _find_short_atom_block_repeat_bounds( + atom_keys: List[str], +) -> Optional[Tuple[int, int, int, int]]: + n_items = len(atom_keys) + if n_items < LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS: + return None + + best: Optional[Tuple[int, int, int, int]] = None + for period in range(n_items // 2, 1, -1): + for start in range(0, n_items - (2 * period) + 1): + pattern = atom_keys[start : start + period] + if atom_keys[start + period : start + (2 * period)] != pattern: + continue + if len(set(pattern)) < 2: + continue + + left = start + while left - period >= 0 and atom_keys[left - period : left] == pattern: + left -= period + + right = start + (2 * period) + while right + period <= n_items and atom_keys[right : right + period] == pattern: + right += period + + repeated_items = right - left + repetitions = repeated_items // period + if repeated_items < LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS or repetitions < 2: + continue + + candidate = (left, right, period, repetitions) + if best is None: + best = candidate + continue + + best_span_len = best[1] - best[0] + candidate_span_len = candidate[1] - candidate[0] + if candidate_span_len > best_span_len: + best = candidate + continue + if candidate_span_len == best_span_len and candidate[2] > best[2]: + best = candidate + return best + + +def _find_local_latex_short_atom_block_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + idx = 0 + while idx < len(group): + if not group[idx].get("short_atom_block_key"): + idx += 1 + continue + + end_idx = idx + 1 + while end_idx < len(group) and group[end_idx].get("short_atom_block_key"): + end_idx += 1 + + run = group[idx:end_idx] + atom_keys = [str(item["short_atom_block_key"]) for item in run] + repeated_bounds = _find_short_atom_block_repeat_bounds(atom_keys) + if repeated_bounds is not None: + _, _, period_items, repetitions = repeated_bounds + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "short_atom_block_repeat", + "item_count": len(run), + "period_items": int(period_items), + "repetitions": int(repetitions), + } + ) + + idx = end_idx + return labeled_spans + + def _find_local_latex_slot_progression_spans( page_text: str, segments: List[Dict[str, Any]], @@ -1461,10 +1609,13 @@ def _find_latex_repeat_spans( segments = _extract_latex_segments(analysis_text) for segment in segments: - segment["exact_key"] = _normalize_latex_segment_exact(str(segment["text"])) - segment["skeleton_key"] = _normalize_latex_segment_skeleton(str(segment["text"])) + raw_text = str(segment["text"]) + segment["exact_key"] = _normalize_latex_segment_exact(raw_text) + segment["skeleton_key"] = _normalize_latex_segment_skeleton(raw_text) + segment["short_atom_block_key"] = _latex_short_atom_block_key(raw_text) labeled_spans.extend(_find_local_latex_segment_block_spans(page_text, segments)) + labeled_spans.extend(_find_local_latex_short_atom_block_spans(page_text, segments)) for segment in segments: normalized_text, raw_map = _normalize_latex_repeat_with_map(segment["text"]) @@ -3626,6 +3777,113 @@ def clean_ocr_latex_slot_progression_debug( ) return rows + def clean_ocr_latex_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 1000, + doc_offset: int = 0, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> List[Dict[str, Any]]: + """Export only matched pages for all LaTeX repeat classes.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting LaTeX debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + latex_spans = _find_latex_repeat_spans( + page, + blocked_spans=[], + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + ) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not latex_spans: + continue + + annotated_page, page_types, _, _, latex_count, _, _ = _annotate_page_with_labeled_spans( + page, + latex_spans, + ) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "latex_match_count": latex_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d LaTeX debug pages to %s", + len(rows), + output_dir, + ) + return rows + def filter(self, *args, **kwargs): # type: ignore[override] """Deprecated: use :py:meth:`clean` instead. Retained for one release.""" self.logger.warning("Corpus.filter() is deprecated – calling clean() instead") diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index 929f1be..a2e1e93 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -166,6 +166,20 @@ def _run_clean_ocr_latex_slot_progression_debug_export( return rows, debug_dir +def _run_clean_ocr_latex_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_latex_debug" + rows = corpus.clean_ocr_latex_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + def test_merge_labeled_raw_spans_merges_same_type_with_gap_of_40() -> None: text = "A" * 10 + ("x" * 40) + "B" * 10 spans = [ @@ -1016,6 +1030,71 @@ def test_clean_ocr_numeric_word_debug_docs_ignores_diagrammatic_short_latex_symb assert " None: + corpus = _build_corpus(tmp_path) + warmup = [r"\( \alpha \)", r"\( \beta \)", r"\( \gamma \)", r"\( \gamma \)"] + block = [ + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \beta \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \gamma \)", + ] + repeated = " ".join(warmup + block + block) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-block", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-block.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \alpha \)", + r"\( \beta \)", + r"\( \gamma \)", + r"\( \delta \)", + r"\( \omega \)", + r"\( \mu \)", + r"\( \nu \)", + r"\( \lambda \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-inventory-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-short-atom-inventory-ignore.md").read_text(encoding="utf-8") + assert " None: @@ -1318,6 +1397,49 @@ def test_clean_ocr_hybrid_debug_ignores_markup_number_progression(tmp_path: Path assert not any(debug_dir.glob("*.md")) +def test_clean_ocr_latex_debug_exports_short_atom_block_pages( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + warmup = [r"\( \alpha \)", r"\( \beta \)", r"\( \gamma \)", r"\( \gamma \)"] + block = [ + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \beta \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \gamma \)", + ] + markdown_text = ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + + " ".join(warmup + block + block) + + "\n" + "<--- Page Split --->\n" + "Κανονική τρίτη σελίδα.\n" + ) + rows, debug_dir = _run_clean_ocr_latex_debug_export( + corpus, + markdown_text, + stem="ocr-latex-debug-short-atom", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-debug-short-atom__debug_page_00002.md").read_text(encoding="utf-8") + assert " None: From ba80f1e76aa174e0078c7228ce96f369bfd25350 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 17:22:51 +0300 Subject: [PATCH 87/93] extend latex short atom tails --- src/glossapi/corpus/phase_clean.py | 40 ++++++++++++++++++++++++- tests/test_corpus_clean_enhancements.py | 7 ++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 1248f12..ac2548a 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1480,6 +1480,43 @@ def _find_short_atom_block_repeat_bounds( return best +def _extend_latex_short_atom_block_partial_tail( + page_text: str, + run: List[Dict[str, Any]], + repeated_bounds: Tuple[int, int, int, int], +) -> int: + if not run: + return 0 + + left, _, period, _ = repeated_bounds + if period <= 0 or left >= len(run): + return int(run[-1]["end"]) + + expected_idx = left + ((len(run) - left) % period) + if expected_idx >= len(run): + return int(run[-1]["end"]) + + expected_text = str(run[expected_idx]["text"]) + segment_end = int(run[-1]["end"]) + cursor = segment_end + while cursor < len(page_text) and page_text[cursor].isspace(): + cursor += 1 + if cursor >= len(page_text): + return segment_end + + prefix_len = 0 + while ( + cursor + prefix_len < len(page_text) + and prefix_len < len(expected_text) + and page_text[cursor + prefix_len] == expected_text[prefix_len] + ): + prefix_len += 1 + + if prefix_len == 0 or prefix_len >= len(expected_text): + return segment_end + return cursor + prefix_len + + def _find_local_latex_short_atom_block_spans( page_text: str, segments: List[Dict[str, Any]], @@ -1501,10 +1538,11 @@ def _find_local_latex_short_atom_block_spans( repeated_bounds = _find_short_atom_block_repeat_bounds(atom_keys) if repeated_bounds is not None: _, _, period_items, repetitions = repeated_bounds + span_end = _extend_latex_short_atom_block_partial_tail(page_text, run, repeated_bounds) labeled_spans.append( { "start": int(run[0]["start"]), - "end": int(run[-1]["end"]), + "end": int(span_end), "match_types": ["latex_repeat"], "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], "kind": "short_atom_block_repeat", diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index a2e1e93..2d168f1 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -1050,7 +1050,7 @@ def test_clean_ocr_numeric_word_debug_docs_flags_latex_short_atom_block_repeat( r"\( \alpha \)", r"\( \gamma \)", ] - repeated = " ".join(warmup + block + block) + repeated = " ".join(warmup + block + block) + " \\( \\alpha" rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( corpus, repeated + "\n", @@ -1064,6 +1064,8 @@ def test_clean_ocr_numeric_word_debug_docs_flags_latex_short_atom_block_repeat( assert " \( \alpha" not in content + assert r"\( \alpha" in content def test_clean_ocr_numeric_word_debug_docs_ignores_nonrepeating_short_atom_inventory( @@ -1421,6 +1423,7 @@ def test_clean_ocr_latex_debug_exports_short_atom_block_pages( "Κανονική πρώτη σελίδα.\n" "<--- Page Split --->\n" + " ".join(warmup + block + block) + + " \\( \\alpha" + "\n" "<--- Page Split --->\n" "Κανονική τρίτη σελίδα.\n" @@ -1438,6 +1441,8 @@ def test_clean_ocr_latex_debug_exports_short_atom_block_pages( content = (debug_dir / "ocr-latex-debug-short-atom__debug_page_00002.md").read_text(encoding="utf-8") assert " \( \alpha" not in content + assert r"\( \alpha" in content def test_clean_ocr_latex_slot_progression_debug_flags_derivative_ladder( From 2b78f456de18758b90ece0689f850650c0187aee Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 18:50:50 +0300 Subject: [PATCH 88/93] expand latex structural repeat coverage --- src/glossapi/corpus/phase_clean.py | 285 ++++++++++++++++++++++-- tests/test_corpus_clean_enhancements.py | 72 ++++++ 2 files changed, 334 insertions(+), 23 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index ac2548a..7e22117 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -111,6 +111,12 @@ r"\tilde", r"\bar", } +LATEX_INTERNAL_SMALL_VOCAB_COMMANDS = { + r"\cdots", + r"\ldots", + r"\vdots", + r"\ddots", +} LATEX_SHORT_REPEAT_ATOM_COMMANDS = { r"\Delta", r"\hat", @@ -141,6 +147,9 @@ LATEX_SEGMENT_ALTERNATING_RUN_MIN = 6 LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN = 4 LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS = 12 +LATEX_SHORT_ATOM_EXACT_SEGMENT_MIN_TOKENS = 2 +LATEX_SHORT_ATOM_CHAIN_MIN_TOKENS = 6 +LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS = 24 LATEX_SHORT_SEGMENT_MAX_NORM = 32 LATEX_LONG_SEGMENT_MIN_NORM = 24 LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP = 3 @@ -439,17 +448,126 @@ def _latex_short_atom_block_key(raw_segment: str) -> Optional[str]: return None +def _consume_latex_short_atom_script(body: str, pos: int) -> Optional[int]: + while pos < len(body) and body[pos] in "_^": + pos += 1 + if pos >= len(body): + return None + if body[pos] == "{": + end = body.find("}", pos + 1) + if end == -1 or end == pos + 1: + return None + content = body[pos + 1 : end] + if any(ch.isspace() for ch in content) or "{" in content or "}" in content: + return None + pos = end + 1 + continue + if body[pos] == "\\": + match = re.match(r"\\[A-Za-z]+", body[pos:]) + if match is None: + return None + pos += len(match.group(0)) + continue + if body[pos].isalnum(): + pos += 1 + continue + return None + return pos + + +def _latex_short_atom_sequence_tokens( + raw_segment: str, + *, + allow_truncated_tail: bool = False, +) -> Optional[List[str]]: + body = "".join(ch for ch in _strip_latex_outer_delimiters(raw_segment) if not ch.isspace()) + if not body: + return None + + base_commands = sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS, key=len, reverse=True) + decorator_commands = sorted(LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS, key=len, reverse=True) + tokens: List[str] = [] + pos = 0 + while pos < len(body): + token: Optional[str] = None + for decorator in decorator_commands: + prefix = decorator + "{" + if not body.startswith(prefix, pos): + continue + inner_pos = pos + len(prefix) + base = next((candidate for candidate in base_commands if body.startswith(candidate, inner_pos)), None) + if base is None: + continue + end_pos = inner_pos + len(base) + if end_pos >= len(body) or body[end_pos] != "}": + continue + token = f"{decorator}{{{base}}}" + pos = end_pos + 1 + break + + if token is None: + base = next((candidate for candidate in base_commands if body.startswith(candidate, pos)), None) + if base is not None: + token = base + pos += len(base) + + if token is None: + remaining = body[pos:] + if allow_truncated_tail and tokens and len(remaining) >= 4 and any(command.startswith(remaining) for command in base_commands): + break + return None + + while pos < len(body) and body[pos] == "'": + token += "'" + pos += 1 + + script_end = _consume_latex_short_atom_script(body, pos) + if script_end is None: + return None + token += body[pos:script_end] + pos = script_end + + while pos < len(body) and body[pos] == "'": + token += "'" + pos += 1 + + tokens.append(token) + + return tokens or None + + +def _is_short_latex_whitelist_segment(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if len(normalized) > LATEX_SHORT_SEGMENT_MAX_NORM: + return False + tokens = _latex_short_atom_sequence_tokens(raw_segment) + return tokens is not None and len(tokens) >= LATEX_SHORT_ATOM_EXACT_SEGMENT_MIN_TOKENS + + +def _is_latex_short_atom_chain_segment(raw_segment: str) -> bool: + tokens = _latex_short_atom_sequence_tokens(raw_segment, allow_truncated_tail=True) + if tokens is None or len(tokens) < LATEX_SHORT_ATOM_CHAIN_MIN_TOKENS: + return False + counts = Counter(tokens) + return max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN and len(counts) <= 3 + + def _is_suspicious_internal_latex_repeat(raw_segment: str) -> bool: if not raw_segment: return False if "" in raw_segment or "" in raw_segment: return True + if _is_latex_short_atom_chain_segment(raw_segment): + return True command_tokens = LATEX_COMMAND_RE.findall(raw_segment) if any(wrapper in raw_segment for wrapper in LATEX_TEXT_WRAPPER_MACROS): return len(command_tokens) >= 8 or len(raw_segment) >= 60 counts = Counter(command_tokens) + if set(command_tokens).issubset(LATEX_INTERNAL_SMALL_VOCAB_COMMANDS): + if len(command_tokens) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS and len(counts) <= 3: + return True if any(command in LATEX_INTERNAL_REPEAT_COMMANDS for command in counts): return max(counts.values(), default=0) >= LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP @@ -1369,21 +1487,23 @@ def _find_local_latex_segment_block_spans( run_length = end_idx - idx exact_run = group[idx:end_idx] + is_short_repeat_atom = _is_short_latex_repeat_atom(str(group[idx]["text"])) + is_short_whitelist_segment = _is_short_latex_whitelist_segment(str(group[idx]["text"])) if run_length >= LATEX_SEGMENT_EXACT_RUN_MIN and ( len(exact_key) >= LATEX_LONG_SEGMENT_MIN_NORM - or ( - _is_short_latex_repeat_atom(str(group[idx]["text"])) - and _short_atom_run_has_clean_gaps(page_text, exact_run) - ) + or (is_short_repeat_atom and _short_atom_run_has_clean_gaps(page_text, exact_run)) + or (is_short_whitelist_segment and _short_atom_run_has_clean_gaps(page_text, exact_run)) ): - labeled_spans.append( - { - "start": int(exact_run[0]["start"]), - "end": int(exact_run[-1]["end"]), - "match_types": ["latex_repeat"], - "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], - } - ) + span = { + "start": int(exact_run[0]["start"]), + "end": int(exact_run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + if is_short_whitelist_segment and not is_short_repeat_atom: + span["kind"] = "short_atom_segment_repeat" + span["item_count"] = len(exact_run) + labeled_spans.append(span) idx = end_idx idx = 0 @@ -1556,6 +1676,115 @@ def _find_local_latex_short_atom_block_spans( return labeled_spans +def _find_raw_latex_small_vocab_command_spans(page_text: str) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + command_matches = list(LATEX_COMMAND_RE.finditer(page_text)) + run_start: Optional[int] = None + run_end: Optional[int] = None + run_commands: List[str] = [] + previous_end = 0 + + def flush_run() -> None: + if run_start is None or run_end is None or not run_commands: + return + counts = Counter(run_commands) + if ( + len(run_commands) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS + and len(counts) <= 3 + and max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN + ): + labeled_spans.append( + { + "start": run_start, + "end": run_end, + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "internal_small_vocab_command_run", + "item_count": len(run_commands), + } + ) + + for command_match in command_matches: + command = command_match.group(0) + gap = page_text[previous_end : command_match.start()] + can_extend_run = not any(ch.isalnum() for ch in gap) + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS and (not run_commands or can_extend_run): + if not run_commands: + run_start = command_match.start() + run_end = command_match.end() + run_commands.append(command) + else: + flush_run() + run_start = None + run_end = None + run_commands = [] + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS: + run_start = command_match.start() + run_end = command_match.end() + run_commands = [command] + previous_end = command_match.end() + flush_run() + + return labeled_spans + + +def _find_internal_latex_small_vocab_command_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for segment in segments: + raw_text = str(segment["text"]) + command_matches = list(LATEX_COMMAND_RE.finditer(raw_text)) + run_start: Optional[int] = None + run_end: Optional[int] = None + run_commands: List[str] = [] + previous_end = 0 + + def flush_run() -> None: + if run_start is None or run_end is None or not run_commands: + return + counts = Counter(run_commands) + if ( + len(run_commands) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS + and len(counts) <= 3 + and max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN + ): + labeled_spans.append( + { + "start": int(segment["start"]) + run_start, + "end": int(segment["start"]) + run_end, + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "internal_small_vocab_command_run", + "item_count": len(run_commands), + } + ) + + for command_match in command_matches: + command = command_match.group(0) + gap = raw_text[previous_end : command_match.start()] + can_extend_run = not any(ch.isalnum() for ch in gap) + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS and (not run_commands or can_extend_run): + if not run_commands: + run_start = command_match.start() + run_end = command_match.end() + run_commands.append(command) + else: + flush_run() + run_start = None + run_end = None + run_commands = [] + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS: + run_start = command_match.start() + run_end = command_match.end() + run_commands = [command] + previous_end = command_match.end() + flush_run() + + return labeled_spans + + def _find_local_latex_slot_progression_spans( page_text: str, segments: List[Dict[str, Any]], @@ -1645,6 +1874,8 @@ def _find_latex_repeat_spans( } ) + labeled_spans.extend(_find_raw_latex_small_vocab_command_spans(analysis_text)) + segments = _extract_latex_segments(analysis_text) for segment in segments: raw_text = str(segment["text"]) @@ -1654,6 +1885,7 @@ def _find_latex_repeat_spans( labeled_spans.extend(_find_local_latex_segment_block_spans(page_text, segments)) labeled_spans.extend(_find_local_latex_short_atom_block_spans(page_text, segments)) + labeled_spans.extend(_find_internal_latex_small_vocab_command_spans(page_text, segments)) for segment in segments: normalized_text, raw_map = _normalize_latex_repeat_with_map(segment["text"]) @@ -1671,17 +1903,18 @@ def _find_latex_repeat_spans( raw_span = page_text[start:end] if not _is_suspicious_internal_latex_repeat(raw_span): continue - labeled_spans.append( - { - "start": start, - "end": end, - "period": span["period"], - "repetitions": span["repetitions"], - "tail_chars": span["tail_chars"], - "match_types": ["latex_repeat"], - "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], - } - ) + labeled_span = { + "start": start, + "end": end, + "period": span["period"], + "repetitions": span["repetitions"], + "tail_chars": span["tail_chars"], + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + if _is_latex_short_atom_chain_segment(raw_span): + labeled_span["kind"] = "short_atom_chain_segment" + labeled_spans.append(labeled_span) return labeled_spans @@ -1750,6 +1983,12 @@ def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dic previous["match_types"] = sorted( set(previous.get("match_types", [])) | set(span.get("match_types", [])) ) + if ( + previous.get("kind") is None + and span.get("kind") is not None + and previous.get("match_types", []) == span.get("match_types", []) + ): + previous["kind"] = span.get("kind") if "period" in span: previous["period"] = min(previous.get("period", span["period"]), span["period"]) if "repetitions" in span: diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index 2d168f1..d69e7de 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -1097,6 +1097,78 @@ def test_clean_ocr_numeric_word_debug_docs_ignores_nonrepeating_short_atom_inven assert " None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-segment-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-segment-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = r"\( \Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta \)" + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-chain-segment", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-chain-segment.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + command_run = (r"\cdots" * 18) + (r"\vdots") + (r"\cdots" * 18) + (r"\ddots") + (r"\cdots" * 18) + repeated = rf"\[ \begin{{aligned}}{command_run}\end{{aligned}} \]" + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-internal-small-vocab-command-run", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-internal-small-vocab-command-run.md").read_text(encoding="utf-8") + assert " None: From f208ae1515fa17fb6563bbb8bc757c7e70d53ef6 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 19:13:00 +0300 Subject: [PATCH 89/93] add OCR clean+debug match indexing --- docs/stages/clean.md | 10 +- src/glossapi/corpus/phase_clean.py | 872 +++++++++++++++++++----- tests/test_corpus_clean_enhancements.py | 64 ++ 3 files changed, 763 insertions(+), 183 deletions(-) diff --git a/docs/stages/clean.md b/docs/stages/clean.md index 63bab08..0528f05 100644 --- a/docs/stages/clean.md +++ b/docs/stages/clean.md @@ -20,7 +20,12 @@ The clean stage normalizes extracted Markdown and evaluates its quality. ## Main outputs - cleaned Markdown in `clean_markdown/` -- debug-marked Markdown when using the debug helpers +- debug-marked Markdown under `debug/` when debug output is requested +- debug manifests under `debug/`: + - `manifest.jsonl` + - `page_metrics.jsonl` + - `match_index.jsonl` + - `summary.json` - quality metrics and reports - metadata updates including OCR-related decisions @@ -46,8 +51,11 @@ The stage now uses one shared analyzer for both: - `debug` mode - shows exact match placement with `` tags + - records merged-span match metadata in `match_index.jsonl` - `clean` mode - removes or rewrites those exact same matched regions +- `clean + debug` + - writes pipeline-ready cleaned Markdown and the parallel debug artifacts from the same span plan in one run ## Important operational outputs diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 7e22117..9a60ca8 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -2013,36 +2013,48 @@ def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dic return merged -def _render_page_with_labeled_spans( - page_text: str, - spans: List[Dict[str, Any]], - *, - mode: str = "debug", -) -> Tuple[str, List[str], int, int, int, int, int]: - """Render one page from a shared span plan. - - `debug` and `clean` intentionally share the exact same merged span plan. - The only difference is how that plan is rendered: - - debug wraps the matched source surface in `` tags - - clean removes or rewrites the matched surface according to policy - - Keeping both modes on one renderer prevents the real cleaner from drifting - away from the reviewed debug output. - """ - if mode not in {"debug", "clean"}: - raise ValueError(f"Unsupported OCR render mode: {mode}") - merged_spans = _merge_labeled_raw_spans(page_text, spans) - if not merged_spans: - return _replace_html_tables_with_markdown(page_text), [], 0, 0, 0, 0, 0 - - parts: List[str] = [] - pos = 0 +def _summarize_merged_labeled_spans( + merged_spans: List[Dict[str, Any]], +) -> Tuple[List[str], int, int, int, int, int]: seen_types: Set[str] = set() numeric_count = 0 word_count = 0 latex_count = 0 table_count = 0 hybrid_count = 0 + for span in merged_spans: + seen_types.update(span.get("match_types", [])) + if span["category"] == "numeric": + numeric_count += 1 + elif span["category"] == "word": + word_count += 1 + elif span["category"] == "latex": + latex_count += 1 + elif span["category"] == "table": + table_count += 1 + elif span["category"] == "hybrid": + hybrid_count += 1 + return ( + sorted(seen_types), + numeric_count, + word_count, + latex_count, + table_count, + hybrid_count, + ) + + +def _render_page_from_merged_labeled_spans( + page_text: str, + merged_spans: List[Dict[str, Any]], + *, + mode: str, +) -> str: + if not merged_spans: + return _replace_html_tables_with_markdown(page_text) + + parts: List[str] = [] + pos = 0 for span in merged_spans: start = span["start"] end = span["end"] @@ -2101,20 +2113,71 @@ def _render_page_with_labeled_spans( else: parts.append(_clean_fill_for_removed_span(page_text, start, end)) pos = end - seen_types.update(match_types) - if span["category"] == "numeric": - numeric_count += 1 - elif span["category"] == "word": - word_count += 1 - elif span["category"] == "latex": - latex_count += 1 - elif span["category"] == "table": - table_count += 1 - elif span["category"] == "hybrid": - hybrid_count += 1 if pos < len(page_text): parts.append(_replace_html_tables_with_markdown(page_text[pos:])) - return "".join(parts), sorted(seen_types), numeric_count, word_count, latex_count, table_count, hybrid_count + return "".join(parts) + + +def _render_page_with_labeled_spans_result( + page_text: str, + spans: List[Dict[str, Any]], + *, + mode: str = "debug", +) -> Dict[str, Any]: + if mode not in {"debug", "clean"}: + raise ValueError(f"Unsupported OCR render mode: {mode}") + merged_spans = _merge_labeled_raw_spans(page_text, spans) + ( + page_types, + numeric_count, + word_count, + latex_count, + table_count, + hybrid_count, + ) = _summarize_merged_labeled_spans(merged_spans) + rendered_page = _render_page_from_merged_labeled_spans( + page_text, + merged_spans, + mode=mode, + ) + return { + "rendered_page": rendered_page, + "merged_spans": merged_spans, + "page_types": page_types, + "page_numeric_count": numeric_count, + "page_word_count": word_count, + "page_latex_count": latex_count, + "page_table_count": table_count, + "page_hybrid_count": hybrid_count, + } + + +def _render_page_with_labeled_spans( + page_text: str, + spans: List[Dict[str, Any]], + *, + mode: str = "debug", +) -> Tuple[str, List[str], int, int, int, int, int]: + """Render one page from a shared span plan. + + `debug` and `clean` intentionally share the exact same merged span plan. + The only difference is how that plan is rendered: + - debug wraps the matched source surface in `` tags + - clean removes or rewrites the matched surface according to policy + + Keeping both modes on one renderer prevents the real cleaner from drifting + away from the reviewed debug output. + """ + result = _render_page_with_labeled_spans_result(page_text, spans, mode=mode) + return ( + str(result["rendered_page"]), + list(result["page_types"]), + int(result["page_numeric_count"]), + int(result["page_word_count"]), + int(result["page_latex_count"]), + int(result["page_table_count"]), + int(result["page_hybrid_count"]), + ) def _annotate_page_with_labeled_spans( @@ -2129,6 +2192,77 @@ def _count_hybrid_matches_in_page(page_text: str, spans: List[Dict[str, Any]]) - return sum(1 for span in merged_spans if span.get("category") == "hybrid") +def _utf8_prefix_byte_offsets(text: str) -> List[int]: + offsets = [0] + total = 0 + for char in text: + total += len(char.encode("utf-8")) + offsets.append(total) + return offsets + + +def _span_repeat_count(span: Dict[str, Any]) -> Optional[int]: + if span.get("repetitions") is not None: + return int(span["repetitions"]) + if span.get("item_count") is not None: + return int(span["item_count"]) + if span.get("duplicate_rows") is not None: + return int(span["duplicate_rows"]) + return None + + +def _build_match_index_rows( + page_text: str, + merged_spans: List[Dict[str, Any]], + *, + source_path: Path, + page_number: int, + debug_output_path: Optional[Path] = None, +) -> List[Dict[str, Any]]: + if not merged_spans: + return [] + byte_offsets = _utf8_prefix_byte_offsets(page_text) + rows: List[Dict[str, Any]] = [] + for match_index, span in enumerate(merged_spans, start=1): + start = int(span["start"]) + end = int(span["end"]) + match_text = page_text[start:end] + rows.append( + { + "match_id": f"{source_path.stem}:page:{page_number}:match:{match_index}", + "source_path": str(source_path), + "source_stem": source_path.stem, + "debug_output_path": None if debug_output_path is None else str(debug_output_path), + "page_number": int(page_number), + "page_index_in_file": int(page_number), + "match_index_in_page": int(match_index), + "start_char": start, + "end_char": end, + "start_byte": int(byte_offsets[start]), + "end_byte": int(byte_offsets[end]), + "match_length_chars": int(end - start), + "match_length_bytes": int(byte_offsets[end] - byte_offsets[start]), + "match_types": list(span.get("match_types", [])), + "match_type": ",".join(span.get("match_types", [])), + "category": str(span.get("category", "")), + "kind": span.get("kind"), + "repeat_count": _span_repeat_count(span), + "period": span.get("period"), + "repetitions": span.get("repetitions"), + "tail_chars": span.get("tail_chars"), + "item_count": span.get("item_count"), + "cycle_len": span.get("cycle_len"), + "row_count": span.get("row_count"), + "duplicate_rows": span.get("duplicate_rows"), + "nonempty_ratio": span.get("nonempty_ratio"), + "word_count": span.get("word_count"), + "char_count": span.get("char_count"), + "matched_text": match_text, + } + ) + return rows + + def _find_labeled_shared_repeat_spans( page_text: str, *, @@ -2192,7 +2326,7 @@ def _find_labeled_shared_repeat_spans( return labeled_spans -def _render_combined_ocr_page( +def _analyze_combined_ocr_page( page_text: str, *, noise_mod: Any, @@ -2202,22 +2336,7 @@ def _render_combined_ocr_page( word_rep_threshold: int, word_min_period: int, word_window: int, - mode: str = "debug", ) -> Dict[str, Any]: - """Analyze one OCR page in the shared ownership order. - - The ordering is a policy decision, not an implementation accident: - 1. tables first, because table shells distort every later text pass - 2. numeric second, because numeric progressions should not be stolen by - generic word repetition - 3. LaTeX and hybrid structural passes next, because they operate on more - specialized local structure - 4. shared text repetition last, on the remaining visible surface only - - That ownership model keeps the matcher family specific and reduces the - false positives that appear when a single fuzzy text matcher sees - everything at once. - """ page_start = time.perf_counter() char_eval_start = time.perf_counter() @@ -2284,37 +2403,119 @@ def _render_combined_ocr_page( ) shared_elapsed = time.perf_counter() - shared_start + page_total_time = time.perf_counter() - page_start + return { + "spans": table_spans + numeric_spans + latex_spans + hybrid_spans + shared_spans, + "page_noise_metrics": page_noise_metrics, + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + + +def _render_combined_ocr_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + mode: str = "debug", +) -> Dict[str, Any]: + """Analyze one OCR page in the shared ownership order. + + The ordering is a policy decision, not an implementation accident: + 1. tables first, because table shells distort every later text pass + 2. numeric second, because numeric progressions should not be stolen by + generic word repetition + 3. LaTeX and hybrid structural passes next, because they operate on more + specialized local structure + 4. shared text repetition last, on the remaining visible surface only + + That ownership model keeps the matcher family specific and reduces the + false positives that appear when a single fuzzy text matcher sees + everything at once. + """ + analysis = _analyze_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + ) + render_result = _render_page_with_labeled_spans_result( + page_text, + list(analysis["spans"]), + mode=mode, + ) + return { + "annotated_page": render_result["rendered_page"], + "merged_spans": render_result["merged_spans"], + "page_types": render_result["page_types"], + "page_numeric_count": render_result["page_numeric_count"], + "page_word_count": render_result["page_word_count"], + "page_latex_count": render_result["page_latex_count"], + "page_table_count": render_result["page_table_count"], + "page_hybrid_count": render_result["page_hybrid_count"], + **analysis, + } + + +def _render_combined_ocr_page_modes( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + modes: Iterable[str], +) -> Dict[str, Any]: + analysis = _analyze_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + ) + merged_spans = _merge_labeled_raw_spans(page_text, list(analysis["spans"])) ( - annotated_page, page_types, page_numeric_count, page_word_count, page_latex_count, page_table_count, page_hybrid_count, - ) = _render_page_with_labeled_spans( - page_text, - table_spans + numeric_spans + latex_spans + hybrid_spans + shared_spans, - mode=mode, - ) - - page_total_time = time.perf_counter() - page_start + ) = _summarize_merged_labeled_spans(merged_spans) + rendered_pages = { + str(mode): _render_page_from_merged_labeled_spans(page_text, merged_spans, mode=str(mode)) + for mode in modes + } return { - "annotated_page": annotated_page, + "rendered_pages": rendered_pages, + "merged_spans": merged_spans, "page_types": page_types, "page_numeric_count": page_numeric_count, "page_word_count": page_word_count, "page_latex_count": page_latex_count, "page_table_count": page_table_count, "page_hybrid_count": page_hybrid_count, - "page_noise_metrics": page_noise_metrics, - "char_eval_seconds": char_eval_elapsed, - "table_seconds": table_elapsed, - "numeric_seconds": numeric_elapsed, - "latex_seconds": latex_elapsed, - "hybrid_seconds": hybrid_elapsed, - "shared_repeat_seconds": shared_elapsed, - "total_page_seconds": page_total_time, + **analysis, } @@ -2342,10 +2543,11 @@ def _render_combined_ocr_debug_page( ) -def _process_combined_ocr_debug_document( +def _process_combined_ocr_document( source_path: Path, - output_path: Path, *, + clean_output_path: Optional[Path], + debug_output_path: Optional[Path], noise_mod: Optional[Any], min_progress_steps: int, min_repeat_steps: int, @@ -2353,12 +2555,15 @@ def _process_combined_ocr_debug_document( word_rep_threshold: int, word_min_period: int, word_window: int, + include_page_metrics: bool, + include_match_index: bool, ) -> Dict[str, Any]: if noise_mod is None: noise_mod = _get_combined_ocr_worker_noise_mod() text = source_path.read_text(encoding="utf-8") pages = text.split(PAGE_SPLIT_MARKER) - annotated_pages: List[str] = [] + cleaned_pages: List[str] = [] + debug_pages: List[str] = [] matched_page_count = 0 table_match_count = 0 numeric_match_count = 0 @@ -2367,19 +2572,53 @@ def _process_combined_ocr_debug_document( word_match_count = 0 doc_match_types: Set[str] = set() page_metric_rows: List[Dict[str, Any]] = [] + match_index_rows: List[Dict[str, Any]] = [] for page_index, page in enumerate(pages, start=1): - page_result = _render_combined_ocr_debug_page( - page, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - ) - annotated_page = str(page_result["annotated_page"]) + if clean_output_path is not None and debug_output_path is not None: + page_result = _render_combined_ocr_page_modes( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + modes=("clean", "debug"), + ) + cleaned_page = str(page_result["rendered_pages"]["clean"]) + debug_page = str(page_result["rendered_pages"]["debug"]) + elif debug_output_path is not None: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="debug", + ) + cleaned_page = "" + debug_page = str(page_result["annotated_page"]) + else: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="clean", + ) + cleaned_page = str(page_result["annotated_page"]) + debug_page = "" + + merged_spans = list(page_result.get("merged_spans", [])) page_types = list(page_result["page_types"]) page_numeric_count = int(page_result["page_numeric_count"]) page_word_count = int(page_result["page_word_count"]) @@ -2395,6 +2634,11 @@ def _process_combined_ocr_debug_document( shared_elapsed = float(page_result["shared_repeat_seconds"]) page_total_time = float(page_result["total_page_seconds"]) + if clean_output_path is not None: + cleaned_pages.append(cleaned_page) + if debug_output_path is not None: + debug_pages.append(debug_page) + page_match_total = ( page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count ) @@ -2406,41 +2650,59 @@ def _process_combined_ocr_debug_document( hybrid_match_count += page_hybrid_count word_match_count += page_word_count doc_match_types.update(page_types) - annotated_pages.append(annotated_page) - page_metric_rows.append( - { - "source_path": str(source_path), - "source_stem": source_path.stem, - "page_number": page_index, - "page_index_in_file": page_index, - "total_chars": int(page_noise_metrics.get("total_chars", 0)), - "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), - "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), - "control_count": int(page_noise_metrics.get("control_count", 0)), - "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), - "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), - "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), - "table_match_count": page_table_count, - "numeric_match_count": page_numeric_count, - "latex_match_count": page_latex_count, - "hybrid_match_count": page_hybrid_count, - "word_match_count": page_word_count, - "match_types": ",".join(page_types), - "char_eval_seconds": char_eval_elapsed, - "table_seconds": table_elapsed, - "numeric_seconds": numeric_elapsed, - "latex_seconds": latex_elapsed, - "hybrid_seconds": hybrid_elapsed, - "shared_repeat_seconds": shared_elapsed, - "total_page_seconds": page_total_time, - } - ) + if include_page_metrics: + page_metric_rows.append( + { + "source_path": str(source_path), + "source_stem": source_path.stem, + "page_number": page_index, + "page_index_in_file": page_index, + "total_chars": int(page_noise_metrics.get("total_chars", 0)), + "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), + "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), + "control_count": int(page_noise_metrics.get("control_count", 0)), + "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), + "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), + "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), + "table_match_count": page_table_count, + "numeric_match_count": page_numeric_count, + "latex_match_count": page_latex_count, + "hybrid_match_count": page_hybrid_count, + "word_match_count": page_word_count, + "match_types": ",".join(page_types), + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + ) + + if include_match_index: + match_index_rows.extend( + _build_match_index_rows( + page, + merged_spans, + source_path=source_path, + page_number=page_index, + debug_output_path=debug_output_path, + ) + ) - output_path.write_text(PAGE_SPLIT_MARKER.join(annotated_pages), encoding="utf-8") + if clean_output_path is not None: + clean_output_path.write_text(PAGE_SPLIT_MARKER.join(cleaned_pages), encoding="utf-8") + if debug_output_path is not None: + debug_output_path.write_text(PAGE_SPLIT_MARKER.join(debug_pages), encoding="utf-8") + + output_path = debug_output_path or clean_output_path row = { "source_path": str(source_path), - "output_path": str(output_path), + "output_path": None if output_path is None else str(output_path), + "clean_output_path": None if clean_output_path is None else str(clean_output_path), + "debug_output_path": None if debug_output_path is None else str(debug_output_path), "source_stem": source_path.stem, "base_stem": canonical_stem(source_path.stem), "page_count": len(pages), @@ -2450,14 +2712,44 @@ def _process_combined_ocr_debug_document( "latex_match_count": latex_match_count, "hybrid_match_count": hybrid_match_count, "word_match_count": word_match_count, + "match_count": int(len(match_index_rows)), "match_types": ",".join(sorted(doc_match_types)), } return { "row": row, "page_metric_rows": page_metric_rows, + "match_index_rows": match_index_rows, } +def _process_combined_ocr_debug_document( + source_path: Path, + output_path: Path, + *, + noise_mod: Optional[Any], + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + return _process_combined_ocr_document( + source_path, + clean_output_path=None, + debug_output_path=output_path, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + include_page_metrics=True, + include_match_index=True, + ) + + def _process_combined_ocr_clean_document( source_path: Path, output_path: Path, @@ -2470,25 +2762,20 @@ def _process_combined_ocr_clean_document( word_min_period: int, word_window: int, ) -> None: - if noise_mod is None: - noise_mod = _get_combined_ocr_worker_noise_mod() - text = source_path.read_text(encoding="utf-8") - pages = text.split(PAGE_SPLIT_MARKER) - cleaned_pages: List[str] = [] - for page in pages: - page_result = _render_combined_ocr_page( - page, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - mode="clean", - ) - cleaned_pages.append(str(page_result["annotated_page"])) - output_path.write_text(PAGE_SPLIT_MARKER.join(cleaned_pages), encoding="utf-8") + _process_combined_ocr_document( + source_path, + clean_output_path=output_path, + debug_output_path=None, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + include_page_metrics=False, + include_match_index=False, + ) def _process_combined_ocr_debug_document_job( @@ -2543,6 +2830,36 @@ def _process_combined_ocr_clean_document_job( ) +def _process_combined_ocr_dual_document_job( + job: Tuple[str, str, str, int, int, int, int, int, int] +) -> Dict[str, Any]: + ( + source_path_str, + clean_output_path_str, + debug_output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + return _process_combined_ocr_document( + Path(source_path_str), + clean_output_path=Path(clean_output_path_str), + debug_output_path=Path(debug_output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + include_page_metrics=True, + include_match_index=True, + ) + + def _summarize_metric(values: List[float]) -> Dict[str, float]: if not values: return {"count": 0, "p50": 0.0, "p95": 0.0, "max": 0.0} @@ -3375,6 +3692,8 @@ def clean_ocr( *, min_repeat_run: int = 6, write_cleaned_files: bool = True, + write_debug_files: bool = False, + debug_output_dir: Union[str, Path, None] = None, min_progress_steps: int = 10, min_repeat_steps: int = 8, min_same_digit_steps: int = 10, @@ -3386,10 +3705,11 @@ def clean_ocr( The OCR profile keeps the existing canonical script metrics columns (`percentage_greek`, `latin_percentage`, `polytonic_ratio`) and adds - OCR-specific noise diagnostics. When ``write_cleaned_files`` is enabled, - the same combined page analyzer used by the debugger is applied in - ``mode="clean"`` and the cleaned markdown is written to - ``self.cleaned_markdown_dir``. + OCR-specific noise diagnostics. The same combined page analyzer drives + both clean and debug outputs: + - clean mode writes pipeline-ready markdown to ``self.cleaned_markdown_dir`` + - debug mode writes annotated markdown and a structured match index under + ``debug_output_dir`` (default: ``self.output_dir / "debug"``) """ from glossapi.parquet_schema import ParquetSchema @@ -3418,56 +3738,224 @@ def clean_ocr( max_workers=n_threads, ) md_files = sorted(input_dir.glob("*.md")) + debug_dir: Optional[Path] = None + debug_manifest_path: Optional[Path] = None + debug_page_metrics_path: Optional[Path] = None + debug_match_index_path: Optional[Path] = None + debug_summary_path: Optional[Path] = None + if write_debug_files: + debug_dir = Path(debug_output_dir) if debug_output_dir is not None else (self.output_dir / "debug") + if debug_dir.exists(): + shutil.rmtree(debug_dir) + debug_dir.mkdir(parents=True, exist_ok=True) + debug_manifest_path = debug_dir / "manifest.jsonl" + debug_page_metrics_path = debug_dir / "page_metrics.jsonl" + debug_match_index_path = debug_dir / "match_index.jsonl" + debug_summary_path = debug_dir / "summary.json" + if write_cleaned_files: if self.cleaned_markdown_dir.exists(): shutil.rmtree(self.cleaned_markdown_dir) self.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) + + if write_cleaned_files or write_debug_files: + mode_label = "clean+debug" if write_cleaned_files and write_debug_files else ("debug" if write_debug_files else "clean") self.logger.info( - "Cleaning OCR markdown with shared combined loop into %s for %d markdown files (workers=%d)…", - self.cleaned_markdown_dir, + "Running shared OCR %s loop over %d markdown files (workers=%d)…", + mode_label, len(md_files), render_workers, ) - if _can_use_combined_ocr_process_pool(noise_mod, render_workers): - jobs = [ - ( - str(source_path), - str(self.cleaned_markdown_dir / source_path.name), - int(min_progress_steps), - int(min_repeat_steps), - int(min_same_digit_steps), - int(word_rep_threshold), - int(word_min_period), - int(word_window), - ) - for source_path in md_files - ] - with _combined_ocr_process_pool_warning_ctx(): - with ProcessPoolExecutor( - max_workers=render_workers, - # Linux workers inherit the already-imported Rust - # extension cheaply under `fork`, which keeps the - # document-level renderer fast without changing output. - mp_context=mp.get_context("fork"), - initializer=_init_combined_ocr_worker, - ) as executor: - list(executor.map(_process_combined_ocr_clean_document_job, jobs)) + + if write_debug_files: + rows: List[Dict[str, Any]] = [] + total_page_times: List[float] = [] + table_page_times: List[float] = [] + numeric_page_times: List[float] = [] + latex_page_times: List[float] = [] + shared_page_times: List[float] = [] + hybrid_page_times: List[float] = [] + char_eval_times: List[float] = [] + bad_char_ratios: List[float] = [] + + def _consume_debug_doc_result( + doc_result: Dict[str, Any], + *, + page_metrics_handle: Any, + match_index_handle: Any, + ) -> None: + rows.append(dict(doc_result["row"])) + for page_row in doc_result["page_metric_rows"]: + page_metrics_handle.write(json.dumps(page_row, ensure_ascii=False)) + page_metrics_handle.write("\n") + total_page_times.append(float(page_row["total_page_seconds"])) + table_page_times.append(float(page_row["table_seconds"])) + numeric_page_times.append(float(page_row["numeric_seconds"])) + latex_page_times.append(float(page_row["latex_seconds"])) + hybrid_page_times.append(float(page_row["hybrid_seconds"])) + shared_page_times.append(float(page_row["shared_repeat_seconds"])) + char_eval_times.append(float(page_row["char_eval_seconds"])) + bad_char_ratios.append(float(page_row["bad_char_ratio"])) + for match_row in doc_result["match_index_rows"]: + match_index_handle.write(json.dumps(match_row, ensure_ascii=False)) + match_index_handle.write("\n") + + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + if write_cleaned_files: + jobs = [ + ( + str(source_path), + str(self.cleaned_markdown_dir / source_path.name), + str(debug_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + else: + jobs = [ + ( + str(source_path), + str(debug_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + with debug_page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, debug_match_index_path.open("w", encoding="utf-8") as match_index_handle: + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + if write_cleaned_files: + iterator = executor.map(_process_combined_ocr_dual_document_job, jobs) + else: + iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) + for doc_result in iterator: + _consume_debug_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + else: + if write_cleaned_files: + def _run_dual_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_document( + source_path, + clean_output_path=self.cleaned_markdown_dir / source_path.name, + debug_output_path=debug_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + include_page_metrics=True, + include_match_index=True, + ) + run_doc = _run_dual_doc + else: + def _run_debug_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_debug_document( + source_path, + debug_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + run_doc = _run_debug_doc + + with debug_page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, debug_match_index_path.open("w", encoding="utf-8") as match_index_handle: + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(run_doc, md_files): + _consume_debug_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + + with debug_manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + debug_summary = { + "doc_count": len(rows), + "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), + "matched_page_count": int(sum(int(row["matched_page_count"]) for row in rows)), + "match_count": int(sum(int(row.get("match_count", 0)) for row in rows)), + "table_match_count": int(sum(int(row["table_match_count"]) for row in rows)), + "numeric_match_count": int(sum(int(row["numeric_match_count"]) for row in rows)), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "word_match_count": int(sum(int(row["word_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "total_page_seconds": _summarize_metric(total_page_times), + "table_seconds": _summarize_metric(table_page_times), + "numeric_seconds": _summarize_metric(numeric_page_times), + "latex_seconds": _summarize_metric(latex_page_times), + "hybrid_seconds": _summarize_metric(hybrid_page_times), + "shared_repeat_seconds": _summarize_metric(shared_page_times), + "char_eval_seconds": _summarize_metric(char_eval_times), + "bad_char_ratio": _summarize_metric(bad_char_ratios), + } + debug_summary_path.write_text(json.dumps(debug_summary, ensure_ascii=False, indent=2), encoding="utf-8") else: - def _run_clean_doc(source_path: Path) -> None: - _process_combined_ocr_clean_document( - source_path, - self.cleaned_markdown_dir / source_path.name, - noise_mod=noise_mod, - min_progress_steps=int(min_progress_steps), - min_repeat_steps=int(min_repeat_steps), - min_same_digit_steps=int(min_same_digit_steps), - word_rep_threshold=int(word_rep_threshold), - word_min_period=int(word_min_period), - word_window=int(word_window), - ) + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + jobs = [ + ( + str(source_path), + str(self.cleaned_markdown_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + list(executor.map(_process_combined_ocr_clean_document_job, jobs)) + else: + def _run_clean_doc(source_path: Path) -> None: + _process_combined_ocr_clean_document( + source_path, + self.cleaned_markdown_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + with ThreadPoolExecutor(max_workers=render_workers) as executor: + list(executor.map(_run_clean_doc, md_files)) - with ThreadPoolExecutor(max_workers=render_workers) as executor: - list(executor.map(_run_clean_doc, md_files)) self.logger.info( "Scoring OCR markdown files with glossapi_rs_noise OCR profile on %d markdown files…", @@ -3729,6 +4217,9 @@ def clean_ocr_numeric_word_debug_docs( page_metrics_path = output_dir / "page_metrics.jsonl" if page_metrics_path.exists(): page_metrics_path.unlink() + match_index_path = output_dir / "match_index.jsonl" + if match_index_path.exists(): + match_index_path.unlink() summary_path = output_dir / "summary.json" if summary_path.exists(): summary_path.unlink() @@ -3769,7 +4260,12 @@ def clean_ocr_numeric_word_debug_docs( hybrid_page_times: List[float] = [] char_eval_times: List[float] = [] bad_char_ratios: List[float] = [] - def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) -> None: + def _consume_doc_result( + doc_result: Dict[str, Any], + *, + page_metrics_handle: Any, + match_index_handle: Any, + ) -> None: rows.append(dict(doc_result["row"])) for page_row in doc_result["page_metric_rows"]: page_metrics_handle.write(json.dumps(page_row, ensure_ascii=False)) @@ -3782,6 +4278,9 @@ def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) shared_page_times.append(float(page_row["shared_repeat_seconds"])) char_eval_times.append(float(page_row["char_eval_seconds"])) bad_char_ratios.append(float(page_row["bad_char_ratio"])) + for match_row in doc_result["match_index_rows"]: + match_index_handle.write(json.dumps(match_row, ensure_ascii=False)) + match_index_handle.write("\n") if _can_use_combined_ocr_process_pool(noise_mod, render_workers): jobs = [ ( @@ -3797,7 +4296,7 @@ def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) for source_path in source_paths ] iterator: Iterable[Dict[str, Any]] - with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle: + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, match_index_path.open("w", encoding="utf-8") as match_index_handle: with _combined_ocr_process_pool_warning_ctx(): with ProcessPoolExecutor( max_workers=render_workers, @@ -3808,7 +4307,11 @@ def _consume_doc_result(doc_result: Dict[str, Any], *, page_metrics_handle: Any) ) as executor: iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) for doc_result in iterator: - _consume_doc_result(doc_result, page_metrics_handle=page_metrics_handle) + _consume_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) else: def _run_debug_doc(source_path: Path) -> Dict[str, Any]: return _process_combined_ocr_debug_document( @@ -3823,10 +4326,14 @@ def _run_debug_doc(source_path: Path) -> Dict[str, Any]: word_window=int(word_window), ) - with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle: + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, match_index_path.open("w", encoding="utf-8") as match_index_handle: with ThreadPoolExecutor(max_workers=render_workers) as executor: for doc_result in executor.map(_run_debug_doc, source_paths): - _consume_doc_result(doc_result, page_metrics_handle=page_metrics_handle) + _consume_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) with manifest_path.open("w", encoding="utf-8") as handle: for row in rows: @@ -3837,6 +4344,7 @@ def _run_debug_doc(source_path: Path) -> Dict[str, Any]: "doc_count": len(rows), "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), "matched_page_count": int(sum(int(row["matched_page_count"]) for row in rows)), + "match_count": int(sum(int(row.get("match_count", 0)) for row in rows)), "table_match_count": int(sum(int(row["table_match_count"]) for row in rows)), "numeric_match_count": int(sum(int(row["numeric_match_count"]) for row in rows)), "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index d69e7de..8ef8926 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -301,6 +301,63 @@ def test_clean_ocr_supports_score_only_mode(tmp_path: Path) -> None: assert corpus.markdown_dir == corpus.output_dir / "markdown" +def test_clean_ocr_supports_combined_clean_and_debug_outputs(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + stem = "ocr-clean-debug" + source_text = ( + "Πρόλογος\n" + "
Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας
\n" + "<--- Page Split --->\n" + "των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + ) + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(source_text, encoding="utf-8") + + corpus.clean_ocr(write_cleaned_files=True, write_debug_files=True) + + cleaned_path = corpus.cleaned_markdown_dir / f"{stem}.md" + debug_dir = corpus.output_dir / "debug" + debug_path = debug_dir / f"{stem}.md" + assert cleaned_path.exists() + assert debug_path.exists() + + cleaned_text = cleaned_path.read_text(encoding="utf-8") + debug_text = debug_path.read_text(encoding="utf-8") + assert "= 2 + + match_rows = [ + json.loads(line) + for line in (debug_dir / "match_index.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert len(match_rows) >= 2 + source_pages = source_text.split("<--- Page Split --->") + for row in match_rows: + page_text = source_pages[int(row["page_number"]) - 1] + assert page_text[int(row["start_char"]):int(row["end_char"])] == row["matched_text"] + word_row = next(row for row in match_rows if row["match_type"] == "word_repeat") + assert int(word_row["repeat_count"]) >= 3 + assert int(word_row["period"]) > 0 + + page_metrics_rows = (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert len(page_metrics_rows) == 2 + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["doc_count"] == 1 + assert summary["match_count"] >= 2 + + def test_clean_ocr_ignores_numeric_lists_and_dotted_values(tmp_path: Path) -> None: corpus = _build_corpus(tmp_path) row = _run_clean_ocr_and_read_row( @@ -645,11 +702,18 @@ def test_clean_ocr_numeric_word_debug_docs_runs_numeric_then_word(tmp_path: Path summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) assert summary["doc_count"] == 1 + assert summary["match_count"] >= 2 assert summary["numeric_match_count"] >= 1 assert summary["word_match_count"] >= 1 page_metrics = (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() assert len(page_metrics) == 2 + match_index = [ + json.loads(line) + for line in (debug_dir / "match_index.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert any(row["match_type"] == "same_digit_numeric_run" for row in match_index) + assert any(row["match_type"] == "word_repeat" for row in match_index) def test_rust_word_repeat_spans_match_python_reference(tmp_path: Path) -> None: From fae39636ca23902fa05b2f73f235f577674523b6 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Sat, 11 Apr 2026 19:24:38 +0300 Subject: [PATCH 90/93] sanitize OCR spans against page bounds --- src/glossapi/corpus/phase_clean.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index 9a60ca8..f3b5d43 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1955,7 +1955,21 @@ def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dic if not spans: return [] - spans = sorted(spans, key=lambda item: (item["start"], item["end"])) + text_len = len(text) + sanitized_spans: List[Dict[str, Any]] = [] + for span in spans: + start = max(0, int(span["start"])) + end = min(text_len, int(span["end"])) + if start >= text_len or end <= start: + continue + sanitized = dict(span) + sanitized["start"] = start + sanitized["end"] = end + sanitized_spans.append(sanitized) + if not sanitized_spans: + return [] + + spans = sorted(sanitized_spans, key=lambda item: (item["start"], item["end"])) merged: List[Dict[str, Any]] = [] for span in spans: if not merged: From 5d7a0bab851452b27a8ed68071d534f517ef39ef Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Tue, 14 Apr 2026 18:04:43 +0300 Subject: [PATCH 91/93] ocr: refresh cleaner metrics after reruns --- src/glossapi/corpus/ocr/artifacts.py | 5 ++ src/glossapi/corpus/phase_ocr_math.py | 29 +++++++ tests/test_ocr_dispatch_backends.py | 112 ++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) diff --git a/src/glossapi/corpus/ocr/artifacts.py b/src/glossapi/corpus/ocr/artifacts.py index 11b7dbb..3e91906 100644 --- a/src/glossapi/corpus/ocr/artifacts.py +++ b/src/glossapi/corpus/ocr/artifacts.py @@ -131,6 +131,11 @@ def persist_ocr_success( def refresh_cleaner_after_ocr(context: CorpusOcrContext) -> None: """Refresh cleaner metrics after OCR reruns rewrite markdown outputs.""" + refresh = getattr(context, "_refresh_metrics_after_ocr_rerun", None) + if callable(refresh): + refresh() + return + context.logger.info("Re-running Rust cleaner after OCR rerun to refresh metrics") context.clean( input_dir=context.markdown_dir, diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 6881e4d..0d86861 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -147,6 +147,35 @@ def _normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> class OcrMathPhaseMixin: + def _refresh_metrics_after_ocr_rerun(self) -> None: + """Refresh OCR-owned and export-owned metrics after OCR remediation. + + `clean_ocr()` and `clean()` remain separate stages on purpose: + + - `clean_ocr()` owns OCR artifact removal and OCR-specific metrics. + - `clean()` owns the broader export-facing clean metrics. + + After OCR reruns we intentionally execute both stages in sequence on the + OCR-cleaned text surface instead of treating one stage as a synonym for + the other. + """ + + self.logger.info( + "Re-running OCR cleaner after OCR rerun to refresh cleaned text and OCR metrics" + ) + self.clean_ocr( + input_dir=self.markdown_dir, + drop_bad=False, + ) + self.logger.info( + "Re-running Rust cleaner in score-only mode on OCR-cleaned markdown to refresh export metrics" + ) + self.clean( + input_dir=self.cleaned_markdown_dir, + drop_bad=False, + write_cleaned_files=False, + ) + def ocr( self, *, diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 3fb05eb..08eb326 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -92,3 +92,115 @@ def test_invalid_backend_is_rejected(tmp_path): corpus = _mk_corpus(tmp_path) with pytest.raises(ValueError, match="backend must be 'deepseek'"): corpus.ocr(backend="bogus", fix_bad=True, math_enhance=False) + + +def test_deepseek_backend_forwards_parallelism_controls(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + use_gpus="multi", + devices=[1, 3], + workers_per_gpu=2, + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=True, + render_dpi=120, + max_pages=7, + max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, + ) + + assert calls["files"] == [fname] + assert calls["kwargs"]["use_gpus"] == "multi" + assert calls["kwargs"]["devices"] == [1, 3] + assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["ocr_profile"] == "plain_ocr" + assert calls["kwargs"]["attn_backend"] == "sdpa" + assert calls["kwargs"]["base_size"] == 640 + assert calls["kwargs"]["image_size"] == 448 + assert calls["kwargs"]["crop_mode"] is True + assert calls["kwargs"]["render_dpi"] == 120 + assert calls["kwargs"]["max_pages"] == 7 + assert calls["kwargs"]["max_new_tokens"] == 2048 + assert calls["kwargs"]["repetition_penalty"] == 1.08 + assert calls["kwargs"]["no_repeat_ngram_size"] == 12 + + +def test_deepseek_rerun_refreshes_with_clean_ocr_then_score_only_clean(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for current in files: + stem = Path(current).stem + (markdown_dir / f"{stem}.md").write_text("ocr text\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + "{\"page_count\": 1}\n", + encoding="utf-8", + ) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + calls = [] + original_markdown_dir = corpus.markdown_dir + + def fake_clean_ocr(*args, **kwargs): + calls.append( + ("clean_ocr", kwargs.get("input_dir"), kwargs.get("write_cleaned_files", True)) + ) + corpus.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) + corpus.markdown_dir = corpus.cleaned_markdown_dir + + def fake_clean(*args, **kwargs): + calls.append(("clean", kwargs.get("input_dir"), kwargs.get("write_cleaned_files", True))) + + monkeypatch.setattr(corpus, "clean_ocr", fake_clean_ocr) + monkeypatch.setattr(corpus, "clean", fake_clean) + + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + assert calls[0][0] == "clean_ocr" + assert Path(str(calls[0][1])) == original_markdown_dir + assert calls[0][2] is True + assert calls[1][0] == "clean" + assert Path(str(calls[1][1])) == corpus.cleaned_markdown_dir + assert calls[1][2] is False From f6e711b50510130330e1326a26a1e807700252fc Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Tue, 14 Apr 2026 18:18:21 +0300 Subject: [PATCH 92/93] test: cover OCR clean and export handoff --- tests/test_corpus_ocr_modules.py | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/tests/test_corpus_ocr_modules.py b/tests/test_corpus_ocr_modules.py index 95561b9..4d5fedd 100644 --- a/tests/test_corpus_ocr_modules.py +++ b/tests/test_corpus_ocr_modules.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pandas as pd @@ -100,3 +101,111 @@ def test_apply_ocr_success_updates_maps_canonical_artifacts_by_stem(tmp_path): assert updated.loc["needs.pdf", "ocr_markdown_relpath"] == "markdown/needs.md" assert updated.loc["needs__p0001-0002.pdf", "ocr_metrics_relpath"] == "json/metrics/needs.metrics.json" assert updated.loc["needs.pdf", "extraction_mode"] == "deepseek" + + +def test_ocr_pipeline_exports_cleaned_and_raw_text_side_by_side(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + { + "filename": "doc.pdf", + corpus.url_column: "https://example.com/doc.pdf", + "needs_ocr": True, + "ocr_success": False, + } + ] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + raw_markdown = ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7.\n" + "0 0 0 0 0 0\n" + "1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n" + ) + + from glossapi.ocr.deepseek import runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for current in files: + stem = Path(current).stem + (markdown_dir / f"{stem}.md").write_text(raw_markdown, encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + json.dumps( + { + "page_count": 2, + "pages": [ + {"page_no": 1, "formula_count": 0, "code_count": 0}, + {"page_no": 2, "formula_count": 0, "code_count": 0}, + ], + } + ) + + "\n", + encoding="utf-8", + ) + return {"doc": {"page_count": 2}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + calls = [] + original_clean_ocr = corpus.clean_ocr + original_clean = corpus.clean + original_markdown_dir = corpus.markdown_dir + original_cleaned_markdown_dir = corpus.cleaned_markdown_dir + + def record_clean_ocr(*args, **kwargs): + calls.append( + ( + "clean_ocr", + Path(str(kwargs.get("input_dir"))), + kwargs.get("write_cleaned_files", True), + ) + ) + return original_clean_ocr(*args, **kwargs) + + def record_clean(*args, **kwargs): + calls.append( + ( + "clean", + Path(str(kwargs.get("input_dir"))), + kwargs.get("write_cleaned_files", True), + ) + ) + return original_clean(*args, **kwargs) + + monkeypatch.setattr(corpus, "clean_ocr", record_clean_ocr) + monkeypatch.setattr(corpus, "clean", record_clean) + + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + assert calls[0] == ("clean_ocr", original_markdown_dir, True) + assert calls[1] == ("clean", original_cleaned_markdown_dir, False) + + raw_text = (original_markdown_dir / "doc.md").read_text(encoding="utf-8") + cleaned_text = (original_cleaned_markdown_dir / "doc.md").read_text(encoding="utf-8") + assert raw_text == raw_markdown + assert cleaned_text != raw_text + assert "1.1\n1.1" in raw_text + assert "1.1\n1.1" not in cleaned_text + + out_path = corpus.output_dir / "export.jsonl" + corpus.jsonl(out_path) + records = [json.loads(line) for line in out_path.read_text(encoding="utf-8").splitlines() if line] + assert len(records) == 1 + record = records[0] + + assert record["document"] == cleaned_text + assert record["text"] == raw_text + assert record["filename"] == "doc" + assert record["url"] == "https://example.com/doc.pdf" + assert record["ocr_success"] is True + assert record["extraction_mode"] == "deepseek" + assert record["page_count"] == 2 From 0874623331db0990c3fdf825162357f36d02d906 Mon Sep 17 00:00:00 2001 From: Foivos Karounos Date: Tue, 14 Apr 2026 19:04:28 +0300 Subject: [PATCH 93/93] chore: bump version to 0.1.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a296c9b..9dd211a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "glossapi" -version = "0.1.3" +version = "0.1.4" description = "Academic document processing pipeline with Rust-powered markdown cleaning" authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"}