diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..4719481 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,40 @@ +name: Build and Deploy Docs + +on: + push: + branches: + - development + - main + - master + workflow_dispatch: + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Install MkDocs + run: | + python -m pip install --upgrade pip + pip install 'mkdocs<2' 'mkdocs-material<10' + + - name: Build site + run: mkdocs build --strict + + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site + publish_branch: gh-pages + force_orphan: true diff --git a/.gitignore b/.gitignore index 8c98a88..74f3edc 100644 --- a/.gitignore +++ b/.gitignore @@ -58,10 +58,13 @@ htmlcov/ # OCR test outputs test_ocr_*_output/ *_demo_output/ +artifacts/ # OCR model weights (if downloaded locally) nanonets/ ocr_models/ +deepseek-ocr-2-model/ +models/ # Noise analysis reports glossapi_noise_analysis_report.md @@ -78,4 +81,4 @@ dependency_setup/.venvs/ deepseek-ocr/DeepSeek-OCR-empty/ # Local DeepSeek checkout and repro scripts (keep out of master) deepseek-ocr/ -repro_rapidocr_onnx/ +deepseek-ocr-2/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..979e757 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,20 @@ +# Contributing to GlossAPI + +## Working branches and PR flow +- Open PRs are pushed against the `development` branch. +- Development is merged with master when a) everything has been effectively used a few times and b) we reach a clear checkpoint. + +## Some design principles +- Corpus methods should be easy to use and descriptive. +- Python files should be readable and well organized (check folder structure). +- Metadata should be written to two distinct parquet files depending on their relevance to the end user ("metadata") or debugging during pipeline runs. The principle of reading/ writing to these parquet files should be maintained through out. Rest of the metadata is implicitly encoded in the output folders at each stage of the pipeline. + +## Pipeline awareness and folder layout +- Tie any pipeline change to the artifacts it produces. Common touchpoints: + - `Corpus.extract()` writes source PDFs under `downloads/` and a manifest at `download_results/download_results.parquet` (fields like `needs_ocr`). + - `Corpus.clean()` emits `markdown/` and `clean_markdown/`, keeping `.processing_state.pkl` plus `problematic_files/` and `timeout_files/` subfolders. + - `Corpus.ocr()` and `Corpus.section()` populate `json/` (Docling JSON, formula index, metrics) and `sections/sections_for_annotation.parquet`. +- When relocating outputs or adding new ones, update assertions in `tests/test_pipeline_smoke.py` and the folder references in `docs/pipeline.md` so the layout stays discoverable. + +## Keep changes small +- Avoid large refactors or sweeping interface changes; aim for narrowly scoped PRs and discuss big shifts before starting. diff --git a/README.md b/README.md index ebc6baf..04be81a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ GlossAPI is a GPU-ready document processing pipeline from [GFOSS](https://gfoss. ## Why GlossAPI - Handles download → extraction → cleaning → sectioning in one pipeline. -- Ships safe PyPDFium extraction plus Docling/RapidOCR for high-throughput OCR. +- Ships safe PyPDFium extraction plus Docling for structured extraction and DeepSeek-OCR-2 for OCR remediation. - Rust-powered cleaner/noise metrics keep Markdown quality predictable. - Greek-first metadata and section classification tuned for academic corpora. - Modular Corpus API lets you resume from any stage or plug into existing flows. @@ -40,56 +40,128 @@ PY ## Automated Environment Profiles -Use `dependency_setup/setup_glossapi.sh` to provision a virtualenv with the right dependency stack for the three supported modes: +Use `dependency_setup/setup_glossapi.sh` for the Docling environment, or `dependency_setup/setup_deepseek_uv.sh` for the dedicated DeepSeek OCR runtime: ```bash -# Vanilla pipeline (no GPU OCR extras) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests +# Docling / main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# Docling + RapidOCR mode -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests - -# DeepSeek OCR mode (requires weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR runtime (uv-managed) +./dependency_setup/setup_deepseek_uv.sh \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Pass `--download-deepseek` if you need the script to fetch weights automatically; otherwise it looks for `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Check `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation history. The script also installs the Rust extensions in editable mode so local changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` now delegates to the same uv-based installer. `setup_deepseek_uv.sh` uses `uv venv` + `uv sync`, installs the Rust extensions in editable mode, and can download `deepseek-ai/DeepSeek-OCR-2` with `huggingface_hub`. +The uv-managed DeepSeek runtime is OCR-only on purpose: it installs `glossapi[deepseek]` and does not carry the Docling layout stack. + +If you want a guided install that asks which phases you plan to use, run: + +```bash +python install_glossapi.py +``` + +That wizard keeps browser-gated download support (`playwright`) and the dedicated DeepSeek OCR runtime out of the main environment unless you explicitly select them. + +## Browser-Gated Download Mode + +`Corpus.download(...)` now supports three high-level routes for file acquisition: + +- `download_mode="standard"`: direct HTTP downloader only +- `download_mode="auto"`: direct HTTP first, then browser-assisted recovery when the response is a recoverable browser-gated interstitial +- `download_mode="browser"`: go straight to browser-assisted acquisition for known browser-gated file endpoints + +Use `browser_mode=True` as a legacy alias for `download_mode="browser"`. + +### Policy-driven routing + +If you know which domains require browser bootstrap, route them with a policy file instead of probing every URL: + +```yaml +default: + downloader: standard + +rules: + - match: + domains: [eur-lex.europa.eu] + downloader: browser + + - match: + url_regex: "https://example.org/protected/.*" + downloader: auto +``` + +```python +from glossapi import Corpus + +corpus = Corpus(input_dir="out", output_dir="out") +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +### Operational notes + +- Browser mode is for browser-gated file endpoints, not viewer-only sources. +- Browser sessions are cached per domain so a successful bootstrap can be reused across multiple files. +- Successful downloads still land in `downloads/`; extraction continues to consume only real files from that directory. +- Viewer-style sources still fail cleanly in `download_results/*.parquet` and do not create fake files. + +### Regression strategy + +The checked-in browser download tests use mocked browser/session flows and fake PDF bytes rather than hard-coded live URLs. + +For manual smoke checks against live browser-gated sources, build an ad hoc parquet locally and run it outside the committed test suite. **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` (from your DeepSeek venv) to fail fast if the CLI would fall back to the stub. -- Export these to force the real CLI and avoid silent stub output: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to fail fast before OCR. +- Export these to force the real runtime and avoid silent stub output: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- CUDA toolkit with `nvcc` available (FlashInfer/vLLM JIT falls back poorly without it); set `CUDA_HOME` and prepend `$CUDA_HOME/bin` to `PATH`. -- If FlashInfer is problematic, disable with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- To avoid FP8 KV cache issues, export `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1` (propagates `--no-fp8-kv`). -- Tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- The default fallback locations already point at the in-repo Transformers runner and `${REPO_ROOT}/deepseek-ocr-2-model/DeepSeek-OCR-2`. +- `flash-attn` is optional. The runner uses `flash_attention_2` when available and falls back to `eager` otherwise. ## Choose Your Install Path | Scenario | Commands | Notes | | --- | --- | --- | | Pip users | `pip install glossapi` | Fast vanilla evaluation with minimal dependencies. | -| Mode automation (recommended) | `./dependency_setup/setup_glossapi.sh --mode {vanilla\|rapidocr\|deepseek}` | Creates an isolated venv per mode, installs Rust crates, and can run the relevant pytest subset. | +| Docling environment | `./dependency_setup/setup_glossapi.sh --mode docling` | Creates the main GlossAPI venv for extraction, cleaning, sectioning, and enrichment. | +| DeepSeek environment | `./dependency_setup/setup_deepseek_uv.sh` | Creates a separate uv-managed OCR runtime pinned to the tested Transformers/Torch stack. | | Manual editable install | `pip install -e .` after cloning | Keep this if you prefer to manage dependencies by hand. | | Conda-based stacks | `scripts/setup_conda.sh` | Provisions Python 3.10 env + Rust + editable install for Amazon Linux/SageMaker. | See the refreshed docs (`docs/index.md`) for detailed environment notes, CUDA/ORT combinations, and troubleshooting tips. ## Repo Landmarks +- `docs/code_map.md`: fast map from pipeline ideas to implementing classes and files. +- `docs/pipeline.md`: stage contracts, key parameters, and artifact outputs. - `samples/lightweight_pdf_corpus/`: 20 one-page PDFs with manifest + expected Markdown. - `src/glossapi/`: Corpus pipeline, cleaners, and orchestration logic. - `tests/test_pipeline_smoke.py`: Minimal regression entry point (uses the lightweight corpus). - `docs/`: MkDocs site with onboarding, pipeline recipes, and configuration guides. +## Pipeline map + +Use this as the shortest path from a documentation concept to the public call that implements it. + +| Stage | Main call | Important parameters | Writes | +| --- | --- | --- | --- | +| Download | `Corpus.download(...)` | `input_parquet`, `links_column`, `parallelize_by`, `download_mode="standard"|"auto"|"browser"`, `download_policy_file`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase-1) | `Corpus.extract(...)` | `input_format`, `phase1_backend`, `use_gpus`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean(...)` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, updated parquet metrics/flags | +| OCR / math follow-up | `Corpus.ocr(...)` | `mode`, `fix_bad`, `math_enhance`, `use_gpus`, `devices` | refreshed `markdown/.md`, optional `json/.latex_map.jsonl` | +| Section | `Corpus.section()` | uses cleaner/parquet outputs to choose inputs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate(...)` | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage math density | `Corpus.triage_math()` | no required args | updated `download_results/*.parquet` routing columns | +| JSONL export | `Corpus.jsonl(...)` | `output_path` | merged training/export JSONL | + ## Contributing - Run `pytest tests/test_pipeline_smoke.py` for a fast end-to-end check. - Regenerate the lightweight corpus via `generate_pdfs.py` and commit the updated PDFs + manifest together. diff --git a/dependency_setup/deepseek_gpu_smoke.py b/dependency_setup/deepseek_gpu_smoke.py index e85d202..ddfb314 100644 --- a/dependency_setup/deepseek_gpu_smoke.py +++ b/dependency_setup/deepseek_gpu_smoke.py @@ -3,9 +3,9 @@ Minimal DeepSeek OCR integration smoke test. This script runs the GlossAPI DeepSeek backend on a tiny sample PDF and -verifies that real Markdown output is produced. It requires the DeepSeek-OCR -weights to be available under ``../deepseek-ocr/DeepSeek-OCR`` relative to -the repository root (override via ``DEEPSEEK_MODEL_DIR``). +verifies that real Markdown output is produced. It requires the DeepSeek-OCR-2 +weights to be available under ``../deepseek-ocr-2-model/DeepSeek-OCR-2`` relative to the +repository root (override via ``DEEPSEEK_MODEL_DIR``). """ from __future__ import annotations @@ -20,15 +20,16 @@ REPO_ROOT = Path(__file__).resolve().parents[1] SAMPLES_DIR = REPO_ROOT / "samples" / "lightweight_pdf_corpus" / "pdfs" -DEFAULT_MODEL_ROOT = (REPO_ROOT / ".." / "deepseek-ocr").resolve() +DEFAULT_MODEL_ROOT = (REPO_ROOT / "deepseek-ocr-2-model").resolve() def ensure_model_available(model_root: Path) -> None: - expected = model_root / "DeepSeek-OCR" / "model-00001-of-000001.safetensors" + direct_root = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + expected = direct_root / "model-00001-of-000001.safetensors" if not expected.exists() or expected.stat().st_size < 1_000_000: raise FileNotFoundError( - f"Expected DeepSeek-OCR weights at {expected}. " - "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR) " + f"Expected DeepSeek-OCR-2 weights at {expected}. " + "Download the checkpoint (huggingface.co/deepseek-ai/DeepSeek-OCR-2) " "or set DEEPSEEK_MODEL_DIR to the directory that contains them." ) @@ -37,7 +38,8 @@ def run_smoke(model_root: Path) -> None: from glossapi import Corpus ensure_model_available(model_root) - sample_pdf = SAMPLES_DIR / "sample01_plain.pdf" + model_dir = model_root if (model_root / "config.json").exists() else (model_root / "DeepSeek-OCR-2") + sample_pdf = SAMPLES_DIR / "alpha.pdf" if not sample_pdf.exists(): raise FileNotFoundError(f"Sample PDF not found: {sample_pdf}") @@ -67,22 +69,17 @@ def run_smoke(model_root: Path) -> None: parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") os.environ.setdefault("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") os.environ.setdefault( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - str(model_root / "run_pdf_ocr_vllm.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py"), ) os.environ.setdefault( "GLOSSAPI_DEEPSEEK_PYTHON", sys.executable, ) - ld_extra = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") or str( - model_root / "libjpeg-turbo" / "lib" - ) - os.environ["GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH"] = ld_extra - os.environ["LD_LIBRARY_PATH"] = ( - f"{ld_extra}:{os.environ.get('LD_LIBRARY_PATH','')}".rstrip(":") - ) + os.environ.setdefault("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) corpus = Corpus(input_dir=input_dir, output_dir=output_dir) corpus.ocr( @@ -100,7 +97,7 @@ def run_smoke(model_root: Path) -> None: def main() -> None: - model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") + model_dir_env = os.environ.get("DEEPSEEK_MODEL_DIR") or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR") if model_dir_env: model_root = Path(model_dir_env).expanduser().resolve() else: diff --git a/dependency_setup/deepseek_uv/pyproject.toml b/dependency_setup/deepseek_uv/pyproject.toml new file mode 100644 index 0000000..6f7ffe0 --- /dev/null +++ b/dependency_setup/deepseek_uv/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +description = "UV-managed runtime for GlossAPI DeepSeek-OCR-2 execution" +requires-python = ">=3.11,<3.13" +dependencies = [ + "glossapi[deepseek]", + "torch==2.10.0", + "torchvision==0.25.0", + "torchaudio==2.10.0", +] + +[dependency-groups] +test = [ + "pytest", + "fpdf2", +] + +[tool.uv.sources] +glossapi = { path = "../..", editable = true } +torch = { index = "pytorch-cu130" } +torchvision = { index = "pytorch-cu130" } +torchaudio = { index = "pytorch-cu130" } + +[[tool.uv.index]] +name = "pytorch-cu130" +url = "https://download.pytorch.org/whl/cu130" +explicit = true diff --git a/dependency_setup/deepseek_uv/uv.lock b/dependency_setup/deepseek_uv/uv.lock new file mode 100644 index 0000000..a136794 --- /dev/null +++ b/dependency_setup/deepseek_uv/uv.lock @@ -0,0 +1,1771 @@ +version = 1 +revision = 3 +requires-python = ">=3.11, <3.13" +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] + +[[package]] +name = "accelerate" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/14/787e5498cd062640f0f3d92ef4ae4063174f76f9afd29d13fc52a319daae/accelerate-1.13.0.tar.gz", hash = "sha256:d631b4e0f5b3de4aff2d7e9e6857d164810dfc3237d54d017f075122d057b236", size = 402835, upload-time = "2026-03-04T19:34:12.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/46/02ac5e262d4af18054b3e922b2baedbb2a03289ee792162de60a865defc5/accelerate-1.13.0-py3-none-any.whl", hash = "sha256:cf1a3efb96c18f7b152eb0fa7490f3710b19c3f395699358f08decca2b8b62e0", size = 383744, upload-time = "2026-03-04T19:34:10.313Z" }, +] + +[[package]] +name = "addict" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/ef/fd7649da8af11d93979831e8f1f8097e85e82d5bfeabc8c68b39175d8e75/addict-2.4.0.tar.gz", hash = "sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494", size = 9186, upload-time = "2020-11-21T16:21:31.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832, upload-time = "2020-11-21T16:21:29.588Z" }, +] + +[[package]] +name = "aiofiles" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/c3/534eac40372d8ee36ef40df62ec129bee4fdb5ad9706e58a29be53b2c970/aiofiles-25.1.0.tar.gz", hash = "sha256:a8d728f0a29de45dc521f18f07297428d56992a742f0cd2701ba86e44d23d5b2", size = 46354, upload-time = "2025-10-09T20:51:04.358Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/8a/340a1555ae33d7354dbca4faa54948d76d89a27ceef032c8c3bc661d003e/aiofiles-25.1.0-py3-none-any.whl", hash = "sha256:abe311e527c862958650f9438e859c1fa7568a141b22abcd015e120e86a85695", size = 14668, upload-time = "2025-10-09T20:51:03.174Z" }, +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556, upload-time = "2026-01-03T17:33:05.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/4c/a164164834f03924d9a29dc3acd9e7ee58f95857e0b467f6d04298594ebb/aiohttp-3.13.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5b6073099fb654e0a068ae678b10feff95c5cae95bbfcbfa7af669d361a8aa6b", size = 746051, upload-time = "2026-01-03T17:29:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/82/71/d5c31390d18d4f58115037c432b7e0348c60f6f53b727cad33172144a112/aiohttp-3.13.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cb93e166e6c28716c8c6aeb5f99dfb6d5ccf482d29fe9bf9a794110e6d0ab64", size = 499234, upload-time = "2026-01-03T17:29:44.822Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c9/741f8ac91e14b1d2e7100690425a5b2b919a87a5075406582991fb7de920/aiohttp-3.13.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28e027cf2f6b641693a09f631759b4d9ce9165099d2b5d92af9bd4e197690eea", size = 494979, upload-time = "2026-01-03T17:29:46.405Z" }, + { url = "https://files.pythonhosted.org/packages/75/b5/31d4d2e802dfd59f74ed47eba48869c1c21552c586d5e81a9d0d5c2ad640/aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b61b7169ababd7802f9568ed96142616a9118dd2be0d1866e920e77ec8fa92a", size = 1748297, upload-time = "2026-01-03T17:29:48.083Z" }, + { url = "https://files.pythonhosted.org/packages/1a/3e/eefad0ad42959f226bb79664826883f2687d602a9ae2941a18e0484a74d3/aiohttp-3.13.3-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:80dd4c21b0f6237676449c6baaa1039abae86b91636b6c91a7f8e61c87f89540", size = 1707172, upload-time = "2026-01-03T17:29:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3a/54a64299fac2891c346cdcf2aa6803f994a2e4beeaf2e5a09dcc54acc842/aiohttp-3.13.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:65d2ccb7eabee90ce0503c17716fc77226be026dcc3e65cce859a30db715025b", size = 1805405, upload-time = "2026-01-03T17:29:51.244Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/ddc1b7169cf64075e864f64595a14b147a895a868394a48f6a8031979038/aiohttp-3.13.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5b179331a481cb5529fca8b432d8d3c7001cb217513c94cd72d668d1248688a3", size = 1899449, upload-time = "2026-01-03T17:29:53.938Z" }, + { url = "https://files.pythonhosted.org/packages/a1/7e/6815aab7d3a56610891c76ef79095677b8b5be6646aaf00f69b221765021/aiohttp-3.13.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d4c940f02f49483b18b079d1c27ab948721852b281f8b015c058100e9421dd1", size = 1748444, upload-time = "2026-01-03T17:29:55.484Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f2/073b145c4100da5511f457dc0f7558e99b2987cf72600d42b559db856fbc/aiohttp-3.13.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f9444f105664c4ce47a2a7171a2418bce5b7bae45fb610f4e2c36045d85911d3", size = 1606038, upload-time = "2026-01-03T17:29:57.179Z" }, + { url = "https://files.pythonhosted.org/packages/0a/c1/778d011920cae03ae01424ec202c513dc69243cf2db303965615b81deeea/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:694976222c711d1d00ba131904beb60534f93966562f64440d0c9d41b8cdb440", size = 1724156, upload-time = "2026-01-03T17:29:58.914Z" }, + { url = "https://files.pythonhosted.org/packages/0e/cb/3419eabf4ec1e9ec6f242c32b689248365a1cf621891f6f0386632525494/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f33ed1a2bf1997a36661874b017f5c4b760f41266341af36febaf271d179f6d7", size = 1722340, upload-time = "2026-01-03T17:30:01.962Z" }, + { url = "https://files.pythonhosted.org/packages/7a/e5/76cf77bdbc435bf233c1f114edad39ed4177ccbfab7c329482b179cff4f4/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e636b3c5f61da31a92bf0d91da83e58fdfa96f178ba682f11d24f31944cdd28c", size = 1783041, upload-time = "2026-01-03T17:30:03.609Z" }, + { url = "https://files.pythonhosted.org/packages/9d/d4/dd1ca234c794fd29c057ce8c0566b8ef7fd6a51069de5f06fa84b9a1971c/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5d2d94f1f5fcbe40838ac51a6ab5704a6f9ea42e72ceda48de5e6b898521da51", size = 1596024, upload-time = "2026-01-03T17:30:05.132Z" }, + { url = "https://files.pythonhosted.org/packages/55/58/4345b5f26661a6180afa686c473620c30a66afdf120ed3dd545bbc809e85/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2be0e9ccf23e8a94f6f0650ce06042cefc6ac703d0d7ab6c7a917289f2539ad4", size = 1804590, upload-time = "2026-01-03T17:30:07.135Z" }, + { url = "https://files.pythonhosted.org/packages/7b/06/05950619af6c2df7e0a431d889ba2813c9f0129cec76f663e547a5ad56f2/aiohttp-3.13.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9af5e68ee47d6534d36791bbe9b646d2a7c7deb6fc24d7943628edfbb3581f29", size = 1740355, upload-time = "2026-01-03T17:30:09.083Z" }, + { url = "https://files.pythonhosted.org/packages/3e/80/958f16de79ba0422d7c1e284b2abd0c84bc03394fbe631d0a39ffa10e1eb/aiohttp-3.13.3-cp311-cp311-win32.whl", hash = "sha256:a2212ad43c0833a873d0fb3c63fa1bacedd4cf6af2fee62bf4b739ceec3ab239", size = 433701, upload-time = "2026-01-03T17:30:10.869Z" }, + { url = "https://files.pythonhosted.org/packages/dc/f2/27cdf04c9851712d6c1b99df6821a6623c3c9e55956d4b1e318c337b5a48/aiohttp-3.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:642f752c3eb117b105acbd87e2c143de710987e09860d674e068c4c2c441034f", size = 457678, upload-time = "2026-01-03T17:30:12.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732, upload-time = "2026-01-03T17:30:14.23Z" }, + { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293, upload-time = "2026-01-03T17:30:15.96Z" }, + { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533, upload-time = "2026-01-03T17:30:17.431Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839, upload-time = "2026-01-03T17:30:19.422Z" }, + { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932, upload-time = "2026-01-03T17:30:21.756Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906, upload-time = "2026-01-03T17:30:23.932Z" }, + { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020, upload-time = "2026-01-03T17:30:26Z" }, + { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181, upload-time = "2026-01-03T17:30:27.554Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794, upload-time = "2026-01-03T17:30:29.254Z" }, + { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900, upload-time = "2026-01-03T17:30:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239, upload-time = "2026-01-03T17:30:32.703Z" }, + { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527, upload-time = "2026-01-03T17:30:34.695Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489, upload-time = "2026-01-03T17:30:36.864Z" }, + { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852, upload-time = "2026-01-03T17:30:39.433Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379, upload-time = "2026-01-03T17:30:41.081Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253, upload-time = "2026-01-03T17:30:42.644Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407, upload-time = "2026-01-03T17:30:44.195Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/35/02daf95b9cd686320bb622eb148792655c9412dbb9b67abb5694e5910a24/charset_normalizer-3.4.5.tar.gz", hash = "sha256:95adae7b6c42a6c5b5b559b1a99149f090a57128155daeea91732c8d970d8644", size = 134804, upload-time = "2026-03-06T06:03:19.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/9e/bcec3b22c64ecec47d39bf5167c2613efd41898c019dccd4183f6aa5d6a7/charset_normalizer-3.4.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:610f72c0ee565dfb8ae1241b666119582fdbfe7c0975c175be719f940e110694", size = 279531, upload-time = "2026-03-06T06:00:52.252Z" }, + { url = "https://files.pythonhosted.org/packages/58/12/81fd25f7e7078ab5d1eedbb0fac44be4904ae3370a3bf4533c8f2d159acd/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60d68e820af339df4ae8358c7a2e7596badeb61e544438e489035f9fbf3246a5", size = 188006, upload-time = "2026-03-06T06:00:53.8Z" }, + { url = "https://files.pythonhosted.org/packages/ae/6e/f2d30e8c27c1b0736a6520311982cf5286cfc7f6cac77d7bc1325e3a23f2/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b473fc8dca1c3ad8559985794815f06ca3fc71942c969129070f2c3cdf7281", size = 205085, upload-time = "2026-03-06T06:00:55.311Z" }, + { url = "https://files.pythonhosted.org/packages/d0/90/d12cefcb53b5931e2cf792a33718d7126efb116a320eaa0742c7059a95e4/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d4eb8ac7469b2a5d64b5b8c04f84d8bf3ad340f4514b98523805cbf46e3b3923", size = 200545, upload-time = "2026-03-06T06:00:56.532Z" }, + { url = "https://files.pythonhosted.org/packages/03/f4/44d3b830a20e89ff82a3134912d9a1cf6084d64f3b95dcad40f74449a654/charset_normalizer-3.4.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bcb3227c3d9aaf73eaaab1db7ccd80a8995c509ee9941e2aae060ca6e4e5d81", size = 193863, upload-time = "2026-03-06T06:00:57.823Z" }, + { url = "https://files.pythonhosted.org/packages/25/4b/f212119c18a6320a9d4a730d1b4057875cdeabf21b3614f76549042ef8a8/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:75ee9c1cce2911581a70a3c0919d8bccf5b1cbc9b0e5171400ec736b4b569497", size = 181827, upload-time = "2026-03-06T06:00:59.323Z" }, + { url = "https://files.pythonhosted.org/packages/74/00/b26158e48b425a202a92965f8069e8a63d9af1481dfa206825d7f74d2a3c/charset_normalizer-3.4.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d1401945cb77787dbd3af2446ff2d75912327c4c3a1526ab7955ecf8600687c", size = 191085, upload-time = "2026-03-06T06:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c2/1c1737bf6fd40335fe53d28fe49afd99ee4143cc57a845e99635ce0b9b6d/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a45e504f5e1be0bd385935a8e1507c442349ca36f511a47057a71c9d1d6ea9e", size = 190688, upload-time = "2026-03-06T06:01:02.479Z" }, + { url = "https://files.pythonhosted.org/packages/5a/3d/abb5c22dc2ef493cd56522f811246a63c5427c08f3e3e50ab663de27fcf4/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e09f671a54ce70b79a1fc1dc6da3072b7ef7251fadb894ed92d9aa8218465a5f", size = 183077, upload-time = "2026-03-06T06:01:04.231Z" }, + { url = "https://files.pythonhosted.org/packages/44/33/5298ad4d419a58e25b3508e87f2758d1442ff00c2471f8e0403dab8edad5/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d01de5e768328646e6a3fa9e562706f8f6641708c115c62588aef2b941a4f88e", size = 206706, upload-time = "2026-03-06T06:01:05.773Z" }, + { url = "https://files.pythonhosted.org/packages/7b/17/51e7895ac0f87c3b91d276a449ef09f5532a7529818f59646d7a55089432/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:131716d6786ad5e3dc542f5cc6f397ba3339dc0fb87f87ac30e550e8987756af", size = 191665, upload-time = "2026-03-06T06:01:07.473Z" }, + { url = "https://files.pythonhosted.org/packages/90/8f/cce9adf1883e98906dbae380d769b4852bb0fa0004bc7d7a2243418d3ea8/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a374cc0b88aa710e8865dc1bd6edb3743c59f27830f0293ab101e4cf3ce9f85", size = 201950, upload-time = "2026-03-06T06:01:08.973Z" }, + { url = "https://files.pythonhosted.org/packages/08/ca/bce99cd5c397a52919e2769d126723f27a4c037130374c051c00470bcd38/charset_normalizer-3.4.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d31f0d1671e1534e395f9eb84a68e0fb670e1edb1fe819a9d7f564ae3bc4e53f", size = 195830, upload-time = "2026-03-06T06:01:10.155Z" }, + { url = "https://files.pythonhosted.org/packages/87/4f/2e3d023a06911f1281f97b8f036edc9872167036ca6f55cc874a0be6c12c/charset_normalizer-3.4.5-cp311-cp311-win32.whl", hash = "sha256:cace89841c0599d736d3d74a27bc5821288bb47c5441923277afc6059d7fbcb4", size = 132029, upload-time = "2026-03-06T06:01:11.706Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1f/a853b73d386521fd44b7f67ded6b17b7b2367067d9106a5c4b44f9a34274/charset_normalizer-3.4.5-cp311-cp311-win_amd64.whl", hash = "sha256:f8102ae93c0bc863b1d41ea0f4499c20a83229f52ed870850892df555187154a", size = 142404, upload-time = "2026-03-06T06:01:12.865Z" }, + { url = "https://files.pythonhosted.org/packages/b4/10/dba36f76b71c38e9d391abe0fd8a5b818790e053c431adecfc98c35cd2a9/charset_normalizer-3.4.5-cp311-cp311-win_arm64.whl", hash = "sha256:ed98364e1c262cf5f9363c3eca8c2df37024f52a8fa1180a3610014f26eac51c", size = 132796, upload-time = "2026-03-06T06:01:14.106Z" }, + { url = "https://files.pythonhosted.org/packages/9c/b6/9ee9c1a608916ca5feae81a344dffbaa53b26b90be58cc2159e3332d44ec/charset_normalizer-3.4.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed97c282ee4f994ef814042423a529df9497e3c666dca19be1d4cd1129dc7ade", size = 280976, upload-time = "2026-03-06T06:01:15.276Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d8/a54f7c0b96f1df3563e9190f04daf981e365a9b397eedfdfb5dbef7e5c6c/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0294916d6ccf2d069727d65973c3a1ca477d68708db25fd758dd28b0827cff54", size = 189356, upload-time = "2026-03-06T06:01:16.511Z" }, + { url = "https://files.pythonhosted.org/packages/42/69/2bf7f76ce1446759a5787cb87d38f6a61eb47dbbdf035cfebf6347292a65/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dc57a0baa3eeedd99fafaef7511b5a6ef4581494e8168ee086031744e2679467", size = 206369, upload-time = "2026-03-06T06:01:17.853Z" }, + { url = "https://files.pythonhosted.org/packages/10/9c/949d1a46dab56b959d9a87272482195f1840b515a3380e39986989a893ae/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ed1a9a204f317ef879b32f9af507d47e49cd5e7f8e8d5d96358c98373314fc60", size = 203285, upload-time = "2026-03-06T06:01:19.473Z" }, + { url = "https://files.pythonhosted.org/packages/67/5c/ae30362a88b4da237d71ea214a8c7eb915db3eec941adda511729ac25fa2/charset_normalizer-3.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad83b8f9379176c841f8865884f3514d905bcd2a9a3b210eaa446e7d2223e4d", size = 196274, upload-time = "2026-03-06T06:01:20.728Z" }, + { url = "https://files.pythonhosted.org/packages/b2/07/c9f2cb0e46cb6d64fdcc4f95953747b843bb2181bda678dc4e699b8f0f9a/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:a118e2e0b5ae6b0120d5efa5f866e58f2bb826067a646431da4d6a2bdae7950e", size = 184715, upload-time = "2026-03-06T06:01:22.194Z" }, + { url = "https://files.pythonhosted.org/packages/36/64/6b0ca95c44fddf692cd06d642b28f63009d0ce325fad6e9b2b4d0ef86a52/charset_normalizer-3.4.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:754f96058e61a5e22e91483f823e07df16416ce76afa4ebf306f8e1d1296d43f", size = 193426, upload-time = "2026-03-06T06:01:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/50/bc/a730690d726403743795ca3f5bb2baf67838c5fea78236098f324b965e40/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0c300cefd9b0970381a46394902cd18eaf2aa00163f999590ace991989dcd0fc", size = 191780, upload-time = "2026-03-06T06:01:25.053Z" }, + { url = "https://files.pythonhosted.org/packages/97/4f/6c0bc9af68222b22951552d73df4532b5be6447cee32d58e7e8c74ecbb7b/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c108f8619e504140569ee7de3f97d234f0fbae338a7f9f360455071ef9855a95", size = 185805, upload-time = "2026-03-06T06:01:26.294Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b9/a523fb9b0ee90814b503452b2600e4cbc118cd68714d57041564886e7325/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d1028de43596a315e2720a9849ee79007ab742c06ad8b45a50db8cdb7ed4a82a", size = 208342, upload-time = "2026-03-06T06:01:27.55Z" }, + { url = "https://files.pythonhosted.org/packages/4d/61/c59e761dee4464050713e50e27b58266cc8e209e518c0b378c1580c959ba/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:19092dde50335accf365cce21998a1c6dd8eafd42c7b226eb54b2747cdce2fac", size = 193661, upload-time = "2026-03-06T06:01:29.051Z" }, + { url = "https://files.pythonhosted.org/packages/1c/43/729fa30aad69783f755c5ad8649da17ee095311ca42024742701e202dc59/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4354e401eb6dab9aed3c7b4030514328a6c748d05e1c3e19175008ca7de84fb1", size = 204819, upload-time = "2026-03-06T06:01:30.298Z" }, + { url = "https://files.pythonhosted.org/packages/87/33/d9b442ce5a91b96fc0840455a9e49a611bbadae6122778d0a6a79683dd31/charset_normalizer-3.4.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a68766a3c58fde7f9aaa22b3786276f62ab2f594efb02d0a1421b6282e852e98", size = 198080, upload-time = "2026-03-06T06:01:31.478Z" }, + { url = "https://files.pythonhosted.org/packages/56/5a/b8b5a23134978ee9885cee2d6995f4c27cc41f9baded0a9685eabc5338f0/charset_normalizer-3.4.5-cp312-cp312-win32.whl", hash = "sha256:1827734a5b308b65ac54e86a618de66f935a4f63a8a462ff1e19a6788d6c2262", size = 132630, upload-time = "2026-03-06T06:01:33.056Z" }, + { url = "https://files.pythonhosted.org/packages/70/53/e44a4c07e8904500aec95865dc3f6464dc3586a039ef0df606eb3ac38e35/charset_normalizer-3.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:728c6a963dfab66ef865f49286e45239384249672cd598576765acc2a640a636", size = 142856, upload-time = "2026-03-06T06:01:34.489Z" }, + { url = "https://files.pythonhosted.org/packages/ea/aa/c5628f7cad591b1cf45790b7a61483c3e36cf41349c98af7813c483fd6e8/charset_normalizer-3.4.5-cp312-cp312-win_arm64.whl", hash = "sha256:75dfd1afe0b1647449e852f4fb428195a7ed0588947218f7ba929f6538487f02", size = 132982, upload-time = "2026-03-06T06:01:35.641Z" }, + { url = "https://files.pythonhosted.org/packages/c5/60/3a621758945513adfd4db86827a5bafcc615f913dbd0b4c2ed64a65731be/charset_normalizer-3.4.5-py3-none-any.whl", hash = "sha256:9db5e3fcdcee89a78c04dffb3fe33c79f77bd741a624946db2591c81b2fc85b0", size = 55455, upload-time = "2026-03-06T06:03:17.827Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "dask" +version = "2026.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "cloudpickle" }, + { name = "fsspec" }, + { name = "importlib-metadata", marker = "python_full_version < '3.12'" }, + { name = "packaging" }, + { name = "partd" }, + { name = "pyyaml" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/52/b0f9172b22778def907db1ff173249e4eb41f054b46a9c83b1528aaf811f/dask-2026.1.2.tar.gz", hash = "sha256:1136683de2750d98ea792670f7434e6c1cfce90cab2cc2f2495a9e60fd25a4fc", size = 10997838, upload-time = "2026-01-30T21:04:20.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/23/d39ccc4ed76222db31530b0a7d38876fdb7673e23f838e8d8f0ed4651a4f/dask-2026.1.2-py3-none-any.whl", hash = "sha256:46a0cf3b8d87f78a3d2e6b145aea4418a6d6d606fe6a16c79bd8ca2bb862bc91", size = 1482084, upload-time = "2026-01-30T21:04:18.363Z" }, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + +[[package]] +name = "deprecated" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, +] + +[[package]] +name = "easydict" +version = "1.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/9f/d18d6b5e19244788a6d09c14a8406376b4f4bfcc008e6d17a4f4c15362e8/easydict-1.13.tar.gz", hash = "sha256:b1135dedbc41c8010e2bc1f77ec9744c7faa42bce1a1c87416791449d6c87780", size = 6809, upload-time = "2024-03-04T12:04:41.251Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/ec/fa6963f1198172c2b75c9ab6ecefb3045991f92f75f5eb41b6621b198123/easydict-1.13-py3-none-any.whl", hash = "sha256:6b787daf4dcaf6377b4ad9403a5cee5a86adbc0ca9a5bcf5410e9902002aeac2", size = 6804, upload-time = "2024-03-04T12:04:39.508Z" }, +] + +[[package]] +name = "einops" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/77/850bef8d72ffb9219f0b1aac23fbc1bf7d038ee6ea666f331fa273031aa2/einops-0.8.2.tar.gz", hash = "sha256:609da665570e5e265e27283aab09e7f279ade90c4f01bcfca111f3d3e13f2827", size = 56261, upload-time = "2026-01-26T04:13:17.638Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/09/f8d8f8f31e4483c10a906437b4ce31bdf3d6d417b73fe33f1a8b59e34228/einops-0.8.2-py3-none-any.whl", hash = "sha256:54058201ac7087911181bfec4af6091bb59380360f069276601256a76af08193", size = 65638, upload-time = "2026-01-26T04:13:18.546Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, +] + +[[package]] +name = "fonttools" +version = "4.61.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, + { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, + { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, + { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, + { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, + { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, + { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, + { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, + { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, + { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, +] + +[[package]] +name = "fpdf2" +version = "2.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "defusedxml" }, + { name = "fonttools" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/f2/72feae0b2827ed38013e4307b14f95bf0b3d124adfef4d38a7d57533f7be/fpdf2-2.8.7.tar.gz", hash = "sha256:7060ccee5a9c7ab0a271fb765a36a23639f83ef8996c34e3d46af0a17ede57f9", size = 362351, upload-time = "2026-02-28T05:39:16.456Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/66/0a/cf50ecffa1e3747ed9380a3adfc829259f1f86b3fdbd9e505af789003141/fpdf2-2.8.7-py3-none-any.whl", hash = "sha256:d391fc508a3ce02fc43a577c830cda4fe6f37646f2d143d489839940932fbc19", size = 327056, upload-time = "2026-02-28T05:39:14.619Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" }, + { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" }, + { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" }, + { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" }, + { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" }, + { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" }, + { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" }, + { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" }, + { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" }, + { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" }, + { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" }, + { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" }, + { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411, upload-time = "2025-10-06T05:36:09.801Z" }, + { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014, upload-time = "2025-10-06T05:36:11.394Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909, upload-time = "2025-10-06T05:36:12.598Z" }, + { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049, upload-time = "2025-10-06T05:36:14.065Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485, upload-time = "2025-10-06T05:36:15.39Z" }, + { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619, upload-time = "2025-10-06T05:36:16.558Z" }, + { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320, upload-time = "2025-10-06T05:36:17.821Z" }, + { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820, upload-time = "2025-10-06T05:36:19.046Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518, upload-time = "2025-10-06T05:36:20.763Z" }, + { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096, upload-time = "2025-10-06T05:36:22.129Z" }, + { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985, upload-time = "2025-10-06T05:36:23.661Z" }, + { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591, upload-time = "2025-10-06T05:36:24.958Z" }, + { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102, upload-time = "2025-10-06T05:36:26.333Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + +[[package]] +name = "glossapi" +version = "0.1.3" +source = { editable = "../../" } +dependencies = [ + { name = "aiofiles" }, + { name = "aiohttp" }, + { name = "dask" }, + { name = "ftfy" }, + { name = "joblib" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pypdfium2" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "zstandard" }, +] + +[package.optional-dependencies] +deepseek = [ + { name = "accelerate" }, + { name = "addict" }, + { name = "easydict" }, + { name = "einops" }, + { name = "img2pdf" }, + { name = "pillow" }, + { name = "pymupdf" }, + { name = "tokenizers" }, + { name = "transformers" }, +] + +[package.metadata] +requires-dist = [ + { name = "accelerate", marker = "extra == 'deepseek'", specifier = ">=1.2.1,<2" }, + { name = "addict", marker = "extra == 'deepseek'" }, + { name = "aiofiles", specifier = ">=23.0.0" }, + { name = "aiohttp", specifier = ">=3.8.0" }, + { name = "dask", specifier = ">=2022.1.0" }, + { name = "docling", marker = "extra == 'docling'", specifier = "==2.81.0" }, + { name = "easydict", marker = "extra == 'deepseek'" }, + { name = "einops", marker = "extra == 'deepseek'" }, + { name = "ftfy", specifier = ">=6.0.0" }, + { name = "img2pdf", marker = "extra == 'deepseek'", specifier = ">=0.5.1" }, + { name = "joblib", specifier = ">=1.0.0" }, + { name = "mkdocs", marker = "extra == 'docs'", specifier = ">=1.5" }, + { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.5" }, + { name = "numpy", specifier = ">=1.26,<3" }, + { name = "pandas", specifier = ">=1.3.0" }, + { name = "pillow", marker = "extra == 'deepseek'", specifier = "==10.4.0" }, + { name = "playwright", marker = "extra == 'browser'", specifier = ">=1.52,<2" }, + { name = "pyarrow", specifier = ">=7.0.0" }, + { name = "pymupdf", marker = "extra == 'deepseek'", specifier = "==1.24.10" }, + { name = "pypdfium2", specifier = ">=4.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = "==1.6.1" }, + { name = "tenacity", specifier = ">=8.0.0" }, + { name = "tokenizers", marker = "extra == 'deepseek'", specifier = "==0.20.3" }, + { name = "torch", marker = "extra == 'cuda'", specifier = "==2.5.1" }, + { name = "torchvision", marker = "extra == 'cuda'", specifier = "==0.20.1" }, + { name = "tqdm", specifier = ">=4.67.0" }, + { name = "transformers", marker = "extra == 'deepseek'", specifier = "==4.46.3" }, + { name = "zstandard", specifier = ">=0.22.0" }, +] +provides-extras = ["browser", "docling", "cuda", "deepseek", "docs"] + +[[package]] +name = "glossapi-deepseek-runtime" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "glossapi", extra = ["deepseek"] }, + { name = "torch" }, + { name = "torchaudio", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchaudio", version = "2.9.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torchvision", version = "0.24.1", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.24.1+cu130", source = { registry = "https://download.pytorch.org/whl/cu130" }, marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] + +[package.dev-dependencies] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "glossapi", extras = ["deepseek"], editable = "../../" }, + { name = "torch", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchaudio", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu130" }, + { name = "torchvision", specifier = "==0.24.1", index = "https://download.pytorch.org/whl/cu130" }, +] + +[package.metadata.requires-dev] +test = [ + { name = "fpdf2" }, + { name = "pytest" }, +] + +[[package]] +name = "hf-xet" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/cb/9bb543bd987ffa1ee48202cc96a756951b734b79a542335c566148ade36c/hf_xet-1.3.2.tar.gz", hash = "sha256:e130ee08984783d12717444e538587fa2119385e5bd8fc2bb9f930419b73a7af", size = 643646, upload-time = "2026-02-27T17:26:08.051Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/28/dbb024e2e3907f6f3052847ca7d1a2f7a3972fafcd53ff79018977fcb3e4/hf_xet-1.3.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f93b7595f1d8fefddfede775c18b5c9256757824f7f6832930b49858483cd56f", size = 3763961, upload-time = "2026-02-27T17:25:52.537Z" }, + { url = "https://files.pythonhosted.org/packages/e4/71/b99aed3823c9d1795e4865cf437d651097356a3f38c7d5877e4ac544b8e4/hf_xet-1.3.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:a85d3d43743174393afe27835bde0cd146e652b5fcfdbcd624602daef2ef3259", size = 3526171, upload-time = "2026-02-27T17:25:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/907890ce6ef5598b5920514f255ed0a65f558f820515b18db75a51b2f878/hf_xet-1.3.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7c2a054a97c44e136b1f7f5a78f12b3efffdf2eed3abc6746fc5ea4b39511633", size = 4180750, upload-time = "2026-02-27T17:25:43.125Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ad/bc7f41f87173d51d0bce497b171c4ee0cbde1eed2d7b4216db5d0ada9f50/hf_xet-1.3.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:06b724a361f670ae557836e57801b82c75b534812e351a87a2c739f77d1e0635", size = 3961035, upload-time = "2026-02-27T17:25:41.837Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/600f4dda40c4a33133404d9fe644f1d35ff2d9babb4d0435c646c63dd107/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:305f5489d7241a47e0458ef49334be02411d1d0f480846363c1c8084ed9916f7", size = 4161378, upload-time = "2026-02-27T17:26:00.365Z" }, + { url = "https://files.pythonhosted.org/packages/00/b3/7bc1ff91d1ac18420b7ad1e169b618b27c00001b96310a89f8a9294fe509/hf_xet-1.3.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:06cdbde243c85f39a63b28e9034321399c507bcd5e7befdd17ed2ccc06dfe14e", size = 4398020, upload-time = "2026-02-27T17:26:03.977Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0b/99bfd948a3ed3620ab709276df3ad3710dcea61976918cce8706502927af/hf_xet-1.3.2-cp37-abi3-win_amd64.whl", hash = "sha256:9298b47cce6037b7045ae41482e703c471ce36b52e73e49f71226d2e8e5685a1", size = 3641624, upload-time = "2026-02-27T17:26:13.542Z" }, + { url = "https://files.pythonhosted.org/packages/cc/02/9a6e4ca1f3f73a164c0cd48e41b3cc56585dcc37e809250de443d673266f/hf_xet-1.3.2-cp37-abi3-win_arm64.whl", hash = "sha256:83d8ec273136171431833a6957e8f3af496bee227a0fe47c7b8b39c106d1749a", size = 3503976, upload-time = "2026-02-27T17:26:12.123Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "img2pdf" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pikepdf" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/97/ca44c467131b93fda82d2a2f21b738c8bcf63b5259e3b8250e928b8dd52a/img2pdf-0.6.3.tar.gz", hash = "sha256:219518020f5bd242bdc46493941ea3f756f664c2e86f2454721e74353f58cd95", size = 120350, upload-time = "2025-11-05T20:51:57.558Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/dc/91e3a4a11c25ae183bd5a71b84ecb298db76405ff70013f76b10877bdfe3/img2pdf-0.6.3-py3-none-any.whl", hash = "sha256:44d12d235752edd17c43c04ff39952cdc5dd4c6aba90569c4902bd445085266b", size = 49701, upload-time = "2025-11-05T20:51:55.469Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "joblib" +version = "1.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, +] + +[[package]] +name = "locket" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/83/97b29fe05cb6ae28d2dbd30b81e2e402a3eed5f460c26e9eaa5895ceacf5/locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632", size = 4350, upload-time = "2022-04-20T22:04:44.312Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/bc/83e112abc66cd466c6b83f99118035867cecd41802f8d044638aa78a106e/locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3", size = 4398, upload-time = "2022-04-20T22:04:42.23Z" }, +] + +[[package]] +name = "lxml" +version = "5.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/76/3d/14e82fc7c8fb1b7761f7e748fd47e2ec8276d137b6acfe5a4bb73853e08f/lxml-5.4.0.tar.gz", hash = "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", size = 3679479, upload-time = "2025-04-23T01:50:29.322Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/2d/67693cc8a605a12e5975380d7ff83020dcc759351b5a066e1cced04f797b/lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9", size = 8083240, upload-time = "2025-04-23T01:45:18.566Z" }, + { url = "https://files.pythonhosted.org/packages/73/53/b5a05ab300a808b72e848efd152fe9c022c0181b0a70b8bca1199f1bed26/lxml-5.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7", size = 4387685, upload-time = "2025-04-23T01:45:21.387Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cb/1a3879c5f512bdcd32995c301886fe082b2edd83c87d41b6d42d89b4ea4d/lxml-5.4.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa", size = 4991164, upload-time = "2025-04-23T01:45:23.849Z" }, + { url = "https://files.pythonhosted.org/packages/f9/94/bbc66e42559f9d04857071e3b3d0c9abd88579367fd2588a4042f641f57e/lxml-5.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df", size = 4746206, upload-time = "2025-04-23T01:45:26.361Z" }, + { url = "https://files.pythonhosted.org/packages/66/95/34b0679bee435da2d7cae895731700e519a8dfcab499c21662ebe671603e/lxml-5.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e", size = 5342144, upload-time = "2025-04-23T01:45:28.939Z" }, + { url = "https://files.pythonhosted.org/packages/e0/5d/abfcc6ab2fa0be72b2ba938abdae1f7cad4c632f8d552683ea295d55adfb/lxml-5.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44", size = 4825124, upload-time = "2025-04-23T01:45:31.361Z" }, + { url = "https://files.pythonhosted.org/packages/5a/78/6bd33186c8863b36e084f294fc0a5e5eefe77af95f0663ef33809cc1c8aa/lxml-5.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba", size = 4876520, upload-time = "2025-04-23T01:45:34.191Z" }, + { url = "https://files.pythonhosted.org/packages/3b/74/4d7ad4839bd0fc64e3d12da74fc9a193febb0fae0ba6ebd5149d4c23176a/lxml-5.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba", size = 4765016, upload-time = "2025-04-23T01:45:36.7Z" }, + { url = "https://files.pythonhosted.org/packages/24/0d/0a98ed1f2471911dadfc541003ac6dd6879fc87b15e1143743ca20f3e973/lxml-5.4.0-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c", size = 5362884, upload-time = "2025-04-23T01:45:39.291Z" }, + { url = "https://files.pythonhosted.org/packages/48/de/d4f7e4c39740a6610f0f6959052b547478107967362e8424e1163ec37ae8/lxml-5.4.0-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8", size = 4902690, upload-time = "2025-04-23T01:45:42.386Z" }, + { url = "https://files.pythonhosted.org/packages/07/8c/61763abd242af84f355ca4ef1ee096d3c1b7514819564cce70fd18c22e9a/lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86", size = 4944418, upload-time = "2025-04-23T01:45:46.051Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/6d7e3b63e7e282619193961a570c0a4c8a57fe820f07ca3fe2f6bd86608a/lxml-5.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056", size = 4827092, upload-time = "2025-04-23T01:45:48.943Z" }, + { url = "https://files.pythonhosted.org/packages/71/4a/e60a306df54680b103348545706a98a7514a42c8b4fbfdcaa608567bb065/lxml-5.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7", size = 5418231, upload-time = "2025-04-23T01:45:51.481Z" }, + { url = "https://files.pythonhosted.org/packages/27/f2/9754aacd6016c930875854f08ac4b192a47fe19565f776a64004aa167521/lxml-5.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd", size = 5261798, upload-time = "2025-04-23T01:45:54.146Z" }, + { url = "https://files.pythonhosted.org/packages/38/a2/0c49ec6941428b1bd4f280650d7b11a0f91ace9db7de32eb7aa23bcb39ff/lxml-5.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751", size = 4988195, upload-time = "2025-04-23T01:45:56.685Z" }, + { url = "https://files.pythonhosted.org/packages/7a/75/87a3963a08eafc46a86c1131c6e28a4de103ba30b5ae903114177352a3d7/lxml-5.4.0-cp311-cp311-win32.whl", hash = "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4", size = 3474243, upload-time = "2025-04-23T01:45:58.863Z" }, + { url = "https://files.pythonhosted.org/packages/fa/f9/1f0964c4f6c2be861c50db380c554fb8befbea98c6404744ce243a3c87ef/lxml-5.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539", size = 3815197, upload-time = "2025-04-23T01:46:01.096Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4c/d101ace719ca6a4ec043eb516fcfcb1b396a9fccc4fcd9ef593df34ba0d5/lxml-5.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4", size = 8127392, upload-time = "2025-04-23T01:46:04.09Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/beddae0cec4dd9ddf46abf156f0af451c13019a0fa25d7445b655ba5ccb7/lxml-5.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d", size = 4415103, upload-time = "2025-04-23T01:46:07.227Z" }, + { url = "https://files.pythonhosted.org/packages/d0/25/d0d93a4e763f0462cccd2b8a665bf1e4343dd788c76dcfefa289d46a38a9/lxml-5.4.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779", size = 5024224, upload-time = "2025-04-23T01:46:10.237Z" }, + { url = "https://files.pythonhosted.org/packages/31/ce/1df18fb8f7946e7f3388af378b1f34fcf253b94b9feedb2cec5969da8012/lxml-5.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e", size = 4769913, upload-time = "2025-04-23T01:46:12.757Z" }, + { url = "https://files.pythonhosted.org/packages/4e/62/f4a6c60ae7c40d43657f552f3045df05118636be1165b906d3423790447f/lxml-5.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9", size = 5290441, upload-time = "2025-04-23T01:46:16.037Z" }, + { url = "https://files.pythonhosted.org/packages/9e/aa/04f00009e1e3a77838c7fc948f161b5d2d5de1136b2b81c712a263829ea4/lxml-5.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5", size = 4820165, upload-time = "2025-04-23T01:46:19.137Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/e0b2f61fa2404bf0f1fdf1898377e5bd1b74cc9b2cf2c6ba8509b8f27990/lxml-5.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5", size = 4932580, upload-time = "2025-04-23T01:46:21.963Z" }, + { url = "https://files.pythonhosted.org/packages/24/a2/8263f351b4ffe0ed3e32ea7b7830f845c795349034f912f490180d88a877/lxml-5.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4", size = 4759493, upload-time = "2025-04-23T01:46:24.316Z" }, + { url = "https://files.pythonhosted.org/packages/05/00/41db052f279995c0e35c79d0f0fc9f8122d5b5e9630139c592a0b58c71b4/lxml-5.4.0-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e", size = 5324679, upload-time = "2025-04-23T01:46:27.097Z" }, + { url = "https://files.pythonhosted.org/packages/1d/be/ee99e6314cdef4587617d3b3b745f9356d9b7dd12a9663c5f3b5734b64ba/lxml-5.4.0-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7", size = 4890691, upload-time = "2025-04-23T01:46:30.009Z" }, + { url = "https://files.pythonhosted.org/packages/ad/36/239820114bf1d71f38f12208b9c58dec033cbcf80101cde006b9bde5cffd/lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079", size = 4955075, upload-time = "2025-04-23T01:46:32.33Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e1/1b795cc0b174efc9e13dbd078a9ff79a58728a033142bc6d70a1ee8fc34d/lxml-5.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20", size = 4838680, upload-time = "2025-04-23T01:46:34.852Z" }, + { url = "https://files.pythonhosted.org/packages/72/48/3c198455ca108cec5ae3662ae8acd7fd99476812fd712bb17f1b39a0b589/lxml-5.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8", size = 5391253, upload-time = "2025-04-23T01:46:37.608Z" }, + { url = "https://files.pythonhosted.org/packages/d6/10/5bf51858971c51ec96cfc13e800a9951f3fd501686f4c18d7d84fe2d6352/lxml-5.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f", size = 5261651, upload-time = "2025-04-23T01:46:40.183Z" }, + { url = "https://files.pythonhosted.org/packages/2b/11/06710dd809205377da380546f91d2ac94bad9ff735a72b64ec029f706c85/lxml-5.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc", size = 5024315, upload-time = "2025-04-23T01:46:43.333Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b0/15b6217834b5e3a59ebf7f53125e08e318030e8cc0d7310355e6edac98ef/lxml-5.4.0-cp312-cp312-win32.whl", hash = "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f", size = 3486149, upload-time = "2025-04-23T01:46:45.684Z" }, + { url = "https://files.pythonhosted.org/packages/91/1e/05ddcb57ad2f3069101611bd5f5084157d90861a2ef460bf42f45cced944/lxml-5.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", size = 3817095, upload-time = "2025-04-23T01:46:48.521Z" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" }, + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, + { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" }, + { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" }, + { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" }, + { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" }, + { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" }, + { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" }, + { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" }, + { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" }, + { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" }, + { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" }, + { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" }, + { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" }, + { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" }, + { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" }, + { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" }, + { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" }, + { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" }, + { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" }, + { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" }, + { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" }, + { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" }, + { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018, upload-time = "2026-01-26T02:43:56.198Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883, upload-time = "2026-01-26T02:43:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413, upload-time = "2026-01-26T02:43:58.755Z" }, + { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404, upload-time = "2026-01-26T02:44:00.216Z" }, + { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456, upload-time = "2026-01-26T02:44:02.202Z" }, + { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322, upload-time = "2026-01-26T02:44:03.56Z" }, + { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955, upload-time = "2026-01-26T02:44:04.845Z" }, + { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254, upload-time = "2026-01-26T02:44:06.133Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059, upload-time = "2026-01-26T02:44:07.518Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588, upload-time = "2026-01-26T02:44:09.382Z" }, + { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642, upload-time = "2026-01-26T02:44:10.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377, upload-time = "2026-01-26T02:44:12.042Z" }, + { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887, upload-time = "2026-01-26T02:44:14.245Z" }, + { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053, upload-time = "2026-01-26T02:44:15.371Z" }, + { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307, upload-time = "2026-01-26T02:44:16.852Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "numpy" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, +] + +[[package]] +name = "nvidia-cublas" +version = "13.0.0.19" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/99/8447b9ee9f070522ee66604ee819d632ab4568c68b3134cebd3837a015cd/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:381b1a0ca636fdcb6920a871e8fc89dbfd1f6157f421ed0a6f2673e14cffd3bd", size = 539001158, upload-time = "2025-08-04T10:19:50.761Z" }, + { url = "https://files.pythonhosted.org/packages/5a/99/210e113dde53955e97042bd76dc4ad927eca04c5b4645ec157cc59f4f3ae/nvidia_cublas-13.0.0.19-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:f6723af2e8e2600a11dc384037d90d9bf93070e346c24ef2e8f9001658c99896", size = 419392356, upload-time = "2025-08-04T10:20:19.449Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/63/e9c12c3ae07c1f3a0821536bc188d7bf76e1b633b3bcd2bd393b00bb3426/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_aarch64.whl", hash = "sha256:67c22627ef436afcf080b48e4ad17b3f83d9e7c0d990ad0c6c0627b01fb92ccc", size = 10171189, upload-time = "2025-08-04T10:16:24.39Z" }, + { url = "https://files.pythonhosted.org/packages/ba/28/e37d62ff27b4462953fdd5713d8a78760578dfa12685c30b71b55fab57b1/nvidia_cuda_cupti-13.0.48-py3-none-manylinux_2_25_x86_64.whl", hash = "sha256:417699e216b23d81bc0bbcb7032352f81b9c5372ef73c097a01abb83125a3d09", size = 10718148, upload-time = "2025-08-04T10:16:33.605Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/5b/f7636b3d66caefade6a0a0dc5b705c259a2062c20ad18b432b3129d348e0/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:87e13d186905a35e7c04ad553a2abded0fba22f93b43d02e5da6f6cf73fb4d0a", size = 90214268, upload-time = "2025-08-04T10:18:09.305Z" }, + { url = "https://files.pythonhosted.org/packages/c0/bd/eb18593b43dae42312612ffbac24b8e68149e590102c3b6cc2e3d3792069/nvidia_cuda_nvrtc-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6ccf1ef1b90a0763ac7536f3c17046659d89869d76b98ac358efc2e09b348365", size = 43013627, upload-time = "2025-08-04T10:17:57.338Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime" +version = "13.0.48" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/3b/c5e5d8aafd355e2ff9922472ba71251331af6cc866e5b04a3b1dc8f58977/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b807c0bb925a307bfa667a24f24d253aef8eda3ac4be66b333f2c9d357557008", size = 2260687, upload-time = "2025-08-04T10:15:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/cc/78/edb119083ca2ff0f09ab0cd597e97775ac3f575b8aa0caf10d68ed49e032/nvidia_cuda_runtime-13.0.48-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b54d12087a1abff81a4cbfa6556876e3afea1fc60da2e0816da374619810c89", size = 2242632, upload-time = "2025-08-04T10:15:49.339Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu13" +version = "9.13.0.50" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/9c/9e99c00dc23db324244ec257d1e84d79539202ee2f185dee2c1fa97c9549/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:33f0aa0b64230101b348648fd0693342188071d3f8a137c0cf50051c24b3584b", size = 412337597, upload-time = "2025-09-04T20:22:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/cf/68/2712854561170b2a81bea7b6b35cc1ae264d9794c0c218986e5c685d45f7/nvidia_cudnn_cu13-9.13.0.50-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:2150b4850725d30653ec3e365f0732e3e2e3eb8633cf3bd2d3117628dea8b4f9", size = 348571624, upload-time = "2025-09-04T20:23:26.544Z" }, +] + +[[package]] +name = "nvidia-cufft" +version = "12.0.0.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/e9/4e49b1baf6899e42eeec324a49d7aa2219fec42076327c4e468000dd375a/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1885731254835797572ff075f3daf43a2a0a2801210dea26971940dae7e1a367", size = 214053580, upload-time = "2025-08-04T10:20:45.781Z" }, + { url = "https://files.pythonhosted.org/packages/9b/9f/e298b66e584ad25bd78ad4a45b061fe7bb57a1ec011128089404ce3fcc7d/nvidia_cufft-12.0.0.15-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9f160b1f018e80bcb0d7c0fa50564b042fa26b13edc1b1ff14b6375a9edd2812", size = 214085489, upload-time = "2025-08-04T10:21:02.975Z" }, +] + +[[package]] +name = "nvidia-cufile" +version = "1.15.0.42" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/0a/4adf0c9bb1241cd1314fc923fde00f3749c7fc785b1e3b3f4a104cd3090c/nvidia_cufile-1.15.0.42-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8f9813eff24d61586699c615e39817e2b4e4f642cace32733c2ab6f663a7eab", size = 1223104, upload-time = "2025-08-04T10:21:31.131Z" }, + { url = "https://files.pythonhosted.org/packages/bf/a5/636baa43399ea10d22b63e7454f22a92ace4a7eaa3c45b94607250857e2d/nvidia_cufile-1.15.0.42-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:bced4036b5a8dbf57e4d78cd4fafefec58ad754b784a9eaa272b011896754c62", size = 1136527, upload-time = "2025-08-04T10:21:22.441Z" }, +] + +[[package]] +name = "nvidia-curand" +version = "10.4.0.35" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/72/7c2ae24fb6b63a32e6ae5d241cc65263ea18d08802aaae087d9f013335a2/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:133df5a7509c3e292aaa2b477afd0194f06ce4ea24d714d616ff36439cee349a", size = 61962106, upload-time = "2025-08-04T10:21:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/a5/9f/be0a41ca4a4917abf5cb9ae0daff1a6060cc5de950aec0396de9f3b52bc5/nvidia_curand-10.4.0.35-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:1aee33a5da6e1db083fe2b90082def8915f30f3248d5896bcec36a579d941bfc", size = 59544258, upload-time = "2025-08-04T10:22:03.992Z" }, +] + +[[package]] +name = "nvidia-cusolver" +version = "12.0.3.29" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-cusparse", marker = "sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/bb/2e60de9bb1f0c3395eabd91ccad00f4ba3ef736dc9190a158a9d268419f5/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:3bb6e65ce0beaeafdd069b320246e8f17c1cd30ddb27a0539143a3706733a4d8", size = 193104180, upload-time = "2025-08-04T10:22:19.821Z" }, + { url = "https://files.pythonhosted.org/packages/a5/87/e3c9ee227b750e5b61572e7509f586cc8d494a4f7874b5163e734ed852c2/nvidia_cusolver-12.0.3.29-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:6f54c2eed5edab54c224dd1852dde80ba76b2b78e6d3ce7344fef5dfc66d16ab", size = 193474165, upload-time = "2025-08-04T10:22:47.976Z" }, +] + +[[package]] +name = "nvidia-cusparse" +version = "12.6.2.49" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink", marker = "sys_platform != 'darwin'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/30/f32023427f2ef4ec27e8293dfddb5068de566912cd0a45eccfd400017a62/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d3269c19283a0057fb5ebfb003ae2a10c97a28a6958f4238354826b055827c7", size = 155888587, upload-time = "2025-08-04T10:23:04.091Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e8/b3f7a87cc719dca926c7baee92f2544de8909573a4126c85a9f1625431e8/nvidia_cusparse-12.6.2.49-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efcf0b01e3a0827c144feff5391456b8a06e9ce63dcd51c0943e32e605251952", size = 140247612, upload-time = "2025-08-04T10:23:29.844Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu13" +version = "0.8.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/10/8dcd1175260706a2fc92a16a52e306b71d4c1ea0b0cc4a9484183399818a/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:400c6ed1cf6780fc6efedd64ec9f1345871767e6a1a0a552a1ea0578117ea77c", size = 220791277, upload-time = "2025-08-13T19:22:40.982Z" }, + { url = "https://files.pythonhosted.org/packages/fd/53/43b0d71f4e702fa9733f8b4571fdca50a8813f1e450b656c239beff12315/nvidia_cusparselt_cu13-0.8.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:25e30a8a7323935d4ad0340b95a0b69926eee755767e8e0b1cf8dd85b197d3fd", size = 169884119, upload-time = "2025-08-13T19:23:41.967Z" }, +] + +[[package]] +name = "nvidia-nccl-cu13" +version = "2.27.7" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/61/2c7762da6febee96341ea17d1f7309ac7559ac3cab00f3f7e1e7bd0e5d00/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5e3cc863e52bf9dd1e3ab1941bddb414098f489ae7342f6b3a274602303da123", size = 194014855, upload-time = "2025-09-23T16:30:27.56Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/dabb10684e60edfaf1a1c9984d12a668bc1091582099d4e03ac5b9983b51/nvidia_nccl_cu13-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b28a524abd8389b76a4a3f133c76a7aaa7005e47fcaa9d9603b90103927a3f93", size = 193901479, upload-time = "2025-09-23T16:30:41.165Z" }, +] + +[[package]] +name = "nvidia-nvjitlink" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/39/726edebeb76f3efc25c79f885429fa1227c9d200e20ea219bf724b382e19/nvidia_nvjitlink-13.0.39-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:bc3179be558329ef9687884c6faa27cdc0659bdbc642432ec8cc6cc00d182627", size = 40709605, upload-time = "2025-08-04T10:25:04.129Z" }, + { url = "https://files.pythonhosted.org/packages/bc/7a/0fb4c4413b3b14519f8934edd4dcd9f411c4e14e2a2c0ae58709e4dda255/nvidia_nvjitlink-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce0d63fa5ebedf542056e7491c49feed2297c900980aa6269b6a55f478056ad7", size = 38767126, upload-time = "2025-08-04T10:24:53.05Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu13" +version = "3.3.24" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/7e/b8797780e442eabd9046cd6eb54100b8d0cb047ebc2f70931710cb03bcfe/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:28ae82a4d14b322b93409535de62df6b7b83f4f7672ca97fc89107c2d40ce2c2", size = 60168129, upload-time = "2025-08-22T19:56:28.818Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e9/8530afb8ed38d16bbc89cec80a4dd6a52dbf59bc93e546c3658cfa8b1f9b/nvidia_nvshmem_cu13-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c14d09571697d2e57cb079c8daec88ab1c68cb3586532bfbd4886125a08339b7", size = 60390470, upload-time = "2025-08-22T19:56:49.848Z" }, +] + +[[package]] +name = "nvidia-nvtx" +version = "13.0.39" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/37/0d103c84e7884382a79a569b720965141f83dd1c5df9e3e00cbc02d7099c/nvidia_nvtx-13.0.39-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc113127785c96db8a0fe715df92db9788777b4b3d1bd713d42f75969201b5ce", size = 147197, upload-time = "2025-08-04T10:18:39.829Z" }, + { url = "https://files.pythonhosted.org/packages/86/91/8b486ba85f71a2859dd705a4ec6aab38c37a389b8b7f94343db027732999/nvidia_nvtx-13.0.39-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cddd2e08b35144f1000631c3880c9ebbcb8a2863d762e76f92d47d30ecaf87cc", size = 148037, upload-time = "2025-08-04T10:18:31.763Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, +] + +[[package]] +name = "partd" +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "locket" }, + { name = "toolz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/3a/3f06f34820a31257ddcabdfafc2672c5816be79c7e353b02c1f318daa7d4/partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c", size = 21029, upload-time = "2024-05-06T19:51:41.945Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, +] + +[[package]] +name = "pikepdf" +version = "10.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "lxml" }, + { name = "packaging" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/ba/7635a5f4259a2a91ed4f094e358dec3068ecedc891d70b8e76a02904ca0c/pikepdf-10.3.0.tar.gz", hash = "sha256:e2a64a5f1ebf8c411193126b9eeff7faf5739a40bce7441e579531422469fbb1", size = 4575749, upload-time = "2026-01-30T07:33:53.317Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/a9/0d2107a3c796ab2fa7d379ee801190c95c4132f0bb5cfc1fd8d2e3ac74af/pikepdf-10.3.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:99fb21d20dc02f9828d477d2c549ee3f6e191801f84a2a2505d21baacb731745", size = 4753016, upload-time = "2026-01-30T07:32:51.999Z" }, + { url = "https://files.pythonhosted.org/packages/a9/2b/f634a0956aa15074db6c62309ec3d08bd158ddbdea8bd2081cea8b6eb3ed/pikepdf-10.3.0-cp311-cp311-macosx_15_0_x86_64.whl", hash = "sha256:c8a4b6862d7e0e69dd3f57efd362826966d1f341e0d052f7f23f0fe3a2375a36", size = 5063869, upload-time = "2026-01-30T07:32:54.418Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/d5ba1febacde805e7ec75a3df0888e53212f8e5f82fa1fc09c0fa981c7f9/pikepdf-10.3.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b86d42e66004ffaf5284aae0d9814bb3d19f048a45943479db5ca3d02d46bfb", size = 2445530, upload-time = "2026-01-30T07:32:56.117Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ba/196351a049a7a9d255140a414f586779b3ad77f0d09091e639d9f85c4131/pikepdf-10.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7021b31eddd5aa611f6941a2c171b7ce321c7763263ff658368f5f40bda1d4", size = 2673622, upload-time = "2026-01-30T07:32:57.85Z" }, + { url = "https://files.pythonhosted.org/packages/7c/cf/1315759de9dc66f769f84067da2127046e46489100f6e2be614fcb6c8394/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b653b1d0c5f17efb080ef68b65d3fcc8909f22128b75e0479775a35cd8d9fe6e", size = 3644910, upload-time = "2026-01-30T07:33:00.182Z" }, + { url = "https://files.pythonhosted.org/packages/80/6f/578ee7b53d06267f6c489fb7734792f6fa670a3a7d0b55db20b084e0957d/pikepdf-10.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fa3e4b32a2c1d15bb57e91ee3896c19b3c8145d46c26fbac8747efe7cb5ce3bd", size = 3835871, upload-time = "2026-01-30T07:33:02.804Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0f/980dbfb5ab9231d30e44d9285e8a7509f0871fc6fe438559e1eed16e683d/pikepdf-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:3233da668d665d301a4a4fd1481867e688336fdb410e9bc9d4e5b0cd62e334eb", size = 3756976, upload-time = "2026-01-30T07:33:05.596Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/d6ca7f6066d7f3b61b56bffeca1069c0ded635ba316aa1df54fcc0e2104f/pikepdf-10.3.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:d1a6646def3fc47f763eab0dcb11341a7205cef1b7dc5c62f1dee435a89472b9", size = 4762039, upload-time = "2026-01-30T07:33:08.626Z" }, + { url = "https://files.pythonhosted.org/packages/9c/dc/d0db713a34a493eedf4eded566668762aee5acfad958bdf374a450df931c/pikepdf-10.3.0-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:e968e4e81d6c05d8e4b24594b27a64cb9be3c7a4371bf0635f6b669559171e6b", size = 5078640, upload-time = "2026-01-30T07:33:10.478Z" }, + { url = "https://files.pythonhosted.org/packages/21/c0/e0a1f1afb99ecac5f7f21313b47c174178f85df0f1ec7080e0d431324099/pikepdf-10.3.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dfad0e4e6bc268ca041d639b232d76c25c9ad7023b7189d14869ef4446cabda2", size = 2450284, upload-time = "2026-01-30T07:33:12.215Z" }, + { url = "https://files.pythonhosted.org/packages/db/3a/2f0e8bd70cf57896a85b1d7f7ca3ce79d91a17222e1b23b607860ea52a5d/pikepdf-10.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cf7ab25f1e9063de320d2edecb2cd2960329cc25bac645c7938390f6538d9bf", size = 2699411, upload-time = "2026-01-30T07:33:13.878Z" }, + { url = "https://files.pythonhosted.org/packages/fd/10/da5f244aa14b845cd835f34b6a7a217493952f2532d2e00957ed3bd79aea/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3904353137e5b0cb2a316d84057e1e5301a65e6b1810d4763348ae8919ba20f4", size = 3649524, upload-time = "2026-01-30T07:33:15.641Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ef/3efb78a16d9c702dfd64fdeaee6a1ac6af95c41d4ec60b784e9171f20753/pikepdf-10.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4335ec70a659b5be1dfc7094a67db7f9c017c9c1cf9049b56d0e35ad24a46ff0", size = 3861320, upload-time = "2026-01-30T07:33:17.466Z" }, + { url = "https://files.pythonhosted.org/packages/8d/63/b0243fe62cf5d4d9da49010a15e0177b9629b8183092b3bd804f59a1529a/pikepdf-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac5befc1e991e28b16be104c219bdb1f6cf62a8371f4019ce7bab64ec5ec5745", size = 3763570, upload-time = "2026-01-30T07:33:19.863Z" }, +] + +[[package]] +name = "pillow" +version = "10.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/62/c9449f9c3043c37f73e7487ec4ef0c03eb9c9afc91a92b977a67b3c0bbc5/pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c", size = 3509265, upload-time = "2024-07-01T09:45:49.812Z" }, + { url = "https://files.pythonhosted.org/packages/f4/5f/491dafc7bbf5a3cc1845dc0430872e8096eb9e2b6f8161509d124594ec2d/pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be", size = 3375655, upload-time = "2024-07-01T09:45:52.462Z" }, + { url = "https://files.pythonhosted.org/packages/73/d5/c4011a76f4207a3c151134cd22a1415741e42fa5ddecec7c0182887deb3d/pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3", size = 4340304, upload-time = "2024-07-01T09:45:55.006Z" }, + { url = "https://files.pythonhosted.org/packages/ac/10/c67e20445a707f7a610699bba4fe050583b688d8cd2d202572b257f46600/pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6", size = 4452804, upload-time = "2024-07-01T09:45:58.437Z" }, + { url = "https://files.pythonhosted.org/packages/a9/83/6523837906d1da2b269dee787e31df3b0acb12e3d08f024965a3e7f64665/pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe", size = 4365126, upload-time = "2024-07-01T09:46:00.713Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e5/8c68ff608a4203085158cff5cc2a3c534ec384536d9438c405ed6370d080/pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319", size = 4533541, upload-time = "2024-07-01T09:46:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7c/01b8dbdca5bc6785573f4cee96e2358b0918b7b2c7b60d8b6f3abf87a070/pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d", size = 4471616, upload-time = "2024-07-01T09:46:05.356Z" }, + { url = "https://files.pythonhosted.org/packages/c8/57/2899b82394a35a0fbfd352e290945440e3b3785655a03365c0ca8279f351/pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696", size = 4600802, upload-time = "2024-07-01T09:46:08.145Z" }, + { url = "https://files.pythonhosted.org/packages/4d/d7/a44f193d4c26e58ee5d2d9db3d4854b2cfb5b5e08d360a5e03fe987c0086/pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496", size = 2235213, upload-time = "2024-07-01T09:46:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/c1/d0/5866318eec2b801cdb8c82abf190c8343d8a1cd8bf5a0c17444a6f268291/pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91", size = 2554498, upload-time = "2024-07-01T09:46:12.685Z" }, + { url = "https://files.pythonhosted.org/packages/d4/c8/310ac16ac2b97e902d9eb438688de0d961660a87703ad1561fd3dfbd2aa0/pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22", size = 2243219, upload-time = "2024-07-01T09:46:14.83Z" }, + { url = "https://files.pythonhosted.org/packages/05/cb/0353013dc30c02a8be34eb91d25e4e4cf594b59e5a55ea1128fde1e5f8ea/pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94", size = 3509350, upload-time = "2024-07-01T09:46:17.177Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5c558a0f247e0bf9cec92bff9b46ae6474dd736f6d906315e60e4075f737/pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597", size = 3374980, upload-time = "2024-07-01T09:46:19.169Z" }, + { url = "https://files.pythonhosted.org/packages/84/48/6e394b86369a4eb68b8a1382c78dc092245af517385c086c5094e3b34428/pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80", size = 4343799, upload-time = "2024-07-01T09:46:21.883Z" }, + { url = "https://files.pythonhosted.org/packages/3b/f3/a8c6c11fa84b59b9df0cd5694492da8c039a24cd159f0f6918690105c3be/pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca", size = 4459973, upload-time = "2024-07-01T09:46:24.321Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1b/c14b4197b80150fb64453585247e6fb2e1d93761fa0fa9cf63b102fde822/pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef", size = 4370054, upload-time = "2024-07-01T09:46:26.825Z" }, + { url = "https://files.pythonhosted.org/packages/55/77/40daddf677897a923d5d33329acd52a2144d54a9644f2a5422c028c6bf2d/pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a", size = 4539484, upload-time = "2024-07-01T09:46:29.355Z" }, + { url = "https://files.pythonhosted.org/packages/40/54/90de3e4256b1207300fb2b1d7168dd912a2fb4b2401e439ba23c2b2cabde/pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b", size = 4477375, upload-time = "2024-07-01T09:46:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/13/24/1bfba52f44193860918ff7c93d03d95e3f8748ca1de3ceaf11157a14cf16/pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9", size = 4608773, upload-time = "2024-07-01T09:46:33.73Z" }, + { url = "https://files.pythonhosted.org/packages/55/04/5e6de6e6120451ec0c24516c41dbaf80cce1b6451f96561235ef2429da2e/pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42", size = 2235690, upload-time = "2024-07-01T09:46:36.587Z" }, + { url = "https://files.pythonhosted.org/packages/74/0a/d4ce3c44bca8635bd29a2eab5aa181b654a734a29b263ca8efe013beea98/pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a", size = 2554951, upload-time = "2024-07-01T09:46:38.777Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ca/184349ee40f2e92439be9b3502ae6cfc43ac4b50bc4fc6b3de7957563894/pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9", size = 2243427, upload-time = "2024-07-01T09:46:43.15Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" }, + { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" }, + { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" }, + { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" }, + { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" }, + { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" }, + { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" }, + { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" }, + { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec17c65562a827bba85e3872ead335f95405ea1674860d96483a02f5c698fa72", size = 225505, upload-time = "2025-10-08T19:46:50.055Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/1c53f4563490b1d06a684742cc6076ef944bc6457df6051b7d1a877c057b/propcache-0.4.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:405aac25c6394ef275dee4c709be43745d36674b223ba4eb7144bf4d691b7367", size = 230242, upload-time = "2025-10-08T19:46:51.815Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/ce4620633b0e2422207c3cb774a0ee61cac13abc6217763a7b9e2e3f4a12/propcache-0.4.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0013cb6f8dde4b2a2f66903b8ba740bdfe378c943c4377a200551ceb27f379e4", size = 238474, upload-time = "2025-10-08T19:46:53.208Z" }, + { url = "https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15932ab57837c3368b024473a525e25d316d8353016e7cc0e5ba9eb343fbb1cf", size = 221575, upload-time = "2025-10-08T19:46:54.511Z" }, + { url = "https://files.pythonhosted.org/packages/6e/a5/8a5e8678bcc9d3a1a15b9a29165640d64762d424a16af543f00629c87338/propcache-0.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:031dce78b9dc099f4c29785d9cf5577a3faf9ebf74ecbd3c856a7b92768c3df3", size = 216736, upload-time = "2025-10-08T19:46:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/f1/63/b7b215eddeac83ca1c6b934f89d09a625aa9ee4ba158338854c87210cc36/propcache-0.4.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ab08df6c9a035bee56e31af99be621526bd237bea9f32def431c656b29e41778", size = 213019, upload-time = "2025-10-08T19:46:57.595Z" }, + { url = "https://files.pythonhosted.org/packages/57/74/f580099a58c8af587cac7ba19ee7cb418506342fbbe2d4a4401661cca886/propcache-0.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4d7af63f9f93fe593afbf104c21b3b15868efb2c21d07d8732c0c4287e66b6a6", size = 220376, upload-time = "2025-10-08T19:46:59.067Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ee/542f1313aff7eaf19c2bb758c5d0560d2683dac001a1c96d0774af799843/propcache-0.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:cfc27c945f422e8b5071b6e93169679e4eb5bf73bbcbf1ba3ae3a83d2f78ebd9", size = 226988, upload-time = "2025-10-08T19:47:00.544Z" }, + { url = "https://files.pythonhosted.org/packages/8f/18/9c6b015dd9c6930f6ce2229e1f02fb35298b847f2087ea2b436a5bfa7287/propcache-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:35c3277624a080cc6ec6f847cbbbb5b49affa3598c4535a0a4682a697aaa5c75", size = 215615, upload-time = "2025-10-08T19:47:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/80/9e/e7b85720b98c45a45e1fca6a177024934dc9bc5f4d5dd04207f216fc33ed/propcache-0.4.1-cp312-cp312-win32.whl", hash = "sha256:671538c2262dadb5ba6395e26c1731e1d52534bfe9ae56d0b5573ce539266aa8", size = 38066, upload-time = "2025-10-08T19:47:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:cb2d222e72399fcf5890d1d5cc1060857b9b236adff2792ff48ca2dfd46c81db", size = 41655, upload-time = "2025-10-08T19:47:04.973Z" }, + { url = "https://files.pythonhosted.org/packages/68/ab/6b5c191bb5de08036a8c697b265d4ca76148efb10fa162f14af14fb5f076/propcache-0.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:204483131fb222bdaaeeea9f9e6c6ed0cac32731f75dfc1d4a567fc1926477c1", size = 37789, upload-time = "2025-10-08T19:47:06.077Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" }, + { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" }, + { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" }, + { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" }, + { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pymupdf" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pymupdfb" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/57/da06ca4886afc71a624e4b463d05f45c8a822596ede939957295e229eb4e/PyMuPDF-1.24.10.tar.gz", hash = "sha256:bd3ebd6d3fb8a845582098362f885bfb0a31ae4272587efc2c55c5e29fe7327a", size = 46988085, upload-time = "2024-09-02T16:28:45.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/35/6af0bb4bafe9d54893a04d9639f73b1b754efe0235997052d75fb6b7edc1/PyMuPDF-1.24.10-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:5fbd67cce759fc0126902137409cf9da6313b776c4d5ff0d5200f336350f86a3", size = 3194012, upload-time = "2024-09-02T16:27:14.019Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2b/c254cf49dfcf2469a674407a680f5b2b174b866e84d322f5767baf4d3ad3/PyMuPDF-1.24.10-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:2b14dbdf7c415bb0fa849527abbe7b4f1f55ae23b9355d132951f634438c59ac", size = 2974781, upload-time = "2024-09-02T16:27:17.213Z" }, + { url = "https://files.pythonhosted.org/packages/1c/77/78800d3a711f92060f8e338a5df9330ffb5950f4fb3beeba01e15c03c4c6/PyMuPDF-1.24.10-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:1a87440a6cbc0d5ad513425baa0f4747841898fca6e37350ca3e6b29e5f40c01", size = 3210393, upload-time = "2024-09-02T22:17:05.788Z" }, + { url = "https://files.pythonhosted.org/packages/c5/39/3aaa1e8822c55c71bb37911b5b1c3157ef38d731581224b29a682d80a17b/PyMuPDF-1.24.10-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:c0d1ccdc062ea9961063790831e838bc43fcf9a8436a8b9f55898addf97c0f86", size = 3482650, upload-time = "2024-09-02T16:27:21.101Z" }, + { url = "https://files.pythonhosted.org/packages/5b/73/6b5c2dc59539b79cb9430ff946d7dff308af146f7c8bc7b96c963e12970d/PyMuPDF-1.24.10-cp311-none-musllinux_1_2_x86_64.whl", hash = "sha256:f68671363be5a2ba104ab7d3bad821d2994cbe3f3408538bbc27d32e6dc9f923", size = 3600588, upload-time = "2024-09-02T16:27:25.022Z" }, + { url = "https://files.pythonhosted.org/packages/71/e9/d3bf062325b4821726a2f9ce9d75b63f594ae24bc38c31f55b4285f1f5e1/PyMuPDF-1.24.10-cp311-none-win32.whl", hash = "sha256:49f83556cd1a7d05b36a54ccc01fce324da8a4e6854e36cc5cd94d321e428565", size = 2694768, upload-time = "2024-09-02T16:27:33.318Z" }, + { url = "https://files.pythonhosted.org/packages/30/3f/356a70c105d4410c29529f1ca8c53b5d176b448a4409238b4dcd133507a4/PyMuPDF-1.24.10-cp311-none-win_amd64.whl", hash = "sha256:05b8d360766b87f4abd186eba16a56b92bae513b2361b13f633fe6256329292e", size = 3214889, upload-time = "2024-09-02T16:27:28.174Z" }, + { url = "https://files.pythonhosted.org/packages/75/84/7231344d98355a40fb57c4025391dfb4116e2c3e9d98d5cc83f80c5ea942/PyMuPDF-1.24.10-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:f323aa7bb55e0214e632bfe24fa140bd5dcfeac2d3977bdce46e760385140513", size = 3230169, upload-time = "2024-09-02T16:27:37.842Z" }, + { url = "https://files.pythonhosted.org/packages/b2/bc/975b4fe4400b00c912dad1874c43d31486150e6f39d7dae758751c27e2dd/PyMuPDF-1.24.10-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:50d2972558d25ce46a8634b58787b28dbeff9b3fe4299530fc9c8c9921061e83", size = 2980118, upload-time = "2024-09-02T16:27:41.534Z" }, + { url = "https://files.pythonhosted.org/packages/5b/dc/0f22c77ac4f8e6b8316072519513d5f0111fffe96d357051db0ddf043032/PyMuPDF-1.24.10-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:0e3969c2fdff682b3b2c6a2b463adde068d6d8e20e2133ef6c8503469259646a", size = 3216830, upload-time = "2024-09-02T22:17:09.193Z" }, + { url = "https://files.pythonhosted.org/packages/a3/1b/1b41b27aab571b835f8d983492b80ed64548e3b5c4d169e23c639727d43b/PyMuPDF-1.24.10-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:cd78ee1ebefdfe72bc36fd4b731cc8c694eb8ef5337d8ea956b0e94cd88751fc", size = 3491118, upload-time = "2024-09-02T16:27:50.098Z" }, + { url = "https://files.pythonhosted.org/packages/2d/3c/f1ffbc6e13ab37900c2aa71e434bbba922770091242e2b059acdb14f779e/PyMuPDF-1.24.10-cp312-none-musllinux_1_2_x86_64.whl", hash = "sha256:696eed91d2ee44e76277dfeb6bd904c84ae005378588949df6ed9be9e03b9817", size = 3612589, upload-time = "2024-09-02T16:27:54.185Z" }, + { url = "https://files.pythonhosted.org/packages/53/fb/158909af75c84968ea7e6659a75fd67bd462103c599033b23ffd6bc173be/PyMuPDF-1.24.10-cp312-none-win32.whl", hash = "sha256:1e5413e1aeab2f18e1ca1b3ff17057a4a7c5cbf4ff14abc93203da88fc1a1dd8", size = 2701190, upload-time = "2024-09-02T16:27:57.74Z" }, + { url = "https://files.pythonhosted.org/packages/91/4a/4a54d3f6a779ac5eed92e82fe3c1bb426bc40f9ea57c8656839198944a82/PyMuPDF-1.24.10-cp312-none-win_amd64.whl", hash = "sha256:227a4473fce8fa32b9268da68781048795503b67dc045867fc201e1334204bf1", size = 3228084, upload-time = "2024-09-02T16:27:45.749Z" }, +] + +[[package]] +name = "pymupdfb" +version = "1.24.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/ff/ecfcb41414b51976974d74c8e35fef0a0e5b47c7046a11c860553f5dccf0/PyMuPDFb-1.24.10.tar.gz", hash = "sha256:007b91fa9b528c5c0eecea2e49c486ac02e878274f9e31522bdd948adc5f8327", size = 37502, upload-time = "2024-09-02T16:28:48.343Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/94/b217dc987b4ac0e3793984427112d6032563b741e27763f7761c2231d022/PyMuPDFb-1.24.10-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:cd6b24630d90dce9ab3e59d06c5e616686f8d7ec626be1311721fcb062aa0078", size = 15536229, upload-time = "2024-09-02T16:25:19.4Z" }, + { url = "https://files.pythonhosted.org/packages/16/7a/f634c76d8331cb8dedcfaced17424cc469ee20b7f53cf29c9ef17a01b461/PyMuPDFb-1.24.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fda2c34b206f724b1b5685b67188e2a57bcaa5c99bc40a0a5bc62057514c5cdf", size = 15149482, upload-time = "2024-09-02T16:25:34.352Z" }, + { url = "https://files.pythonhosted.org/packages/62/97/67b5da2edd034e66dadd0ec530e277afb14fe866a3b3b01d9fad154bc6f8/PyMuPDFb-1.24.10-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4f50a7472f9bb10cbc7a1cd589ee4626ca030b8a4a02749f9a29eb6f00c0e0db", size = 15711338, upload-time = "2024-09-02T22:17:01.592Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/ad3f076e86328880797fe7e98c43b2879df56cf6cb75ac3058da06d6e6cb/PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:409f1270ef2e70d845e80149ff3db9cfed578274042316cba55cc3e3882421ea", size = 15921939, upload-time = "2024-09-02T16:26:00.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/e7/02160ea905a7ba16d6e1ca51759ae1c1045785ebebae57ba30e82617f934/PyMuPDFb-1.24.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:aca96b6e9ee3096a26810592f4d899f4d3cf3cf0c902ae7e8cca09bce4d946c4", size = 17076991, upload-time = "2024-09-02T16:25:46.703Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c0/e1ed840440131f71b068cdb3b620a69ec27543b1012a6bd855d8d05f1629/PyMuPDFb-1.24.10-py3-none-win32.whl", hash = "sha256:2d231b42fe3bf79837df235e7fbdf7ff8b46bf4ca1346d0f0124fb1cdd343ce8", size = 11731706, upload-time = "2024-09-02T16:26:19.131Z" }, + { url = "https://files.pythonhosted.org/packages/70/cb/8459d6c179befd7c6eee555334f054e9a6dcdd9f8671891e1da19e0ce526/PyMuPDFb-1.24.10-py3-none-win_amd64.whl", hash = "sha256:27ea65c701608b6b7632703339ca33ea6d513843b26dbe9bdefb2f56f7b9b196", size = 13186168, upload-time = "2024-09-02T16:26:10.503Z" }, +] + +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "pytz" +version = "2026.1.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, +] + +[[package]] +name = "regex" +version = "2026.2.28" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/db/8cbfd0ba3f302f2d09dd0019a9fcab74b63fee77a76c937d0e33161fb8c1/regex-2026.2.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e621fb7c8dc147419b28e1702f58a0177ff8308a76fa295c71f3e7827849f5d9", size = 488462, upload-time = "2026-02-28T02:16:22.616Z" }, + { url = "https://files.pythonhosted.org/packages/5d/10/ccc22c52802223f2368731964ddd117799e1390ffc39dbb31634a83022ee/regex-2026.2.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0d5bef2031cbf38757a0b0bc4298bb4824b6332d28edc16b39247228fbdbad97", size = 290774, upload-time = "2026-02-28T02:16:23.993Z" }, + { url = "https://files.pythonhosted.org/packages/62/b9/6796b3bf3101e64117201aaa3a5a030ec677ecf34b3cd6141b5d5c6c67d5/regex-2026.2.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcb399ed84eabf4282587ba151f2732ad8168e66f1d3f85b1d038868fe547703", size = 288724, upload-time = "2026-02-28T02:16:25.403Z" }, + { url = "https://files.pythonhosted.org/packages/9c/02/291c0ae3f3a10cea941d0f5366da1843d8d1fa8a25b0671e20a0e454bb38/regex-2026.2.28-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1b34dfa72f826f535b20712afa9bb3ba580020e834f3c69866c5bddbf10098", size = 791924, upload-time = "2026-02-28T02:16:26.863Z" }, + { url = "https://files.pythonhosted.org/packages/0f/57/f0235cc520d9672742196c5c15098f8f703f2758d48d5a7465a56333e496/regex-2026.2.28-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:851fa70df44325e1e4cdb79c5e676e91a78147b1b543db2aec8734d2add30ec2", size = 860095, upload-time = "2026-02-28T02:16:28.772Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7c/393c94cbedda79a0f5f2435ebd01644aba0b338d327eb24b4aa5b8d6c07f/regex-2026.2.28-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:516604edd17b1c2c3e579cf4e9b25a53bf8fa6e7cedddf1127804d3e0140ca64", size = 906583, upload-time = "2026-02-28T02:16:30.977Z" }, + { url = "https://files.pythonhosted.org/packages/2c/73/a72820f47ca5abf2b5d911d0407ba5178fc52cf9780191ed3a54f5f419a2/regex-2026.2.28-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e7ce83654d1ab701cb619285a18a8e5a889c1216d746ddc710c914ca5fd71022", size = 800234, upload-time = "2026-02-28T02:16:32.55Z" }, + { url = "https://files.pythonhosted.org/packages/34/b3/6e6a4b7b31fa998c4cf159a12cbeaf356386fbd1a8be743b1e80a3da51e4/regex-2026.2.28-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2791948f7c70bb9335a9102df45e93d428f4b8128020d85920223925d73b9e1", size = 772803, upload-time = "2026-02-28T02:16:34.029Z" }, + { url = "https://files.pythonhosted.org/packages/10/e7/5da0280c765d5a92af5e1cd324b3fe8464303189cbaa449de9a71910e273/regex-2026.2.28-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:03a83cc26aa2acda6b8b9dfe748cf9e84cbd390c424a1de34fdcef58961a297a", size = 781117, upload-time = "2026-02-28T02:16:36.253Z" }, + { url = "https://files.pythonhosted.org/packages/76/39/0b8d7efb256ae34e1b8157acc1afd8758048a1cf0196e1aec2e71fd99f4b/regex-2026.2.28-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ec6f5674c5dc836994f50f1186dd1fafde4be0666aae201ae2fcc3d29d8adf27", size = 854224, upload-time = "2026-02-28T02:16:38.119Z" }, + { url = "https://files.pythonhosted.org/packages/21/ff/a96d483ebe8fe6d1c67907729202313895d8de8495569ec319c6f29d0438/regex-2026.2.28-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:50c2fc924749543e0eacc93ada6aeeb3ea5f6715825624baa0dccaec771668ae", size = 761898, upload-time = "2026-02-28T02:16:40.333Z" }, + { url = "https://files.pythonhosted.org/packages/89/bd/d4f2e75cb4a54b484e796017e37c0d09d8a0a837de43d17e238adf163f4e/regex-2026.2.28-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ba55c50f408fb5c346a3a02d2ce0ebc839784e24f7c9684fde328ff063c3cdea", size = 844832, upload-time = "2026-02-28T02:16:41.875Z" }, + { url = "https://files.pythonhosted.org/packages/8a/a7/428a135cf5e15e4e11d1e696eb2bf968362f8ea8a5f237122e96bc2ae950/regex-2026.2.28-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edb1b1b3a5576c56f08ac46f108c40333f222ebfd5cf63afdfa3aab0791ebe5b", size = 788347, upload-time = "2026-02-28T02:16:43.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/59/68691428851cf9c9c3707217ab1d9b47cfeec9d153a49919e6c368b9e926/regex-2026.2.28-cp311-cp311-win32.whl", hash = "sha256:948c12ef30ecedb128903c2c2678b339746eb7c689c5c21957c4a23950c96d15", size = 266033, upload-time = "2026-02-28T02:16:45.094Z" }, + { url = "https://files.pythonhosted.org/packages/42/8b/1483de1c57024e89296cbcceb9cccb3f625d416ddb46e570be185c9b05a9/regex-2026.2.28-cp311-cp311-win_amd64.whl", hash = "sha256:fd63453f10d29097cc3dc62d070746523973fb5aa1c66d25f8558bebd47fed61", size = 277978, upload-time = "2026-02-28T02:16:46.75Z" }, + { url = "https://files.pythonhosted.org/packages/a4/36/abec45dc6e7252e3dbc797120496e43bb5730a7abf0d9cb69340696a2f2d/regex-2026.2.28-cp311-cp311-win_arm64.whl", hash = "sha256:00f2b8d9615aa165fdff0a13f1a92049bfad555ee91e20d246a51aa0b556c60a", size = 270340, upload-time = "2026-02-28T02:16:48.626Z" }, + { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" }, + { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" }, + { url = "https://files.pythonhosted.org/packages/9e/06/3ef1ac6910dc3295ebd71b1f9bfa737e82cfead211a18b319d45f85ddd09/regex-2026.2.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b65d33a17101569f86d9c5966a8b1d7fbf8afdda5a8aa219301b0a80f58cf7d", size = 289200, upload-time = "2026-02-28T02:16:54.08Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c9/8cc8d850b35ab5650ff6756a1cb85286e2000b66c97520b29c1587455344/regex-2026.2.28-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e71dcecaa113eebcc96622c17692672c2d104b1d71ddf7adeda90da7ddeb26fc", size = 796765, upload-time = "2026-02-28T02:16:55.905Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5d/57702597627fc23278ebf36fbb497ac91c0ce7fec89ac6c81e420ca3e38c/regex-2026.2.28-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:481df4623fa4969c8b11f3433ed7d5e3dc9cec0f008356c3212b3933fb77e3d8", size = 863093, upload-time = "2026-02-28T02:16:58.094Z" }, + { url = "https://files.pythonhosted.org/packages/02/6d/f3ecad537ca2811b4d26b54ca848cf70e04fcfc138667c146a9f3157779c/regex-2026.2.28-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:64e7c6ad614573e0640f271e811a408d79a9e1fe62a46adb602f598df42a818d", size = 909455, upload-time = "2026-02-28T02:17:00.918Z" }, + { url = "https://files.pythonhosted.org/packages/9e/40/bb226f203caa22c1043c1ca79b36340156eca0f6a6742b46c3bb222a3a57/regex-2026.2.28-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6b08a06976ff4fb0d83077022fde3eca06c55432bb997d8c0495b9a4e9872f4", size = 802037, upload-time = "2026-02-28T02:17:02.842Z" }, + { url = "https://files.pythonhosted.org/packages/44/7c/c6d91d8911ac6803b45ca968e8e500c46934e58c0903cbc6d760ee817a0a/regex-2026.2.28-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:864cdd1a2ef5716b0ab468af40139e62ede1b3a53386b375ec0786bb6783fc05", size = 775113, upload-time = "2026-02-28T02:17:04.506Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8d/4a9368d168d47abd4158580b8c848709667b1cd293ff0c0c277279543bd0/regex-2026.2.28-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:511f7419f7afab475fd4d639d4aedfc54205bcb0800066753ef68a59f0f330b5", size = 784194, upload-time = "2026-02-28T02:17:06.888Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bf/2c72ab5d8b7be462cb1651b5cc333da1d0068740342f350fcca3bca31947/regex-2026.2.28-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b42f7466e32bf15a961cf09f35fa6323cc72e64d3d2c990b10de1274a5da0a59", size = 856846, upload-time = "2026-02-28T02:17:09.11Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f4/6b65c979bb6d09f51bb2d2a7bc85de73c01ec73335d7ddd202dcb8cd1c8f/regex-2026.2.28-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8710d61737b0c0ce6836b1da7109f20d495e49b3809f30e27e9560be67a257bf", size = 763516, upload-time = "2026-02-28T02:17:11.004Z" }, + { url = "https://files.pythonhosted.org/packages/8e/32/29ea5e27400ee86d2cc2b4e80aa059df04eaf78b4f0c18576ae077aeff68/regex-2026.2.28-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4390c365fd2d45278f45afd4673cb90f7285f5701607e3ad4274df08e36140ae", size = 849278, upload-time = "2026-02-28T02:17:12.693Z" }, + { url = "https://files.pythonhosted.org/packages/1d/91/3233d03b5f865111cd517e1c95ee8b43e8b428d61fa73764a80c9bb6f537/regex-2026.2.28-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb3b1db8ff6c7b8bf838ab05583ea15230cb2f678e569ab0e3a24d1e8320940b", size = 790068, upload-time = "2026-02-28T02:17:14.9Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/abc706c1fb03b4580a09645b206a3fc032f5a9f457bc1a8038ac555658ab/regex-2026.2.28-cp312-cp312-win32.whl", hash = "sha256:f8ed9a5d4612df9d4de15878f0bc6aa7a268afbe5af21a3fdd97fa19516e978c", size = 266416, upload-time = "2026-02-28T02:17:17.15Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/2a6f7dff190e5fa9df9fb4acf2fdf17a1aa0f7f54596cba8de608db56b3a/regex-2026.2.28-cp312-cp312-win_amd64.whl", hash = "sha256:01d65fd24206c8e1e97e2e31b286c59009636c022eb5d003f52760b0f42155d4", size = 277297, upload-time = "2026-02-28T02:17:18.723Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f0/58a2484851fadf284458fdbd728f580d55c1abac059ae9f048c63b92f427/regex-2026.2.28-cp312-cp312-win_arm64.whl", hash = "sha256:c0b5ccbb8ffb433939d248707d4a8b31993cb76ab1a0187ca886bf50e96df952", size = 270408, upload-time = "2026-02-28T02:17:20.328Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "safetensors" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" }, + { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" }, + { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" }, + { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" }, + { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" }, + { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" }, + { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" }, + { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" }, + { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" }, +] + +[[package]] +name = "scikit-learn" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e", size = 7068312, upload-time = "2025-01-10T08:07:55.348Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/2a/e291c29670795406a824567d1dfc91db7b699799a002fdaa452bceea8f6e/scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33", size = 12102620, upload-time = "2025-01-10T08:06:16.675Z" }, + { url = "https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d", size = 11116234, upload-time = "2025-01-10T08:06:21.83Z" }, + { url = "https://files.pythonhosted.org/packages/30/cd/ed4399485ef364bb25f388ab438e3724e60dc218c547a407b6e90ccccaef/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2", size = 12592155, upload-time = "2025-01-10T08:06:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/62fc9a5a659bb58a03cdd7e258956a5824bdc9b4bb3c5d932f55880be569/scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8", size = 13497069, upload-time = "2025-01-10T08:06:32.515Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415", size = 11139809, upload-time = "2025-01-10T08:06:35.514Z" }, + { url = "https://files.pythonhosted.org/packages/0a/18/c797c9b8c10380d05616db3bfb48e2a3358c767affd0857d56c2eb501caa/scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b", size = 12104516, upload-time = "2025-01-10T08:06:40.009Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b7/2e35f8e289ab70108f8cbb2e7a2208f0575dc704749721286519dcf35f6f/scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2", size = 11167837, upload-time = "2025-01-10T08:06:43.305Z" }, + { url = "https://files.pythonhosted.org/packages/a4/f6/ff7beaeb644bcad72bcfd5a03ff36d32ee4e53a8b29a639f11bcb65d06cd/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f", size = 12253728, upload-time = "2025-01-10T08:06:47.618Z" }, + { url = "https://files.pythonhosted.org/packages/29/7a/8bce8968883e9465de20be15542f4c7e221952441727c4dad24d534c6d99/scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86", size = 13147700, upload-time = "2025-01-10T08:06:50.888Z" }, + { url = "https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52", size = 11110613, upload-time = "2025-01-10T08:06:54.115Z" }, +] + +[[package]] +name = "scipy" +version = "1.17.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" }, + { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" }, + { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" }, + { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" }, + { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" }, + { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" }, + { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" }, + { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" }, + { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017, upload-time = "2026-02-23T00:18:21.502Z" }, + { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842, upload-time = "2026-02-23T00:18:35.367Z" }, + { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890, upload-time = "2026-02-23T00:18:49.188Z" }, + { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557, upload-time = "2026-02-23T00:18:54.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856, upload-time = "2026-02-23T00:19:00.307Z" }, + { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682, upload-time = "2026-02-23T00:19:07.67Z" }, + { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340, upload-time = "2026-02-23T00:19:12.024Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/82/f3/748f4d6f65d1756b9ae577f329c951cda23fb900e4de9f70900ced962085/setuptools-82.0.0.tar.gz", hash = "sha256:22e0a2d69474c6ae4feb01951cb69d515ed23728cf96d05513d36e42b62b37cb", size = 1144893, upload-time = "2026-02-08T15:08:40.206Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/c6/76dc613121b793286a3f91621d7b75a2b493e0390ddca50f11993eadf192/setuptools-82.0.0-py3-none-any.whl", hash = "sha256:70b18734b607bd1da571d097d236cfcfacaf01de45717d59e6e04b96877532e0", size = 1003468, upload-time = "2026-02-08T15:08:38.723Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "threadpoolctl" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, +] + +[[package]] +name = "tokenizers" +version = "0.20.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/25/b1681c1c30ea3ea6e584ae3fffd552430b12faa599b558c4c4783f56d7ff/tokenizers-0.20.3.tar.gz", hash = "sha256:2278b34c5d0dd78e087e1ca7f9b1dcbf129d80211afa645f214bd6e051037539", size = 340513, upload-time = "2024-11-05T17:34:10.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/93/6742ef9206409d5ce1fdf44d5ca1687cdc3847ba0485424e2c731e6bcf67/tokenizers-0.20.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:585b51e06ca1f4839ce7759941e66766d7b060dccfdc57c4ca1e5b9a33013a90", size = 2674224, upload-time = "2024-11-05T17:30:49.972Z" }, + { url = "https://files.pythonhosted.org/packages/aa/14/e75ece72e99f6ef9ae07777ca9fdd78608f69466a5cecf636e9bd2f25d5c/tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61cbf11954f3b481d08723ebd048ba4b11e582986f9be74d2c3bdd9293a4538d", size = 2558991, upload-time = "2024-11-05T17:30:51.666Z" }, + { url = "https://files.pythonhosted.org/packages/46/54/033b5b2ba0c3ae01e026c6f7ced147d41a2fa1c573d00a66cb97f6d7f9b3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef820880d5e4e8484e2fa54ff8d297bb32519eaa7815694dc835ace9130a3eea", size = 2892476, upload-time = "2024-11-05T17:30:53.505Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/cc369fb3297d61f3311cab523d16d48c869dc2f0ba32985dbf03ff811041/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:67ef4dcb8841a4988cd00dd288fb95dfc8e22ed021f01f37348fd51c2b055ba9", size = 2802775, upload-time = "2024-11-05T17:30:55.229Z" }, + { url = "https://files.pythonhosted.org/packages/1a/74/62ad983e8ea6a63e04ed9c5be0b605056bf8aac2f0125f9b5e0b3e2b89fa/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb", size = 3086138, upload-time = "2024-11-05T17:30:57.332Z" }, + { url = "https://files.pythonhosted.org/packages/6b/ac/4637ba619db25094998523f9e6f5b456e1db1f8faa770a3d925d436db0c3/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:444d188186eab3148baf0615b522461b41b1f0cd58cd57b862ec94b6ac9780f1", size = 3098076, upload-time = "2024-11-05T17:30:59.455Z" }, + { url = "https://files.pythonhosted.org/packages/58/ce/9793f2dc2ce529369807c9c74e42722b05034af411d60f5730b720388c7d/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37c04c032c1442740b2c2d925f1857885c07619224a533123ac7ea71ca5713da", size = 3379650, upload-time = "2024-11-05T17:31:01.264Z" }, + { url = "https://files.pythonhosted.org/packages/50/f6/2841de926bc4118af996eaf0bdf0ea5b012245044766ffc0347e6c968e63/tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453c7769d22231960ee0e883d1005c93c68015025a5e4ae56275406d94a3c907", size = 2994005, upload-time = "2024-11-05T17:31:02.985Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b2/00915c4fed08e9505d37cf6eaab45b12b4bff8f6719d459abcb9ead86a4b/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4bb31f7b2847e439766aaa9cc7bccf7ac7088052deccdb2275c952d96f691c6a", size = 8977488, upload-time = "2024-11-05T17:31:04.424Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ac/1c069e7808181ff57bcf2d39e9b6fbee9133a55410e6ebdaa89f67c32e83/tokenizers-0.20.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:843729bf0f991b29655a069a2ff58a4c24375a553c70955e15e37a90dd4e045c", size = 9294935, upload-time = "2024-11-05T17:31:06.882Z" }, + { url = "https://files.pythonhosted.org/packages/50/47/722feb70ee68d1c4412b12d0ea4acc2713179fd63f054913990f9e259492/tokenizers-0.20.3-cp311-none-win32.whl", hash = "sha256:efcce3a927b1e20ca694ba13f7a68c59b0bd859ef71e441db68ee42cf20c2442", size = 2197175, upload-time = "2024-11-05T17:31:09.385Z" }, + { url = "https://files.pythonhosted.org/packages/75/68/1b4f928b15a36ed278332ac75d66d7eb65d865bf344d049c452c18447bf9/tokenizers-0.20.3-cp311-none-win_amd64.whl", hash = "sha256:88301aa0801f225725b6df5dea3d77c80365ff2362ca7e252583f2b4809c4cc0", size = 2381616, upload-time = "2024-11-05T17:31:10.685Z" }, + { url = "https://files.pythonhosted.org/packages/07/00/92a08af2a6b0c88c50f1ab47d7189e695722ad9714b0ee78ea5e1e2e1def/tokenizers-0.20.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:49d12a32e190fad0e79e5bdb788d05da2f20d8e006b13a70859ac47fecf6ab2f", size = 2667951, upload-time = "2024-11-05T17:31:12.356Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9a/e17a352f0bffbf415cf7d73756f5c73a3219225fc5957bc2f39d52c61684/tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:282848cacfb9c06d5e51489f38ec5aa0b3cd1e247a023061945f71f41d949d73", size = 2555167, upload-time = "2024-11-05T17:31:13.839Z" }, + { url = "https://files.pythonhosted.org/packages/27/37/d108df55daf4f0fcf1f58554692ff71687c273d870a34693066f0847be96/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe4e08c7d0cd6154c795deb5bf81d2122f36daf075e0c12a8b050d824ef0a64", size = 2898389, upload-time = "2024-11-05T17:31:15.12Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/32f29da16d28f59472fa7fb38e7782069748c7e9ab9854522db20341624c/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca94fc1b73b3883c98f0c88c77700b13d55b49f1071dfd57df2b06f3ff7afd64", size = 2795866, upload-time = "2024-11-05T17:31:16.857Z" }, + { url = "https://files.pythonhosted.org/packages/29/4e/8a9a3c89e128c4a40f247b501c10279d2d7ade685953407c4d94c8c0f7a7/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef279c7e239f95c8bdd6ff319d9870f30f0d24915b04895f55b1adcf96d6c60d", size = 3085446, upload-time = "2024-11-05T17:31:18.392Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3b/a2a7962c496ebcd95860ca99e423254f760f382cd4bd376f8895783afaf5/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16384073973f6ccbde9852157a4fdfe632bb65208139c9d0c0bd0176a71fd67f", size = 3094378, upload-time = "2024-11-05T17:31:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f4/a8a33f0192a1629a3bd0afcad17d4d221bbf9276da4b95d226364208d5eb/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:312d522caeb8a1a42ebdec87118d99b22667782b67898a76c963c058a7e41d4f", size = 3385755, upload-time = "2024-11-05T17:31:21.778Z" }, + { url = "https://files.pythonhosted.org/packages/9e/65/c83cb3545a65a9eaa2e13b22c93d5e00bd7624b354a44adbdc93d5d9bd91/tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b7cb962564785a83dafbba0144ecb7f579f1d57d8c406cdaa7f32fe32f18ad", size = 2997679, upload-time = "2024-11-05T17:31:23.134Z" }, + { url = "https://files.pythonhosted.org/packages/55/e9/a80d4e592307688a67c7c59ab77e03687b6a8bd92eb5db763a2c80f93f57/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:124c5882ebb88dadae1fc788a582299fcd3a8bd84fc3e260b9918cf28b8751f5", size = 8989296, upload-time = "2024-11-05T17:31:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/60c957af8d2244321124e893828f1a4817cde1a2d08d09d423b73f19bd2f/tokenizers-0.20.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2b6e54e71f84c4202111a489879005cb14b92616a87417f6c102c833af961ea2", size = 9303621, upload-time = "2024-11-05T17:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/be/a9/96172310ee141009646d63a1ca267c099c462d747fe5ef7e33f74e27a683/tokenizers-0.20.3-cp312-none-win32.whl", hash = "sha256:83d9bfbe9af86f2d9df4833c22e94d94750f1d0cd9bfb22a7bb90a86f61cdb1c", size = 2188979, upload-time = "2024-11-05T17:31:29.483Z" }, + { url = "https://files.pythonhosted.org/packages/bd/68/61d85ae7ae96dde7d0974ff3538db75d5cdc29be2e4329cd7fc51a283e22/tokenizers-0.20.3-cp312-none-win_amd64.whl", hash = "sha256:44def74cee574d609a36e17c8914311d1b5dbcfe37c55fd29369d42591b91cf2", size = 2380725, upload-time = "2024-11-05T17:31:31.315Z" }, +] + +[[package]] +name = "toolz" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/d6/114b492226588d6ff54579d95847662fc69196bdeec318eb45393b24c192/toolz-1.1.0.tar.gz", hash = "sha256:27a5c770d068c110d9ed9323f24f1543e83b2f300a687b7891c1a6d56b697b5b", size = 52613, upload-time = "2025-10-17T04:03:21.661Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/12/5911ae3eeec47800503a238d971e51722ccea5feb8569b735184d5fcdbc0/toolz-1.1.0-py3-none-any.whl", hash = "sha256:15ccc861ac51c53696de0a5d6d4607f99c210739caf987b5d2054f3efed429d8", size = 58093, upload-time = "2025-10-17T04:03:20.435Z" }, +] + +[[package]] +name = "torch" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufft", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, + { name = "nvidia-curand", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusolver", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" }, + { name = "nvidia-nvtx", marker = "sys_platform == 'linux'" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, + { name = "sympy" }, + { name = "triton", marker = "sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fd6c7d297e21758a7fa07624f2b5bb15607ee3b1dcc52519e8e796c6d4fcf960" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f40778951ca1533dc634b3842392641fa0b641181ff2f71d62728ef33cc36a5c" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:8db2814e63f2b365bda88526587ca75a6083a0b957a24b2b0d45ddc5ee350176" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e7f84cb10c7e7d9f862c318f056d64840544ab4f0bcbf8cf7ed6047fe04051f" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e70e1b18881e6b3c1ce402d0a989da39f956a3a057526e03c354df23d704ce9b" }, + { url = "https://download.pytorch.org/whl/cu130/torch-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:cd3232a562ad2a2699d48130255e1b24c07dfe694a40dcd24fad683c752de121" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:493421d061375074ce84840ca619605f625892e16dead63ec97181ef02da3357" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b3c75f87e325946276c952864dbce2c8fabc88a00d86730c3d5bc0999ebf7789" }, +] + +[[package]] +name = "torchaudio" +version = "2.9.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1023bb6598fa6312e1990fdc78660f4b4ef128d8942a1f10c5827aea23d6bd7e" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:817e2660d35a3c9a2638dd80d63c7a488cbbe87446ddbb564a5cf88b9de632f7" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6c58d5e846da5a90d50bd425e2c24368747cd04297d95c6dd51d3f7f85fea26" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchaudio-2.9.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7533a17bed21e5b86b8c49fd79656779779f2c991aef2804af6f318d2022ea6a" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "numpy", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "pillow", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d4ba2532440a93c23a99c41423a765a0cdd47556afa3acf7c318dd1d3d6793e9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:89743dcee13e943f58b37c7647aff14b5bb24c11c84826376d457acf97586fec" }, +] + +[[package]] +name = "torchvision" +version = "0.24.1+cu130" +source = { registry = "https://download.pytorch.org/whl/cu130" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'darwin'", + "(python_full_version >= '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", +] +dependencies = [ + { name = "numpy", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "pillow", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, + { name = "torch", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b0cc84c57c1fd54644698a70a74d1ea1eddfa44ee2df3354b7bb2c619a5d2923" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp311-cp311-win_amd64.whl", hash = "sha256:f564b9fdbc336ac187780931331fb4253f8511deae914dde12dca5bf17b3045f" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6939dd403cc28ab0a46f53e6c86e2e852cf65771c1b0ddd09c44c541a1cdbad9" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.24.1%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:d31ceaded0d9b737471fa680ccd9e1acb6d5f0f70f03ef3a8d786a99c79da7cf" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "transformers" +version = "4.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944, upload-time = "2024-11-18T22:13:01.012Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536, upload-time = "2024-11-18T22:12:57.024Z" }, +] + +[[package]] +name = "triton" +version = "3.5.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" }, + { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" }, + { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" }, + { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] + +[[package]] +name = "wrapt" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/81/60c4471fce95afa5922ca09b88a25f03c93343f759aae0f31fb4412a85c7/wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb", size = 60666, upload-time = "2026-03-06T02:52:58.934Z" }, + { url = "https://files.pythonhosted.org/packages/6b/be/80e80e39e7cb90b006a0eaf11c73ac3a62bbfb3068469aec15cc0bc795de/wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d", size = 61601, upload-time = "2026-03-06T02:53:00.487Z" }, + { url = "https://files.pythonhosted.org/packages/b0/be/d7c88cd9293c859fc74b232abdc65a229bb953997995d6912fc85af18323/wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894", size = 114057, upload-time = "2026-03-06T02:52:44.08Z" }, + { url = "https://files.pythonhosted.org/packages/ea/25/36c04602831a4d685d45a93b3abea61eca7fe35dab6c842d6f5d570ef94a/wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842", size = 116099, upload-time = "2026-03-06T02:54:56.74Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4e/98a6eb417ef551dc277bec1253d5246b25003cf36fdf3913b65cb7657a56/wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8", size = 112457, upload-time = "2026-03-06T02:53:52.842Z" }, + { url = "https://files.pythonhosted.org/packages/cb/a6/a6f7186a5297cad8ec53fd7578533b28f795fdf5372368c74bd7e6e9841c/wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6", size = 115351, upload-time = "2026-03-06T02:53:32.684Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/06e66189e721dbebd5cf20e138acc4d1150288ce118462f2fcbff92d38db/wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9", size = 111748, upload-time = "2026-03-06T02:53:08.455Z" }, + { url = "https://files.pythonhosted.org/packages/ef/43/4808b86f499a51370fbdbdfa6cb91e9b9169e762716456471b619fca7a70/wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15", size = 113783, upload-time = "2026-03-06T02:53:02.02Z" }, + { url = "https://files.pythonhosted.org/packages/91/2c/a3f28b8fa7ac2cefa01cfcaca3471f9b0460608d012b693998cd61ef43df/wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b", size = 57977, upload-time = "2026-03-06T02:53:27.844Z" }, + { url = "https://files.pythonhosted.org/packages/3f/c3/2b1c7bd07a27b1db885a2fab469b707bdd35bddf30a113b4917a7e2139d2/wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1", size = 60336, upload-time = "2026-03-06T02:54:28.104Z" }, + { url = "https://files.pythonhosted.org/packages/ec/5c/76ece7b401b088daa6503d6264dd80f9a727df3e6042802de9a223084ea2/wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a", size = 58756, upload-time = "2026-03-06T02:53:16.319Z" }, + { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255, upload-time = "2026-03-06T02:52:45.663Z" }, + { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848, upload-time = "2026-03-06T02:53:48.728Z" }, + { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433, upload-time = "2026-03-06T02:54:40.328Z" }, + { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013, upload-time = "2026-03-06T02:53:26.58Z" }, + { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326, upload-time = "2026-03-06T02:53:11.547Z" }, + { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444, upload-time = "2026-03-06T02:54:09.5Z" }, + { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237, upload-time = "2026-03-06T02:54:03.884Z" }, + { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563, upload-time = "2026-03-06T02:53:20.412Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198, upload-time = "2026-03-06T02:53:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441, upload-time = "2026-03-06T02:52:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836, upload-time = "2026-03-06T02:53:22.053Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, +] + +[[package]] +name = "yarl" +version = "1.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" }, + { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" }, + { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" }, + { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" }, + { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" }, + { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" }, + { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" }, + { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" }, + { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" }, + { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" }, + { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" }, + { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" }, + { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" }, + { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" }, + { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" }, + { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587, upload-time = "2026-03-01T22:05:17.384Z" }, + { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528, upload-time = "2026-03-01T22:05:18.804Z" }, + { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339, upload-time = "2026-03-01T22:05:20.235Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061, upload-time = "2026-03-01T22:05:22.268Z" }, + { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132, upload-time = "2026-03-01T22:05:23.638Z" }, + { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289, upload-time = "2026-03-01T22:05:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950, upload-time = "2026-03-01T22:05:27.318Z" }, + { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960, upload-time = "2026-03-01T22:05:28.738Z" }, + { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703, upload-time = "2026-03-01T22:05:30.438Z" }, + { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325, upload-time = "2026-03-01T22:05:31.835Z" }, + { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067, upload-time = "2026-03-01T22:05:33.358Z" }, + { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285, upload-time = "2026-03-01T22:05:35.4Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359, upload-time = "2026-03-01T22:05:36.811Z" }, + { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674, upload-time = "2026-03-01T22:05:38.171Z" }, + { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879, upload-time = "2026-03-01T22:05:40.006Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288, upload-time = "2026-03-01T22:07:51.388Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" }, + { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" }, + { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" }, + { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" }, + { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" }, + { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" }, + { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" }, + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, +] diff --git a/dependency_setup/dependency_notes.md b/dependency_setup/dependency_notes.md index b85460e..e0f7707 100644 --- a/dependency_setup/dependency_notes.md +++ b/dependency_setup/dependency_notes.md @@ -1,67 +1,57 @@ # GlossAPI Dependency Profiles & Test Notes ## Environment Profiles -- **Vanilla** – core GlossAPI pipeline without GPU OCR add-ons. Uses `requirements-glossapi-vanilla.txt`. -- **RapidOCR** – Docling + RapidOCR GPU stack. Builds on vanilla requirements and adds ONNX runtime (`requirements-glossapi-rapidocr.txt`). -- **DeepSeek** – GPU OCR via DeepSeek/vLLM. Extends vanilla requirements with torch/cu128, nightly vLLM and supporting CUDA libs (`requirements-glossapi-deepseek.txt`). `xformers` was dropped because the published wheels still pin Torch 2.8; the rest of the stack now installs cleanly on Torch 2.9. +- **Docling** – main GlossAPI environment for extraction, cleaning, sectioning, annotation, and math/code enrichment. Uses `requirements-glossapi-docling.txt`. +- **DeepSeek** – dedicated OCR runtime managed with `uv`. Pins the tested Torch/Transformers stack in `dependency_setup/deepseek_uv/pyproject.toml` and intentionally excludes the Docling layout stack. -Each profile is installed through `dependency_setup/setup_glossapi.sh`: +Recommended installation commands: ```bash -# Examples (venv path optional) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests -./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests ``` Key flags: -- `--download-deepseek` optionally fetches DeepSeek weights (skipped by default; set `--weights-dir` if they live elsewhere). +- `--download-model` optionally fetches DeepSeek weights (set `--model-root` if they live elsewhere). - `--smoke-test` (DeepSeek only) runs `dependency_setup/deepseek_gpu_smoke.py`. ## Test Segmentation Pytest markers were added so suites can be run per profile: -- `rapidocr` – GPU Docling/RapidOCR integration tests. - `deepseek` – DeepSeek execution paths. -- Unmarked tests cover the vanilla footprint. +- Unmarked tests cover the Docling/core footprint. -`setup_glossapi.sh` now chooses marker expressions automatically: +Suggested commands: -| Mode | Command run by script | -|-----------|---------------------------------------------------------| -| vanilla | `pytest -q -m "not rapidocr and not deepseek" tests` | -| rapidocr | `pytest -q -m "not deepseek" tests` | -| deepseek | `pytest -q -m "not rapidocr" tests` | +| Profile | Command | +|-----------|---------| +| Docling | `pytest -q -m "not deepseek" tests` | +| DeepSeek | `pytest -q -m "deepseek" tests` | -Heavy GPU tests in `tests/test_pipeline_smoke.py` were guarded with `pytest.importorskip("onnxruntime")` so vanilla installs skip them cleanly. Helper PDFs now embed DejaVuSans with Unicode support and insert spacing to keep OCR-friendly glyphs. +## Validation Runs (2026-03-08) +- `./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --run-tests` +- `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek --smoke-test` -## Validation Runs (2025-10-30) -- `./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests` -- `./dependency_setup/setup_glossapi.sh --mode deepseek --venv dependency_setup/.venvs/deepseek --run-tests` - -All three completed successfully after the following adjustments: -1. **Rust extensions** – switched to `pip install -e rust/glossapi_rs_{cleaner,noise}` because `maturin develop` left the wheel unregistered. -2. **Parquet locking** – `_parquet_lock` now creates parent directories before attempting the file lock (fixes `FileNotFoundError` in concurrent metadata tests). -3. **RapidOCR pipeline** – fixed `GlossExtract.create_extractor()` to build the Docling converter regardless of import path and added UTF-8 PDF generation improvements; smoke tests now pass on CUDA. -4. **DeepSeek stack** – updated nightly vLLM pin (`0.11.1rc5.dev58+g60f76baa6.cu129`) and removed `xformers` to resolve Torch 2.9 dependency conflicts. +These completed successfully after the following adjustments: +1. **Rust extensions** – use editable installs for `rust/glossapi_rs_{cleaner,noise}` so local changes are picked up immediately. +2. **DeepSeek stack** – moved to a uv-managed runtime pinned to the `transformers`-based OCR-2 path. +3. **Attention fallback** – the DeepSeek runner falls back to `eager` attention if `flash-attn` is unavailable. ## Known Follow-ups -- **DeepSeek weights** – installer warns if weights are absent. Set `--download-deepseek` or populate `${DEEPSEEK_ROOT}/DeepSeek-OCR` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). -- **xformers kernels** – removed pending compatible Torch 2.9 wheels. Reintroduce once upstream publishes matching builds. +- **DeepSeek weights** – installer warns if weights are absent. Set `--download-model` or populate `${MODEL_ROOT}/DeepSeek-OCR-2` before running the real CLI tests (`GLOSSAPI_RUN_DEEPSEEK_CLI=1`). +- **flash-attn** – optional. Reintroduce into the pinned flow once wheel availability is stable across target hosts. - **Patchelf warnings** – maturin emits rpath hints if `patchelf` is missing; they are benign but install `patchelf` if cleaner logs are desired. -- **Deprecation noise** – Docling emits future warnings (Pydantic) and RapidOCR font deprecation notices; currently harmless but worth tracking for future upgrades. +- **Deprecation noise** – Docling and Transformers emit some warnings on current pins; currently harmless but worth tracking for future upgrades. ## Quick Reference -- Activate an environment: `source dependency_setup/.venvs//bin/activate` +- Activate an environment: `source dependency_setup/.venvs//bin/activate` - Re-run tests manually: - - Vanilla: `pytest -m "not rapidocr and not deepseek" tests` - - RapidOCR: `pytest -m "not deepseek" tests` - - DeepSeek: `pytest -m "not rapidocr" tests` + - Docling: `pytest -m "not deepseek" tests` + - DeepSeek: `pytest -m "deepseek" tests` - DeepSeek runtime exports: ```bash export GLOSSAPI_DEEPSEEK_PYTHON="dependency_setup/.venvs/deepseek/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="/mnt/data/glossAPI/deepseek-ocr/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="/mnt/data/glossAPI/deepseek-ocr/libjpeg-turbo/lib" - export LD_LIBRARY_PATH="$GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH:${LD_LIBRARY_PATH:-}" + export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT="/mnt/data/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py" + export GLOSSAPI_DEEPSEEK_MODEL_DIR="/mnt/data/glossAPI/deepseek-ocr-2-model/DeepSeek-OCR-2" ``` These notes capture the current dependency state, the rationale behind constraint changes, and the validation steps used to exercise each profile. diff --git a/dependency_setup/requirements-glossapi-deepseek.txt b/dependency_setup/requirements-glossapi-deepseek.txt index 5cc685a..8185d9c 100644 --- a/dependency_setup/requirements-glossapi-deepseek.txt +++ b/dependency_setup/requirements-glossapi-deepseek.txt @@ -1,16 +1,13 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 ---extra-index-url https://wheels.vllm.ai/nightly --r requirements-glossapi-vanilla.txt -# CUDA Torch stack aligned with NVIDIA L4 (CUDA 12.8 wheels) -torch==2.9.0+cu128 -torchvision==0.24.0+cu128 -torchaudio==2.9.0+cu128 -# DeepSeek via nightly vLLM -vllm==0.11.1rc5.dev58+g60f76baa6.cu129 -flashinfer-python==0.4.1 -compressed-tensors==0.12.2 -depyf==0.20.0 -# Auxiliary CUDA libs -nvidia-nvshmem-cu12==3.3.20 -nvidia-nccl-cu12==2.27.5 -triton==3.5.0 +--extra-index-url https://download.pytorch.org/whl/cu118 +-r requirements-glossapi-docling.txt +torch==2.6.0 +torchvision==0.21.0 +torchaudio==2.6.0 +transformers==4.46.3 +tokenizers==0.20.3 +accelerate>=1.2.1,<2 +pymupdf==1.24.10 +Pillow==10.4.0 +img2pdf>=0.5.1 +easydict +addict diff --git a/dependency_setup/requirements-glossapi-docling.txt b/dependency_setup/requirements-glossapi-docling.txt new file mode 100644 index 0000000..73cb17f --- /dev/null +++ b/dependency_setup/requirements-glossapi-docling.txt @@ -0,0 +1,38 @@ +# Core GlossAPI runtime (Docling extraction/layout) +maturin>=1.5,<2.0 +numpy>=1.26,<3 +pandas>=1.3.0 +python-dateutil>=2.8.2 +pytz>=2021.1 +scikit-learn==1.6.1 +joblib>=1.0.0 +dask>=2022.1.0 +pyarrow>=7.0.0 +aiohttp>=3.8.0 +aiofiles>=23.0.0 +ftfy>=6.0.0 +tenacity>=8.0.0 +tqdm>=4.67.0 +pyyaml>=6.0 +pypdfium2>=4.0.0 +zstandard>=0.22.0 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 +msgspec>=0.18.6 +fpdf2>=2.7.0 +cachetools +cbor2 +einops +tiktoken +diskcache==5.6.3 +lark==1.2.2 +numba==0.61.2 +# Tooling / tests +pytest>=8.0 +pytest-mock>=3.14 +psutil>=5.9 +rich>=14.0 +safetensors>=0.4 +huggingface-hub>=0.22 diff --git a/dependency_setup/requirements-glossapi-rapidocr.txt b/dependency_setup/requirements-glossapi-rapidocr.txt deleted file mode 100644 index f5c5839..0000000 --- a/dependency_setup/requirements-glossapi-rapidocr.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements-glossapi-vanilla.txt -rapidocr>=3.3.0 -opencv-python-headless>=4.8.0 -onnxruntime-gpu==1.18.1 diff --git a/dependency_setup/requirements-glossapi-vanilla.txt b/dependency_setup/requirements-glossapi-vanilla.txt index b13df49..eca76ba 100644 --- a/dependency_setup/requirements-glossapi-vanilla.txt +++ b/dependency_setup/requirements-glossapi-vanilla.txt @@ -1,6 +1,6 @@ # Core GlossAPI runtime (Docling without GPU OCR extras) maturin>=1.5,<2.0 -numpy<2 +numpy>=1.26,<3 pandas>=1.3.0 python-dateutil>=2.8.2 pytz>=2021.1 @@ -16,10 +16,10 @@ tqdm>=4.67.0 pyyaml>=6.0 pypdfium2>=4.0.0 zstandard>=0.22.0 -docling==2.48.0 -docling-core==2.47.0 -docling-parse==4.4.0 -docling-ibm-models==3.9.1 +docling==2.81.0 +docling-core==2.70.2 +docling-parse==5.6.0 +docling-ibm-models==3.12.0 msgspec>=0.18.6 fpdf2>=2.7.0 cachetools diff --git a/dependency_setup/setup_deepseek_uv.sh b/dependency_setup/setup_deepseek_uv.sh new file mode 100755 index 0000000..87ad8b6 --- /dev/null +++ b/dependency_setup/setup_deepseek_uv.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +PROJECT_DIR="${SCRIPT_DIR}/deepseek_uv" + +PYTHON_BIN="${PYTHON:-python3}" +VENV_PATH="${GLOSSAPI_DEEPSEEK_VENV:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" +MODEL_ROOT="${DEEPSEEK_ROOT:-${REPO_ROOT}/deepseek-ocr-2-model}" +DOWNLOAD_MODEL=0 +RUN_SMOKE=0 +RUN_TESTS=0 + +info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + +SYNC_ARGS=(--no-dev) + +usage() { + cat <<'EOF' +Usage: setup_deepseek_uv.sh [options] + +Options: + --venv PATH Target virtual environment path + --python PATH Python executable to use for uv venv + --model-root PATH Destination root for the DeepSeek-OCR-2 model + --download-model Download DeepSeek-OCR-2 via huggingface_hub + --run-tests Run the DeepSeek pytest subset after installation + --smoke-test Run dependency_setup/deepseek_gpu_smoke.py + --help Show this help message +EOF +} + +prepend_path_if_dir() { + local dir="$1" + if [[ -d "${dir}" ]]; then + case ":${PATH}:" in + *":${dir}:"*) ;; + *) export PATH="${dir}:${PATH}" ;; + esac + fi +} + +ensure_stable_python() { + local python_bin="$1" + local release_level + release_level="$("${python_bin}" - <<'PY' +import sys +print(sys.version_info.releaselevel) +PY +)" + if [[ "${release_level}" != "final" ]]; then + error "Python interpreter ${python_bin} is not a stable final release (releaselevel=${release_level}). Install a stable CPython (for example via 'uv python install 3.11.11') and rerun with --python." + fi +} + +check_rust_toolchain() { + if ! command -v cargo >/dev/null 2>&1; then + error "cargo is required to build the Rust extensions. Install Rust (for example via rustup) and ensure cargo is on PATH." + fi + if ! cargo metadata --format-version 1 --manifest-path "${REPO_ROOT}/rust/glossapi_rs_cleaner/Cargo.toml" >/dev/null 2>&1; then + error "Current cargo cannot parse the repo Rust metadata/Cargo.lock. Upgrade Rust (for example 'rustup toolchain install stable && rustup default stable') and rerun setup." + fi +} + +while (( "$#" )); do + case "$1" in + --venv) + shift || { echo "--venv requires a path" >&2; exit 1; } + VENV_PATH="${1:-}" + ;; + --python) + shift || { echo "--python requires a path" >&2; exit 1; } + PYTHON_BIN="${1:-}" + ;; + --model-root|--weights-dir) + shift || { echo "--model-root requires a path" >&2; exit 1; } + MODEL_ROOT="${1:-}" + ;; + --download-model|--download-deepseek) + DOWNLOAD_MODEL=1 + ;; + --run-tests) + RUN_TESTS=1 + ;; + --smoke-test) + RUN_SMOKE=1 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift || true +done + +prepend_path_if_dir "${HOME}/.local/bin" +prepend_path_if_dir "${HOME}/.cargo/bin" + +command -v uv >/dev/null 2>&1 || error "uv is required. Install it first, e.g. 'python3 -m pip install --user uv'." +command -v "${PYTHON_BIN}" >/dev/null 2>&1 || error "Python executable not found: ${PYTHON_BIN}" +ensure_stable_python "${PYTHON_BIN}" +check_rust_toolchain + +MODEL_DIR="${MODEL_ROOT}/DeepSeek-OCR-2" + +if [[ -x "${VENV_PATH}/bin/python" ]]; then + info "Reusing uv environment at ${VENV_PATH}" +else + info "Creating uv environment at ${VENV_PATH}" + uv venv --python "${PYTHON_BIN}" "${VENV_PATH}" +fi + +if [[ "${RUN_TESTS}" -eq 1 ]]; then + SYNC_ARGS+=(--group test) +fi + +info "Syncing DeepSeek runtime from ${PROJECT_DIR}" +UV_PROJECT_ENVIRONMENT="${VENV_PATH}" uv sync --project "${PROJECT_DIR}" --python "${VENV_PATH}/bin/python" "${SYNC_ARGS[@]}" + +info "Installing Rust extensions in editable mode" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" +uv pip install --python "${VENV_PATH}/bin/python" -e "${REPO_ROOT}/rust/glossapi_rs_noise" + +if [[ "${DOWNLOAD_MODEL}" -eq 1 ]]; then + info "Downloading DeepSeek-OCR-2 model to ${MODEL_DIR}" + HUGGINGFACE_HUB_TOKEN="${HUGGINGFACE_HUB_TOKEN:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}}" \ + "${VENV_PATH}/bin/python" - <\033[0m %s\n" "$*"; } +warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } +error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } + usage() { cat <<'EOF' Usage: setup_glossapi.sh [options] Options: - --mode MODE Environment profile: vanilla, rapidocr, deepseek (default: vanilla) + --mode MODE Environment profile: docling or deepseek (default: docling) --venv PATH Target virtual environment path --python PATH Python executable to use when creating the venv - --download-deepseek Fetch DeepSeek-OCR weights (only meaningful for --mode deepseek) - --weights-dir PATH Destination directory for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr) + --download-deepseek Fetch DeepSeek-OCR-2 weights (DeepSeek mode only) + --weights-dir PATH Destination directory root for DeepSeek weights (default: $REPO_ROOT/deepseek-ocr-2-model) --run-tests Run pytest -q after installation --smoke-test Run dependency_setup/deepseek_gpu_smoke.py (deepseek mode only) --help Show this help message EOF } +prepend_path_if_dir() { + local dir="$1" + if [[ -d "${dir}" ]]; then + case ":${PATH}:" in + *":${dir}:"*) ;; + *) export PATH="${dir}:${PATH}" ;; + esac + fi +} + +ensure_stable_python() { + local python_bin="$1" + local release_level + release_level="$("${python_bin}" - <<'PY' +import sys +print(sys.version_info.releaselevel) +PY +)" + if [[ "${release_level}" != "final" ]]; then + error "Python interpreter ${python_bin} is not a stable final release (releaselevel=${release_level}). Install a stable CPython and rerun with --python." + fi +} + +check_rust_toolchain() { + if ! command -v cargo >/dev/null 2>&1; then + error "cargo is required to build the Rust extensions. Install Rust (for example via rustup) and ensure cargo is on PATH." + fi + if ! cargo metadata --format-version 1 --manifest-path "${REPO_ROOT}/rust/glossapi_rs_cleaner/Cargo.toml" >/dev/null 2>&1; then + error "Current cargo cannot parse the repo Rust metadata/Cargo.lock. Upgrade Rust (for example 'rustup toolchain install stable && rustup default stable') and rerun setup." + fi +} + while (( "$#" )); do case "$1" in --mode) @@ -68,14 +104,34 @@ while (( "$#" )); do shift || true done +prepend_path_if_dir "${HOME}/.local/bin" +prepend_path_if_dir "${HOME}/.cargo/bin" +command -v "${PYTHON_BIN}" >/dev/null 2>&1 || error "Python executable not found: ${PYTHON_BIN}" +ensure_stable_python "${PYTHON_BIN}" +check_rust_toolchain + case "${MODE}" in - vanilla|rapidocr|deepseek) ;; + vanilla) + warn "Mode 'vanilla' is deprecated; using 'docling' instead." + MODE="docling" + ;; + docling|deepseek) ;; *) - echo "Invalid mode '${MODE}'. Expected vanilla, rapidocr, or deepseek." >&2 + echo "Invalid mode '${MODE}'. Expected docling or deepseek." >&2 exit 1 ;; esac +if [[ "${MODE}" == "deepseek" ]]; then + exec "${SCRIPT_DIR}/setup_deepseek_uv.sh" \ + --python "${PYTHON_BIN}" \ + --venv "${VENV_PATH:-${REPO_ROOT}/dependency_setup/.venvs/deepseek}" \ + --model-root "${DEEPSEEK_ROOT}" \ + $([[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]] && printf '%s' "--download-model") \ + $([[ "${RUN_TESTS}" -eq 1 ]] && printf '%s' "--run-tests") \ + $([[ "${RUN_SMOKE}" -eq 1 ]] && printf '%s' "--smoke-test") +fi + if [[ -z "${VENV_PATH}" ]]; then VENV_PATH="${REPO_ROOT}/.venv_glossapi_${MODE}" fi @@ -86,10 +142,6 @@ if [[ ! -f "${REQUIREMENTS_FILE}" ]]; then exit 1 fi -info() { printf "\033[1;32m==>\033[0m %s\n" "$*"; } -warn() { printf "\033[1;33m[warn]\033[0m %s\n" "$*"; } -error() { printf "\033[1;31m[err]\033[0m %s\n" "$*" >&2; exit 1; } - ensure_venv() { if [[ ! -d "${VENV_PATH}" ]]; then info "Creating virtual environment at ${VENV_PATH}" @@ -107,44 +159,6 @@ python_run() { "${VENV_PATH}/bin/python" "$@" } -download_deepseek_weights() { - local root="$1" - local target="${root}/DeepSeek-OCR" - - if [[ -d "${target}" ]]; then - info "DeepSeek-OCR weights already present at ${target}" - return 0 - fi - - mkdir -p "${root}" - if command -v huggingface-cli >/dev/null 2>&1; then - info "Downloading DeepSeek weights with huggingface-cli (this may take a while)" - huggingface-cli download deepseek-ai/DeepSeek-OCR \ - --repo-type model \ - --include "DeepSeek-OCR/*" \ - --local-dir "${target}" \ - --local-dir-use-symlinks False || warn "huggingface-cli download failed; falling back to git-lfs" - fi - - if [[ ! -d "${target}" ]]; then - if command -v git >/dev/null 2>&1; then - if ! command -v git-lfs >/dev/null 2>&1; then - warn "git-lfs not available; install git-lfs to clone DeepSeek weights via git." - else - info "Cloning DeepSeek weights via git-lfs" - git lfs install --skip-repo >/dev/null 2>&1 || true - git clone https://huggingface.co/deepseek-ai/DeepSeek-OCR "${target}" - fi - else - warn "Neither huggingface-cli nor git found; skipping DeepSeek weight download." - fi - fi - - if [[ ! -d "${target}" ]]; then - warn "DeepSeek weights were not downloaded. Set DEEPSEEK_ROOT manually once acquired." - fi -} - ensure_venv info "Upgrading pip tooling" pip_run install --upgrade pip wheel setuptools @@ -159,43 +173,18 @@ info "Building Rust extensions via editable installs" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_cleaner" pip_run install -e "${REPO_ROOT}/rust/glossapi_rs_noise" -if [[ "${MODE}" == "deepseek" ]]; then - export GLOSSAPI_DEEPSEEK_PYTHON="${VENV_PATH}/bin/python" - export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT="${DEEPSEEK_ROOT}/run_pdf_ocr_vllm.py" - export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH="${DEEPSEEK_ROOT}/libjpeg-turbo/lib" - export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 - export LD_LIBRARY_PATH="${GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH:-}" - - if [[ "${DOWNLOAD_DEEPSEEK}" -eq 1 ]]; then - download_deepseek_weights "${DEEPSEEK_ROOT}" - else - warn "DeepSeek weights not downloaded (use --download-deepseek to fetch automatically)." - fi -fi - if [[ "${RUN_TESTS}" -eq 1 ]]; then pytest_args=("-q") case "${MODE}" in - vanilla) - pytest_args+=("-m" "not rapidocr and not deepseek") - ;; - rapidocr) + docling) pytest_args+=("-m" "not deepseek") ;; - deepseek) - pytest_args+=("-m" "not rapidocr") - ;; esac info "Running pytest ${pytest_args[*]} tests" python_run -m pytest "${pytest_args[@]}" tests fi -if [[ "${MODE}" == "deepseek" && "${RUN_SMOKE}" -eq 1 ]]; then - info "Running DeepSeek smoke test" - python_run "${SCRIPT_DIR}/deepseek_gpu_smoke.py" -fi - cat < None +``` + +- Purpose: Phase‑1 extraction from source files into markdown plus optional JSON intermediates. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'`: PyPDFium for stability vs Docling for native layout extraction + - `force_ocr`: deprecated no-op kept for compatibility; OCR remediation now lives in `Corpus.ocr(backend='deepseek')` + - `use_gpus='multi'`: use all visible GPUs through a shared work queue + - `workers_per_device`: fan out more than one extraction worker onto a single visible GPU when measuring throughput + - `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: optional environment override for how many PDFs one Docling worker processes per extractor batch; GlossAPI keeps the default at `1` until a benchmark proves a larger batch is safe on the target node + - `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: optional environment override for the page budget of each queued multi-GPU Docling work item; use it with benchmark checkpoints when long PDFs dominate the tail + - `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE`: optional environment override for Docling's internal `settings.perf.page_batch_size`; use it when a GPU can hold more pages in flight than the default internal batch window + - `export_doc_json=True`: write `json/.docling.json(.zst)` + - `emit_formula_index=True`: also write `json/.formula_index.jsonl` +- Main outputs: + - `markdown/.md` + - `json/.docling.json(.zst)` when enabled + - `json/metrics/.metrics.json` + - `json/metrics/.per_page.metrics.json` + +## clean() + +```python +clean( + input_dir: str | Path | None = None, + threshold: float = 0.10, + num_threads: int | None = None, + drop_bad: bool = True, +) -> None +``` + +- Purpose: run the Rust cleaner/noise pipeline and decide which documents are safe for downstream processing. +- Typical inputs: + - `markdown/*.md` + - metadata parquet if present +- Important parameters: + - `threshold`: badness threshold + - `drop_bad`: whether to remove bad files from downstream selection + - `empty_char_threshold`, `empty_min_pages`: heuristics for OCR rerun recommendation +- Main outputs: + - `clean_markdown/.md` + - cleaner report parquet + - updated parquet columns such as `filter`, `needs_ocr`, and metrics fields +- Operational note: this stage is the quality gate that drives `section()` and `ocr()`. + +## ocr() + +```python +ocr( + *, + fix_bad: bool = True, + mode: str | None = None, + device: str | None = None, + model_dir: str | Path | None = None, + max_pages: int | None = None, + persist_engine: bool = True, + limit: int | None = None, + dpi: int | None = None, + precision: str | None = None, + math_enhance: bool = True, + math_targets: dict[str, list[tuple[int,int]]] | None = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = 'single', + devices: list[int] | None = None, + force: bool | None = None, +) -> None +``` + +- Purpose: selective OCR retry and optional Phase‑2 math/code enrichment. +- Mode selection: + - `ocr_bad`: rerun OCR only for cleaner-flagged docs + - `math_only`: run enrichment from existing Docling JSON + - `ocr_bad_then_math`: OCR flagged docs, then enrich them +- Important parameters: + - `mode`, `fix_bad`, `math_enhance` + - `use_gpus`, `devices` + - `math_targets` to restrict enrichment to specific items +- Main outputs: + - refreshed `markdown/.md` + - refreshed cleaner/parquet metadata after OCR reruns + - when metadata parquet is present, OCR now preserves the same row identity and embeds corrected `text` plus direct OCR sidecar pointers such as `ocr_markdown_relpath`, `ocr_metrics_relpath`, and `ocr_text_sha256` + - `json/.latex_map.jsonl` when enrichment runs + +## formula_enrich_from_json() + +```python +formula_enrich_from_json( + files: list[str] | None = None, + *, + device: str = 'cuda', + batch_size: int = 8, + dpi_base: int = 220, + targets_by_stem: dict[str, list[tuple[int,int]]] | None = None, +) -> None +``` + +- Purpose: Phase‑2 GPU enrichment from previously exported Docling JSON. +- Typical inputs: + - `json/.docling.json(.zst)` + - optional formula/code index data +- Important parameters: + - `files`: restrict to specific stems + - `device`, `batch_size`, `dpi_base` + - `targets_by_stem`: target specific `(page_no, item_index)` tuples +- Main outputs: + - enriched markdown back into `markdown/.md` + - `json/.latex_map.jsonl` + +## section(), annotate() + +```python +section() -> None +annotate(annotation_type: str = 'text', fully_annotate: bool = True) -> None +``` + +- `section()`: + - purpose: convert markdown into one row per section with structural flags + - inputs: markdown selected by cleaner/parquet metadata + - outputs: `sections/sections_for_annotation.parquet` +- `annotate()`: + - purpose: classify sections and optionally expand them into full document structure + - important parameters: `annotation_type='text'|'chapter'|'auto'`, `fully_annotate` + - outputs: `classified_sections.parquet` and `fully_annotated_sections.parquet` + +## download() + +```python +download( + input_parquet: str | Path, + *, + links_column: str | None = None, + parallelize_by: str | None = None, + verbose: bool | None = None, + **kwargs, +) -> pd.DataFrame +``` + +- Purpose: fetch source files described in a parquet dataset. +- Typical inputs: + - an explicit `input_parquet` + - or the first parquet file found in `input_dir` +- Important parameters: + - `links_column`: override URL column name + - `parallelize_by`: choose grouping for the scheduler + - `download_mode`: one of `standard`, `auto`, or `browser` + - `browser_mode=True`: alias for `download_mode="browser"` + - `download_policy_file`: route specific domains/URL patterns to `standard`, `auto`, or `browser` + - downloader kwargs via `**kwargs` for concurrency, SSL, cookies, retries, checkpoints, etc. +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` + - returned `pd.DataFrame` with download status and metadata + +Browser-capable download mode is intended for browser-gated file endpoints where a real file still exists behind session/bootstrap checks. It is not a general viewer extractor. Viewer-only sources should still fail cleanly with a recorded error and no local file artifact. + +Example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_mode="browser", +) +``` + +Policy-routed example: + +```python +corpus.download( + input_parquet="input_urls.parquet", + download_policy_file="download_policy.yml", +) +``` + +## triage_math() + +- Purpose: summarize per-page metrics and recommend Phase‑2 for math-dense docs. +- Inputs: `json/metrics/.per_page.metrics.json` +- Outputs: updated `download_results` parquet with routing fields such as formula totals and phase recommendation + +## Suggested Reading Order + +1. `download()` if you start from URLs. +2. `extract()` for Phase‑1 layout/markdown. +3. `clean()` to decide what needs OCR. +4. `ocr()` if you need OCR retry or Phase‑2 enrichment. +5. `section()` and `annotate()` for structured downstream outputs. + +--- + +See also: +- Code map: ../code_map.md +- Pipeline overview and artifacts: ../pipeline.md +- Configuration and environment variables: ../configuration.md +- OCR and math enrichment details: ../ocr_and_math_enhancement.md diff --git a/docs/api_corpus_tmp.md b/docs/api_corpus_tmp.md index 4181094..e584308 100644 --- a/docs/api_corpus_tmp.md +++ b/docs/api_corpus_tmp.md @@ -44,7 +44,7 @@ extract( ) -> None ``` -- Phase‑1 extraction; set `force_ocr=True` for OCR. +- Phase‑1 extraction; `force_ocr` is deprecated and ignored. - Docling layout JSON now writes by default (`json/.docling.json(.zst)`); set `emit_formula_index=True` to also produce `json/.formula_index.jsonl`. - Set `use_gpus='multi'` to use all visible GPUs (shared queue). @@ -85,7 +85,7 @@ ocr( ) -> None ``` -- Convenience shim that re‑runs `extract(force_ocr=True)` on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. +- Convenience shim that re-runs OCR on cleaner-flagged documents and, by default, performs math/code enrichment unless `math_enhance=False`. ## formula_enrich_from_json() diff --git a/docs/architecture/artifact_layout_and_stage_handoffs.md b/docs/architecture/artifact_layout_and_stage_handoffs.md index f3b5b6d..53cbcec 100644 --- a/docs/architecture/artifact_layout_and_stage_handoffs.md +++ b/docs/architecture/artifact_layout_and_stage_handoffs.md @@ -92,6 +92,38 @@ That affects: Chunk suffix behavior is therefore part of the current contract. +For DeepSeek OCR, there is an important distinction between execution-time shards and stage handoff artifacts: + +- Multi-GPU `exact_fill` may execute shards such as `doc__p00001-00096` internally to keep GPU lanes full. +- Those shard names are operational artifacts, not the downstream contract for OCR outputs. +- After worker completion, the runner reassembles canonical `markdown/.md` and `json/metrics/.metrics.json` files for each source PDF. +- If OCR started from canonical corpus metadata, the authoritative OCR handoff should also include a canonical parquet where corrected `text` is embedded back into the same document rows. Detached markdown alone is not the full stage handoff in that case. +- Canonical OCR markdown page boundaries are annotated with `` comments next to the page-split marker, and the parser remains backward-compatible with legacy unnumbered separators. +- Original shard markdown and shard metrics are moved under `sidecars/ocr_shards/` for debugging and audit trails. +- If a repair retry trips the garbage cutoff again, the canonical markdown keeps the page slot but blanks the page content rather than preserving the bad first-pass OCR. + +For multi-GPU vLLM OCR, there is now a second class of operational artifacts under `sidecars/ocr_runtime/`: + +- `work_queue.sqlite`: durable batch queue state for the current OCR run +- `worker_*.runtime.json`: per-worker heartbeat and timing state +- `gpu_preflight.json`: GPU readiness checks such as persistence mode +- `gpu_telemetry.jsonl`: sampled GPU utilization and process telemetry +- `runtime_summary.json`: queue completion state plus steady-state timing windows + +The runtime queue now has two phases inside the same operational state: + +- first-pass shard batches +- repair shard batches published after first pass completes + +Repair queue durability and repair execution batching are intentionally separate concerns: + +- the durable queue records individual repair work items so retries, failure accounting, and resume logic stay precise +- workers may pack multiple pending repair items into one larger execution batch to keep GPUs busy during the repair tail + +These runtime artifacts are operational state, not downstream stage inputs. They are intended for monitoring, debugging, and safe resumption logic. + +Downstream stages should therefore consume canonical OCR outputs, not shard artifacts. + ## Authoritative state vs derived artifacts Not every file has equal semantic importance. diff --git a/docs/architecture/index.md b/docs/architecture/index.md index a8d8621..7f3d113 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -103,7 +103,7 @@ Purpose: Important characteristics: -- can use RapidOCR via Docling or DeepSeek OCR +- uses DeepSeek OCR for remediation while keeping Docling in the surrounding extraction/layout flow - reads metadata to find OCR candidates - skiplist-aware - designed as a corrective stage, not the default for every document @@ -172,4 +172,5 @@ The current architecture is effective but has important tradeoffs: These pressure points are documented separately in: - [Artifact Layout and Stage Handoffs](artifact_layout_and_stage_handoffs.md) +- [OCR Cleaning Runtime](ocr_cleaning_runtime.md) - [Resumability, Recovery, and Retention](resumability_recovery_and_retention.md) diff --git a/docs/architecture/ocr_cleaning_runtime.md b/docs/architecture/ocr_cleaning_runtime.md new file mode 100644 index 0000000..6b780b9 --- /dev/null +++ b/docs/architecture/ocr_cleaning_runtime.md @@ -0,0 +1,118 @@ +# OCR Cleaning Runtime + +This document explains how the current OCR cleaner is organized, why the +matcher families are separated, and why the clean/debug behavior is driven by +one shared page analyzer. + +## One Analyzer, Two Render Modes + +The OCR cleaner now works in two modes over the same span plan: + +- `debug` + - preserves the source page surface + - inserts `` tags around the matched regions +- `clean` + - applies the removal/rewrite policy directly + - writes the cleaned page text with no debug tags + +This is deliberate. The project previously had a tendency for the reviewer-facing +debug logic to evolve faster than the real cleaner. Sharing one analyzer avoids +that drift: if the debug page is right, the clean page is operating on the same +decisions. + +## Why The Cleaner Is Not One Generic Matcher + +The cleaner is trying to remove OCR- or VLM-induced garbage, not every repeated +pattern in a page. A single fuzzy matcher over the whole page overgeneralizes +quickly: + +- numbers steal matches that should belong to numeric progression logic +- repeated notation in LaTeX looks like corruption even when it is legitimate +- HTML tables distort text surfaces and cause spurious word matches + +So the runtime uses ownership by surface type and structure instead of one broad +"repetition" rule. + +## Page Ownership Order + +The current analyzer order is: + +1. tables +2. numeric +3. LaTeX +4. hybrid numbered repetition +5. shared text repetition + +Why this order: + +- Tables run first because HTML table shells can dominate a page and confuse + every later pass. +- Numeric runs before generic text because `1, 2, 3, ...` style progressions + are real OCR-collapse signals and should not be absorbed by `word_repeat`. +- LaTeX and hybrid passes run before generic text because they depend on local + structure, not just repeated tokens. +- Shared text repetition runs last on the remaining visible surface only. + +This ordering is the main false-positive control mechanism. + +## Table Cleaning Is Broader Than Repetition + +Table handling is intentionally separated into `src/glossapi/corpus/ocr_table.py` +because it is not just another repetition matcher. + +Current table classes: + +- `sentence_shell_table` + - a table with one prose-like filled cell + - treated as layout noise around content + - dropped in clean mode +- `empty_table_collapse` + - a large sparse shell with almost no real cell content + - dropped in clean mode +- `repeated_rows` + - an actually repetition-oriented table problem + - dropped in clean mode +- unmatched kept tables + - converted from HTML to GitHub-style Markdown + +The important design point is that sentence-shell and empty-shell tables are +structural cleanup decisions, not repetition decisions. + +## LaTeX And Hybrid Generalization Strategy + +LaTeX and hybrid numbered matching both follow the same conservative pattern: + +- prefer local runs +- abstract slot fields +- require mechanical progression or stable low-diversity cycles +- avoid page-wide reuse as evidence on its own + +That is why the cleaner does not treat "same symbol appears many times on a +page" as enough evidence. The goal is to catch degenerate local collapse, not +normal scholarly notation reuse. + +## Why Rust Is Used Selectively + +The hot-path detection work is in Rust because page-scale scanning dominates run +time. Python still owns: + +- orchestration +- filesystem I/O +- debug/clean rendering +- policy composition across matcher families + +This split is intentional: + +- Rust is best for large repeated scans and token-normalization hot loops +- Python is still easier for mode-aware rendering and pipeline integration + +## Performance And Correctness Contract + +Performance work is allowed only if exact debug output stays stable. + +The correctness lock is: + +- `tests/test_ocr_golden_pages.py` + +That suite uses hundreds of real pages and compares exact output bytes. The +speed work therefore optimizes implementation, not semantics. diff --git a/docs/code_map.md b/docs/code_map.md new file mode 100644 index 0000000..8616def --- /dev/null +++ b/docs/code_map.md @@ -0,0 +1,60 @@ +# Code Map + +This page maps the main documentation ideas to the code that implements them. It is +meant to help you move from "what does GlossAPI do?" to "where do I change it?" +without reading the entire repo. + +## Top-Level Entry Points + +| Area | Main code | Responsibility | +| --- | --- | --- | +| Public package entry | `src/glossapi/__init__.py` | Lazy-exports `Corpus`, `GlossSectionClassifier`, `GlossDownloader`, and related classes without pulling heavy runtime dependencies at import time. | +| High-level orchestration | `src/glossapi/corpus/corpus_orchestrator.py` | Coordinates the end-to-end pipeline and owns the main folder/artifact conventions. | +| Phase-1 extraction engine | `src/glossapi/gloss_extract.py` | Builds/reuses Docling converters, handles safe vs Docling backend selection, batching, timeouts, resumption, and artifact export. | + +## Pipeline Stages + +| Stage | Main methods/classes | Notes | +| --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | Supports URL expansion, deduplication, checkpoints, per-domain scheduling, and resume. | +| Extract | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` | Handles backend choice, GPU preflight, and single- vs multi-GPU dispatch. | +| Clean / quality gate | `Corpus.clean()` | Runs the Rust cleaner and merges quality metrics back into parquet metadata. | +| OCR retry / math follow-up | `Corpus.ocr()`, `Corpus.formula_enrich_from_json()` | Re-runs OCR only for flagged documents and optionally performs Phase-2 math/code enrichment from JSON. | +| Sectioning | `Corpus.section()`, `GlossSection.to_parquet()` | Converts markdown documents into section rows for later classification. | +| Classification / annotation | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | Runs the SVM classifier and post-processes section labels into final document structure. | +| Export / triage | `Corpus.jsonl()`, `Corpus.triage_math()` | Produces training/export JSONL and computes routing hints for math-dense documents. | + +## Backend and Runtime Helpers + +| File | Responsibility | +| --- | --- | +| `src/glossapi/ocr/docling/pipeline.py` | Canonical builder for the layout-only Docling Phase-1 pipeline, including runtime tuning knobs for the current Docling API. | +| `src/glossapi/ocr/docling_pipeline.py` | Compatibility re-export for the canonical Docling pipeline builder. | +| `src/glossapi/ocr/deepseek/runner.py` | Launches the DeepSeek OCR remediation path from `Corpus.ocr()`. | +| `src/glossapi/ocr/utils/json_io.py` | Writes and reads compressed Docling JSON artifacts. | +| `src/glossapi/corpus/phase_ocr_math.py` | Runs DeepSeek OCR remediation, math/code enrichment, and parquet status updates. | +| `src/glossapi/metrics.py` | Computes per-page parse/OCR/formula metrics from Docling conversions. | + +## Rust Extensions + +| Crate | Path | Purpose | +| --- | --- | --- | +| Cleaner | `rust/glossapi_rs_cleaner` | Markdown cleaning, script/noise filtering, and report generation used by `Corpus.clean()`. | +| Noise metrics | `rust/glossapi_rs_noise` | Fast quality metrics used by the broader pipeline and package build configuration. | + +## Tests To Read First + +| Test | Why it matters | +| --- | --- | +| `tests/test_pipeline_smoke.py` | Best high-level example of the intended artifact flow through extract -> clean -> OCR -> section. | +| `tests/test_corpus_guards.py` | Shows the contract around backend selection and GPU preflight. | +| `tests/test_jsonl_export.py` | Shows how final JSONL export merges cleaned markdown, parquet metadata, and math metrics. | +| `tests/test_ocr_dispatch_backends.py` | Covers the DeepSeek-only OCR dispatch contract and backend validation. | + +## If You Need To Change... + +- Download scheduling or resume behavior: start in `src/glossapi/gloss_downloader.py`. +- Phase-1 parsing, worker fanout, or artifact generation: start in `src/glossapi/corpus/phase_extract.py`, `src/glossapi/corpus/corpus_orchestrator.py`, and `src/glossapi/gloss_extract.py`. +- Docling pipeline wiring or runtime tuning: start in `src/glossapi/ocr/docling/pipeline.py` and `src/glossapi/gloss_extract.py`. +- Section labels or section-annotation rules: start in `src/glossapi/gloss_section_classifier.py`. +- Output folder contracts or stage sequencing: start in `src/glossapi/corpus/corpus_orchestrator.py`. diff --git a/docs/configuration.md b/docs/configuration.md index 659d65c..98f2687 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -18,30 +18,50 @@ GlossAPI exposes two Phase‑1 profiles. Use `Corpus.extract(..., phase1_backend Regardless of backend, the extractor clamps OMP/OpenBLAS/MKL pools to one thread per worker so multi‑GPU runs do not explode thread counts. -### DeepSeek optional dependencies +### Docling Runtime Tuning -Install DeepSeek backend extras to enable the DeepSeek OCR path (imports remain lazy, so the package is optional). Use the CUDA 12.1 wheels for both vLLM and Torch: +These optional knobs map directly to current Docling `PdfPipelineOptions` fields and are mainly useful for benchmarking on strong GPUs: -```bash -pip install '.[deepseek]' +- `GLOSSAPI_DOCLING_MAX_BATCH_FILES`: override the number of PDF documents a single Phase‑1 Docling worker processes per extractor batch. Defaults to `1` in GlossAPI for stability; raise it deliberately when benchmarking fresh A100 nodes. +- `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES`: target page budget for each queued multi‑GPU Docling work item. Defaults to `256`; lower it when a single worker hoards long PDFs, raise it when a strong GPU can keep larger mixed bundles resident. +- `GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE`: override Docling `layout_batch_size`. +- `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE`: override Docling `settings.perf.page_batch_size` so Phase‑1 can raise or lower the number of pages each device keeps in flight internally without changing GlossAPI queue semantics. +- `GLOSSAPI_DOCLING_TABLE_BATCH_SIZE`: override Docling `table_batch_size`. +- `GLOSSAPI_DOCLING_OCR_BATCH_SIZE`: override Docling `ocr_batch_size` even though Phase‑1 OCR stays disabled. +- `GLOSSAPI_DOCLING_QUEUE_MAX_SIZE`: override Docling `queue_max_size`. +- `GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT`: override Docling `document_timeout`. +- `GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL`: override Docling `batch_polling_interval_seconds`. -# Install Torch CUDA 12.1 wheels (required by the DeepSeek script) -pip install --extra-index-url https://download.pytorch.org/whl/cu121 \ - 'torch==2.5.1+cu121' 'torchvision==0.20.1+cu121' +### DeepSeek optional dependencies -# Alternatively, use the requirements file (edit to uncomment torch lines): -pip install -r deepseek-ocr/requirements-deepseek.txt +Install DeepSeek backend extras to enable the DeepSeek OCR path. The recommended path is the dedicated `uv` environment: + +```bash +./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek ``` When using `backend='deepseek'`, equations are included inline in the OCR output; Phase‑2 math flags are accepted but skipped. +The dedicated uv profile is OCR-only and does not install the Docling extraction stack. ### DeepSeek runtime controls -- `GLOSSAPI_DEEPSEEK_ALLOW_STUB` (`1` by default): allow the builtin stub runner for tests and lightweight environments. -- `GLOSSAPI_DEEPSEEK_ALLOW_CLI` (`0` by default): flip to `1` to force the real vLLM CLI even when the stub is allowed. -- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs `run_pdf_ocr_vllm.py` (defaults to the current interpreter). -- `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT`: override path to the DeepSeek CLI script (defaults to `deepseek-ocr/run_pdf_ocr_vllm.py` under the repo). -- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths (e.g., for `libjpeg-turbo`) when launching the CLI. +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB`: must remain `0`; stub execution is rejected. +- `GLOSSAPI_DEEPSEEK_ALLOW_CLI`: keep at `1` to require the real runtime. +- `GLOSSAPI_DEEPSEEK_PYTHON`: absolute path to the Python interpreter that runs the DeepSeek OCR runner. When this is unset, GlossAPI now prefers a repo-local version-pinned DeepSeek runtime under `dependency_setup/.venvs/deepseek*` before falling back to the generic `deepseek` alias and finally the current process interpreter. +- `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`: override path to the OCR runner script (defaults to `src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py`). +- `GLOSSAPI_DEEPSEEK_MODEL_DIR`: path to the downloaded `DeepSeek-OCR-2` snapshot. +- `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH`: prepend extra library search paths when launching the OCR runner. + +Standard OCR defaults: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` +- `repair_mode='auto'` +- `scheduler='auto'` +- `target_batch_pages=160` + +The DeepSeek runners now default to `max_new_tokens=2048`. Do not leave the token cap implicit in one environment and explicit in another when comparing benchmarks. ## Math Enrichment (Phase‑2) @@ -71,10 +91,6 @@ All LaTeX policy knobs are loaded via `glossapi.text_sanitize.load_latex_policy( - `GLOSSAPI_WORKER_LOG_DIR`: override the directory used for per-worker logs and `gpu.current` markers (defaults to `logs/ocr_workers/` or `logs/math_workers/` under the output directory). - `GLOSSAPI_WORKER_LOG_VERBOSE` = `1|0` (default `1`): emit (or suppress) the GPU binding banner each worker prints on startup. -## RapidOCR Model Paths - -- `GLOSSAPI_RAPIDOCR_ONNX_DIR`: directory containing `det/rec/cls` ONNX models and keys. - ## Triage & Parquet - Triage always writes both: diff --git a/docs/getting_started.md b/docs/getting_started.md index f6bf4ce..a53518c 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -4,46 +4,75 @@ This guide gets a new GlossAPI contributor from clone → first extraction with ## Checklist -- Python 3.8+ (3.10 recommended) +- Python 3.10+ (`3.12` recommended for the DeepSeek runtime) - Recent `pip` (or `uv`) and a C/C++ toolchain for Rust wheels -- Optional: NVIDIA GPU with CUDA 12.x drivers for Docling/RapidOCR acceleration +- Optional: NVIDIA GPU with CUDA drivers for Docling/DeepSeek acceleration + +On fresh Linux hosts, make these assumptions explicit instead of relying on shell startup files: + +- prefer a stable final CPython, not a prerelease distro build +- keep `~/.local/bin` on `PATH` if `uv` was installed with `pip install --user uv` +- keep `~/.cargo/bin` on `PATH` if Rust was installed with `rustup` ## Install GlossAPI -### Recommended — mode-aware setup script +### Recommended setup -Use `dependency_setup/setup_glossapi.sh` to build an isolated virtualenv with the correct dependency set for vanilla, RapidOCR, or DeepSeek runs. Examples: +Use `dependency_setup/setup_glossapi.sh` for the main Docling environment and `dependency_setup/setup_deepseek_uv.sh` for the OCR runtime. Examples: ```bash -# Vanilla pipeline (CPU-only OCR) -./dependency_setup/setup_glossapi.sh --mode vanilla --venv dependency_setup/.venvs/vanilla --run-tests - -# RapidOCR GPU stack -./dependency_setup/setup_glossapi.sh --mode rapidocr --venv dependency_setup/.venvs/rapidocr --run-tests +# Main GlossAPI environment +./dependency_setup/setup_glossapi.sh --mode docling --venv dependency_setup/.venvs/docling --run-tests -# DeepSeek OCR on GPU (expects weights under /path/to/deepseek-ocr/DeepSeek-OCR) -./dependency_setup/setup_glossapi.sh \ - --mode deepseek \ +# DeepSeek OCR on GPU (uv-managed, downloads DeepSeek-OCR-2 if requested) +./dependency_setup/setup_deepseek_uv.sh \ + --python /path/to/stable/python \ --venv dependency_setup/.venvs/deepseek \ - --weights-dir /path/to/deepseek-ocr \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ --run-tests --smoke-test ``` -Add `--download-deepseek` if you need the script to fetch weights via Hugging Face; otherwise it searches `${REPO_ROOT}/deepseek-ocr/DeepSeek-OCR` unless you override `--weights-dir`. Inspect `dependency_setup/dependency_notes.md` for the latest pins, caveats, and validation runs. The script installs GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +`setup_glossapi.sh --mode deepseek` delegates to the same uv-based installer. Inspect `dependency_setup/dependency_notes.md` for the current pins and validation runs. Both setup paths install GlossAPI and its Rust crates in editable mode so source changes are picked up immediately. +The dedicated DeepSeek uv environment is intentionally OCR-only: it installs `glossapi[deepseek]` and leaves Docling in the main environment. + +On fresh GPU nodes, prefer a `uv`-managed stable Python such as: + +```bash +~/.local/bin/uv python install 3.11.11 +``` + +Then pass that interpreter explicitly to the setup scripts: + +```bash +./dependency_setup/setup_glossapi.sh \ + --mode docling \ + --python /home/$USER/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/bin/python3.11 \ + --venv dependency_setup/.venvs/docling + +./dependency_setup/setup_deepseek_uv.sh \ + --python /home/$USER/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/bin/python3.11 \ + --venv dependency_setup/.venvs/deepseek +``` **DeepSeek runtime checklist** -- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the CLI can run (env vars, model dir, flashinfer, cc1plus, libjpeg). -- Force the real CLI and avoid stub fallback by setting: +- Run `python -m glossapi.ocr.deepseek.preflight` from the DeepSeek venv to assert the real runtime is reachable. +- Run `python -m glossapi.scripts.deepseek_runtime_report` from the DeepSeek venv on fresh GPU nodes before ad hoc fixes. That captures the interpreter, CUDA wheel layout, and package versions used by the node. +- Force the real runtime and avoid stub fallback by setting: - `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` - `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` - - `GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py` - - `GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek/venv/bin/python` - - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR` - - `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib` -- Install a CUDA toolkit with `nvcc` and set `CUDA_HOME` / prepend `$CUDA_HOME/bin` to `PATH` (FlashInfer/vLLM JIT expects it). -- If FlashInfer is unstable on your stack, disable it with `VLLM_USE_FLASHINFER=0` and `FLASHINFER_DISABLE=1`. -- Avoid FP8 KV cache issues by exporting `GLOSSAPI_DEEPSEEK_NO_FP8_KV=1`; tune VRAM use via `GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION=<0.5–0.9>`. -- Keep `LD_LIBRARY_PATH` pointing at the toolkit lib64 (e.g. `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`). + - `GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek/venv/bin/python` + - `GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2` +- If `GLOSSAPI_DEEPSEEK_PYTHON` is unset, GlossAPI now searches for a repo-local version-pinned DeepSeek runtime under `dependency_setup/.venvs/deepseek*` before falling back to the generic `deepseek` alias and then the current process interpreter. Keep the env var set when you need an explicit override; broken explicit paths are treated as configuration errors, not silently ignored. +- Standard OCR defaults after setup: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `scheduler='auto'` + - `target_batch_pages=160` +- `flash-attn` is optional. The runner uses it when available and otherwise falls back to the Transformers `eager` attention implementation. +- Do not benchmark against an ad hoc DeepSeek venv and compare it to the validated `dependency_setup/.venvs/deepseek` results as if they were the same stack. ### Option 1 — pip (evaluate quickly) @@ -74,30 +103,19 @@ chmod +x scripts/setup_conda.sh conda activate glossapi ``` -The helper script provisions Python 3.10, installs Rust + `maturin`, performs an editable install, and applies the Docling RapidOCR patch automatically. +The helper script provisions Python 3.10, installs Rust + `maturin`, and performs an editable install. ## GPU prerequisites (optional but recommended) -`setup_glossapi.sh` pulls the right CUDA/Torch/ONNX wheels for the RapidOCR and DeepSeek profiles. If you are curating dependencies manually, make sure you: +`setup_glossapi.sh` and `setup_deepseek_uv.sh` pull the required Torch wheels for the supported Docling and DeepSeek flows. If you are curating dependencies manually, make sure you: -- Install the GPU build of ONNX Runtime (`onnxruntime-gpu`) and uninstall the CPU wheel. -- Select the PyTorch build that matches your driver/toolkit (the repository currently targets CUDA 12.8 for DeepSeek). +- Select the PyTorch build that matches your driver/toolkit. - Verify the providers with: ```bash - python -c "import onnxruntime as ort; print(ort.get_available_providers())" python -c "import torch; print(torch.cuda.is_available())" ``` -## RapidOCR models & keys - -GlossAPI ships the required ONNX models and Greek keys under `glossapi/models/rapidocr/{onnx,keys}`. To override them, set `GLOSSAPI_RAPIDOCR_ONNX_DIR` to a directory containing: - -- `det/inference.onnx` -- `rec/inference.onnx` -- `cls/ch_ppocr_mobile_v2.0_cls_infer.onnx` -- `greek_ppocrv5_keys.txt` - ## First run (lightweight corpus) ```bash diff --git a/docs/index.md b/docs/index.md index d696c8d..13cef9d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,21 +7,13 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [Quickstart Recipes](quickstart.md) — common extraction/OCR flows in copy-paste form. - [Lightweight PDF Corpus](lightweight_corpus.md) — 20 one-page PDFs for smoke testing without Docling or GPUs. -## Understand the architecture -- [Architecture Overview](architecture/index.md) — the end-to-end staged model and why it exists. -- [Core Design Principles](architecture/core_design_principles.md) — the design constraints that shape the pipeline. -- [Docling Throughput and Batching](architecture/docling_throughput_and_batching.md) — how throughput and stability trade off. -- [Failure Recovery and Skiplist](architecture/docling_failure_recovery_and_skiplist.md) — how the pipeline survives problematic PDFs. -- [Greek Text Validation](architecture/greek_text_validation.md) — why extraction success is not enough for Greek corpora. -- [Metadata, Artifacts, and Run Diagnostics](architecture/metadata_artifacts_and_run_diagnostics.md) — how provenance and operational state are retained. -- [Artifact Layout and Stage Handoffs](architecture/artifact_layout_and_stage_handoffs.md) — how folders, filenames, and metadata glue the stages together. -- [Resumability, Recovery, and Retention](architecture/resumability_recovery_and_retention.md) — how the current design supports reruns and where storage pressure appears. - ## Learn the pipeline +- [Code Map](code_map.md) links the main documentation ideas to the classes and files that implement them. - [Pipeline Overview](pipeline.md) explains each stage and the emitted artifacts. -- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers Docling + RapidOCR usage. +- [OCR & Math Enrichment](ocr_and_math_enhancement.md) covers DeepSeek OCR remediation and Docling-based enrichment. +- [OCR Repetition Policy](ocr_repetition_policy.md) pins the default repetition thresholds for word and LaTeX cleaning. +- [OCR Cleaning Runtime](architecture/ocr_cleaning_runtime.md) explains the shared clean/debug analyzer, ordering, and why the cleaner separates tables, numeric, LaTeX, hybrid, and text ownership. - [Multi-GPU & Benchmarking](multi_gpu.md) shares scaling and scheduling tips. -- [Stage Reference](stages/index.md) breaks down each pipeline stage as a contract. ## Configure and debug - [Configuration](configuration.md) lists all environment knobs. @@ -29,5 +21,5 @@ Welcome to the refreshed docs for GlossAPI, the GFOSS pipeline for turning acade - [AWS Job Distribution](aws_job_distribution.md) describes large-scale scheduling. ## Reference -- [Corpus API](api/corpus.md) details public methods and parameters. -- `docs/divio/` contains placeholder pages for the upcoming Divio restructuring—feel free to open PRs fleshing them out. +- [Corpus API](api/corpus.md) gives the compact contract view of the main public methods. +- [Legacy Corpus API Notes](api_corpus_tmp.md) remains available while the docs are being consolidated. diff --git a/docs/math_enrichment_runtime.md b/docs/math_enrichment_runtime.md index 21d8617..096209c 100644 --- a/docs/math_enrichment_runtime.md +++ b/docs/math_enrichment_runtime.md @@ -68,9 +68,8 @@ c.ocr(math_targets=targets, math_batch_size=4) ## OCR/Model Constraints (recap) -- ORT GPU only: uninstall `onnxruntime` CPU; use `onnxruntime-gpu`. -- RapidOCR keys: Docling 2.48.0 needs `Rec.rec_keys_path` patch (see README). -- Model discovery: set `GLOSSAPI_RAPIDOCR_ONNX_DIR` or package models under `glossapi/models/rapidocr/`. +- DeepSeek OCR runs in its own pinned runtime; set `GLOSSAPI_DEEPSEEK_PYTHON`, `GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT`, and `GLOSSAPI_DEEPSEEK_MODEL_DIR`. +- Keep `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` and `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1`. - Optional Torch CUDA: needed for GPU layout/enrichment; see README for the CUDA wheels. ## Multi‑GPU diff --git a/docs/multi_gpu.md b/docs/multi_gpu.md index b1b8956..c06efe8 100644 --- a/docs/multi_gpu.md +++ b/docs/multi_gpu.md @@ -1,17 +1,21 @@ # Multi‑GPU & Benchmarking GlossAPI can scale across multiple visible GPUs. Faster GPUs drain more work from a shared queue of **absolute -file paths**, so no worker rescans directories. +file paths or pre-packed work items, so no worker rescans directories. ## Extract (Phase‑1) on Multiple GPUs ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` - Workers are bound using `CUDA_VISIBLE_DEVICES=` and run Docling on `cuda:0` relative to each worker. +- `workers_per_device` defaults to `1`; raise it only when benchmarking a strong GPU such as an A100. +- `GLOSSAPI_DOCLING_MAX_BATCH_FILES` lets one Docling worker take more than one PDF per extractor batch; keep the default `1` for fresh-node stability and benchmark larger values explicitly. +- `GLOSSAPI_DOCLING_BATCH_TARGET_PAGES` controls the page budget per queued multi-GPU Docling work item. The controller now sorts heavier work first and packs smaller PDFs toward that page budget so workers do not immediately collapse into a long single-file tail. +- `GLOSSAPI_DOCLING_PAGE_BATCH_SIZE` controls Docling's internal per-device page window (`settings.perf.page_batch_size`). Use it together with the outer queue page budget when you want steadier GPU residency instead of just fatter file bundles. - Threads auto‑tune when `num_threads=None` (roughly `min(cpu_count, 2 * #GPUs)`). Override explicitly if needed. - The controller persists extraction progress in `download_results/download_results.parquet` after each reported batch, so interrupted runs can resume cleanly without ad-hoc checkpoint files. @@ -31,6 +35,39 @@ c.ocr(use_gpus='multi', math_batch_size=12) - Crashed workers are respawned automatically; control the retry budget per GPU with `GLOSSAPI_MATH_RESPAWN_CAP` (default `5`). Use `GLOSSAPI_WORKER_LOG_VERBOSE=0` to silence the banner that prints the binding info. - When a device exceeds the respawn cap, remaining stems are added to the fatal skip-list and their artifacts are quarantined under `downloads/problematic_math/` and `json/problematic_math/` for follow-up. +## DeepSeek OCR on Multiple GPUs + +```python +from glossapi import Corpus +c = Corpus("OUT", "OUT") +c.ocr( + use_gpus="multi", + runtime_backend="vllm", + workers_per_gpu=1, + scheduler="exact_fill", + target_batch_pages=96, +) +``` + +- `scheduler="exact_fill"` is the preferred multi-GPU vLLM scheduler when PDFs vary widely in length. It shards large documents into page ranges and keeps GPU lanes filled more evenly. +- Internal shard runs now preserve the public `Corpus.ocr()` contract. Canonical outputs are reassembled back into `markdown/.md` and `json/metrics/.metrics.json` for each source PDF. +- When OCR starts from canonical corpus rows, the preferred stage handoff is also a canonical parquet where corrected `text` is embedded back into the same row identity. Markdown and metrics remain sidecars for inspection and audit. +- Shard markdown and shard metrics are retained for debugging under `sidecars/ocr_shards/` instead of remaining in the canonical handoff directories. +- The vLLM path now renders pages into memory and feeds a bounded queue directly into inference, which removes the temporary PNG round-trip and overlaps rendering with generation. +- Empty-page detection still happens before inference, and repair retries reuse the in-memory page image instead of reopening a file from disk. +- Final OCR markdown now tags each page split with `` so page images, markdown, and metrics stay aligned during inspection. +- If a repair retry hits the garbage cutoff again, the page is blanked rather than keeping the failed first-pass garbage. +- Multi-GPU vLLM workers now pull from a durable shared batch queue in `sidecars/ocr_runtime/work_queue.sqlite`, so finished batches survive worker crashes and respawned workers can continue without rescanning completed work. +- Repair work now runs as a second global queue phase. First-pass batches finish and persist shard outputs first; then any worker can claim the queued repair shards. This keeps repair tails balanced across GPUs without mixing worker-local repair state into the controller. +- Workers may pack multiple pending repair items into one larger execution batch. Queue durability stays item-granular, but the runtime no longer has to execute the repair tail as one tiny origin-shard retry at a time. +- Each worker writes `sidecars/ocr_runtime/worker_*.runtime.json` with heartbeat state and steady-state timing markers. The runner also emits `gpu_preflight.json`, `gpu_telemetry.jsonl`, and `runtime_summary.json`. +- The runner checks GPU persistence mode before launch by default. Control it with `GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT=off|warn|ensure`. The default is `ensure`, which will try `sudo -n nvidia-smi -pm 1` and record the result in `gpu_preflight.json`. +- When the DeepSeek runtime is built from wheel-managed CUDA packages, the runner now auto-discovers the venv's `site-packages/nvidia/*/lib` directories and prepends them to `LD_LIBRARY_PATH`. `GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH` still works as a manual override or supplement. +- Worker reliability knobs are environment-driven: `GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP`, `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS`, `GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC`, `GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC`, and `GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC`. +- The default `GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS=2` means one retry after the first failed claim, then the batch is marked failed instead of retrying forever. +- `workers_per_gpu=1` remains the safe default on A100 40GB nodes. Prefer increasing `target_batch_pages` before adding more workers per device. +- For fresh GCP A100 nodes, run `python -m glossapi.scripts.deepseek_runtime_report --repo-root ` before applying ad hoc fixes. Treat that report as the baseline comparison against a known-good node. See [operations/deepseek_gcp_a100_setup.md](operations/deepseek_gcp_a100_setup.md). + ## Provider & Device Checks - ONNXRuntime providers must include `CUDAExecutionProvider`. diff --git a/docs/ocr_and_math_enhancement.md b/docs/ocr_and_math_enhancement.md index 197bb0a..b013dd3 100644 --- a/docs/ocr_and_math_enhancement.md +++ b/docs/ocr_and_math_enhancement.md @@ -1,15 +1,14 @@ # GPU OCR and Math Enrichment -This document summarizes how GlossAPI uses the GPU for OCR and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. +This document summarizes how GlossAPI uses the GPU for OCR remediation and formula/code enrichment, how to run each phase efficiently, and where artifacts are written. ## Overview -- Phase‑1 (Extract): PDF → Markdown via Docling; optional GPU OCR via RapidOCR (ONNXRuntime). Optionally emit JSON + formula index for Phase‑2. +- Phase‑1 (Extract): PDF → Markdown via Docling or the safe backend. Optionally emit JSON + formula index for Phase‑2. - Phase‑2 (Enrich): From Docling JSON, decode math/code on the GPU (CodeFormula) and re‑emit enriched Markdown. Backends -- `backend='rapidocr'` (default): Docling + RapidOCR; Phase‑2 math runs from Docling JSON. -- `backend='deepseek'`: DeepSeek‑OCR; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. +- `backend='deepseek'`: DeepSeek-OCR-2; equations are included inline in OCR output, so Phase‑2 math is not required and is treated as a no‑op. Policy: never OCR and math on the same file - If a file needs OCR, GlossAPI runs OCR only (no Phase‑2 on that file in the same pass). @@ -18,24 +17,43 @@ Policy: never OCR and math on the same file ### Python API layout - DeepSeek entry point: `glossapi.ocr.deepseek.runner.run_for_files(...)` -- RapidOCR dispatcher: `glossapi.ocr.rapidocr.dispatch.run_via_extract(...)` - Math enrichment: `glossapi.ocr.math.enrich.enrich_from_docling_json(...)` - Utility helpers (Docling JSON / cleaning): `glossapi.ocr.utils.*` ## Prerequisites -- RapidOCR/Docling stack: `pip install '.[rapidocr]'` -- DeepSeek CLI stack (in a dedicated venv recommended): `pip install '.[deepseek]'` -- ONNXRuntime GPU installed (no CPU ORT): `onnxruntime-gpu==1.18.1` -- Torch CUDA installed: e.g., `torch==2.5.1+cu121` -- Packaged RapidOCR models/keys found under `glossapi/models/rapidocr/{onnx,keys}` or via `GLOSSAPI_RAPIDOCR_ONNX_DIR`. +- Main GlossAPI stack: `./dependency_setup/setup_glossapi.sh --mode docling` +- DeepSeek runtime: `./dependency_setup/setup_deepseek_uv.sh --venv dependency_setup/.venvs/deepseek` +- Torch CUDA installed in the DeepSeek env (the uv setup pins the tested stack). - Optional helpers for Phase‑2 JSON: `pypdfium2`, `zstandard`. +### Standard DeepSeek venv + +Use a dedicated OCR runtime and treat it as the source of truth for DeepSeek runs: + +```bash +./dependency_setup/setup_deepseek_uv.sh \ + --venv dependency_setup/.venvs/deepseek \ + --model-root /path/to/deepseek-ocr-2-model \ + --download-model \ + --run-tests --smoke-test +``` + +Recommended environment variables after setup: + +```bash +export GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 +export GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 +export GLOSSAPI_DEEPSEEK_PYTHON="$PWD/dependency_setup/.venvs/deepseek/bin/python" +export GLOSSAPI_DEEPSEEK_MODEL_DIR="/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2" +``` + +The OCR runtime should not silently drift between ad hoc virtual environments during benchmarking. If a benchmark uses a different DeepSeek venv, treat the result as a different runtime stack. + Verify GPU readiness before forcing OCR or math: ```bash python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())" # expects True, >=1 -python -c "import onnxruntime as ort; print(ort.get_available_providers())" # must include CUDAExecutionProvider ``` ## Running Phase‑1 (Extract) @@ -44,17 +62,14 @@ python -c "import onnxruntime as ort; print(ort.get_available_providers())" from glossapi import Corpus c = Corpus('IN','OUT') -# GPU OCR on PDFs; emit JSON + formula index for Phase‑2 +# Emit JSON + formula index for Phase‑2 c.extract( input_format='pdf', - accel_type='CUDA', # or use_gpus='multi' for multi‑GPU - force_ocr=True, # OCR always on for PDFs + accel_type='CUDA', emit_formula_index=True, # request json/.formula_index.jsonl alongside the default JSON ) ``` -When `force_ocr=True` (or when math/code enrichment is enabled), GlossAPI automatically switches to the Docling backend and aborts if CUDA‑enabled torch/ONNXRuntime providers are not available. - Outputs: - `markdown/.md` - `json/.docling.json(.zst)` and `json/.formula_index.jsonl` @@ -88,20 +103,64 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → runs OCR only for bad files; equations are included inline; Phase‑2 is skipped ``` -If you need Phase‑2 math on files that do not require OCR, use RapidOCR/Docling and math‑only (expects Docling JSON from Phase‑1): +If you need Phase‑2 math on files that do not require OCR, run `math_only` after Docling extraction with JSON enabled. + +### DeepSeek fast path + +The current recommended high-throughput DeepSeek configuration is: + +- `runtime_backend='vllm'` +- `ocr_profile='markdown_grounded'` +- `max_new_tokens=2048` as the standard default ceiling +- `repair_mode='auto'` to keep markdown as the primary output while selectively rerunning suspicious pages +- `scheduler='auto'` so multi-GPU vLLM runs resolve to exact-fill page-range batching +- `target_batch_pages=160` +- large `vllm_batch_size` chosen to keep `sec/page/GPU` at or below the best validated floor for the target hardware + +Example: ```python -c.ocr(backend='rapidocr', fix_bad=False, math_enhance=True, mode='math_only') -# → runs Phase‑2 on non‑OCR files only (requires Docling JSON) +c.ocr( + backend='deepseek', + fix_bad=True, + math_enhance=False, + runtime_backend='vllm', + ocr_profile='markdown_grounded', + max_new_tokens=2048, + vllm_batch_size=160, + gpu_memory_utilization=0.9, + repair_mode='auto', + scheduler='auto', + target_batch_pages=160, + use_gpus='multi', +) ``` +`repair_mode='auto'` runs the pipeline in distinct phases inside the vLLM runner: + +1. markdown first pass over all rendered pages +2. cheap per-page triage using output quality plus simple image density statistics +3. plain-text rerun bucket for garbage markdown pages +4. tiled markdown rerun bucket for short coverage failures + +This keeps the fast path batched while avoiding per-page sequential fallback overhead. + +### What is now implemented + +- Empty-page skipping before OCR dispatch +- Streaming garbage early-stop during markdown generation +- Plain-text retry for pages that hit the garbage early-stop +- Multi-GPU exact-fill page-range scheduling for the DeepSeek runner +- Benchmark harness support for `whole_doc`, `fixed_shard`, and `exact_fill` +- Corpus API forwarding for the scheduler controls + ## Multi‑GPU Phase‑1 (extract): ```python -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.extract(input_format='pdf', use_gpus='multi', phase1_backend='docling', workers_per_device=2) ``` -Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. OCR uses ORT GPU under the same process. +Workers set `CUDA_VISIBLE_DEVICES` per process; Docling runs on `cuda:0` relative to each worker. Phase‑2 (enrich): ```python @@ -118,9 +177,73 @@ Spawns math workers; each binds to its GPU using `CUDA_VISIBLE_DEVICES` and runs ## Performance & Tuning +### Validated benchmark floor + +The current non-regression metric is `sec/page/GPU`. + +Validated on 2026-03-30: + +- Host: AWS `g7e.48xlarge` +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Render DPI: `144` +- GPU memory utilization: `0.9` +- Best large-batch single-GPU floor observed: `0.3109 sec/page/GPU` + +Production markdown+repair benchmark on the same host: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Max new tokens: `2048` +- GPUs: `8` +- Static sharding (`1` shard/GPU), validated rerun after classifier hardening: `558.88s` wall, `0.0733 sec/page` overall, `0.4912` to `0.5475 sec/page/GPU` +- Streaming admission (`stream_batch_pages=160`): `928.81s` wall, `0.1218 sec/page` overall, `0.5469` to `0.6856 sec/page/GPU` +- Peak VRAM in both runs stayed at about `88,953 MiB` per active GPU +- Static active-lane GPU utilization averaged about `65%` to `75%`; streaming active-lane utilization stayed similar while whole-run occupancy got worse because more lanes sat idle between batches + +Validated on 2026-03-31 after standardizing the DeepSeek runtime ceiling back to `2048` and restoring the persistent one-process-per-lane architecture: + +- Corpus: `43` OA PDFs, `7,624` pages +- Runtime: `vllm` +- Profile: `markdown_grounded` +- Repair mode: `auto` +- Scheduler: `whole_doc` +- Max new tokens: `2048` +- GPUs: `8` +- Clean rebuilt whole-document rerun: about `541s` wall, `0.0710 sec/page` overall, and `0.3927` to `0.5000 sec/page/GPU` + +Interpretation: + +- The rebuilt stack is back near the validated March 30 throughput once the silent `8192` ceiling regression is removed. +- The remaining performance problem is not raw inference speed; it is whole-document tail imbalance, where one oversized PDF can keep a single GPU busy after the other lanes finish. +- Multi-GPU `exact_fill` must therefore be benchmarked only on the persistent lane-worker architecture. The earlier exact-fill regression was caused by spawning a fresh OCR CLI per batch, not by the scheduling idea itself. + +Decision: + +- Keep static sharding as the default large-run pipeline shape for now +- Do not enable streaming admission by default yet; on this benchmark it regressed badly versus static sharding +- Treat the earlier `0.3109 sec/page/GPU` result as the raw floor, and the static repaired-markdown result above as the current production-like baseline on this hardware +- Treat the 2026-03-31 clean whole-document rerun as the restored benchmark sanity check for the standardized `2048` ceiling on the rebuilt runtime + +Attention/runtime note: + +- The production fast path is `vllm`; logs on this stack show `flashinfer` autotuning plus CUDA graph capture +- Transformers remain the fallback path; prefer `flash_attention_2` there and do not optimize around `sdpa` + +That number is the floor to preserve or beat when tuning the full markdown pipeline. Faster raw runs that change the effective output mode or bypass repair logic do not replace it as the production baseline. + +Default policy note: + +- The standard DeepSeek OCR default is now `max_new_tokens=2048` for both the Transformers and vLLM runners. +- Leaving the flag unset must not silently expand to a larger ceiling such as `8192`. +- When comparing benchmark runs, treat a different token ceiling or a different DeepSeek venv as a different runtime/configuration. + - Batch sizes - - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula docling side throughput. + - Inline (Phase‑1): `GLOSSAPI_FORMULA_BATCH` (default 16) sets CodeFormula throughput. - Phase‑2: `batch_size` / `math_batch_size` parameter (typ. 8–16) balances VRAM and speed. + - DeepSeek vLLM: push `vllm_batch_size` as high as the hardware allows while tracking `sec/page/GPU`; on the validated `g7e.48xlarge` path, larger batches continued improving throughput through `batch_size=160`. - Images scale for OCR: `GLOSSAPI_IMAGES_SCALE` (~1.1–1.25) can improve detection on thin glyphs. - CPU threads: cap `OMP_NUM_THREADS` / `MKL_NUM_THREADS` to avoid CPU oversubscription on multi‑GPU nodes. @@ -159,11 +282,7 @@ OUT/ ## Troubleshooting -- Missing CUDAExecutionProvider - - Ensure `onnxruntime-gpu` is installed and `onnxruntime` CPU is uninstalled. - Torch reports no CUDA - Check `nvidia-smi` and match Torch CUDA build to your driver. -- OCR is slow or falls back to CPU - - Confirm ORT providers include CUDAExecutionProvider and that `accel_type='CUDA'` is used. - Out of memory - Lower `batch_size` for Phase‑2, reduce `GLOSSAPI_IMAGES_SCALE`, or split inputs. diff --git a/docs/ocr_noise_failure_modes.md b/docs/ocr_noise_failure_modes.md new file mode 100644 index 0000000..6017e9c --- /dev/null +++ b/docs/ocr_noise_failure_modes.md @@ -0,0 +1,118 @@ +# OCR Noise Failure Modes + +Status: example bank for future `Corpus.clean_ocr(...)` heuristics. These are notes only, not implemented cleaning rules. + +## Why This Exists + +The preserved OCR outputs contain several distinct failure modes that should not be collapsed into one generic `ocr_noise` rule. Some are page-local low-entropy collapses, some are encoding/control-character tails, and some are repetitive math-token artifacts that need math-aware handling. + +The examples below were reviewed on April 3, 2026 from the preserved OCR lane: + +- `/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown` + +## Group 1: Page-Local Low-Entropy Numeric Collapse + +Definition: +pages that collapse into highly repetitive short numeric lines, often immediately after a page split marker. + +Examples: + +- `ABO_768__p00001-00096.md` + - around line 955 the page turns into repeated `0`, `0 0`, `0 0 0` + - the collapse begins directly after `<--- Page Split --->` +- `ACH_787__p00001-00096.md` + - around line 755 the page turns into repeated `1.1` and occasional `1` + - this also begins directly after `<--- Page Split --->` + +Anchored references: + +- [ABO_768__p00001-00096.md:955](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ABO_768__p00001-00096.md#L955) +- [ACH_787__p00001-00096.md:755](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ACH_787__p00001-00096.md#L755) + +Detection ideas: + +- page-level repeated-line detection, not just single-line run detection +- low token entropy on a page-sized region +- special weight if the collapse starts right after `<--- Page Split --->` +- repeated short numeric lines should be treated separately from legitimate tables or lists + +Important note: +the current OCR numeric-noise check is line-local and is better at catching long same-number or ascending sequences inside one line than these repeated-line page collapses. + +## Group 2: Control-Character / Encoding-Garbage Tails + +Definition: +pages that devolve into non-printable or control-like characters, often after otherwise valid text. + +Example: + +- `ADQ_670.md` + - after a page split, the page contains `%` followed by C1/control-like junk such as `€`, ``, `‚`, ..., `°` + - this is not just numeric repetition; it looks like decoding/binary leakage or severe mojibake-like corruption + +Anchored references: + +- [ADQ_670.md:887](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L887) +- [ADQ_670.md:954](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L954) + +Detection ideas: + +- count non-printable/control codepoints +- count dense runs of extended control-like characters on a page +- flag abrupt transitions from valid prose to control-character tails +- keep this separate from ordinary mojibake and separate from numeric collapse + +## Group 3: Repetitive Math-Token Floods + +Definition: +pages or page segments that repeat the same LaTeX-like math atoms or malformed math atoms many times. + +Examples: + +- `ADS_856__p00001-00014.md` + - repeated `\( \gamma \)` sequence on one line +- `ADS_856__p00015-00082.md` + - repeated `\( \Delta_{v} \)` blocks + - malformed variants like `\( \Deltav \)` + - long concatenated runs like `\Delta_{v}\Delta_{v}\Delta_{v}...` + +Anchored references: + +- [ADS_856__p00001-00014.md:139](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00001-00014.md#L139) +- [ADS_856__p00015-00082.md:1](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00015-00082.md#L1) + +Detection ideas: + +- tokenize LaTeX-like math atoms and detect repeated-token floods +- distinguish valid repeated notation from pathological repetition +- score malformed math variants separately from valid math tokens +- this should remain an experimental detector, not a blunt drop rule + +Important note: +real mathematical texts can legitimately repeat symbols, so this class needs a math-aware heuristic rather than a general repetition penalty. + +## Grouping Recommendation + +Do not collapse all of the above into one rule. + +Recommended future flags: + +- `ocr_numeric_page_collapse` +- `ocr_control_char_tail` +- `ocr_math_repetition` + +Recommended future metadata: + +- page-local region counts +- page-split proximity flags +- repeated-line entropy or uniqueness ratio +- control-character density +- math-token repetition density + +## Current Examples To Keep Around + +- [ABO_768__p00001-00096.md:955](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ABO_768__p00001-00096.md#L955) +- [ACH_787__p00001-00096.md:755](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ACH_787__p00001-00096.md#L755) +- [ADQ_670.md:887](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADQ_670.md#L887) +- [ADS_856__p00001-00014.md:139](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00001-00014.md#L139) +- [ADS_856__p00015-00082.md:1](/home/foivos/data/openarchives_ocr_ingest_20260403/lanes/eu_node01_full_v1/markdown/ADS_856__p00015-00082.md#L1) diff --git a/docs/ocr_repetition_policy.md b/docs/ocr_repetition_policy.md new file mode 100644 index 0000000..eccd446 --- /dev/null +++ b/docs/ocr_repetition_policy.md @@ -0,0 +1,42 @@ +# OCR Repetition Policy + +This document pins the intended default repetition thresholds for OCR-cleaner development so they do not drift silently. + +## Defaults + +- Shared word repetition threshold: `4` +- Shared LaTeX repetition threshold: `4` +- Shared minimum repeat period: `3` +- Shared repeat window: `96` + +These defaults apply to the combined OCR debug annotator: +- `Corpus.clean_ocr_numeric_word_debug_docs(...)` + +The same analyzer now also drives real clean-mode rendering in `clean_ocr()`; +debug and clean differ only in rendering, not in span discovery. + +In that pipeline: +- tables are handled first +- numeric detection runs before generic text ownership +- LaTeX and hybrid structural detection run before shared text repetition +- shared repeat detection runs last on the remaining untagged text + +## Scope + +These defaults are for: +- word repetition +- LaTeX repetition + +They do not override numeric-specific detectors, which have their own thresholds such as: +- ascending numeric progressions +- compact repeated numeric atoms +- same-digit numeric runs + +## Design Intent + +- Neighboring same-type spans may merge when their separator has `40` non-whitespace characters or less; this keeps fragmented OCR loops from being split into multiple tiny matches. +- A default of `4` is meant to reduce borderline `3`-repeat matches. +- Locality matters more than page-wide reuse, especially for LaTeX. +- Repeated symbols or notation used normally across a page should not be treated as cleaner targets by default. +- Numeric progression should be handled by numeric or hybrid logic before text repetition sees it. +- Table cleanup includes structural cases that are not repetition problems, so table policy is documented separately in `docs/architecture/ocr_cleaning_runtime.md`. diff --git a/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md b/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md new file mode 100644 index 0000000..2e6605d --- /dev/null +++ b/docs/operations/DELETE_ME_deepseek_reliability_pending_2026-04-02.md @@ -0,0 +1,39 @@ +# DELETE ME: DeepSeek Reliability Pending Work + +This note is temporary. Delete it after the first production soak confirms the +merged reliability path is stable and the follow-up items below are either done +or explicitly discarded. + +## What shipped in this merge + +- durable multi-GPU DeepSeek work queue with separate main and repair phases +- worker respawn with process-group teardown so orphaned `VLLM::EngineCore` + processes do not pin VRAM after a crash +- GPU preflight and telemetry sidecars under `sidecars/ocr_runtime/` +- steady-state timing in the runtime summary +- default work-item retry ceiling of two total attempts + - first failure: retry once + - second failure: mark the batch failed and stop retrying it + +## Pending follow-up + +1. Capture and archive one clean fault-injection receipt on the merged + `development` branch. + - Goal: preserve one explicit production-like run where a worker is killed + mid-run, the supervisor respawns it, the in-flight batch is retried once, + and the run still completes. + +2. Add operator-facing handling for terminally failed batches. + - The durable queue already marks them `failed`. + - The remaining work is a cleaner operator handoff, for example a dedicated + quarantine/export path or a documented replay workflow. + +3. Replace the current image-content stats implementation in + `run_pdf_ocr_vllm.py`. + - It still uses a CPU-heavy PIL pixel scan and currently emits a Pillow + deprecation warning. + +4. Run a longer unattended soak after merge. + - The current validation covers targeted tests, full end-to-end runs, and + reliability-path implementation, but production confidence still benefits + from a longer multi-hour burn-in on the merged branch. diff --git a/docs/operations/deepseek_gcp_a100_setup.md b/docs/operations/deepseek_gcp_a100_setup.md new file mode 100644 index 0000000..20d9209 --- /dev/null +++ b/docs/operations/deepseek_gcp_a100_setup.md @@ -0,0 +1,160 @@ +# DeepSeek GCP A100 Setup + +This note captures the current known-good baseline for bringing up GlossAPI +DeepSeek OCR on fresh GCP A100 nodes and the required diagnosis workflow when a +fresh node does not behave like the already-converged fleet. + +## Goal + +Treat a fresh OCR node as a reproducible setup target, not as a one-off machine +that is repaired interactively until it happens to work. + +The target is a clean path from: + +1. create instance +2. bootstrap machine +3. prepare GlossAPI runtime +4. run a normal GlossAPI OCR workflow + +## Known-good baseline + +This rollout has validated the following stack on working OCR fleet nodes: + +- Ubuntu `22.04.5` +- NVIDIA driver `590.48.01` +- `A100 40GB` GPUs +- host Python `3.10` +- DeepSeek venv Python `3.11` from a stable final CPython, not a prerelease distro build +- `torch 2.10.0+cu130` +- `vllm 0.18.0` +- `transformers 4.57.6` +- `workers_per_gpu=1` + +The runner also expects GPU persistence mode to be enabled and will record the +preflight result under `sidecars/ocr_runtime/gpu_preflight.json`. + +## First command on a fresh node + +Run the checked-in runtime report before changing code or applying ad hoc fixes: + +```bash +python -m glossapi.scripts.deepseek_runtime_report --repo-root /opt/glossapi/repo +``` + +The report prints: + +- OS and hostname +- repo revision +- GPU model, driver, and memory +- selected Python executable and venv root +- `torch` / `vllm` / `transformers` import details +- wheel-managed NVIDIA library directories +- a focused `pip freeze` subset +- selected runtime environment variables + +Prefer comparing this output against a known-good OCR node before modifying +GlossAPI itself. + +## Fresh-node diagnosis rule + +If a fresh node fails, classify the problem before patching code: + +1. instance creation choice + - wrong image + - wrong driver path + - wrong machine family or GPU shape +2. bootstrap incompleteness + - missing system packages + - missing wheel-managed CUDA libraries + - model / cache / filesystem layout mismatch + - missing env wiring +3. actual GlossAPI runtime assumption + - hidden dependency on a particular venv layout + - hidden dependency on a specific CUDA wheel layout + - hidden runner / vLLM startup assumption + +Write down which class the current failure belongs to before making broad code +changes. + +## Current benchmark-node findings + +The fresh `a2-highgpu-2g` benchmark node used during the April 3, 2026 work +surfaced two setup classes: + +- early missing shared-library failure: + - `ImportError: libcudart.so.12: cannot open shared object file` +- later engine startup failure after bootstrap fixes: + - `RuntimeError: Engine core initialization failed. Failed core proc(s): {}` + +This means instance creation itself worked, but bootstrap/runtime reproducibility +was incomplete. + +The concrete bootstrap issues found on that node were: + +- `uv` existed only in `~/.local/bin`, which non-interactive shells were not using +- the default DeepSeek venv was created against `/usr/bin/python3.11`, which on + that node was `Python 3.11.0rc1` +- system cargo/rustc were too old to parse the repo `Cargo.lock` +- the DeepSeek venv still needed the cu12 runtime pair for `vllm._C` to import: + - `nvidia-cuda-runtime-cu12` + - `nvidia-cuda-nvrtc-cu12` + +After correcting those bootstrap defects, the same fresh node was able to: + +- import `vllm._C` +- initialize a direct one-GPU `LLM(...)` +- start a real `openarchives_ocr_run_node` workload with `runtime_backend=vllm` + +The same node was also used for a real `10`-PDF `extract -> clean -> ocr` +checkpoint: + +- the stable end-to-end shape on that node was: + - multi-GPU extraction + - `workers_per_device=1` + - multi-GPU DeepSeek OCR with `workers_per_gpu=1` +- an isolated extraction benchmark with `workers_per_device=2` was faster on the + same sample, but the first full-pipeline replay hit a Docling allocator crash: + - `malloc_consolidate(): unaligned fastbin chunk detected` +- treat `workers_per_device=2` as benchmark-only / experimental until it is + proven stable in the full Corpus pipeline, not just in extract-only tests + +The full-pipeline checkpoint harness also now retries the JSONL export when OCR +has already filled text into parquet rows but the first export pass still emits +zero records. This guards the observed end-of-run export race on the benchmark +node without changing the OCR output contract itself. + +## Current runner expectation + +`glossapi.ocr.deepseek.runner._build_env()` now auto-discovers +`site-packages/nvidia/*/lib` directories under the selected DeepSeek virtualenv +and prepends them to `LD_LIBRARY_PATH`. + +This is the right place to normalize wheel-managed CUDA library discovery. Do +not rely on manual shell-session exports as the primary contract. + +## Practical bring-up checklist + +1. confirm the node matches the OS / driver baseline +2. export user-local tool paths explicitly for non-interactive shells: + - `export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"` +3. install a stable CPython explicitly, for example: + - `~/.local/bin/uv python install 3.11.11` +4. run `deepseek_runtime_report` +5. compare report output to a known-good node +6. fix bootstrap mismatches first +7. rerun the report +8. only then run a small OCR validation workload +9. if OCR still fails, inspect worker logs and decide whether the remaining gap + belongs in GlossAPI runtime code or external bootstrap + +## Rust note + +If editable installs fail while building `glossapi_rs_cleaner` or +`glossapi_rs_noise`, prefer a user-local modern Rust toolchain: + +```bash +curl https://sh.rustup.rs -sSf | sh -s -- -y +export PATH="$HOME/.cargo/bin:$PATH" +rustup toolchain install stable +rustup default stable +``` diff --git a/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md b/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md new file mode 100644 index 0000000..734b5d8 --- /dev/null +++ b/docs/operations/ocr_changes_2026-04-01_to_2026-04-03.md @@ -0,0 +1,82 @@ +# OCR Changes Merged To `development` (2026-04-01 to 2026-04-03) + +This note summarizes the OCR-facing changes already merged into +`development`, centered on commit `489698e` (`deepseek reliability hardening`). + +Use it as a short operator/developer changelog for the April 1-3 rollout. + +## Runtime reliability + +- DeepSeek multi-GPU OCR now runs through a durable SQLite work queue instead of + fragile fixed subprocess assignment. +- Work items heartbeat while running and are requeued if a worker dies or goes + stale. +- Failed work items now default to one retry (`max_attempts=2` total attempts), + then become terminal failures for operator follow-up instead of bouncing + forever. +- Repair work is durable too: first-pass batches populate a second repair queue + that workers drain after the main queue is empty. +- Workers are launched in their own process groups so respawn can clean up + orphaned runtime processes and recover GPU memory. + +## Throughput and observability + +- vLLM OCR now renders pages into memory and feeds a bounded render queue + directly into inference, removing the temporary image-file round trip. +- Rendering and inference overlap during the first pass. +- Empty pages are detected before inference and skipped early. +- Per-worker runtime JSON, GPU preflight output, GPU telemetry, durable queue + state, and the final runtime summary now live under `sidecars/ocr_runtime/`. +- Runtime summaries now expose steady-state inference timestamps so long-run + throughput can be measured without startup noise. + +## Output contract and repair behavior + +- Canonical OCR outputs remain one `markdown/.md` and one + `json/metrics/.metrics.json` per source PDF. +- Page boundaries are annotated with `` comments alongside the + page split markers. +- Internal shard markdown and shard metrics move under `sidecars/ocr_shards/` + so downstream stages do not mistake them for canonical outputs. +- If a repair retry hits the garbage cutoff again, GlossAPI now blanks that page + slot instead of preserving the failed garbage text. +- Repair queue durability and repair execution packing are separate concerns: + queue accounting stays item-granular, while workers are allowed to combine + multiple repair items into one larger execution batch. + +## Fresh-node setup implications + +- The runner now auto-discovers wheel-managed CUDA libraries inside the selected + DeepSeek virtualenv and prepends them to `LD_LIBRARY_PATH`. +- Fresh A100 nodes should be validated first with: + +```bash +python -m glossapi.scripts.deepseek_runtime_report --repo-root +``` + +- The currently validated fleet baseline is: + - Ubuntu `22.04.5` + - NVIDIA driver `590.48.01` + - A100 `40GB` + - `torch 2.10.0+cu130` + - `vllm 0.18.0` + - `transformers 4.57.6` + - `workers_per_gpu=1` + +## Test coverage added with the merge + +- durable queue requeue / retry behavior +- repair queue enqueue and phase switching +- repair execution packing +- worker runtime summaries and runner contracts + +## What this doc does not cover + +This note only summarizes OCR work already merged into `development`. + +It does not describe the still-in-progress branch work for: + +- fresh-node bootstrap hardening beyond `development` +- stronger OCR metadata continuity +- canonical text-bearing OCR parquet outputs +- additional extract-clean-ocr integration validation diff --git a/docs/operations/openarchives_ocr_rollout_plan.md b/docs/operations/openarchives_ocr_rollout_plan.md new file mode 100644 index 0000000..56136fa --- /dev/null +++ b/docs/operations/openarchives_ocr_rollout_plan.md @@ -0,0 +1,498 @@ +# OpenArchives OCR Rollout Plan + +This document records the concrete execution plan for running DeepSeek OCR over the OpenArchives subset with `needs_ocr=True`, including how to recover or regenerate the routing state, how to shard work across AWS nodes, and how to merge results back into the canonical GlossAPI corpus. + +## Implemented tooling + +The rollout is backed by concrete scripts in `src/glossapi/scripts/`: + +- `openarchives_ocr_enrich.py` + - reads the canonical OpenArchives parquet + - scans raw HF JSONL shards for the target docs + - extracts `page_count_source`, `pages_total_source`, and `pdf_url` + - writes a shard-ready enriched parquet for OCR deployment +- `openarchives_ocr_shards.py` + - reads the canonical parquet + - filters `needs_ocr=True` + - balances documents across `N` nodes by page count + - writes one shard manifest parquet per node + - writes a JSON summary with page totals and ETA +- `openarchives_ocr_merge.py` + - merges shard-level OCR metadata back into the canonical parquet by `filename` + - can also embed merged OCR `text` plus artifact linkage fields back into the canonical rows when OCR markdown artifacts are available + - unifies page-range shard markdown back into one canonical document-level markdown artifact per OCR row before downstream handoff + +These scripts are intentionally document-level rather than page-fragment-level so merge stays simple and GlossAPI-compatible. + +## Executed result on 2026-03-31 + +The CPU fallback path has now been executed successfully on AWS: + +- CPU cleaner node: + - instance: `c7i.8xlarge` + - instance id: `i-0ccf5ab1a510b31d8` +- Full OA reevaluation fill: + - input rows: `179,845` + - missing `greek_badness_score` rows materialized and cleaned: `89,892` + - unique raw JSONL shards needed for the fill subset: `108` +- Filled routing result: + - `greek_badness_score` coverage: `179,845 / 179,845` + - `needs_ocr == true`: `45,547` +- Enriched OCR target manifest: + - OCR-target docs: `45,547` + - OCR-target pages: `3,292,392` + - raw JSONL shards needed for the full OCR target set: `218` +- Balanced 4-node shard result: + - `4` shard manifests + - `823,098` pages per node + - `11,386` or `11,387` docs per node +- ETA from validated `g7e.48xlarge` throughput: + - one node: `64.94h` + - four nodes: `16.23h` + +Published artifacts on Hugging Face dataset `glossAPI/openarchives.gr`: + +- `data/openarchives_ocr_completion/20260331/summary.json` +- `data/openarchives_ocr_completion/20260331/filled_document_level.parquet` +- `data/openarchives_ocr_completion/20260331/filled_document_quality.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/needs_ocr_enriched.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_00.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_01.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_02.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_node_03.parquet` +- `data/openarchives_ocr_completion/20260331/ocr_shards/openarchives_ocr_shard_summary.json` + +## Node runner contract + +Each OCR node should materialize one shard into its own GlossAPI corpus root and +run DeepSeek OCR through the standard `Corpus.ocr(...)` API, not through a +standalone benchmark wrapper. + +Stored runner: + +- `python -m glossapi.scripts.openarchives_ocr_run_node` +- `python -m glossapi.scripts.openarchives_download_freeze` + +The runner does four things in order: + +1. reads one shard parquet +2. downloads the shard PDFs into `downloads/` using their OA filenames +3. writes the shard metadata as canonical `download_results/download_results.parquet` +4. runs `Corpus.ocr(...)` with the validated DeepSeek settings + +The download-freeze runner is the matching download-only entrypoint: + +1. reads one OA manifest parquet +2. downloads the PDFs into `downloads/` using their OA filenames +3. writes canonical `download_results/download_results.parquet` +4. stops there, without starting OCR + +Download policy note: + +- OpenArchives download should be host-first, not collection-first. +- GlossAPI now supports host-specific download policy overrides in the normal downloader path for: + - `downloader` + - `request_timeout` + - `ssl_verify` + - `ssl_cafile` + - `request_method` + - `sleep` + - `per_domain_concurrency` + - `domain_concurrency_floor` + - `domain_concurrency_ceiling` + - `skip_failed_after` + - `domain_cookies` +- That means the OA freeze-download phase can stay inside `Corpus.download(...)`; we do not need a separate downloader implementation. +- Stored OA policy sample: + - `samples/openarchives_download_policy.yml` +- Stored OA probe runner: + - `python -m glossapi.scripts.openarchives_download_probe` +- OA download runs should use `scheduler_mode=per_domain` together with `parallelize_by=base_domain`, + otherwise the host-level concurrency policy is mostly inert. +- Probe result on the CPU box: + - `dspace.lib.ntua.gr` succeeds cleanly once OA downloads use `scheduler_mode=per_domain` + and the host is throttled to a single in-flight request + - `ktisis.cut.ac.cy` succeeds with `ssl_verify=false` + - `repository.academyofathens.gr`, `repository.ihu.gr`, `pergamos.lib.uoa.gr`, + and `dione.lib.unipi.gr` behaved like standard hosts in the probe + - `ikee.lib.auth.gr` is not just a pre-ping false negative; direct PDF requests hit + real connection timeouts + - `olympias.lib.uoi.gr` is not just a pre-ping false negative either; direct PDF + requests reach the host but stall on response reads +- Operational recommendation: + - bulk-freeze the good hosts first + - keep `ikee.lib.auth.gr` and `olympias.lib.uoi.gr` in a dedicated slow-path download phase + so they do not dominate the main corpus freeze run + +Standard node command: + +```bash +PYTHONPATH=src /home/ubuntu/venvs/deepseek/bin/python -m glossapi.scripts.openarchives_ocr_run_node \ + --shard-parquet /data/openarchives/shards/openarchives_ocr_shard_node_00.parquet \ + --work-root /data/openarchives/node_00 \ + --heartbeat-path /data/openarchives/heartbeats/node_00.json \ + --instance-id "$INSTANCE_ID" \ + --node-id node-00 \ + --scheduler whole_doc \ + --runtime-backend vllm \ + --ocr-profile markdown_grounded \ + --render-dpi 144 \ + --max-new-tokens 2048 \ + --repair-mode auto \ + --gpu-memory-utilization 0.9 +``` + +Current rollout note: + +- use `scheduler=whole_doc` for the first production OA pass because that is the + last large-run configuration validated cleanly on the standardized stack +- keep `exact_fill` as the next benchmarking target, but do not silently switch + the production rollout to it until the same stack shows a non-regression or + improvement + +## Current validated baseline + +- Validated OCR node type: `g7e.48xlarge` +- Validated AMI: `ami-052266c3e21dff7db` +- AMI name: `Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 24.04) 20260320` +- Validated runtime stack on the OCR node: + - `torch 2.10.0+cu130` + - `vllm 0.18.0` + - `transformers 4.57.6` +- Standard DeepSeek settings: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` +- Restored clean benchmark on the stopped OCR box: + - `7,624` pages in about `541s` + - about `0.0710 sec/page` overall on one `8`-GPU node + - about `0.3927` to `0.5000 sec/page/GPU` +- Derived per-node throughput: + - about `14.08 pages/sec` + - about `50,700 pages/hour` + +## Current AWS capacity + +`us-east-1` service quotas currently allow: + +- `Running On-Demand G and VT instances = 768` +- `Running On-Demand Standard instances = 640` + +For the validated OCR node: + +- `g7e.48xlarge = 192 vCPU, 8 GPUs` + +So the current maximum concurrent validated OCR fleet is: + +- `floor(768 / 192) = 4` nodes +- total rollout capacity: `32 GPUs` + +## Phase 1: Recover or regenerate the canonical OCR routing state + +Goal: + +- produce one canonical `download_results/download_results.parquet` for the OpenArchives corpus root +- ensure it contains, at minimum: + - `filename` + - `needs_ocr` + - `greek_badness_score` + - `mojibake_badness_score` + - `ocr_success` + - `page_count` or `pages_total` + +Decision order: + +1. Check the stopped GPU OCR instance first. +2. If the full corpus parquet is not there, run a dedicated CPU cleaning pass. + +### 1A. Check the stopped OCR instance first + +Reason: + +- the NVMe volume persists across stop/start +- if the full OpenArchives cleaning pass was already run there, this is the fastest path + +Concrete steps: + +1. Start instance `i-0504a326a1fee541f`. +2. SSH in and search for the full OpenArchives corpus root and canonical parquet: + - `find /opt /data /home -name download_results.parquet` + - verify row count is the full OpenArchives set, not the `43`-document benchmark subset +3. Validate that the parquet has the required OCR routing columns listed above. +4. If found: + - copy the canonical parquet and any supporting cleaner outputs back to stable storage + - stage a copy on `home` + - upload the parquet artifact to the Hugging Face dataset repo as routing metadata + +Acceptance check: + +- row count matches the full OpenArchives working set +- `needs_ocr=True` count is available directly from the parquet +- page totals are available + +Current state on 2026-03-31: + +- checked OCR instance `i-0504a326a1fee541f` +- no `download_results.parquet` was found under `/opt`, `/data`, or `/home` +- therefore this path did not recover the canonical OpenArchives routing parquet +- the rollout should proceed with the CPU cleaning-pass fallback below + +### 1B. Fallback: regenerate the routing state on a CPU instance + +If the OCR box does not contain the full canonical parquet: + +- launch a dedicated CPU node for the cleaner pass +- recommended instance family: `c7i` or `r7i` +- recommended first choice: `c7i.8xlarge` with sufficient gp3 storage for the OpenArchives markdown/output root + +Reason: + +- `Corpus.clean()` is CPU-bound and does not need GPUs +- we only need one clean, reproducible routing pass + +Concrete steps: + +1. Launch one Ubuntu 24.04 CPU instance. +2. Clone `glossapi-development` at `development`. +3. Bootstrap the standard GlossAPI environment. +4. Mount or sync the full OpenArchives corpus root. +5. Run `Corpus.clean()` over the full markdown corpus. +6. Verify that `download_results/download_results.parquet` now exists and includes the required OCR routing columns. +7. Store the resulting parquet: + - on the corpus root + - on `home` + - in the Hugging Face dataset repo as routing metadata + +## Phase 2: Quantify the actual OCR workload + +Once the canonical parquet exists: + +1. Filter `needs_ocr == True` +2. Count: + - total documents + - total pages from `pages_total` or `page_count` +3. Also record: + - `greek_badness_score > 60` + - `mojibake_badness_score > 0.1` + - overlap between those conditions and `needs_ocr` + +This step defines the real production workload and the true ETA. + +## Phase 3: Shard across nodes + +Shard across nodes by document, not by page range. + +Reason: + +- cross-node merge stays trivial +- node-local GPU scheduling already exists in GlossAPI +- splitting one document across nodes adds complexity without clear benefit + +### Coordinator manifest + +Build one coordinator manifest from the canonical parquet with: + +- `filename` +- stable OpenArchives document id or canonical filename +- `pages_total` +- `needs_ocr` + +Then: + +1. keep only `needs_ocr=True` +2. greedily bin-pack documents across `N=4` nodes by page count +3. write one shard manifest parquet per node + +Each shard manifest should contain: + +- `filename` +- `pages_total` +- `node_id` +- `shard_id` +- original metadata keys needed for rejoin + +### Node-local execution + +Each node: + +1. loads only its shard manifest +2. runs GlossAPI OCR over that subset +3. keeps standard GlossAPI outputs only: + - `markdown/.md` + - `json/metrics/*.json` + - shard-local `download_results.parquet` + +Inside each node: + +- use the existing GlossAPI DeepSeek path +- let node-local scheduling handle GPU balance +- do not invent a separate OCR metadata format + +## Phase 4: Merge back into the canonical corpus + +Merge rules: + +1. Markdown: + - copy updated `markdown/.md` into the canonical corpus root +2. Metrics: + - copy `json/metrics/*.json` into the canonical corpus root +3. Metadata parquet: + - concatenate shard metadata + - upsert by canonical document id / filename into the master parquet + - preserve the standard GlossAPI contract: + - `needs_ocr` + - `ocr_success` + - `processing_stage` + - page and quality fields + +Recommended additional execution metadata: + +- `ocr_node_id` +- `ocr_shard_id` +- `ocr_started_at` +- `ocr_finished_at` +- `ocr_attempt_count` + +These fields are operational and should not replace the existing GlossAPI routing fields. + +## Phase 5: Standardize all OCR nodes + +All OCR nodes should use the exact same: + +- AMI +- bootstrap script +- DeepSeek venv setup +- model path +- runtime defaults + +Standard production recipe: + +- AMI: `ami-052266c3e21dff7db` +- instance type: `g7e.48xlarge` +- DeepSeek venv created by `dependency_setup/setup_deepseek_uv.sh` +- defaults: + - `runtime_backend='vllm'` + - `ocr_profile='markdown_grounded'` + - `max_new_tokens=2048` + - `repair_mode='auto'` + - `render_dpi=144` + - `gpu_memory_utilization=0.9` + +Do not allow per-node env drift during the rollout. + +Cleaner/fallback venv decision: + +- CPU cleaning pass should use the standard GlossAPI environment from `development` +- OCR nodes should use the dedicated DeepSeek venv only +- do not mix the cleaner runtime and the OCR runtime on the same benchmark measurement path + +## Instance options + +Primary OCR choice: + +- `g7e.48xlarge` + - validated benchmarked path + - `192 vCPU` + - `8` RTX PRO Server 6000 GPUs + - current recommended production OCR node + +Secondary OCR options, only if we intentionally rebenchmark: + +- `g6e.48xlarge` + - `192 vCPU` + - `8` L40S GPUs +- `g5.48xlarge` + - `192 vCPU` + - `8` A10G GPUs +- `p5.48xlarge` + - technically available, but not the cost/default target for this rollout + +Cleaner node options: + +- first choice: `c7i.8xlarge` + - `32 vCPU` + - good CPU-bound cleaner candidate +- alternative: `r7i.8xlarge` + - `32 vCPU` + - use if the cleaner pass needs more memory headroom + +## Phase 6: ETA + +Validated throughput on one node: + +- about `50,700 pages/hour` + +With `4` nodes: + +- about `202,800 pages/hour` + +Exact ETA formula: + +- `ETA_hours = total_needs_ocr_pages / 202800` + +Reference scenarios: + +- `400,000` pages: about `1.97h` +- `600,000` pages: about `2.96h` +- `800,000` pages: about `3.95h` +- `1,000,000` pages: about `4.93h` + +Equivalent document scenarios for `40,000` documents: + +- average `10` pages/doc: about `1.97h` +- average `15` pages/doc: about `2.96h` +- average `20` pages/doc: about `3.95h` +- average `25` pages/doc: about `4.93h` + +The exact ETA should be recalculated once the canonical parquet gives the real total page count for `needs_ocr=True`. + +## Phase 7: Deployment and monitoring + +### Deployment + +1. Produce canonical parquet +2. Compute shard manifests +3. Stage manifests and source data +4. Launch `4` OCR nodes +5. Bootstrap the same OCR environment on all nodes +6. Run one shard per node +7. Collect outputs +8. Merge back into the canonical corpus + +### Monitoring + +Each node should write a heartbeat JSON at a fixed interval with: + +- `node_id` +- `docs_done` +- `pages_done` +- current file +- GPU utilization snapshot +- VRAM usage snapshot +- last successful write time +- error count + +The coordinator should watch: + +- stale heartbeat +- zero progress +- failed OCR process +- low GPU utilization for a sustained period + +### Recovery + +- rerun only failed shard manifests +- keep shard manifests immutable +- merge is idempotent by canonical document id / filename + +## Immediate next actions + +1. Start the stopped OCR instance and search for the full OpenArchives canonical parquet. +2. If found, validate and upload the routing parquet to stable storage and Hugging Face. +3. If not found, launch one CPU instance and run the full `Corpus.clean()` pass. +4. Compute exact `needs_ocr` doc/page totals from the canonical parquet. +5. Generate the `4` node shard manifests. +6. Launch the `4` OCR nodes and execute the distributed run. diff --git a/docs/pipeline.md b/docs/pipeline.md index cb11662..2c00354 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -6,44 +6,150 @@ GlossAPI is a staged pipeline. You can enter at any stage and use the same folde The `Corpus` class is the stable surface of the project. New functionality should plug into the existing phase mixins so callers can stick to the small set of entrypoints (`download()`, `extract()`, `clean()`, `ocr()`, `section()`, `annotate()`, `export/jsonl*()`). The expected usage pattern is a short script that chains these calls; avoid ad-hoc monkeypatches or bypassing the orchestrator when adding features so downstream users retain resumability and consistent artifacts. -## Stages - -- Download (optional): fetch source files from URLs → `downloads/` -- Extract (Phase‑1): parse PDFs to Markdown; optional GPU OCR → `markdown/.md` -- Clean: compute quality metrics and filter low‑quality items; decide which to OCR -- OCR (compat shim): re‑run extract on filtered items with `force_ocr=True` -- JSON + index (optional): emit `json/.docling.json(.zst)` and `json/.formula_index.jsonl` for Phase‑2 -- Enrich (Phase‑2): decode FORMULA/CODE from JSON on GPU → overwrite `markdown/.md`, write `json/.latex_map.jsonl` -- Section: produce `sections/sections_for_annotation.parquet` -- Annotate: classify sections; produce `classified_sections.parquet` and `fully_annotated_sections.parquet` +## Stage Map + +| Stage | Main code | Typical inputs | Important parameters | Main outputs | +| --- | --- | --- | --- | --- | +| Download | `Corpus.download()`, `GlossDownloader.download_files()` | metadata parquet with a URL column | `input_parquet`, `links_column`, `parallelize_by`, downloader kwargs | `downloads/`, `download_results/*.parquet` | +| Extract (Phase‑1) | `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.extract_path()` | files in `downloads/` or explicit paths | `input_format`, `phase1_backend`, `use_gpus`, `devices`, `workers_per_device`, `export_doc_json`, `emit_formula_index` | `markdown/.md`, `json/.docling.json(.zst)`, `json/metrics/*.json` | +| Clean | `Corpus.clean()` | `markdown/*.md` | `threshold`, `drop_bad`, `empty_char_threshold`, `empty_min_pages` | `clean_markdown/.md`, cleaner report parquet, parquet flags such as `filter` and `needs_ocr` | +| OCR retry | `Corpus.ocr(mode='ocr_bad'...)` | parquet rows flagged by cleaner | `mode`, `fix_bad`, `use_gpus`, `devices` | refreshed `markdown/.md`, refreshed cleaner/parquet metadata | +| Phase‑2 enrich | `Corpus.ocr(mode='math_only'...)`, `Corpus.formula_enrich_from_json()` | `json/.docling.json(.zst)` and optional formula index | `math_enhance`, `math_batch_size`, `math_dpi_base`, `targets_by_stem` | updated `markdown/.md`, `json/.latex_map.jsonl` | +| Section | `Corpus.section()`, `GlossSection.to_parquet()` | markdown selected by cleaner/parquet | no major public knobs | `sections/sections_for_annotation.parquet` | +| Annotate | `Corpus.annotate()`, `GlossSectionClassifier.classify_sections()`, `GlossSectionClassifier.fully_annotate()` | section parquet and classifier model | `annotation_type`, `fully_annotate` | `classified_sections.parquet`, `fully_annotated_sections.parquet` | +| Triage / export | `Corpus.triage_math()`, `Corpus.jsonl()` | metrics, parquet metadata, cleaned markdown | output path for JSONL | parquet routing hints, JSONL export | + +## Stage Contracts + +### 1. Download + +- Main code: `Corpus.download()` -> `GlossDownloader.download_files()` +- Purpose: read a metadata parquet, expand list/JSON URL cells, deduplicate URLs, download supported file types, and checkpoint progress. +- Typical inputs: + - a parquet file in `input_dir` or an explicit `input_parquet` + - a URL column such as `url` or `links_column` +- Main outputs: + - downloaded files in `downloads/` + - partial/final results in `download_results/` +- Read this next if you want the scheduler details: `gloss_downloader.py` + +### 2. Extract (Phase‑1) + +- Main code: `Corpus.prime_extractor()`, `Corpus.extract()`, `GlossExtract.ensure_extractor()`, `GlossExtract.extract_path()` +- Purpose: convert source files to markdown and optional intermediate JSON artifacts. +- Typical inputs: + - files already present in `downloads/` + - or explicit `file_paths` +- Important parameters: + - `phase1_backend='safe'|'docling'|'auto'` + - `use_gpus='single'|'multi'` + - `workers_per_device` to fan out more than one extraction worker onto each GPU + - `export_doc_json` and `emit_formula_index` for later Phase‑2 work +- Operational note: + - `force_ocr` is deprecated and ignored in Phase‑1; use `Corpus.ocr(backend='deepseek')` after `clean()` for OCR remediation +- Main outputs: + - canonical markdown in `markdown/.md` + - optional Docling JSON and index artifacts in `json/` + - per-document and per-page metrics in `json/metrics/` + +### 3. Clean + +- Main code: `Corpus.clean()` +- Purpose: run the Rust cleaner, remove low-quality or noisy markdown, + and mark documents that may need OCR retry before moving on. +- Typical inputs: + - `markdown/*.md` + - metadata parquet, if available +- Important parameters: + - `threshold` and `drop_bad` + - `empty_char_threshold` and `empty_min_pages` for OCR fallback decisions +- Main outputs: + - cleaned markdown in `clean_markdown/` + - updated parquet metadata with quality and OCR-related flags +- Runtime/debug artifacts: + - `.processing_state.pkl` keeps track of progress so interrupted runs can resume + - `problematic_files/` keeps files that could not be cleaned successfully + - `timeout_files/` keeps files that exceeded the cleaning time limit + +### 4. OCR Retry and Phase‑2 Enrichment + +- Main code: `Corpus.ocr()` and `Corpus.formula_enrich_from_json()` +- Purpose: + - rerun OCR only for documents marked bad by the cleaner + - optionally decode formula/code regions from Docling JSON into markdown +- Modes: + - `ocr_bad` + - `math_only` + - `ocr_bad_then_math` +- Main outputs: + - refreshed `markdown/.md` + - `json/.latex_map.jsonl` when math/code enrichment runs + +### 5. Section and Annotate + +- Main code: `Corpus.section()`, `GlossSection.to_parquet()`, `Corpus.annotate()`, `GlossSectionClassifier.*` +- Purpose: + - split markdown into sections suitable for classification + - classify sections and optionally expand coarse labels into full document structure +- Main outputs: + - `sections/sections_for_annotation.parquet` + - `classified_sections.parquet` + - `fully_annotated_sections.parquet` ## Artifact Layout -``` +The tree below shows the main folders and files GlossAPI can create under +the output directory. + +To make the layout easier to follow, artifacts are grouped by the role they +play in the pipeline: + +- canonical — the main outputs a stage is expected to produce, and the + files later stages usually depend on +- runtime — state files used to resume work safely if a run is interrupted +- debug — extra files kept around when something fails or needs a closer look + OUT/ -├── downloads/ -│ └── problematic_math/ -├── download_results/ -├── markdown/ +├── downloads/ (canonical) +│ └── problematic_math/ (debug) +├── download_results/ (canonical) +├── markdown/ (canonical) +│ └── .md +├── clean_markdown/ (canonical) │ └── .md -├── json/ +├── json/ (canonical) │ ├── .docling.json(.zst) │ ├── .formula_index.jsonl │ ├── .latex_map.jsonl │ ├── metrics/ -│ ├── .metrics.json -│ └── .per_page.metrics.json -│ └── problematic_math/ -├── sections/ +│ │ ├── .metrics.json +│ │ └── .per_page.metrics.json +│ └── problematic_math/ (debug) +├── sections/ (canonical) │ └── sections_for_annotation.parquet -├── classified_sections.parquet -└── fully_annotated_sections.parquet -``` +├── classified_sections.parquet (canonical) +├── fully_annotated_sections.parquet (canonical) +├── .processing_state.pkl (runtime) +├── problematic_files/ (debug) +└── timeout_files/ (debug) Notes: - Enriched Markdown replaces the plain Markdown (single canonical location). - Metrics lived under `markdown/` in earlier versions; they now live under `json/metrics/`. - When math enrichment cannot recover after the configured number of respawns, the corresponding PDFs and Docling artifacts are copied into the `problematic_math/` folders above and the stems are added to the fatal skip-list for later review. +- The same folder can act as both `input_dir` and `output_dir`; the pipeline creates its own subdirectories under that root. + +## Readability Shortcut + +If you only need the shortest path through the system: + +1. `Corpus.download()` if you start from URLs. +2. `Corpus.extract()` for Phase‑1 markdown. +3. `Corpus.clean()` to decide what needs OCR. +4. `Corpus.ocr()` for selective OCR and optional math/code enrichment. +5. `Corpus.section()` and `Corpus.annotate()` for structured outputs. + +If you need to jump from these ideas to the source files, see `code_map.md`. ## Exporting corpora diff --git a/docs/quickstart.md b/docs/quickstart.md index 4b10685..a498725 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -38,14 +38,13 @@ Workers report per-batch summaries and extraction progress is persisted into `download_results/download_results.parquet`, so you can restart multi-GPU runs without losing progress (no extra checkpoint files required). -## GPU OCR (opt-in) +## OCR remediation (opt-in) ```python from glossapi import Corpus c = Corpus('IN', 'OUT') -c.extract(input_format='pdf', accel_type='CUDA', force_ocr=True) -# or reuse multi-GPU batching -c.extract(input_format='pdf', use_gpus='multi', force_ocr=True) +c.clean() +c.ocr(backend='deepseek', fix_bad=True, math_enhance=False) ``` ## Phase‑2 Math Enrichment (from JSON) @@ -76,7 +75,7 @@ c.section() # to parquet c.annotate() # classify/annotate sections ``` -See ocr_and_math_enhancement.md for GPU details, batch sizes, and artifact locations. +See `ocr_and_math_enhancement.md` for OCR runtime details, batch sizes, and artifact locations. ### DeepSeek OCR @@ -89,12 +88,11 @@ c.ocr(backend='deepseek', fix_bad=True, math_enhance=True, mode='ocr_bad_then_ma # → OCR only for bad files; math is included inline in the Markdown ``` -To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the CLI bits are reachable: +To avoid stub output, set `GLOSSAPI_DEEPSEEK_ALLOW_CLI=1` and `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0`, and ensure the runtime is reachable: ```bash -export GLOSSAPI_DEEPSEEK_VLLM_SCRIPT=/path/to/deepseek-ocr/run_pdf_ocr_vllm.py -export GLOSSAPI_DEEPSEEK_TEST_PYTHON=/path/to/deepseek-venv/bin/python -export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr/DeepSeek-OCR -export GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH=/path/to/libjpeg-turbo/lib +export GLOSSAPI_DEEPSEEK_PYTHON=/path/to/deepseek-venv/bin/python +export GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT=/path/to/glossAPI/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py +export GLOSSAPI_DEEPSEEK_MODEL_DIR=/path/to/deepseek-ocr-2-model/DeepSeek-OCR-2 python -m glossapi.ocr.deepseek.preflight # optional: validates env without running OCR ``` diff --git a/docs/stages/clean.md b/docs/stages/clean.md index ae3c735..0528f05 100644 --- a/docs/stages/clean.md +++ b/docs/stages/clean.md @@ -6,7 +6,7 @@ The clean stage normalizes extracted Markdown and evaluates its quality. ## Main responsibilities -- run Rust-backed cleaning +- run the shared OCR analyzer in either clean or debug rendering mode - compute text quality and badness metrics - detect documents that require OCR reruns - update metadata for downstream stage selection @@ -20,6 +20,12 @@ The clean stage normalizes extracted Markdown and evaluates its quality. ## Main outputs - cleaned Markdown in `clean_markdown/` +- debug-marked Markdown under `debug/` when debug output is requested +- debug manifests under `debug/`: + - `manifest.jsonl` + - `page_metrics.jsonl` + - `match_index.jsonl` + - `summary.json` - quality metrics and reports - metadata updates including OCR-related decisions @@ -32,6 +38,25 @@ It is especially important for Greek corpora because it distinguishes: - technically extracted text - actually usable Greek text +It also separates two different responsibilities that are easy to conflate: + +- structural cleanup + - tables, numeric runs, LaTeX collapse, hybrid numbered loops, word repetition +- quality scoring + - bad-character metrics + - suspicious-line metrics + - OCR rerun recommendations + +The stage now uses one shared analyzer for both: + +- `debug` mode + - shows exact match placement with `` tags + - records merged-span match metadata in `match_index.jsonl` +- `clean` mode + - removes or rewrites those exact same matched regions +- `clean + debug` + - writes pipeline-ready cleaned Markdown and the parallel debug artifacts from the same span plan in one run + ## Important operational outputs This stage may contribute or update: @@ -42,6 +67,32 @@ This stage may contribute or update: - character-count-based diagnostics - processing-stage status +## Current cleaning policy + +The cleaner does not use one generic fuzzy matcher over the whole page. +Instead it applies ownership in a fixed order: + +1. tables +2. numeric +3. LaTeX +4. hybrid numbered repetition +5. shared word repetition + +Why this matters: + +- tables can distort the visible text surface for every later pass +- numeric progressions are often valid cleaner targets but should not be + consumed by generic text repetition +- LaTeX and hybrid passes rely on more specific local structure +- shared text repetition is therefore safest on the remaining surface only + +Table handling is intentionally broader than repetition: + +- `sentence_shell_table` is dropped +- `empty_table_collapse` is dropped +- `repeated_rows` is dropped +- unmatched tables are converted from HTML to GitHub-style Markdown + ## Failure concerns Typical issues include: @@ -53,3 +104,7 @@ Typical issues include: ## Contributor note Changes here affect OCR routing and post-run quality analysis. Treat score and flag semantics as contract-level behavior. + +For content-cleaning changes, the exact-output benchmark in +`tests/test_ocr_golden_pages.py` is the main regression lock. Speed work is only +acceptable if those outputs remain stable. diff --git a/docs/stages/download.md b/docs/stages/download.md index 99bc4f8..c70c551 100644 --- a/docs/stages/download.md +++ b/docs/stages/download.md @@ -8,6 +8,7 @@ The download stage acquires source documents from parquet-based URL metadata and - read URL-bearing parquet input - download files concurrently +- route known browser-gated sources through browser-assisted acquisition when configured - retain source metadata context - avoid refetching previously successful downloads - assign stable-enough local filenames for downstream processing @@ -42,10 +43,34 @@ Typical issues include: - transient network failures - rate limiting +- browser-gated file endpoints that return HTML challenge/interstitial pages +- viewer-only sources that should fail cleanly instead of being recorded as successful downloads - duplicate URLs - filename collisions - partially completed corpus fetches +## Browser-gated sources + +The downloader now distinguishes between: + +- direct file endpoints +- browser-gated file endpoints +- viewer-only/document-reader sources + +For browser-gated file endpoints: + +- `download_mode="auto"` probes with direct HTTP and escalates to a browser session when it detects a recoverable interstitial +- `download_mode="browser"` goes directly to the browser-assisted path +- `download_policy_file=...` can route known domains or URL patterns to the correct path without probing every file + +Browser-assisted mode is designed for retrievable file endpoints, not for sources that only expose page images, tiles, HTML/SVG re-rendering, or DRM-wrapped readers. + +## Session reuse + +Browser-assisted mode reuses cached browser session state per domain so multiple files from the same protected source do not need a fresh browser bootstrap every time. + +This keeps the browser as a session-bootstrap resource rather than the main downloader. + ## Contributor note Any change to filename assignment or result parquet structure can have downstream impact on: diff --git a/docs/stages/ocr.md b/docs/stages/ocr.md index 3bf8815..65454eb 100644 --- a/docs/stages/ocr.md +++ b/docs/stages/ocr.md @@ -22,15 +22,13 @@ The OCR stage repairs documents whose extracted text is considered unreliable, a - corrected Markdown or OCR-enriched outputs - backend-specific JSON or related artifacts - metadata updates such as OCR success markers +- when metadata parquet is available, a canonical OCR parquet should preserve the same row identity and carry corrected `text` together with the updated metadata ## Backend choices -The pipeline supports at least two OCR-oriented modes: - -- RapidOCR through the Docling path -- DeepSeek OCR for environments configured for that backend - -These are operationally different and should not be treated as interchangeable implementation details. +The supported OCR remediation backend is DeepSeek OCR. Docling remains part of +the surrounding extraction and layout flow, but OCR reruns themselves are now +expected to use the DeepSeek runtime. ## Selection model @@ -44,6 +42,25 @@ OCR reruns should preserve: - explicit indication that remediation was attempted - visibility into files that remain problematic +## DeepSeek runtime contract + +For the operator-facing summary of the OCR changes already merged into +`development` during the April 1-3 rollout, see +`../operations/ocr_changes_2026-04-01_to_2026-04-03.md`. + +- `ocr()` may execute page-range shards internally when `use_gpus="multi"` and `scheduler="exact_fill"`, but the stage contract remains one canonical Markdown file and one canonical metrics file per source PDF. +- When shard execution is used, the runner reassembles `markdown/.md` and `json/metrics/.metrics.json` after the CLI workers finish. +- Execution-time shard artifacts are moved under `sidecars/ocr_shards/` so downstream stages do not mistake them for canonical stage outputs. +- When OCR starts from canonical corpus rows, the authoritative stage handoff should preserve that metadata continuity instead of reducing the result to detached markdown files. Corrected `text` belongs in the canonical parquet row; markdown and metrics stay as sidecars. +- The vLLM runtime now streams rendered pages through an in-memory queue, overlaps rendering with inference, skips empty pages before inference, and reuses the same in-memory image for repair retries. +- Canonical OCR markdown now annotates page boundaries with `` comments alongside each page-split marker so downstream inspection can line up page images and markdown more easily. +- In `repair_mode="auto"`, a page that trips the garbage cutoff again during the plain-OCR repair pass is now blanked instead of keeping the original garbage text. +- Multi-GPU vLLM runs now execute through a durable shared batch queue rather than one fragile subprocess per preassigned lane. Workers claim first-pass batches dynamically, heartbeat while a batch is active, and can be respawned without losing finished batch outputs. +- Repair retries are now durable too. Flagged pages are published back into the same runtime database as a second global repair queue, and any GPU worker can drain those repair shards after the first-pass queue is complete. +- Repair queue durability and repair execution packing are intentionally separate. The queue tracks individual retry items for precise resume/failure accounting, while workers can combine multiple repair items into one larger execution batch to keep the repair tail GPU-efficient. +- By default each durable batch gets at most two total attempts, so one retry is allowed after the first failure and then the batch is marked failed for operator follow-up. +- Operational sidecars for these runs live under `sidecars/ocr_runtime/`, including the durable work queue state, per-worker runtime JSON, GPU telemetry samples, GPU preflight output, and a final runtime summary with steady-state inference timestamps. + ## Contributor note Any change to candidate selection, skiplist semantics, or OCR-success metadata affects both rerun behavior and corpus analysis quality. diff --git a/docs/testing/compatibility_matrix.md b/docs/testing/compatibility_matrix.md new file mode 100644 index 0000000..29a5e15 --- /dev/null +++ b/docs/testing/compatibility_matrix.md @@ -0,0 +1,276 @@ +# Compatibility And Regression Matrix + +This document defines the release-validation matrix for the DeepSeek-only migration and subsequent Docling upgrades. + +It is not a generic unit-test list. It is a contract-based validation plan tied to the documented pipeline behavior. + +## Scope + +This matrix applies to changes in: + +- DeepSeek-only OCR migration +- no-stub enforcement +- installation simplification +- Docling dependency upgrades +- page-level reevaluation experiments + +## Validation policy + +Release validation for this migration must use: + +- real PDFs +- real Docling +- real DeepSeek +- real GPUs where the code path requires them +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` + +Developer-only tests may still use mocks or lightweight stubs for fast iteration, but those do not satisfy release gates for this migration. + +## Test levels + +### L0: Install and import sanity + +Purpose: + +- prove the supported environments install cleanly and that removed components are truly gone + +Typical inputs: + +- fresh venv +- supported Python version + +### L1: Lightweight smoke corpus + +Purpose: + +- prove the baseline end-to-end flow still works on the small repo corpus + +Typical inputs: + +- `samples/lightweight_pdf_corpus/` + +### L2: Real-PDF contract validation + +Purpose: + +- prove the documented artifacts and metadata contracts still hold on real documents + +Typical inputs: + +- real PDFs from a representative sample + +### L3: Multi-GPU and operational recovery + +Purpose: + +- prove the runtime behavior remains correct under parallel execution and rerun conditions + +Typical inputs: + +- multiple real PDFs +- at least two visible GPUs + +### L4: Comparative corpus evaluation + +Purpose: + +- compare baseline and changed behavior on a real evaluation slice + +Typical inputs: + +- real corpus slice such as the Pergamos sample + +## Mandatory invariants + +The following must remain true unless a change explicitly revises the contract and updates the docs: + +- canonical Markdown is written to `markdown/.md` +- Docling JSON artifacts are emitted when requested +- cleaner output still drives `needs_ocr` +- OCR remains selective rather than defaulting to all documents +- metadata parquet remains the durable operational record +- reruns skip completed work unless forced +- skiplist semantics remain explicit and stable +- no production path silently falls back to stub OCR + +## Release-gate matrix + +| ID | Level | Contract | Input | Run | Pass criteria | Negative assertions | +| --- | --- | --- | --- | --- | --- | --- | +| `ENV-001` | L0 | Python and packaging | Fresh environment | install supported profile(s) | install completes on supported Python floor | no reference to removed legacy OCR install modes | +| `ENV-002` | L0 | Dependency simplification | Fresh environment | import `glossapi`, `glossapi.ocr.deepseek`, extract-path modules | imports succeed | no dead imports from removed OCR integrations | +| `EXT-001` | L1 | Safe Phase-1 extraction | lightweight corpus | `Corpus.extract(input_format="pdf")` | canonical Markdown produced | extraction must not depend on OCR extras | +| `EXT-002` | L2 | Docling Phase-1 extraction | real PDFs | `Corpus.extract(..., phase1_backend="docling", export_doc_json=True)` | Markdown, Docling JSON, metrics written to documented locations | artifact layout must not drift | +| `CLN-001` | L1/L2 | Cleaner metadata contract | extracted docs | `clean(drop_bad=False)` | metadata parquet updated with routing-relevant fields | no collapse of `needs_ocr` behavior | +| `OCR-001` | L2 | DeepSeek-only remediation | docs with `needs_ocr=True` | `ocr(backend="deepseek", fix_bad=True)` | recovered docs updated, metadata marks `ocr_success=True` | no stub output, no silent success | +| `OCR-002` | L2 | No-stub enforcement | broken/missing DeepSeek runtime | run OCR with `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` | run fails explicitly | failure must not produce placeholder success artifacts | +| `MTH-001` | L2 | Formula/code enrichment compatibility | math-heavy real PDF | Docling extract plus Phase-2 enrichment | enriched outputs and metadata remain coherent | no schema drift breaking enrichment | +| `SEC-001` | L2 | Sectioning contract | usable real docs | `section()` | `sections/sections_for_annotation.parquet` produced | no empty-output regression caused by upstream changes | +| `ANN-001` | L2 | Annotation contract | section parquet | `annotate()` | classified outputs produced | model integration must not break on changed upstream text/layout | +| `EXP-001` | L2 | Export contract | processed docs | `jsonl()` / `jsonl_sharded()` | JSONL and metadata outputs match documented layout | no dropped metadata fields without explicit design change | +| `RES-001` | L3 | Resumability | interrupted or partial run | rerun with defaults | completed items skipped correctly | no duplicate reprocessing by default | +| `RES-002` | L3 | Force/reprocess semantics | prior successful run | rerun with force/reprocess flag | selected items are reprocessed | no stale completion flags blocking intended rerun | +| `SKP-001` | L3 | Skiplist semantics | run with known problematic items | extract/OCR rerun | skiplist excludes intended stems only | no hidden filtering of healthy items | +| `GPU-001` | L3 | Multi-GPU OCR | real PDF slice on 2 GPUs | DeepSeek OCR in parallel | work is distributed and completes per GPU | no worker success masking failures | +| `CMP-001` | L4 | Baseline quality comparison | Pergamos sample slice | compare pre/post change outputs | no material regression in artifact completeness and downstream usability | runtime improvement alone does not justify quality loss | +| `CMP-002` | L4 | Whole-text vs page-level experiment | long PDFs | compare baseline branch vs page-level branch | quality/runtime tradeoff explicitly measured | experimental branch does not replace baseline without evidence | + +## Detailed test groups + +### Install and runtime compatibility + +What to prove: + +- supported environment installs cleanly +- unsupported/removed OCR components are not required +- Python floor matches actual upstream dependencies + +Critical checks: + +- packaging metadata uses a supported Python minimum +- setup docs expose only supported install paths +- removal of the old OCR integration does not leave dead GlossAPI imports or entrypoints + +## Extraction contract + +What to prove: + +- Phase-1 still produces canonical Markdown +- Docling extraction still produces JSON artifacts when requested +- metrics continue to be written where downstream stages expect them + +Artifacts to check: + +- `markdown/.md` +- `json/.docling.json(.zst)` +- `json/.formula_index.jsonl` when requested +- `json/metrics/.metrics.json` +- `json/metrics/.per_page.metrics.json` + +## Cleaning and Greek-quality routing + +What to prove: + +- cleaner still computes routing decisions required for selective OCR +- Greek-text validation remains first-class rather than incidental cleanup + +Fields to check in metadata parquet: + +- `needs_ocr` +- `filter` +- Greek-quality and badness-related fields currently emitted by the cleaner + +## DeepSeek OCR contract + +What to prove: + +- DeepSeek is the only OCR remediation backend +- no-stub enforcement is real +- recovered documents update metadata correctly + +Required environment behavior: + +- `GLOSSAPI_DEEPSEEK_ALLOW_STUB=0` +- real model weights present +- real CLI/runtime path present + +Negative checks: + +- no markdown contains placeholder stub markers +- no OCR pass succeeds after a DeepSeek CLI failure unless real output exists +- no removed OCR backend is referenced during final validation + +## Formula and code enrichment + +What to prove: + +- if retained, enrichment still works with the upgraded Docling stack +- if later removed, the removal is justified by evaluation rather than convenience + +Checks: + +- enriched Markdown is generated where expected +- `json/.latex_map.jsonl` remains coherent when enrichment is enabled +- metadata updates for math enrichment still work + +## Section, annotate, and export contracts + +What to prove: + +- downstream stages still consume the extraction outputs +- output layout and metadata structure remain compatible with the documented pipeline + +Artifacts to check: + +- `sections/sections_for_annotation.parquet` +- `classified_sections.parquet` +- `fully_annotated_sections.parquet` +- exported JSONL shards and related metadata + +## Resumability and operational recovery + +What to prove: + +- reruns still honor completion state +- skiplist semantics remain intact +- multi-worker failures remain visible and recoverable + +Checks: + +- default rerun skips completed items +- explicit force/reprocess reruns the intended items +- problematic stems are persisted and not silently lost + +## Comparative evaluation set + +Suggested real-world slice: + +- lightweight corpus for smoke validation +- representative real PDFs spanning: + - short documents + - medium documents + - long documents + - structure-rich documents + - math-heavy documents where applicable + +For current local evaluation work, a Pergamos sample manifest has been prepared outside the repo and can be used as the L3/L4 real-PDF slice. + +## Suggested release sequence + +For the planned migration, run gates in this order: + +1. `ENV-*` +2. `EXT-*` +3. `CLN-*` +4. `OCR-*` +5. `MTH-*` +6. `SEC-*`, `ANN-*`, `EXP-*` +7. `RES-*`, `SKP-*`, `GPU-*` +8. `CMP-*` + +This keeps low-level compatibility failures from being confused with downstream quality regressions. + +## Exit criteria per stage + +### Stage 1 exit criteria + +- DeepSeek-only OCR path works on real PDFs +- no-stub enforcement verified +- no supported GlossAPI OCR backend remains besides DeepSeek + +### Stage 2 exit criteria + +- install paths reduced to supported environments +- packaging/docs no longer reference removed OCR components + +### Stage 3 exit criteria + +- upgraded Docling passes `EXT-*`, `MTH-*`, `SEC-*`, `ANN-*`, and `EXP-*` + +### Stage 4 exit criteria + +- retained or removed Docling capabilities are justified by evaluation evidence + +### Stage 5 exit criteria + +- page-level branch is compared against the stabilized baseline before any adoption decision diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6691407..24cc470 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -2,19 +2,15 @@ ## OCR runs on CPU -- Verify ONNXRuntime GPU: `python -c "import onnxruntime as ort; print(ort.get_available_providers())"` — must include `CUDAExecutionProvider`. -- Ensure CPU ORT wheel is not installed: `pip uninstall -y onnxruntime`. -- Make sure you pass `accel_type='CUDA'` (or `use_gpus='multi'`). +- Verify Torch CUDA: `python -c "import torch; print(torch.cuda.is_available(), torch.cuda.device_count())"`. +- Make sure the DeepSeek runtime is the one configured in `GLOSSAPI_DEEPSEEK_PYTHON`. +- Run `python -m glossapi.ocr.deepseek.preflight` in the DeepSeek env before large OCR jobs. ## Torch doesn’t see the GPU - Check `nvidia-smi` and driver installation. - Match Torch CUDA build to your driver; see getting_started.md for the recommended wheel. -## RapidOCR font download failure - -- The first OCR call might download a visualization font. Ensure egress is allowed; the file is cached afterwards. - ## Out of memory - Lower Phase‑2 `batch_size` (e.g., 8) and reduce inline `GLOSSAPI_FORMULA_BATCH`. diff --git a/install_glossapi.py b/install_glossapi.py new file mode 100644 index 0000000..ef7a7c9 --- /dev/null +++ b/install_glossapi.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +def _bootstrap_repo_src() -> None: + repo_root = Path(__file__).resolve().parent + src_dir = repo_root / "src" + src_str = str(src_dir) + if src_str not in sys.path: + sys.path.insert(0, src_str) + + +def main() -> int: + _bootstrap_repo_src() + from glossapi.scripts.install_glossapi import main as _main + + return int(_main()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/mkdocs.yml b/mkdocs.yml index ba13512..c61882c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: GlossAPI -site_description: Academic document processing pipeline (Docling + RapidOCR + Rust) +site_description: Academic document processing pipeline (Docling + DeepSeek + Rust) repo_url: https://github.com/eellak/glossAPI theme: name: material @@ -38,16 +38,16 @@ nav: - Configuration & Ops: - Configuration: configuration.md - AWS Job Distribution: aws_job_distribution.md + - DeepSeek GCP A100 Setup: operations/deepseek_gcp_a100_setup.md + - OCR Changes 2026-04-01 to 2026-04-03: operations/ocr_changes_2026-04-01_to_2026-04-03.md + - OpenArchives OCR Rollout Plan: operations/openarchives_ocr_rollout_plan.md - Troubleshooting: troubleshooting.md + - Compatibility And Regression Matrix: testing/compatibility_matrix.md - Reference: + - Code Map: code_map.md - Corpus API: api/corpus.md + - Legacy Corpus API Notes: api_corpus_tmp.md - Math Enrichment Runtime: math_enrichment_runtime.md - - Divio Skeleton: - - Overview: divio/overview.md - - Tutorials: divio/tutorials.md - - How-to Guides: divio/how_to_guides.md - - Reference: divio/reference.md - - Explanation: divio/explanation.md docs_dir: docs markdown_extensions: - admonition diff --git a/pyproject.toml b/pyproject.toml index 3d0d5fa..9dd211a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta" [project] name = "glossapi" -version = "0.1.3" +version = "0.1.4" description = "Academic document processing pipeline with Rust-powered markdown cleaning" authors = [ {name = "GlossAPI Team", email = "glossapi.team@eellak.gr"} ] readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ # Core pipeline deps "pandas>=1.3.0", - "numpy<2", # ORT+RapidOCR best compatibility + "numpy>=1.26,<3", "scikit-learn==1.6.1", "joblib>=1.0.0", "dask>=2022.1.0", @@ -37,28 +37,31 @@ classifiers = [ ] [project.optional-dependencies] -# Docling + RapidOCR ONNX stack (kept optional to preserve import-light installs) -rapidocr = [ - "docling==2.48.0", - # Use RapidOCR core package; avoid rapidocr_onnxruntime to prevent pip - # from auto-installing the CPU-only 'onnxruntime' wheel. - "rapidocr>=3.3.0", - "onnxruntime-gpu==1.18.1", +# Browser automation fallback for browser-gated file endpoints +browser = [ + "playwright>=1.52,<2", +] +# Docling extraction/layout stack +docling = [ + "docling==2.81.0", ] # Optional CUDA layout acceleration (Docling) cuda = [ "torch==2.5.1", "torchvision==0.20.1", ] -# DeepSeek OCR backend extras (CUDA 12.1 build of vLLM). Torch is not pinned here -# because users should install the CUDA wheel from the PyTorch index -# (see docs: installing torch==2.5.1+cu121 via extra index URL). +# DeepSeek OCR backend extras (Torch should be installed from the PyTorch index). deepseek = [ - "vllm>=0.11.0", - "transformers>=4.45,<5", + "vllm==0.18.0", + "transformers==4.57.6", + "tokenizers==0.22.2", "accelerate>=1.2.1,<2", "pymupdf==1.24.10", - "Pillow==10.4.0", + "Pillow==12.1.1", + "img2pdf>=0.5.1", + "einops", + "easydict", + "addict", ] docs = [ "mkdocs>=1.5", @@ -78,6 +81,5 @@ glossapi = ["models/**/*"] [tool.pytest.ini_options] markers = [ - "rapidocr: requires the RapidOCR/Docling execution stack", "deepseek: exercises the DeepSeek OCR pipeline", ] diff --git a/requirements.txt b/requirements.txt index 95f4678..32b555c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -### GlossAPI runtime requirements (aligned with repro_rapidocr_onnx) +### GlossAPI runtime requirements # Core pipeline deps pandas>=1.3.0 -numpy<2 +numpy>=1.26,<3 python-dateutil>=2.8.2 pytz>=2021.1 scikit-learn==1.6.1 @@ -15,17 +15,12 @@ ftfy>=6.0.0 tenacity>=8.0.0 tqdm>=4.67.0 -# Docling + RapidOCR ONNX stack -docling==2.48.0 -# Prefer RapidOCR core package; it works with the GPU ORT wheel without pulling -# the CPU-only 'onnxruntime' dependency. -rapidocr>=3.3.0 -onnxruntime-gpu==1.18.1 +# Docling extraction/layout stack +docling==2.81.0 pyyaml>=6.0 # Enrichment & JSON compression (required for Phase-2 math/code and JSON zstd) pypdfium2>=4.0.0 zstandard>=0.22.0 -# Optional: install Torch CUDA for GPU layout (not required for OCR) -# pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.5.1 torchvision==0.20.1 +# Optional: install Torch CUDA for GPU-backed Docling layout / enrichment diff --git a/rust/glossapi_rs_cleaner/Cargo.lock b/rust/glossapi_rs_cleaner/Cargo.lock index a3aabd3..07298d7 100644 --- a/rust/glossapi_rs_cleaner/Cargo.lock +++ b/rust/glossapi_rs_cleaner/Cargo.lock @@ -614,6 +614,7 @@ dependencies = [ "chrono", "csv", "futures", + "glossapi_rs_common", "htmlentity", "lazy_static", "memchr", @@ -626,6 +627,10 @@ dependencies = [ "walkdir", ] +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" + [[package]] name = "half" version = "2.6.0" diff --git a/rust/glossapi_rs_cleaner/Cargo.toml b/rust/glossapi_rs_cleaner/Cargo.toml index 7213bc7..e3974ce 100644 --- a/rust/glossapi_rs_cleaner/Cargo.toml +++ b/rust/glossapi_rs_cleaner/Cargo.toml @@ -26,6 +26,7 @@ memchr = "2" aho-corasick = "1" htmlentity = "~1.3.0" chrono = { version = "=0.4.33", features = ["serde"] } +glossapi_rs_common = { path = "../glossapi_rs_common" } [tool.maturin] bindings = "pyo3-abi3-py38" diff --git a/rust/glossapi_rs_cleaner/src/cleaning_module.rs b/rust/glossapi_rs_cleaner/src/cleaning_module.rs index 9b52551..823ab1c 100644 --- a/rust/glossapi_rs_cleaner/src/cleaning_module.rs +++ b/rust/glossapi_rs_cleaner/src/cleaning_module.rs @@ -1,4 +1,5 @@ use aho_corasick::AhoCorasick; +use glossapi_rs_common::scan_script_metrics; use htmlentity::entity::{decode, ICodedDataTrait}; use lazy_static::lazy_static; use memchr::memchr; // For Step 5.1 @@ -548,31 +549,21 @@ pub fn perform_text_analysis( // This block already calculates cleaned_non_whitespace_chars_val correctly after cleaning if calculate_specific_counts { - let mut current_greek_count = 0; - let mut current_latin_count = 0; - let mut current_cleaned_non_ws_count = 0; - - let greek_set = SCRIPT_SETS.get("greek").cloned().unwrap_or_default(); - let latin_set = SCRIPT_SETS.get("latin").cloned().unwrap_or_default(); - - for ch in cleaned_text.chars() { - if !ch.is_whitespace() { - current_cleaned_non_ws_count += 1; - } - if scripts_for_percentage_and_specific_counts.contains(&"greek".to_string()) - && greek_set.contains(&ch) - { - current_greek_count += 1; - } - if scripts_for_percentage_and_specific_counts.contains(&"latin".to_string()) - && latin_set.contains(&ch) - { - current_latin_count += 1; - } + let metrics = scan_script_metrics(&cleaned_text); + let include_greek = scripts_for_percentage_and_specific_counts + .iter() + .any(|script| script == "greek"); + let include_latin = scripts_for_percentage_and_specific_counts + .iter() + .any(|script| script == "latin"); + + if include_greek { + greek_char_count_cleaned = Some(metrics.greek_char_count as usize); + } + if include_latin { + latin_char_count_cleaned = Some(metrics.latin_char_count as usize); } - greek_char_count_cleaned = Some(current_greek_count); - latin_char_count_cleaned = Some(current_latin_count); - cleaned_non_whitespace_chars_val = Some(current_cleaned_non_ws_count); + cleaned_non_whitespace_chars_val = Some(metrics.non_whitespace_chars as usize); } else { cleaned_non_whitespace_chars_val = Some(cleaned_text.chars().filter(|c| !c.is_whitespace()).count()); diff --git a/rust/glossapi_rs_common/Cargo.lock b/rust/glossapi_rs_common/Cargo.lock new file mode 100644 index 0000000..4fc9d61 --- /dev/null +++ b/rust/glossapi_rs_common/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" diff --git a/rust/glossapi_rs_common/Cargo.toml b/rust/glossapi_rs_common/Cargo.toml new file mode 100644 index 0000000..594fc96 --- /dev/null +++ b/rust/glossapi_rs_common/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "glossapi_rs_common" +version = "0.1.0" +edition = "2021" +authors = ["GlossAPI Team "] +description = "Shared Rust script-analysis helpers for GlossAPI" +license = "EUPL-1.2" + +[lib] +name = "glossapi_rs_common" +path = "src/lib.rs" diff --git a/rust/glossapi_rs_common/src/lib.rs b/rust/glossapi_rs_common/src/lib.rs new file mode 100644 index 0000000..a34f2a9 --- /dev/null +++ b/rust/glossapi_rs_common/src/lib.rs @@ -0,0 +1,159 @@ +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ScriptMetrics { + pub non_whitespace_chars: u64, + pub greek_char_count: u64, + pub latin_char_count: u64, + pub greek_word_count: u64, + pub polytonic_word_count: u64, +} + +impl ScriptMetrics { + #[inline] + pub fn percentage_greek(&self) -> f64 { + if self.non_whitespace_chars > 0 { + 100.0 * self.greek_char_count as f64 / self.non_whitespace_chars as f64 + } else { + 0.0 + } + } + + #[inline] + pub fn latin_percentage(&self) -> f64 { + if self.non_whitespace_chars > 0 { + 100.0 * self.latin_char_count as f64 / self.non_whitespace_chars as f64 + } else { + 0.0 + } + } + + #[inline] + pub fn polytonic_ratio(&self) -> f64 { + if self.greek_word_count > 0 { + self.polytonic_word_count as f64 / self.greek_word_count as f64 + } else { + 0.0 + } + } +} + +#[derive(Debug, Clone, Default)] +pub struct ScriptScanner { + metrics: ScriptMetrics, + token_has_greek: bool, + token_has_polytonic: bool, + in_token: bool, +} + +impl ScriptScanner { + #[inline] + pub fn new() -> Self { + Self::default() + } + + #[inline] + pub fn observe_char(&mut self, ch: char) { + if ch.is_whitespace() { + self.finish_token(); + return; + } + + self.in_token = true; + self.metrics.non_whitespace_chars += 1; + + let cp = ch as u32; + if is_greek(cp) { + self.metrics.greek_char_count += 1; + self.token_has_greek = true; + if is_polytonic_codepoint(cp) { + self.token_has_polytonic = true; + } + } else if is_ascii_latin(cp) { + self.metrics.latin_char_count += 1; + } else if is_combining_mark(cp) { + self.token_has_polytonic = true; + } + } + + #[inline] + pub fn observe_str(&mut self, text: &str) { + for ch in text.chars() { + self.observe_char(ch); + } + } + + #[inline] + pub fn finish_token(&mut self) { + if !self.in_token { + return; + } + if self.token_has_greek { + self.metrics.greek_word_count += 1; + if self.token_has_polytonic { + self.metrics.polytonic_word_count += 1; + } + } + self.in_token = false; + self.token_has_greek = false; + self.token_has_polytonic = false; + } + + #[inline] + pub fn finish(mut self) -> ScriptMetrics { + self.finish_token(); + self.metrics + } +} + +#[inline(always)] +pub fn is_greek(cp: u32) -> bool { + (0x0370..=0x03FF).contains(&cp) || (0x1F00..=0x1FFF).contains(&cp) +} + +#[inline(always)] +pub fn is_combining_mark(cp: u32) -> bool { + (0x0300..=0x036F).contains(&cp) + || (0x1DC0..=0x1DFF).contains(&cp) + || (0x20D0..=0x20FF).contains(&cp) +} + +#[inline(always)] +pub fn is_ascii_latin(cp: u32) -> bool { + (0x41..=0x5A).contains(&cp) || (0x61..=0x7A).contains(&cp) +} + +#[inline(always)] +pub fn is_polytonic_codepoint(cp: u32) -> bool { + (0x1F00..=0x1FFF).contains(&cp) +} + +#[inline] +pub fn scan_script_metrics(text: &str) -> ScriptMetrics { + let mut scanner = ScriptScanner::new(); + scanner.observe_str(text); + scanner.finish() +} + +#[cfg(test)] +mod tests { + use super::{scan_script_metrics, ScriptScanner}; + + #[test] + fn scanner_counts_greek_latin_and_polytonic_words() { + let metrics = scan_script_metrics("Αυτή abc Καὶ"); + assert!(metrics.greek_char_count > 0); + assert_eq!(metrics.latin_char_count, 3); + assert_eq!(metrics.greek_word_count, 2); + assert_eq!(metrics.polytonic_word_count, 1); + assert!(metrics.percentage_greek() > metrics.latin_percentage()); + } + + #[test] + fn scanner_flushes_on_line_boundaries() { + let mut scanner = ScriptScanner::new(); + scanner.observe_str("Καὶ\n"); + scanner.observe_str("αὕτη"); + let metrics = scanner.finish(); + assert_eq!(metrics.greek_word_count, 2); + assert_eq!(metrics.polytonic_word_count, 2); + } +} diff --git a/rust/glossapi_rs_noise/Cargo.lock b/rust/glossapi_rs_noise/Cargo.lock index 3c09979..f68e0a8 100644 --- a/rust/glossapi_rs_noise/Cargo.lock +++ b/rust/glossapi_rs_noise/Cargo.lock @@ -87,18 +87,36 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glossapi_rs_common" +version = "0.1.0" + [[package]] name = "glossapi_rs_noise" version = "0.1.0" dependencies = [ "anyhow", "csv", + "glossapi_rs_common", "lazy_static", "memmap2", "once_cell", "pyo3", + "rand", "rayon", "regex", + "unicode-normalization", "walkdir", ] @@ -189,6 +207,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -267,6 +294,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.10.0" @@ -400,12 +457,36 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unindent" version = "0.1.11" @@ -422,6 +503,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "winapi-util" version = "0.1.9" @@ -503,3 +590,23 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] diff --git a/rust/glossapi_rs_noise/Cargo.toml b/rust/glossapi_rs_noise/Cargo.toml index 8dfa7bc..463e884 100644 --- a/rust/glossapi_rs_noise/Cargo.toml +++ b/rust/glossapi_rs_noise/Cargo.toml @@ -20,3 +20,6 @@ csv = "1.3.0" pyo3 = { version = "0.19.0", features = ["extension-module", "abi3-py38", "macros"] } anyhow = "1" regex = "1.10" +glossapi_rs_common = { path = "../glossapi_rs_common" } +rand = { version = "0.8", features = ["std_rng"] } +unicode-normalization = "0.1" diff --git a/rust/glossapi_rs_noise/src/lib.rs b/rust/glossapi_rs_noise/src/lib.rs index 33ae607..e3dc334 100644 --- a/rust/glossapi_rs_noise/src/lib.rs +++ b/rust/glossapi_rs_noise/src/lib.rs @@ -2,47 +2,95 @@ mod noise_metrics; -use pyo3::prelude::*; -use pyo3::types::PyTuple; use noise_metrics::{ - score_markdown_file_internal, - score_markdown_directory_internal, - score_markdown_file_detailed_internal, + annotate_numeric_debug_page_internal, evaluate_page_character_noise_internal, + export_numeric_match_debug_pages_internal, export_ocr_match_debug_pages_internal, + find_hybrid_repeat_spans_internal, find_labeled_shared_repeat_spans_internal, + find_numeric_debug_page_spans_internal, + find_word_repeat_spans_internal, score_markdown_directory_detailed_internal, + score_markdown_directory_internal, score_markdown_directory_ocr_profile_internal, + score_markdown_file_detailed_internal, score_markdown_file_internal, }; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use pyo3::types::PyTuple; /// Compute the badness score for a single markdown file. /// Returns the numeric score as `float`. #[pyfunction] fn score_markdown_file(path: &str) -> PyResult { - score_markdown_file_internal(std::path::Path::new(path)).map_err(|e| PyErr::new::(e.to_string())) + score_markdown_file_internal(std::path::Path::new(path)) + .map_err(|e| PyErr::new::(e.to_string())) } /// Compute badness scores for all `.md` files under `input_dir` in parallel. /// The result is a list of `(file_path, score, latin_percentage)` tuples. #[pyfunction] -fn score_markdown_directory(input_dir: &str, n_threads: Option) -> PyResult> { - score_markdown_directory_internal(std::path::Path::new(input_dir), n_threads).map_err(|e| PyErr::new::(e.to_string())) +fn score_markdown_directory( + input_dir: &str, + n_threads: Option, +) -> PyResult> { + score_markdown_directory_internal(std::path::Path::new(input_dir), n_threads) + .map_err(|e| PyErr::new::(e.to_string())) } /// Detailed score for a single file: returns a Python tuple of all raw and derived metrics #[pyfunction] fn score_markdown_file_detailed(py: Python<'_>, path: &str) -> PyResult> { let ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, ) = score_markdown_file_detailed_internal(std::path::Path::new(path)) .map_err(|e| PyErr::new::(e.to_string()))?; let tup = PyTuple::new( py, vec![ - score.into_py(py), latin_pct.into_py(py), table_ratio.into_py(py), poly_ratio.into_py(py), - (len_greek as u128).into_py(py), (total_words as u128).into_py(py), - (v_pen as u128).into_py(py), (c_pen as u128).into_py(py), (bad_dbl as u128).into_py(py), (misplaced_sigma as u128).into_py(py), (invalid_bigram as u128).into_py(py), (long_word_count as u128).into_py(py), (longest_word as u128).into_py(py), (short_word_count as u128).into_py(py), (max_run as u128).into_py(py), - v_rate.into_py(py), c_rate.into_py(py), d_rate.into_py(py), sigma_end_rate.into_py(py), bigram_rate.into_py(py), long_word_rate.into_py(py), short_ratio.into_py(py), short_pen.into_py(py), + score.into_py(py), + latin_pct.into_py(py), + table_ratio.into_py(py), + poly_ratio.into_py(py), + (len_greek as u128).into_py(py), + (total_words as u128).into_py(py), + (v_pen as u128).into_py(py), + (c_pen as u128).into_py(py), + (bad_dbl as u128).into_py(py), + (misplaced_sigma as u128).into_py(py), + (invalid_bigram as u128).into_py(py), + (long_word_count as u128).into_py(py), + (longest_word as u128).into_py(py), + (short_word_count as u128).into_py(py), + (max_run as u128).into_py(py), + v_rate.into_py(py), + c_rate.into_py(py), + d_rate.into_py(py), + sigma_end_rate.into_py(py), + bigram_rate.into_py(py), + long_word_rate.into_py(py), + short_ratio.into_py(py), + short_pen.into_py(py), flags.into_py(py), ], ); @@ -51,25 +99,70 @@ fn score_markdown_file_detailed(py: Python<'_>, path: &str) -> PyResult, input_dir: &str, n_threads: Option) -> PyResult>> { - let rows = score_markdown_directory_detailed_internal(std::path::Path::new(input_dir), n_threads) - .map_err(|e| PyErr::new::(e.to_string()))?; +fn score_markdown_directory_detailed( + py: Python<'_>, + input_dir: &str, + n_threads: Option, +) -> PyResult>> { + let rows = + score_markdown_directory_detailed_internal(std::path::Path::new(input_dir), n_threads) + .map_err(|e| PyErr::new::(e.to_string()))?; let mut out: Vec> = Vec::with_capacity(rows.len()); for ( - path, score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) in rows.into_iter() { + path, + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) in rows.into_iter() + { let tup = PyTuple::new( py, vec![ path.into_py(py), - score.into_py(py), latin_pct.into_py(py), table_ratio.into_py(py), poly_ratio.into_py(py), - (len_greek as u128).into_py(py), (total_words as u128).into_py(py), - (v_pen as u128).into_py(py), (c_pen as u128).into_py(py), (bad_dbl as u128).into_py(py), (misplaced_sigma as u128).into_py(py), (invalid_bigram as u128).into_py(py), (long_word_count as u128).into_py(py), (longest_word as u128).into_py(py), (short_word_count as u128).into_py(py), (max_run as u128).into_py(py), - v_rate.into_py(py), c_rate.into_py(py), d_rate.into_py(py), sigma_end_rate.into_py(py), bigram_rate.into_py(py), long_word_rate.into_py(py), short_ratio.into_py(py), short_pen.into_py(py), + score.into_py(py), + latin_pct.into_py(py), + table_ratio.into_py(py), + poly_ratio.into_py(py), + (len_greek as u128).into_py(py), + (total_words as u128).into_py(py), + (v_pen as u128).into_py(py), + (c_pen as u128).into_py(py), + (bad_dbl as u128).into_py(py), + (misplaced_sigma as u128).into_py(py), + (invalid_bigram as u128).into_py(py), + (long_word_count as u128).into_py(py), + (longest_word as u128).into_py(py), + (short_word_count as u128).into_py(py), + (max_run as u128).into_py(py), + v_rate.into_py(py), + c_rate.into_py(py), + d_rate.into_py(py), + sigma_end_rate.into_py(py), + bigram_rate.into_py(py), + long_word_rate.into_py(py), + short_ratio.into_py(py), + short_pen.into_py(py), flags.into_py(py), ], ); @@ -78,11 +171,277 @@ fn score_markdown_directory_detailed(py: Python<'_>, input_dir: &str, n_threads: Ok(out) } +#[pyfunction] +#[pyo3(signature = (input_dir, n_threads=None, min_repeat_run=6))] +fn score_markdown_directory_ocr_profile( + py: Python<'_>, + input_dir: &str, + n_threads: Option, + min_repeat_run: u64, +) -> PyResult>> { + let rows = score_markdown_directory_ocr_profile_internal( + std::path::Path::new(input_dir), + n_threads, + min_repeat_run, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("path", row.path)?; + item.set_item("percentage_greek", row.percentage_greek)?; + item.set_item("latin_percentage", row.latin_percentage)?; + item.set_item("polytonic_ratio", row.polytonic_ratio)?; + item.set_item("non_whitespace_chars", row.non_whitespace_chars)?; + item.set_item("greek_char_count", row.greek_char_count)?; + item.set_item("latin_char_count", row.latin_char_count)?; + item.set_item("ocr_repeat_phrase_run_max", row.ocr_repeat_phrase_run_max)?; + item.set_item("ocr_repeat_line_run_max", row.ocr_repeat_line_run_max)?; + item.set_item( + "ocr_repeat_suspicious_line_count", + row.ocr_repeat_suspicious_line_count, + )?; + item.set_item( + "ocr_repeat_suspicious_line_ratio", + row.ocr_repeat_suspicious_line_ratio, + )?; + item.set_item("ocr_noise_suspect", row.ocr_noise_suspect)?; + item.set_item("ocr_noise_flags", row.ocr_noise_flags)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (input_dir, output_dir, n_threads=None, min_repeat_run=6, max_pages=None, sample_seed=0))] +fn export_ocr_match_debug_pages( + py: Python<'_>, + input_dir: &str, + output_dir: &str, + n_threads: Option, + min_repeat_run: u64, + max_pages: Option, + sample_seed: u64, +) -> PyResult>> { + let rows = export_ocr_match_debug_pages_internal( + std::path::Path::new(input_dir), + std::path::Path::new(output_dir), + n_threads, + min_repeat_run, + max_pages, + sample_seed, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("source_path", row.source_path)?; + item.set_item("output_path", row.output_path)?; + item.set_item("source_stem", row.source_stem)?; + item.set_item("base_stem", row.base_stem)?; + item.set_item("page_number", row.page_number)?; + item.set_item("page_index_in_file", row.page_index_in_file)?; + item.set_item("match_types", row.match_types)?; + item.set_item("match_count", row.match_count)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (input_dir, output_dir, n_threads=None, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10, max_pages=None, sample_seed=0))] +fn export_numeric_match_debug_pages( + py: Python<'_>, + input_dir: &str, + output_dir: &str, + n_threads: Option, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, + max_pages: Option, + sample_seed: u64, +) -> PyResult>> { + let rows = export_numeric_match_debug_pages_internal( + std::path::Path::new(input_dir), + std::path::Path::new(output_dir), + n_threads, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + max_pages, + sample_seed, + ) + .map_err(|e| PyErr::new::(e.to_string()))?; + + let mut out: Vec> = Vec::with_capacity(rows.len()); + for row in rows { + let item = PyDict::new(py); + item.set_item("source_path", row.source_path)?; + item.set_item("output_path", row.output_path)?; + item.set_item("source_stem", row.source_stem)?; + item.set_item("base_stem", row.base_stem)?; + item.set_item("page_number", row.page_number)?; + item.set_item("page_index_in_file", row.page_index_in_file)?; + item.set_item("match_types", row.match_types)?; + item.set_item("match_count", row.match_count)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (page, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10))] +fn annotate_numeric_debug_page( + py: Python<'_>, + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> PyResult>> { + let Some((annotated_page, match_types, match_count)) = annotate_numeric_debug_page_internal( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) else { + return Ok(None); + }; + + let item = PyDict::new(py); + item.set_item("annotated_page", annotated_page)?; + item.set_item("match_types", match_types)?; + item.set_item("match_count", match_count)?; + Ok(Some(item.into())) +} + +#[pyfunction] +#[pyo3(signature = (page, min_progress_steps=10, min_repeat_steps=8, min_same_digit_steps=10))] +fn find_numeric_debug_page_spans( + py: Python<'_>, + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> PyResult>> { + let spans = py.allow_threads(|| { + find_numeric_debug_page_spans_internal( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("match_type", span.match_type)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (normalized_text, rep_threshold=4, min_period=3, window=96))] +fn find_word_repeat_spans( + py: Python<'_>, + normalized_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> PyResult>> { + let spans = + py.allow_threads(|| find_word_repeat_spans_internal(normalized_text, rep_threshold, min_period, window)); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("period", span.period)?; + item.set_item("repetitions", span.repetitions)?; + item.set_item("tail_chars", span.tail_chars)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +fn find_hybrid_repeat_spans(py: Python<'_>, analysis_text: &str) -> PyResult>> { + let spans = py.allow_threads(|| find_hybrid_repeat_spans_internal(analysis_text)); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("match_types", vec!["hybrid_repeat"])?; + item.set_item("category", "hybrid")?; + item.set_item("kind", span.kind)?; + item.set_item("item_count", span.item_count)?; + if let Some(cycle_len) = span.cycle_len { + item.set_item("cycle_len", cycle_len)?; + } + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +#[pyo3(signature = (analysis_text, rep_threshold=4, min_period=3, window=96))] +fn find_labeled_shared_repeat_spans( + py: Python<'_>, + analysis_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> PyResult>> { + let spans = py.allow_threads(|| { + find_labeled_shared_repeat_spans_internal(analysis_text, rep_threshold, min_period, window) + }); + let mut out: Vec> = Vec::with_capacity(spans.len()); + for span in spans { + let item = PyDict::new(py); + item.set_item("start", span.start)?; + item.set_item("end", span.end)?; + item.set_item("period", span.period)?; + item.set_item("repetitions", span.repetitions)?; + item.set_item("tail_chars", span.tail_chars)?; + item.set_item("match_type", span.match_type)?; + out.push(item.into()); + } + Ok(out) +} + +#[pyfunction] +fn evaluate_page_character_noise(py: Python<'_>, page: &str) -> PyResult> { + let metrics = py.allow_threads(|| evaluate_page_character_noise_internal(page)); + let item = PyDict::new(py); + item.set_item("total_chars", metrics.total_chars)?; + item.set_item("bad_char_count", metrics.bad_char_count)?; + item.set_item("bad_char_ratio", metrics.bad_char_ratio)?; + item.set_item("control_count", metrics.control_count)?; + item.set_item("private_use_count", metrics.private_use_count)?; + item.set_item("cjk_count", metrics.cjk_count)?; + item.set_item("replacement_count", metrics.replacement_count)?; + Ok(item.into()) +} + #[pymodule] fn glossapi_rs_noise(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(score_markdown_file, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_directory, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_file_detailed, m)?)?; m.add_function(wrap_pyfunction!(score_markdown_directory_detailed, m)?)?; + m.add_function(wrap_pyfunction!(score_markdown_directory_ocr_profile, m)?)?; + m.add_function(wrap_pyfunction!(export_ocr_match_debug_pages, m)?)?; + m.add_function(wrap_pyfunction!(export_numeric_match_debug_pages, m)?)?; + m.add_function(wrap_pyfunction!(annotate_numeric_debug_page, m)?)?; + m.add_function(wrap_pyfunction!(find_numeric_debug_page_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_word_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_hybrid_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(find_labeled_shared_repeat_spans, m)?)?; + m.add_function(wrap_pyfunction!(evaluate_page_character_noise, m)?)?; Ok(()) } diff --git a/rust/glossapi_rs_noise/src/noise_metrics.rs b/rust/glossapi_rs_noise/src/noise_metrics.rs index 105b823..90245f7 100644 --- a/rust/glossapi_rs_noise/src/noise_metrics.rs +++ b/rust/glossapi_rs_noise/src/noise_metrics.rs @@ -69,134 +69,2646 @@ Positions in detailed tuple (suggested append): Note: after adding these fields, bump the Python bindings accordingly and propagate polytonic_ratio (already computed here) into downstream parquet (already wired in Corpus.clean()). */ +use glossapi_rs_common::{is_combining_mark, is_greek, scan_script_metrics, ScriptScanner}; +use rand::rngs::StdRng; +use rand::seq::SliceRandom; +use rand::SeedableRng; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use std::fs::{self, File}; +use std::io::Read; +use std::path::{Path, PathBuf}; +use unicode_normalization::UnicodeNormalization; +use walkdir::WalkDir; +// Avoid heavy regex for table detection; use lightweight checks instead + +#[inline(always)] +fn is_vowel(cp: u32) -> bool { + matches!( + cp, + 0x0391 | 0x03B1 | 0x0386 | 0x03AC | // Αα Άά + 0x0395 | 0x03B5 | 0x0388 | 0x03AD | // Εε Έέ + 0x0397 | 0x03B7 | 0x0389 | 0x03AE | // Ηη Ήή + 0x0399 | 0x03B9 | 0x038A | 0x03AF | 0x03CA | 0x03CB | 0x039F | 0x03BF | + 0x038C | 0x03CC | 0x03C5 | 0x03B0 | 0x03CD | 0x03A5 | 0x038E | + 0x03A9 | 0x03C9 | 0x038F | 0x03CE + ) +} + +const LONG_WORD_LIMIT: u64 = 21; +const SHORT_WORD_LIMIT: u64 = 3; +const PAGE_SPLIT_MARKER: &str = "<--- Page Split --->"; +const NUMERIC_PAGE_COLLAPSE_MIN_TOKENS: u64 = 64; +const NUMERIC_PAGE_COLLAPSE_MIN_ATOMS: u64 = 64; +const NUMERIC_BLOCK_SEED_MIN_ATOMS: usize = 8; +// Baseline for short words per 1000 Greek characters (empirically ~26 on clean texts) +const SHORT_BASELINE_PER_1000: f64 = 26.0; + +#[inline] +fn to_lower_fast(cp: u32) -> u32 { + // Fast path for basic Greek capitals: add 0x20; otherwise return as-is + if (0x0391..=0x03A9).contains(&cp) { + cp + 0x20 + } else { + cp + } +} + +#[inline] +fn is_invalid_bigram_pair(prev_low: u32, curr_low: u32) -> bool { + match (prev_low, curr_low) { + // κ/γ/χ + ξ + (0x03BA, 0x03BE) | (0x03B3, 0x03BE) | (0x03C7, 0x03BE) + // π/β/φ + ψ + | (0x03C0, 0x03C8) | (0x03B2, 0x03C8) | (0x03C6, 0x03C8) + // ρλ, μρ, γβ, δτ, τδ, βπ, πβ + | (0x03C1, 0x03BB) | (0x03BC, 0x03C1) | (0x03B3, 0x03B2) + | (0x03B4, 0x03C4) | (0x03C4, 0x03B4) | (0x03B2, 0x03C0) | (0x03C0, 0x03B2) => true, + _ => false, + } +} + +static ALLOWED_DOUBLE: [u32; 9] = [ + 0x03BB, 0x03BC, 0x03BD, 0x03C1, 0x03C3, 0x03C4, 0x03BA, 0x03C0, 0x03B3, +]; + +fn allowed_double(cp: u32) -> bool { + ALLOWED_DOUBLE.contains(&cp) +} + +#[inline] +fn is_table_line_trimmed(trimmed: &str) -> bool { + // A simple check equivalent to /^\s*\|.*\|\s*$/ after trimming + // i.e., line begins and ends with a '|' ignoring outer whitespace + !trimmed.is_empty() + && trimmed.as_bytes()[0] == b'|' + && trimmed.as_bytes()[trimmed.len() - 1] == b'|' +} + +fn table_line_ratio_and_filtered(text: &str) -> (f64, Option, usize, usize) { + let mut non_empty = 0usize; + let mut table_like = 0usize; + // First pass: count table-like rows without allocating filtered buffer unless needed + for line in text.lines() { + let trimmed = line.trim(); + if !trimmed.is_empty() { + non_empty += 1; + if is_table_line_trimmed(trimmed) { + table_like += 1; + } + } + } + let ratio = if non_empty > 0 { + table_like as f64 / non_empty as f64 + } else { + 0.0 + }; + if table_like == 0 { + return (ratio, None, non_empty, table_like); + } + // Second pass only if we actually need a filtered buffer (preserve original newlines) + let mut filtered = String::with_capacity(text.len()); + for seg in text.split_inclusive('\n') { + let trimmed = seg.trim(); + if trimmed.is_empty() || !is_table_line_trimmed(trimmed) { + filtered.push_str(seg); + } + } + (ratio, Some(filtered), non_empty, table_like) +} + +fn compute_latin_pct(buf: &[u8]) -> f64 { + let latin_chars = buf + .iter() + .filter(|&&b| (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) + .count(); + latin_chars as f64 / (buf.len() as f64) +} + +#[derive(Debug, Clone)] +pub struct OcrProfileRow { + pub path: String, + pub percentage_greek: f64, + pub latin_percentage: f64, + pub polytonic_ratio: f64, + pub non_whitespace_chars: u64, + pub greek_char_count: u64, + pub latin_char_count: u64, + pub ocr_repeat_phrase_run_max: u64, + pub ocr_repeat_line_run_max: u64, + pub ocr_repeat_suspicious_line_count: u64, + pub ocr_repeat_suspicious_line_ratio: f64, + pub ocr_noise_suspect: bool, + pub ocr_noise_flags: String, +} + +#[derive(Debug, Clone)] +pub struct OcrDebugPageRow { + pub source_path: String, + pub output_path: String, + pub source_stem: String, + pub base_stem: String, + pub page_number: u64, + pub page_index_in_file: u64, + pub match_types: String, + pub match_count: u64, +} + +#[derive(Debug, Clone)] +struct OcrDebugPageCandidate { + source_path: String, + source_stem: String, + base_stem: String, + page_number: u64, + page_index_in_file: u64, +} + +#[derive(Debug, Clone)] +struct DebugMatchSpan { + start: usize, + end: usize, + match_type: &'static str, +} + +#[derive(Debug, Clone)] +pub struct NumericDebugSpan { + pub start: usize, + pub end: usize, + pub match_type: String, +} + +#[derive(Debug, Clone)] +pub struct WordRepeatSpan { + pub start: usize, + pub end: usize, + pub period: usize, + pub repetitions: usize, + pub tail_chars: usize, +} + +#[derive(Debug, Clone)] +pub struct HybridRepeatSpan { + pub start: usize, + pub end: usize, + pub kind: &'static str, + pub item_count: usize, + pub cycle_len: Option, +} + +#[derive(Debug, Clone)] +pub struct LabeledSharedRepeatSpan { + pub start: usize, + pub end: usize, + pub period: usize, + pub repetitions: usize, + pub tail_chars: usize, + pub match_type: &'static str, +} + +#[derive(Debug, Clone, Default)] +pub struct PageCharacterNoise { + pub total_chars: u64, + pub bad_char_count: u64, + pub bad_char_ratio: f64, + pub control_count: u64, + pub private_use_count: u64, + pub cjk_count: u64, + pub replacement_count: u64, +} + +const MERGE_SAME_CATEGORY_MAX_NONWHITESPACE_GAP: usize = 10; +const HYBRID_REPEAT_MIN_ITEMS: usize = 4; +const HYBRID_REPEAT_MIN_BODY_ALNUM: usize = 6; +const HYBRID_REPEAT_MAX_CYCLE: usize = 6; +const HYBRID_REPEAT_MIN_CYCLE_ITEMS: usize = 8; +const HYBRID_INLINE_CONTEXT_WORDS: usize = 2; +const HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS: usize = 2; +const HYBRID_INLINE_CONTEXT_MIN_CHARS: usize = 8; +const HYBRID_INLINE_REPEAT_MIN_ITEMS: usize = 6; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HybridFieldKind { + HeaderCounter, + NumericValue, +} + +#[derive(Debug, Clone)] +struct HybridNumberedItem { + start: usize, + end: usize, + field_kind: HybridFieldKind, + numbers: Vec, + shape: String, + body_key: String, + body_is_full: bool, +} + +#[derive(Debug, Clone)] +struct HybridInlineItem { + start: usize, + end: usize, + clause_index: usize, + inline_context_key: String, + numeric_value: f64, +} + +#[derive(Debug, Clone)] +struct HybridCandidate { + prefix_start_byte: usize, + prefix_end_byte: usize, + field_kind: HybridFieldKind, + numbers: Vec, + shape: String, +} + +#[derive(Debug, Clone)] +struct HybridToken { + kind: HybridTokenKind, + start: usize, + end: usize, + token_key: Option, + numeric_value: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HybridTokenKind { + Numeric, + Alpha, +} + +#[derive(Debug, Clone, Copy)] +struct TokenSpan { + start: usize, + end: usize, +} + +#[derive(Debug, Clone, Copy, Default)] +struct NumericLineSummary { + has_alpha: bool, + rejected_non_numeric: bool, + numeric_token_count: usize, + numeric_atom_count: usize, + is_blank: bool, +} + +#[inline] +fn is_trim_numeric_edge_char(ch: char) -> bool { + ch.is_ascii_punctuation() + || matches!( + ch, + '«' | '»' | '“' | '”' | '„' | '‟' | '‘' | '’' | '‚' | '‛' + ) +} + +#[inline] +fn is_numeric_page_ignored_token(token: &str) -> bool { + !token.is_empty() + && token + .chars() + .all(|ch| !ch.is_whitespace() && !ch.is_alphanumeric()) +} + +fn trim_numeric_token_bounds(token: &str) -> Option<(usize, usize)> { + if token.is_empty() { + return None; + } + + let mut start = 0usize; + let mut end = token.len(); + + while start < end { + let ch = token[start..].chars().next()?; + if ch.is_ascii_digit() { + break; + } + if is_trim_numeric_edge_char(ch) { + start += ch.len_utf8(); + } else { + return None; + } + } + + while start < end { + let ch = token[..end].chars().next_back()?; + if ch.is_ascii_digit() { + break; + } + if is_trim_numeric_edge_char(ch) { + end -= ch.len_utf8(); + } else { + return None; + } + } + + if start >= end { + None + } else { + Some((start, end)) + } +} + +#[inline] +fn is_numeric_page_token_body(text: &str) -> bool { + if text.is_empty() { + return false; + } + + if text.chars().all(|ch| ch.is_ascii_digit()) { + return (1..=4).contains(&text.len()); + } + + let mut saw_digit = false; + for ch in text.chars() { + if ch.is_ascii_digit() { + saw_digit = true; + continue; + } + if matches!(ch, '.' | ',' | ':' | ';' | '/' | '-') { + continue; + } + return false; + } + + saw_digit +} + +fn summarize_numeric_line(line: &str) -> NumericLineSummary { + let trimmed = line.trim(); + if trimmed.is_empty() { + return NumericLineSummary { + is_blank: true, + ..NumericLineSummary::default() + }; + } + + let tokens = extract_non_whitespace_tokens_with_spans(line); + let mut summary = NumericLineSummary::default(); + for token in tokens { + let raw = &line[token.start..token.end]; + if raw.chars().any(char::is_alphabetic) { + summary.has_alpha = true; + } + if is_numeric_page_ignored_token(raw) { + continue; + } + let Some((trim_start, trim_end)) = trim_numeric_token_bounds(raw) else { + summary.rejected_non_numeric = true; + continue; + }; + let trimmed = &raw[trim_start..trim_end]; + if !is_numeric_page_token_body(trimmed) { + summary.rejected_non_numeric = true; + continue; + } + summary.numeric_token_count += 1; + summary.numeric_atom_count += extract_digit_group_spans(trimmed).len(); + } + summary +} + +fn parse_simple_number(text: &str) -> Option { + if text.is_empty() { + return None; + } + + let mut normalized = String::with_capacity(text.len()); + let mut saw_digit = false; + let mut saw_separator = false; + + for ch in text.chars() { + if ch.is_ascii_digit() { + normalized.push(ch); + saw_digit = true; + } else if ch == '.' || ch == ',' { + if saw_separator { + return None; + } + saw_separator = true; + normalized.push('.'); + } else { + return None; + } + } + + if !saw_digit || normalized.starts_with('.') || normalized.ends_with('.') { + return None; + } + + normalized.parse::().ok() +} + +fn repeated_digit_token(text: &str) -> Option { + let mut digit: Option = None; + for ch in text.chars() { + if !ch.is_ascii_digit() { + return None; + } + match digit { + Some(existing) if existing != ch => return None, + Some(_) => {} + None => digit = Some(ch), + } + } + digit +} + +#[inline] +fn is_private_use_codepoint(cp: u32) -> bool { + matches!( + cp, + 0xE000..=0xF8FF | 0xF0000..=0xFFFFD | 0x100000..=0x10FFFD + ) +} + +#[inline] +fn is_cjk_codepoint(cp: u32) -> bool { + matches!( + cp, + 0x3400..=0x4DBF + | 0x4E00..=0x9FFF + | 0xF900..=0xFAFF + | 0x20000..=0x2A6DF + | 0x2A700..=0x2B73F + | 0x2B740..=0x2B81F + | 0x2B820..=0x2CEAF + | 0x2F800..=0x2FA1F + ) +} + +pub fn evaluate_page_character_noise_internal(page: &str) -> PageCharacterNoise { + let mut metrics = PageCharacterNoise::default(); + for ch in page.chars() { + metrics.total_chars += 1; + let cp = ch as u32; + let mut is_bad = false; + if ch == '\u{FFFD}' { + metrics.replacement_count += 1; + is_bad = true; + } else if ch.is_control() && !matches!(ch, '\n' | '\r' | '\t') { + metrics.control_count += 1; + is_bad = true; + } else if is_private_use_codepoint(cp) { + metrics.private_use_count += 1; + is_bad = true; + } else if is_cjk_codepoint(cp) { + metrics.cjk_count += 1; + is_bad = true; + } + if is_bad { + metrics.bad_char_count += 1; + } + } + + metrics.bad_char_ratio = if metrics.total_chars > 0 { + metrics.bad_char_count as f64 / metrics.total_chars as f64 + } else { + 0.0 + }; + metrics +} + +fn extract_digit_group_spans(text: &str) -> Vec { + let mut spans = Vec::new(); + let mut current_start: Option = None; + + for (idx, ch) in text.char_indices() { + if ch.is_ascii_digit() { + if current_start.is_none() { + current_start = Some(idx); + } + } else if let Some(start) = current_start.take() { + spans.push(TokenSpan { start, end: idx }); + } + } + + if let Some(start) = current_start { + spans.push(TokenSpan { + start, + end: text.len(), + }); + } + + spans +} + +#[inline] +fn numeric_step_approx_eq(lhs: f64, rhs: f64) -> bool { + let scale = lhs.abs().max(rhs.abs()).max(1.0); + (lhs - rhs).abs() <= 1e-9 * scale +} + +#[derive(Debug, Clone, Copy, Default)] +struct OcrRepeatNoiseMetrics { + phrase_run_max: u64, + line_run_max: u64, + suspicious_line_count: u64, + suspicious_line_ratio: f64, + suspect: bool, +} + +fn extract_non_whitespace_tokens_with_spans(line: &str) -> Vec { + let mut tokens = Vec::new(); + let mut current_start: Option = None; + + for (idx, ch) in line.char_indices() { + if !ch.is_whitespace() { + if current_start.is_none() { + current_start = Some(idx); + } + } else if let Some(start) = current_start.take() { + tokens.push(TokenSpan { start, end: idx }); + } + } + + if let Some(start) = current_start { + tokens.push(TokenSpan { + start, + end: line.len(), + }); + } + + tokens +} + +fn normalize_line_for_repetition(line: &str) -> Option { + let trimmed = line.trim(); + if trimmed.is_empty() { + return None; + } + + let mut normalized = String::with_capacity(trimmed.len()); + let mut iter = trimmed.split_whitespace(); + if let Some(first) = iter.next() { + normalized.push_str(first); + for token in iter { + normalized.push(' '); + normalized.push_str(token); + } + } + Some(normalized) +} + +fn phrase_tokens_equal( + line: &str, + tokens: &[TokenSpan], + lhs: usize, + rhs: usize, + len: usize, +) -> bool { + (0..len).all(|offset| { + let lhs_token = &line[tokens[lhs + offset].start..tokens[lhs + offset].end]; + let rhs_token = &line[tokens[rhs + offset].start..tokens[rhs + offset].end]; + lhs_token == rhs_token + }) +} + +fn collect_repeat_phrase_debug_matches( + line: &str, + tokens: &[TokenSpan], + min_repeat_run: u64, +) -> Vec { + let mut spans = Vec::new(); + let min_run = min_repeat_run as usize; + if min_run < 2 || tokens.len() < min_run { + return spans; + } + + let max_phrase_len = 4usize.min(tokens.len() / min_run); + for phrase_len in 1..=max_phrase_len { + let mut i = 0usize; + while i + phrase_len * min_run <= tokens.len() { + let mut repeats = 1usize; + while i + phrase_len * (repeats + 1) <= tokens.len() + && phrase_tokens_equal(line, tokens, i, i + repeats * phrase_len, phrase_len) + { + repeats += 1; + } + if repeats >= min_run { + spans.push(DebugMatchSpan { + start: tokens[i].start, + end: tokens[i + phrase_len * repeats - 1].end, + match_type: "repeat_phrase_run", + }); + i += phrase_len * repeats; + } else { + i += 1; + } + } + } + + spans +} + +fn debug_match_merge_category(match_type: &'static str) -> Option<&'static str> { + match match_type { + "ascending_numeric_sequence" + | "repeat_numeric_run" + | "same_digit_numeric_run" + | "numeric_page_collapse" + | "numeric_block_collapse" => Some("numeric"), + "word_repeat" => Some("word"), + _ => None, + } +} + +fn gap_has_fewer_than_n_nonwhitespace_chars( + text: &str, + start: usize, + end: usize, + max_nonwhitespace: usize, +) -> bool { + if start >= end { + return true; + } + + let mut count = 0usize; + for ch in text[start..end].chars() { + if !ch.is_whitespace() { + count += 1; + if count >= max_nonwhitespace { + return false; + } + } + } + true +} + +fn merge_debug_spans( + text: &str, + spans: Vec, +) -> Vec<(usize, usize, Vec<&'static str>)> { + if spans.is_empty() { + return Vec::new(); + } + + let mut spans = spans; + spans.sort_by_key(|span| (span.start, span.end)); + + let mut merged: Vec<(usize, usize, Vec<&'static str>)> = Vec::new(); + for span in spans { + if let Some((start, end, types)) = merged.last_mut() { + let overlaps = span.start <= *end; + let same_category_gap_merge = !overlaps + && debug_match_merge_category(span.match_type).is_some() + && types.iter().any(|kind| { + debug_match_merge_category(*kind) == debug_match_merge_category(span.match_type) + }) + && gap_has_fewer_than_n_nonwhitespace_chars( + text, + *end, + span.start, + MERGE_SAME_CATEGORY_MAX_NONWHITESPACE_GAP, + ); + if overlaps || same_category_gap_merge { + *end = (*end).max(span.end); + if !types.contains(&span.match_type) { + types.push(span.match_type); + } + *start = (*start).min(span.start); + continue; + } + } + merged.push((span.start, span.end, vec![span.match_type])); + } + + for (_, _, types) in &mut merged { + types.sort_unstable(); + types.dedup(); + } + + merged +} + +fn annotate_text_with_debug_spans( + text: &str, + spans: Vec, +) -> Option<(String, Vec<&'static str>, u64)> { + let merged = merge_debug_spans(text, spans); + if merged.is_empty() { + return None; + } + + let mut annotated = String::with_capacity(text.len() + merged.len() * 48); + let mut pos = 0usize; + let mut match_types: Vec<&'static str> = Vec::new(); + for (start, end, types) in &merged { + if *start > pos { + annotated.push_str(&text[pos..*start]); + } + let type_attr = types.join(","); + annotated.push_str("'); + annotated.push_str(&text[*start..*end]); + annotated.push_str(""); + pos = *end; + for kind in types { + if !match_types.contains(kind) { + match_types.push(*kind); + } + } + } + if pos < text.len() { + annotated.push_str(&text[pos..]); + } + + Some((annotated, match_types, merged.len() as u64)) +} + +fn collect_numeric_page_collapse_span(page: &str, min_page_tokens: u64) -> Option { + let tokens = extract_non_whitespace_tokens_with_spans(page); + let mut page_start: Option = None; + let mut page_end: Option = None; + let mut first_start: Option = None; + let mut last_end: Option = None; + let mut numeric_token_count = 0usize; + let mut numeric_atom_count = 0usize; + for token in tokens { + let raw = &page[token.start..token.end]; + if page_start.is_none() { + page_start = Some(token.start); + } + page_end = Some(token.end); + if is_numeric_page_ignored_token(raw) { + continue; + } + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + if !is_numeric_page_token_body(trimmed) { + return None; + } + let abs_start = token.start + trim_start; + let abs_end = token.start + trim_end; + if first_start.is_none() { + first_start = Some(abs_start); + } + last_end = Some(abs_end); + numeric_token_count += 1; + numeric_atom_count += extract_digit_group_spans(trimmed).len(); + } + + if numeric_token_count < min_page_tokens as usize + && numeric_atom_count < NUMERIC_PAGE_COLLAPSE_MIN_ATOMS as usize + { + return None; + } + + Some(DebugMatchSpan { + start: page_start.or(first_start)?, + end: page_end.or(last_end)?, + match_type: "numeric_page_collapse", + }) +} + +fn collect_numeric_block_collapse_spans(page: &str) -> Vec { + let mut lines: Vec<(usize, usize, NumericLineSummary)> = Vec::new(); + let mut offset = 0usize; + for segment in page.split_inclusive('\n') { + let line = segment.strip_suffix('\n').unwrap_or(segment); + let summary = summarize_numeric_line(line); + lines.push((offset, offset + segment.len(), summary)); + offset += segment.len(); + } + if offset < page.len() { + let line = &page[offset..]; + lines.push((offset, page.len(), summarize_numeric_line(line))); + } + + let mut spans = Vec::new(); + let mut idx = 0usize; + while idx < lines.len() { + let (_, _, summary) = lines[idx]; + let is_seed = !summary.has_alpha + && !summary.rejected_non_numeric + && summary.numeric_atom_count >= NUMERIC_BLOCK_SEED_MIN_ATOMS; + if !is_seed { + idx += 1; + continue; + } + + let mut start_idx = idx; + let mut end_idx = idx; + let mut total_atoms = summary.numeric_atom_count; + + while start_idx > 0 { + let prev = lines[start_idx - 1].2; + let prev_ok = prev.is_blank + || (!prev.has_alpha && !prev.rejected_non_numeric && prev.numeric_token_count > 0); + if !prev_ok { + break; + } + start_idx -= 1; + total_atoms += prev.numeric_atom_count; + } + + while end_idx + 1 < lines.len() { + let next = lines[end_idx + 1].2; + let next_ok = next.is_blank + || (!next.has_alpha && !next.rejected_non_numeric && next.numeric_token_count > 0); + if !next_ok { + break; + } + end_idx += 1; + total_atoms += next.numeric_atom_count; + } + + if total_atoms >= NUMERIC_PAGE_COLLAPSE_MIN_ATOMS as usize { + let first_nonblank = (start_idx..=end_idx).find(|i| !lines[*i].2.is_blank); + let last_nonblank = (start_idx..=end_idx).rfind(|i| !lines[*i].2.is_blank); + if let (Some(first), Some(last)) = (first_nonblank, last_nonblank) { + spans.push(DebugMatchSpan { + start: lines[first].0, + end: lines[last].1, + match_type: "numeric_block_collapse", + }); + } + } + + idx = end_idx + 1; + } + + spans +} + +fn collect_numeric_progression_matches( + line: &str, + tokens: &[TokenSpan], + min_progress_steps: u64, +) -> Vec { + let min_steps = min_progress_steps as usize; + if min_steps < 2 || tokens.len() < min_steps { + return Vec::new(); + } + + let numeric_tokens: Vec> = tokens + .iter() + .map(|token| { + let raw = &line[token.start..token.end]; + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + let value = parse_simple_number(trimmed)?; + Some((token.start + trim_start, token.start + trim_end, value)) + }) + .collect(); + + let mut spans = Vec::new(); + let mut i = 0usize; + while i + min_steps <= numeric_tokens.len() { + let Some((start, _, first)) = numeric_tokens[i] else { + i += 1; + continue; + }; + let Some((_, _, second)) = numeric_tokens[i + 1] else { + i += 1; + continue; + }; + + let step = second - first; + if !step.is_finite() || step <= 0.0 { + i += 1; + continue; + } + + let mut j = i + 1; + while j + 1 < numeric_tokens.len() { + let Some((_, _, current)) = numeric_tokens[j] else { + break; + }; + let Some((_, _, next)) = numeric_tokens[j + 1] else { + break; + }; + if numeric_step_approx_eq(next - current, step) { + j += 1; + } else { + break; + } + } + + let run_len = j - i + 1; + if run_len >= min_steps { + let (_, end, _) = numeric_tokens[j].expect("numeric run end"); + spans.push(DebugMatchSpan { + start, + end, + match_type: "ascending_numeric_sequence", + }); + i = j + 1; + } else { + i += 1; + } + } + + spans +} + +fn collect_compact_repeat_numeric_matches( + line: &str, + tokens: &[TokenSpan], + min_repeat_steps: u64, +) -> Vec { + let min_steps = min_repeat_steps as usize; + if min_steps < 2 { + return Vec::new(); + } + + let mut spans = Vec::new(); + for token in tokens { + let raw = &line[token.start..token.end]; + let Some((trim_start, trim_end)) = trim_numeric_token_bounds(raw) else { + continue; + }; + let trimmed = &raw[trim_start..trim_end]; + let digit_groups = extract_digit_group_spans(trimmed); + if digit_groups.len() < min_steps { + continue; + } + + let first_group = &trimmed[digit_groups[0].start..digit_groups[0].end]; + if digit_groups + .iter() + .any(|group| &trimmed[group.start..group.end] != first_group) + { + continue; + } + + let mut separators_ok = true; + for pair in digit_groups.windows(2) { + let separator = &trimmed[pair[0].end..pair[1].start]; + if separator.is_empty() + || separator + .chars() + .any(|ch| ch.is_ascii_alphanumeric() || ch.is_whitespace()) + { + separators_ok = false; + break; + } + } + if !separators_ok { + continue; + } + + let trailing = &trimmed[digit_groups.last().expect("digit group").end..]; + if trailing + .chars() + .any(|ch| ch.is_ascii_alphanumeric() || ch.is_whitespace()) + { + continue; + } + + spans.push(DebugMatchSpan { + start: token.start + trim_start, + end: token.start + trim_end, + match_type: "repeat_numeric_run", + }); + } + + spans +} + +fn collect_same_digit_numeric_matches( + line: &str, + tokens: &[TokenSpan], + min_same_digit_steps: u64, +) -> Vec { + let min_steps = min_same_digit_steps as usize; + if min_steps < 2 || tokens.len() < min_steps { + return Vec::new(); + } + + let signatures: Vec> = tokens + .iter() + .map(|token| { + let raw = &line[token.start..token.end]; + let (trim_start, trim_end) = trim_numeric_token_bounds(raw)?; + let trimmed = &raw[trim_start..trim_end]; + let digit = repeated_digit_token(trimmed)?; + Some((token.start + trim_start, token.start + trim_end, digit)) + }) + .collect(); + + let mut spans = Vec::new(); + let mut i = 0usize; + while i + min_steps <= signatures.len() { + let Some((start, _, digit)) = signatures[i] else { + i += 1; + continue; + }; + + let mut j = i + 1; + while j < signatures.len() && signatures[j].map(|(_, _, current)| current) == Some(digit) { + j += 1; + } + + let run_len = j - i; + if run_len >= min_steps { + let (_, end, _) = signatures[j - 1].expect("same-digit run end"); + spans.push(DebugMatchSpan { + start, + end, + match_type: "same_digit_numeric_run", + }); + i = j; + } else { + i += 1; + } + } + + spans +} + +fn annotate_line_with_numeric_debug_matches( + line: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let tokens = extract_non_whitespace_tokens_with_spans(line); + if tokens.is_empty() { + return None; + } + + let mut spans = Vec::new(); + spans.extend(collect_numeric_progression_matches( + line, + &tokens, + min_progress_steps, + )); + spans.extend(collect_compact_repeat_numeric_matches( + line, + &tokens, + min_repeat_steps, + )); + spans.extend(collect_same_digit_numeric_matches( + line, + &tokens, + min_same_digit_steps, + )); + annotate_text_with_debug_spans(line, spans) +} + +fn annotate_line_with_debug_matches( + line: &str, + min_repeat_run: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let tokens = extract_non_whitespace_tokens_with_spans(line); + if tokens.is_empty() { + return None; + } + + let spans = collect_repeat_phrase_debug_matches(line, &tokens, min_repeat_run); + let merged = merge_debug_spans(line, spans); + if merged.is_empty() { + return None; + } + + let mut annotated = String::with_capacity(line.len() + merged.len() * 48); + let mut pos = 0usize; + let mut line_types: Vec<&'static str> = Vec::new(); + for (start, end, types) in &merged { + if *start > pos { + annotated.push_str(&line[pos..*start]); + } + let type_attr = types.join(","); + annotated.push_str("'); + annotated.push_str(&line[*start..*end]); + annotated.push_str(""); + pos = *end; + for kind in types { + if !line_types.contains(kind) { + line_types.push(*kind); + } + } + } + if pos < line.len() { + annotated.push_str(&line[pos..]); + } + + Some((annotated, line_types, merged.len() as u64)) +} + +fn compute_repeat_phrase_run_max(trimmed: &str, min_repeat_run: u64) -> u64 { + let tokens = extract_non_whitespace_tokens_with_spans(trimmed); + let min_run = min_repeat_run as usize; + if min_run < 2 || tokens.len() < min_run { + return 0; + } + + let max_phrase_len = 4usize.min(tokens.len() / min_run); + let mut phrase_run_max = 0u64; + for phrase_len in 1..=max_phrase_len { + let mut i = 0usize; + while i + phrase_len * min_run <= tokens.len() { + let mut repeats = 1usize; + while i + phrase_len * (repeats + 1) <= tokens.len() + && phrase_tokens_equal(trimmed, &tokens, i, i + repeats * phrase_len, phrase_len) + { + repeats += 1; + } + if repeats >= min_run { + phrase_run_max = phrase_run_max.max(repeats as u64); + i += phrase_len * repeats; + } else { + i += 1; + } + } + } + + phrase_run_max +} + +fn collect_repeat_line_flags(lines: &[Option], min_repeat_run: u64) -> (Vec, u64) { + let min_run = min_repeat_run as usize; + let mut flags = vec![false; lines.len()]; + if min_run < 2 || lines.len() < min_run { + return (flags, 0); + } + + let mut run_max = 0u64; + let mut i = 0usize; + while i < lines.len() { + let Some(current) = lines[i].as_ref() else { + i += 1; + continue; + }; + + let mut j = i + 1; + while j < lines.len() && lines[j].as_ref() == Some(current) { + j += 1; + } + let run_len = j - i; + if run_len >= min_run { + run_max = run_max.max(run_len as u64); + for flag in &mut flags[i..j] { + *flag = true; + } + } + i = j; + } + + (flags, run_max) +} + +fn finalize_ocr_repeat_noise( + phrase_run_max: u64, + line_run_max: u64, + suspicious_line_count: u64, + non_empty_lines: usize, +) -> OcrRepeatNoiseMetrics { + let suspicious_line_ratio = if non_empty_lines > 0 { + suspicious_line_count as f64 / non_empty_lines as f64 + } else { + 0.0 + }; + let suspect = suspicious_line_count > 0; + + OcrRepeatNoiseMetrics { + phrase_run_max, + line_run_max, + suspicious_line_count, + suspicious_line_ratio, + suspect, + } +} + +fn compute_ocr_profile( + text: &str, + min_repeat_run: u64, +) -> (glossapi_rs_common::ScriptMetrics, OcrRepeatNoiseMetrics) { + let mut scanner = ScriptScanner::new(); + let mut non_empty_lines = 0usize; + let mut phrase_run_max = 0u64; + let mut line_repeat_inputs: Vec> = Vec::new(); + let mut phrase_suspicious_lines: Vec = Vec::new(); + + for segment in text.split_inclusive('\n') { + let trimmed = segment.trim(); + if trimmed.is_empty() { + continue; + } + if trimmed == PAGE_SPLIT_MARKER || is_table_line_trimmed(trimmed) { + continue; + } + + non_empty_lines += 1; + scanner.observe_str(segment); + let line_phrase_run_max = compute_repeat_phrase_run_max(trimmed, min_repeat_run); + phrase_run_max = phrase_run_max.max(line_phrase_run_max); + phrase_suspicious_lines.push(line_phrase_run_max >= min_repeat_run); + line_repeat_inputs.push(normalize_line_for_repetition(trimmed)); + } + + let (repeat_line_flags, line_run_max) = + collect_repeat_line_flags(&line_repeat_inputs, min_repeat_run); + let suspicious_line_count = phrase_suspicious_lines + .iter() + .zip(repeat_line_flags.iter()) + .filter(|(phrase_flag, line_flag)| **phrase_flag || **line_flag) + .count() as u64; + + ( + scanner.finish(), + finalize_ocr_repeat_noise( + phrase_run_max, + line_run_max, + suspicious_line_count, + non_empty_lines, + ), + ) +} + +fn split_pages(text: &str) -> Vec { + let mut pages = Vec::new(); + let mut current = String::new(); + + for segment in text.split_inclusive('\n') { + if segment.trim() == PAGE_SPLIT_MARKER { + pages.push(current); + current = String::new(); + continue; + } + current.push_str(segment); + } + pages.push(current); + pages +} + +fn parse_source_stem(stem: &str) -> (String, u64) { + if let Some((base, suffix)) = stem.rsplit_once("__p") { + if let Some((start, _end)) = suffix.split_once('-') { + if let Ok(start_page) = start.parse::() { + return (base.to_string(), start_page); + } + } + } + (stem.to_string(), 1) +} + +fn annotate_page_for_debug( + page: &str, + min_repeat_run: u64, +) -> Option<(String, Vec<&'static str>, u64)> { + let mut segments: Vec<(&str, &str)> = Vec::new(); + let mut normalized_lines: Vec> = Vec::new(); + for segment in page.split_inclusive('\n') { + let (line, newline) = if let Some(body) = segment.strip_suffix('\n') { + (body, "\n") + } else { + (segment, "") + }; + segments.push((line, newline)); + let trimmed = line.trim(); + if trimmed.is_empty() || is_table_line_trimmed(trimmed) { + normalized_lines.push(None); + } else { + normalized_lines.push(normalize_line_for_repetition(trimmed)); + } + } + + let (repeat_line_flags, _line_run_max) = + collect_repeat_line_flags(&normalized_lines, min_repeat_run); + + let mut annotated = String::with_capacity(page.len()); + let mut page_types: Vec<&'static str> = Vec::new(); + let mut match_count = 0u64; + + for (idx, (line, newline)) in segments.iter().enumerate() { + let line_debug = annotate_line_with_debug_matches(line, min_repeat_run); + let line_repeat_flag = repeat_line_flags.get(idx).copied().unwrap_or(false); + + let mut line_content = + if let Some((annotated_line, line_types, line_match_count)) = line_debug { + match_count += line_match_count; + for kind in line_types { + if !page_types.contains(&kind) { + page_types.push(kind); + } + } + annotated_line + } else { + (*line).to_string() + }; + + if line_repeat_flag { + if !page_types.contains(&"repeat_line_run") { + page_types.push("repeat_line_run"); + } + match_count += 1; + line_content = format!("{}", line_content); + } + + annotated.push_str(&line_content); + annotated.push_str(newline); + } + + if match_count == 0 { + return None; + } + + page_types.sort_unstable(); + page_types.dedup(); + Some((annotated, page_types, match_count)) +} + +pub fn annotate_numeric_debug_page_internal( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Option<(String, Vec, u64)> { + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + let (annotated_page, match_types, match_count) = annotate_text_with_debug_spans(page, spans)?; + Some(( + annotated_page, + match_types.into_iter().map(str::to_string).collect(), + match_count, + )) +} + +pub fn find_numeric_debug_page_spans_internal( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + .into_iter() + .map(|span| NumericDebugSpan { + start: span.start, + end: span.end, + match_type: span.match_type.to_string(), + }) + .collect() +} + +const WORD_REPEAT_HASH_MASK: u64 = (1u64 << 63).wrapping_mul(2).wrapping_sub(1); +const WORD_REPEAT_HASH_BASE: u64 = 1469598103934665603u64; + +#[inline] +fn hybrid_text_char_boundaries(text: &str) -> Vec { + let mut boundaries = Vec::with_capacity(text.chars().count() + 1); + for (byte_idx, _) in text.char_indices() { + boundaries.push(byte_idx); + } + boundaries.push(text.len()); + boundaries +} + +fn hybrid_byte_to_char_idx(boundaries: &[usize], byte_idx: usize) -> usize { + match boundaries.binary_search(&byte_idx) { + Ok(idx) => idx, + Err(idx) => idx, + } +} + +fn hybrid_normalize_body(text: &str) -> String { + let mut out = String::with_capacity(text.len()); + for ch in text.chars() { + for lower in ch.to_lowercase() { + let lower = if lower == 'ς' { 'σ' } else { lower }; + for sub in lower.to_string().nfd() { + if sub.is_alphanumeric() { + let mapped = match sub { + 'ο' => 'o', + 'κ' => 'k', + _ => sub, + }; + out.push(mapped); + } + } + } + } + out +} + +fn hybrid_has_markup_body(text: &str) -> bool { + if text.is_empty() { + return false; + } + let lower = text.to_lowercase(); + if lower.contains("src=") + || lower.contains("alt=") + || lower.contains("image_") + || lower.contains(".png") + || lower.contains(".jpg") + || lower.contains(".jpeg") + || lower.contains(".gif") + { + return true; + } + + let bytes = text.as_bytes(); + for (idx, byte) in bytes.iter().enumerate() { + if *byte == b'<' && idx + 2 <= bytes.len() && bytes[idx + 1..].contains(&b'>') { + return true; + } + } + false +} + +fn hybrid_classify_numeric_field(token: &str) -> Option<(HybridFieldKind, Vec, String)> { + let token = token.trim(); + if token.is_empty() { + return None; + } + + let trailing_paren = token.ends_with(')'); + let trailing_dot = token.ends_with('.'); + let stripped = if trailing_paren || trailing_dot { + &token[..token.len() - 1] + } else { + token + }; + if stripped.is_empty() { + return None; + } + + if stripped.contains('/') { + return Some((HybridFieldKind::NumericValue, Vec::new(), String::new())); + } + + let parts: Vec<&str> = stripped.split('.').collect(); + if parts.is_empty() || parts.iter().any(|part| part.is_empty() || !part.chars().all(|ch| ch.is_ascii_digit())) { + return None; + } + + let mut numbers = Vec::with_capacity(parts.len()); + for part in &parts { + numbers.push(part.parse::().ok()?); + } + + let mut shape = std::iter::repeat("#") + .take(numbers.len()) + .collect::>() + .join("."); + if trailing_paren { + shape.push(')'); + } else if trailing_dot { + shape.push('.'); + } + + let field_kind = if trailing_paren || trailing_dot { + HybridFieldKind::HeaderCounter + } else if numbers.len() >= 3 { + HybridFieldKind::HeaderCounter + } else if numbers.len() == 2 && parts.last().map(|part| part.len()).unwrap_or(0) <= 2 { + HybridFieldKind::HeaderCounter + } else { + HybridFieldKind::NumericValue + }; + + Some((field_kind, numbers, shape)) +} + +fn hybrid_classify_inline_numeric_field(token: &str) -> bool { + let stripped = token.trim(); + if stripped.is_empty() { + return false; + } + + if stripped.chars().all(|ch| ch.is_ascii_digit()) { + return true; + } + + if stripped.matches('/').count() == 1 { + let mut parts = stripped.split('/'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + return !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs != "0"; + } + + let decimal_candidate = stripped.replacen(',', ".", 1); + if decimal_candidate.matches('.').count() == 1 { + let mut parts = decimal_candidate.split('.'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + return !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()); + } + + false +} + +fn hybrid_parse_numeric_value(token: &str) -> Option { + let stripped = token.trim(); + if stripped.is_empty() { + return None; + } + + if stripped.chars().all(|ch| ch.is_ascii_digit()) { + return stripped.parse::().ok().map(|value| value as f64); + } + + if stripped.matches('/').count() == 1 { + let mut parts = stripped.split('/'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + if !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + { + let lhs_value = lhs.parse::().ok()?; + let rhs_value = rhs.parse::().ok()?; + if rhs_value != 0.0 { + return Some(lhs_value / rhs_value); + } + } + return None; + } + + let decimal_candidate = stripped.replacen(',', ".", 1); + if decimal_candidate.matches('.').count() == 1 { + let mut parts = decimal_candidate.split('.'); + let lhs = parts.next().unwrap_or(""); + let rhs = parts.next().unwrap_or(""); + if !lhs.is_empty() + && !rhs.is_empty() + && lhs.chars().all(|ch| ch.is_ascii_digit()) + && rhs.chars().all(|ch| ch.is_ascii_digit()) + { + return decimal_candidate.parse::().ok(); + } + } + + None +} + +fn hybrid_next_char(text: &str, byte_idx: usize) -> Option<(char, usize)> { + let ch = text[byte_idx..].chars().next()?; + Some((ch, byte_idx + ch.len_utf8())) +} + +fn hybrid_previous_char(text: &str, byte_idx: usize) -> Option { + text[..byte_idx].chars().next_back() +} + +fn hybrid_parse_prefix_at(text: &str, start: usize) -> Option { + if start >= text.len() { + return None; + } + if let Some(prev) = hybrid_previous_char(text, start) { + if prev.is_ascii_digit() { + return None; + } + } + + let (first, mut idx) = hybrid_next_char(text, start)?; + if !first.is_ascii_digit() { + return None; + } + while idx < text.len() { + let (ch, next_idx) = hybrid_next_char(text, idx)?; + if !ch.is_ascii_digit() { + break; + } + idx = next_idx; + } + + if idx >= text.len() { + return None; + } + let (delimiter, mut end_idx) = hybrid_next_char(text, idx)?; + match delimiter { + ')' => {} + '.' => { + loop { + let mut cursor = end_idx; + let mut saw_digit = false; + while cursor < text.len() { + let (ch, next_cursor) = hybrid_next_char(text, cursor)?; + if !ch.is_ascii_digit() { + break; + } + saw_digit = true; + cursor = next_cursor; + } + if saw_digit { + if cursor < text.len() { + let (ch, next_cursor) = hybrid_next_char(text, cursor)?; + if ch == '.' { + end_idx = next_cursor; + continue; + } + } + end_idx = cursor; + } + break; + } + } + _ => return None, + } + + let mut lookahead = end_idx; + while lookahead < text.len() { + let (ch, next_idx) = hybrid_next_char(text, lookahead)?; + if !ch.is_whitespace() { + return ch.is_alphabetic().then_some(end_idx); + } + lookahead = next_idx; + } + None +} + +fn hybrid_extract_numbered_items(analysis_text: &str) -> Vec { + let boundaries = hybrid_text_char_boundaries(analysis_text); + let mut candidates: Vec = Vec::new(); + let mut byte_idx = 0usize; + while byte_idx < analysis_text.len() { + let (ch, next_idx) = match hybrid_next_char(analysis_text, byte_idx) { + Some(value) => value, + None => break, + }; + if ch.is_ascii_digit() { + if let Some(prefix_end_byte) = hybrid_parse_prefix_at(analysis_text, byte_idx) { + let prefix = &analysis_text[byte_idx..prefix_end_byte]; + if let Some((field_kind, numbers, shape)) = hybrid_classify_numeric_field(prefix) { + candidates.push(HybridCandidate { + prefix_start_byte: byte_idx, + prefix_end_byte, + field_kind, + numbers, + shape, + }); + } + byte_idx = prefix_end_byte; + continue; + } + } + byte_idx = next_idx; + } + + let mut items: Vec = Vec::new(); + for (idx, candidate) in candidates.iter().enumerate() { + let next_start = candidates + .get(idx + 1) + .map(|item| item.prefix_start_byte) + .unwrap_or_else(|| analysis_text.len()); + let body_raw = analysis_text[candidate.prefix_end_byte..next_start].trim(); + if hybrid_has_markup_body(body_raw) { + continue; + } + let body_key = hybrid_normalize_body(body_raw); + let has_alpha = body_key.chars().any(|ch| ch.is_alphabetic()); + if !has_alpha { + continue; + } + let body_is_full = body_key.chars().count() >= HYBRID_REPEAT_MIN_BODY_ALNUM; + items.push(HybridNumberedItem { + start: hybrid_byte_to_char_idx(&boundaries, candidate.prefix_start_byte), + end: hybrid_byte_to_char_idx(&boundaries, next_start), + field_kind: candidate.field_kind, + numbers: candidate.numbers.clone(), + shape: candidate.shape.clone(), + body_key, + body_is_full, + }); + } + + items +} + +fn hybrid_clause_ranges(text: &str) -> Vec<(usize, usize)> { + let mut ranges: Vec<(usize, usize)> = Vec::new(); + let mut clause_start = 0usize; + let mut iter = text.char_indices().peekable(); + while let Some((idx, ch)) = iter.next() { + let is_delimiter = match ch { + ';' | '\n' => true, + ',' => match iter.peek() { + Some((_, next_ch)) => !next_ch.is_ascii_digit(), + None => true, + }, + _ => false, + }; + if is_delimiter { + ranges.push((clause_start, idx)); + clause_start = idx + ch.len_utf8(); + } + } + ranges.push((clause_start, text.len())); + ranges +} + +fn hybrid_extract_inline_items(analysis_text: &str) -> Vec { + let boundaries = hybrid_text_char_boundaries(analysis_text); + let clause_ranges = hybrid_clause_ranges(analysis_text); + let mut items: Vec = Vec::new(); + + for (clause_index, (raw_start, raw_end)) in clause_ranges.iter().enumerate() { + let clause = &analysis_text[*raw_start..*raw_end]; + if clause.trim().is_empty() { + continue; + } + + let leading_ws = clause.len() - clause.trim_start().len(); + let trailing_ws = clause.len() - clause.trim_end().len(); + let clause_start_abs = raw_start + leading_ws; + let clause_end_abs = raw_end - trailing_ws; + if clause_start_abs >= clause_end_abs { + continue; + } + + let clause_text = &analysis_text[clause_start_abs..clause_end_abs]; + if clause_text.is_empty() || hybrid_has_markup_body(clause_text) { + continue; + } + + let mut working_offset = clause_start_abs; + let mut working_text = clause_text; + if let Some(prefix_end) = hybrid_parse_prefix_at(working_text, 0) { + let trimmed = working_text[prefix_end..].trim_start(); + let trimmed_leading = working_text[prefix_end..].len() - trimmed.len(); + working_offset += prefix_end + trimmed_leading; + working_text = trimmed; + } + if working_text.is_empty() { + continue; + } + + let mut tokens: Vec = Vec::new(); + let mut numeric_positions: Vec = Vec::new(); + let mut token_byte = 0usize; + while token_byte < working_text.len() { + let (ch, next_idx) = match hybrid_next_char(working_text, token_byte) { + Some(value) => value, + None => break, + }; + if ch.is_ascii_digit() { + let mut end = next_idx; + loop { + let mut cursor = end; + while cursor < working_text.len() { + let (digit_ch, digit_next) = match hybrid_next_char(working_text, cursor) { + Some(value) => value, + None => break, + }; + if !digit_ch.is_ascii_digit() { + break; + } + cursor = digit_next; + } + end = cursor; + if end >= working_text.len() { + break; + } + let (sep, sep_next) = match hybrid_next_char(working_text, end) { + Some(value) => value, + None => break, + }; + if !matches!(sep, '.' | ',' | '/') { + break; + } + if sep_next >= working_text.len() { + break; + } + let (after_sep, _) = match hybrid_next_char(working_text, sep_next) { + Some(value) => value, + None => break, + }; + if !after_sep.is_ascii_digit() { + break; + } + end = sep_next; + } + let token = &working_text[token_byte..end]; + if hybrid_classify_inline_numeric_field(token) { + if let Some(parsed_value) = hybrid_parse_numeric_value(token) { + numeric_positions.push(tokens.len()); + tokens.push(HybridToken { + kind: HybridTokenKind::Numeric, + start: hybrid_byte_to_char_idx(&boundaries, working_offset + token_byte), + end: hybrid_byte_to_char_idx(&boundaries, working_offset + end), + token_key: None, + numeric_value: Some(parsed_value), + }); + } + } + token_byte = end; + continue; + } + if ch.is_alphabetic() { + let mut end = next_idx; + while end < working_text.len() { + let (next_ch, next_end) = match hybrid_next_char(working_text, end) { + Some(value) => value, + None => break, + }; + if !next_ch.is_alphabetic() { + break; + } + end = next_end; + } + let token = &working_text[token_byte..end]; + let token_key = hybrid_normalize_body(token); + if !token_key.is_empty() { + tokens.push(HybridToken { + kind: HybridTokenKind::Alpha, + start: hybrid_byte_to_char_idx(&boundaries, working_offset + token_byte), + end: hybrid_byte_to_char_idx(&boundaries, working_offset + end), + token_key: Some(token_key), + numeric_value: None, + }); + } + token_byte = end; + continue; + } + token_byte = next_idx; + } + + if numeric_positions.len() != 1 { + continue; + } + let numeric_pos = numeric_positions[0]; + let numeric_token = &tokens[numeric_pos]; + let left_alpha: Vec<&HybridToken> = tokens[..numeric_pos] + .iter() + .filter(|token| token.kind == HybridTokenKind::Alpha) + .collect(); + let right_alpha: Vec<&HybridToken> = tokens[numeric_pos + 1..] + .iter() + .filter(|token| token.kind == HybridTokenKind::Alpha) + .collect(); + + let left_start = left_alpha.len().saturating_sub(HYBRID_INLINE_CONTEXT_WORDS); + let left_context = &left_alpha[left_start..]; + let right_limit = std::cmp::min(HYBRID_INLINE_CONTEXT_WORDS, right_alpha.len()); + let right_context = &right_alpha[..right_limit]; + let alpha_word_count = left_context.len() + right_context.len(); + if alpha_word_count < HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS { + continue; + } + + let mut context_parts: Vec = + Vec::with_capacity(left_context.len() + 1 + right_context.len()); + for token in left_context { + if let Some(token_key) = &token.token_key { + context_parts.push(token_key.clone()); + } + } + context_parts.push("num".to_string()); + for token in right_context { + if let Some(token_key) = &token.token_key { + context_parts.push(token_key.clone()); + } + } + let context_key = hybrid_normalize_body(&context_parts.join(" ")); + if context_key.chars().count() < HYBRID_INLINE_CONTEXT_MIN_CHARS { + continue; + } + + let item_start = left_context + .first() + .map(|token| token.start) + .unwrap_or(numeric_token.start); + let item_end = right_context + .last() + .map(|token| token.end) + .unwrap_or(numeric_token.end); + items.push(HybridInlineItem { + start: item_start, + end: item_end, + clause_index, + inline_context_key: context_key, + numeric_value: numeric_token.numeric_value.unwrap_or(0.0), + }); + } + + items +} -use rayon::prelude::*; -use std::fs::{self, File}; -use std::io::Read; -use std::path::{Path, PathBuf}; -use walkdir::WalkDir; -// Avoid heavy regex for table detection; use lightweight checks instead +fn hybrid_partial_body_matches(candidate_body_key: &str, target_body_key: &str) -> bool { + if candidate_body_key.is_empty() || target_body_key.is_empty() || candidate_body_key == target_body_key { + return false; + } + if !target_body_key.starts_with(candidate_body_key) { + return false; + } + let target_len = target_body_key.chars().count(); + let candidate_len = candidate_body_key.chars().count(); + let min_chars = std::cmp::min(4usize, target_len); + let min_ratio_chars = std::cmp::max(1usize, (target_len + 1) / 2); + candidate_len >= std::cmp::min(min_chars, min_ratio_chars) +} -const GREEK_BLOCK_1: std::ops::RangeInclusive = 0x0370..=0x03FF; // Greek & Coptic -const GREEK_BLOCK_2: std::ops::RangeInclusive = 0x1F00..=0x1FFF; // Greek Extended +fn hybrid_header_progresses(previous: &HybridNumberedItem, current: &HybridNumberedItem) -> bool { + previous.field_kind == HybridFieldKind::HeaderCounter + && current.field_kind == HybridFieldKind::HeaderCounter + && !previous.numbers.is_empty() + && previous.numbers.len() == current.numbers.len() + && previous.numbers[..previous.numbers.len() - 1] == current.numbers[..current.numbers.len() - 1] + && current.numbers.last().copied() == previous.numbers.last().copied().map(|value| value + 1) +} -#[inline(always)] -fn is_greek(cp: u32) -> bool { - GREEK_BLOCK_1.contains(&cp) || GREEK_BLOCK_2.contains(&cp) +fn hybrid_header_is_parent(previous: &HybridNumberedItem, current: &HybridNumberedItem) -> bool { + previous.field_kind == HybridFieldKind::HeaderCounter + && current.field_kind == HybridFieldKind::HeaderCounter + && !previous.numbers.is_empty() + && previous.numbers.len() + 1 == current.numbers.len() + && current.numbers[..current.numbers.len() - 1] == previous.numbers[..] } -#[inline(always)] -fn is_combining_mark(cp: u32) -> bool { - (0x0300..=0x036F).contains(&cp) || (0x1DC0..=0x1DFF).contains(&cp) || (0x20D0..=0x20FF).contains(&cp) +fn hybrid_extend_tail_span_end( + items: &[HybridNumberedItem], + run_start: usize, + run_end: usize, + expected_body_key: &str, +) -> usize { + let span_end = items[run_end - 1].end; + if run_end >= items.len() { + return span_end; + } + let tail = &items[run_end]; + if tail.field_kind != HybridFieldKind::HeaderCounter + || tail.shape != items[run_start].shape + || !hybrid_header_progresses(&items[run_end - 1], tail) + || !hybrid_partial_body_matches(&tail.body_key, expected_body_key) + { + return span_end; + } + tail.end } -#[inline(always)] -fn is_vowel(cp: u32) -> bool { - matches!( - cp, - 0x0391 | 0x03B1 | 0x0386 | 0x03AC | // Αα Άά - 0x0395 | 0x03B5 | 0x0388 | 0x03AD | // Εε Έέ - 0x0397 | 0x03B7 | 0x0389 | 0x03AE | // Ηη Ήή - 0x0399 | 0x03B9 | 0x038A | 0x03AF | 0x03CA | 0x03CB | 0x039F | 0x03BF | - 0x038C | 0x03CC | 0x03C5 | 0x03B0 | 0x03CD | 0x03A5 | 0x038E | - 0x03A9 | 0x03C9 | 0x038F | 0x03CE - ) +fn hybrid_inline_step(previous: &HybridInlineItem, current: &HybridInlineItem) -> Option { + if current.clause_index != previous.clause_index + 1 + || current.inline_context_key != previous.inline_context_key + { + return None; + } + let step = current.numeric_value - previous.numeric_value; + (step > 0.0).then_some(step) } -const LONG_WORD_LIMIT: u64 = 21; -const SHORT_WORD_LIMIT: u64 = 3; -// Baseline for short words per 1000 Greek characters (empirically ~26 on clean texts) -const SHORT_BASELINE_PER_1000: f64 = 26.0; +fn hybrid_inline_step_matches(expected_step: f64, actual_step: f64) -> bool { + let tolerance = f64::max(1e-9, expected_step.abs() * 1e-6); + (expected_step - actual_step).abs() <= tolerance +} -#[inline] -fn to_lower_fast(cp: u32) -> u32 { - // Fast path for basic Greek capitals: add 0x20; otherwise return as-is - if (0x0391..=0x03A9).contains(&cp) { cp + 0x20 } else { cp } +fn hybrid_find_same_body_progression_spans(items: &[HybridNumberedItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let mut idx = 0usize; + while idx < items.len() { + let item = &items[idx]; + if item.field_kind != HybridFieldKind::HeaderCounter || !item.body_is_full { + idx += 1; + continue; + } + + let mut end_idx = idx + 1; + while end_idx < items.len() + && items[end_idx].field_kind == HybridFieldKind::HeaderCounter + && items[end_idx].body_is_full + && items[end_idx].body_key == item.body_key + && items[end_idx].shape == item.shape + && hybrid_header_progresses(&items[end_idx - 1], &items[end_idx]) + { + end_idx += 1; + } + + let run_length = end_idx - idx; + if run_length >= HYBRID_REPEAT_MIN_ITEMS { + let mut start_idx = idx; + if idx > 0 { + let previous = &items[idx - 1]; + if previous.body_is_full + && previous.body_key == item.body_key + && hybrid_header_is_parent(previous, item) + { + start_idx = idx - 1; + } + } + let span_end = hybrid_extend_tail_span_end(items, idx, end_idx, &item.body_key); + spans.push(HybridRepeatSpan { + start: items[start_idx].start, + end: span_end, + kind: "same_body_progression", + item_count: end_idx - start_idx, + cycle_len: None, + }); + idx = end_idx; + continue; + } + + idx += 1; + } + spans } -#[inline] -fn is_invalid_bigram_pair(prev_low: u32, curr_low: u32) -> bool { - match (prev_low, curr_low) { - // κ/γ/χ + ξ - (0x03BA, 0x03BE) | (0x03B3, 0x03BE) | (0x03C7, 0x03BE) - // π/β/φ + ψ - | (0x03C0, 0x03C8) | (0x03B2, 0x03C8) | (0x03C6, 0x03C8) - // ρλ, μρ, γβ, δτ, τδ, βπ, πβ - | (0x03C1, 0x03BB) | (0x03BC, 0x03C1) | (0x03B3, 0x03B2) - | (0x03B4, 0x03C4) | (0x03C4, 0x03B4) | (0x03B2, 0x03C0) | (0x03C0, 0x03B2) => true, - _ => false, +fn hybrid_find_cycle_progression_spans(items: &[HybridNumberedItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let n_items = items.len(); + for cycle_len in 2..=HYBRID_REPEAT_MAX_CYCLE { + let mut idx = 0usize; + while idx + 2 * cycle_len <= n_items { + let run = &items[idx..idx + 2 * cycle_len]; + if run + .iter() + .any(|item| item.field_kind != HybridFieldKind::HeaderCounter || !item.body_is_full) + { + idx += 1; + continue; + } + let first_shape = &run[0].shape; + if run.iter().any(|item| item.shape != *first_shape) { + idx += 1; + continue; + } + if !(1..run.len()).all(|pos| hybrid_header_progresses(&run[pos - 1], &run[pos])) { + idx += 1; + continue; + } + + let template: Vec<&str> = run[..cycle_len] + .iter() + .map(|item| item.body_key.as_str()) + .collect(); + let unique_template_count = template + .iter() + .copied() + .collect::>() + .len(); + if unique_template_count < 2 { + idx += 1; + continue; + } + + if (cycle_len..run.len()).any(|pos| run[pos].body_key != template[pos % cycle_len]) { + idx += 1; + continue; + } + + let mut end_idx = idx + 2 * cycle_len; + while end_idx < n_items + && items[end_idx].field_kind == HybridFieldKind::HeaderCounter + && items[end_idx].body_is_full + && items[end_idx].shape == items[idx].shape + && hybrid_header_progresses(&items[end_idx - 1], &items[end_idx]) + && items[end_idx].body_key == template[(end_idx - idx) % cycle_len] + { + end_idx += 1; + } + + let item_count = end_idx - idx; + if item_count >= HYBRID_REPEAT_MIN_CYCLE_ITEMS { + let span_end = hybrid_extend_tail_span_end( + items, + idx, + end_idx, + template[(end_idx - idx) % cycle_len], + ); + spans.push(HybridRepeatSpan { + start: items[idx].start, + end: span_end, + kind: "body_cycle_progression", + item_count, + cycle_len: Some(cycle_len), + }); + idx = end_idx; + continue; + } + idx += 1; + } } + spans } -static ALLOWED_DOUBLE: [u32; 9] = [ - 0x03BB, 0x03BC, 0x03BD, 0x03C1, 0x03C3, 0x03C4, 0x03BA, 0x03C0, 0x03B3, -]; +fn hybrid_find_inline_progression_spans(items: &[HybridInlineItem]) -> Vec { + let mut spans: Vec = Vec::new(); + let mut idx = 0usize; + while idx + HYBRID_INLINE_REPEAT_MIN_ITEMS <= items.len() { + let first = &items[idx]; + let second = &items[idx + 1]; + let expected_step = match hybrid_inline_step(first, second) { + Some(step) => step, + None => { + idx += 1; + continue; + } + }; -fn allowed_double(cp: u32) -> bool { - ALLOWED_DOUBLE.contains(&cp) + let mut end_idx = idx + 2; + while end_idx < items.len() { + let actual_step = match hybrid_inline_step(&items[end_idx - 1], &items[end_idx]) { + Some(step) => step, + None => break, + }; + if !hybrid_inline_step_matches(expected_step, actual_step) { + break; + } + end_idx += 1; + } + + let item_count = end_idx - idx; + if item_count >= HYBRID_INLINE_REPEAT_MIN_ITEMS { + spans.push(HybridRepeatSpan { + start: items[idx].start, + end: items[end_idx - 1].end, + kind: "inline_numeric_progression", + item_count, + cycle_len: None, + }); + idx = end_idx; + continue; + } + idx += 1; + } + spans } -#[inline] -fn is_table_line_trimmed(trimmed: &str) -> bool { - // A simple check equivalent to /^\s*\|.*\|\s*$/ after trimming - // i.e., line begins and ends with a '|' ignoring outer whitespace - !trimmed.is_empty() && trimmed.as_bytes()[0] == b'|' && trimmed.as_bytes()[trimmed.len()-1] == b'|' +pub fn find_hybrid_repeat_spans_internal(analysis_text: &str) -> Vec { + let items = hybrid_extract_numbered_items(analysis_text); + let mut spans = hybrid_find_same_body_progression_spans(&items); + spans.extend(hybrid_find_cycle_progression_spans(&items)); + let inline_items = hybrid_extract_inline_items(analysis_text); + spans.extend(hybrid_find_inline_progression_spans(&inline_items)); + spans.sort_by(|lhs, rhs| { + lhs.start + .cmp(&rhs.start) + .then_with(|| (rhs.end - rhs.start).cmp(&(lhs.end - lhs.start))) + }); + + let mut deduped: Vec = Vec::new(); + for span in spans { + if let Some(previous) = deduped.last() { + if span.start >= previous.start && span.end <= previous.end { + continue; + } + } + deduped.push(span); + } + deduped } -fn table_line_ratio_and_filtered(text: &str) -> (f64, Option, usize, usize) { - let mut non_empty = 0usize; - let mut table_like = 0usize; - // First pass: count table-like rows without allocating filtered buffer unless needed - for line in text.lines() { - let trimmed = line.trim(); - if !trimmed.is_empty() { - non_empty += 1; - if is_table_line_trimmed(trimmed) { - table_like += 1; +fn normalize_alnum_with_map_skip_tags_internal(text: &str) -> (String, Vec) { + let mut normalized = String::with_capacity(text.len()); + let mut raw_char_indices: Vec = Vec::with_capacity(text.len()); + let mut in_tag = false; + + for (raw_idx, ch) in text.chars().enumerate() { + if in_tag { + if ch == '>' { + in_tag = false; + } + continue; + } + if ch == '<' { + in_tag = true; + continue; + } + let mut casefolded = String::new(); + for lower in ch.to_lowercase() { + match lower { + 'ς' => casefolded.push('σ'), + 'ß' => { + casefolded.push('s'); + casefolded.push('s'); + } + 'ſ' => casefolded.push('s'), + _ => casefolded.push(lower), + } + } + for sub in casefolded.nfd() { + if sub.is_alphanumeric() { + let mapped = match sub { + 'ο' => 'o', + 'κ' => 'k', + _ => sub, + }; + normalized.push(mapped); + raw_char_indices.push(raw_idx); } } } - let ratio = if non_empty > 0 { table_like as f64 / non_empty as f64 } else { 0.0 }; - if table_like == 0 { - return (ratio, None, non_empty, table_like); + + (normalized, raw_char_indices) +} + +pub fn find_labeled_shared_repeat_spans_internal( + text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> Vec { + let (normalized_text, raw_map) = normalize_alnum_with_map_skip_tags_internal(text); + let normalized_chars: Vec = normalized_text.chars().collect(); + let spans = find_word_repeat_spans_internal(&normalized_text, rep_threshold, min_period, window); + let mut labeled: Vec = Vec::new(); + + for span in spans { + if span.end <= span.start || span.start >= raw_map.len() { + continue; + } + let mut has_letter = false; + let mut has_digit = false; + for ch in &normalized_chars[span.start..span.end] { + if ch.is_alphabetic() { + has_letter = true; + } + if ch.is_ascii_digit() { + has_digit = true; + } + } + let match_type = if has_letter { + "word_repeat" + } else if has_digit { + "numeric_repeat" + } else { + continue; + }; + labeled.push(LabeledSharedRepeatSpan { + start: raw_map[span.start], + end: raw_map[span.end - 1] + 1, + period: span.period, + repetitions: span.repetitions, + tail_chars: span.tail_chars, + match_type, + }); } - // Second pass only if we actually need a filtered buffer (preserve original newlines) - let mut filtered = String::with_capacity(text.len()); - for seg in text.split_inclusive('\n') { - let trimmed = seg.trim(); - if trimmed.is_empty() || !is_table_line_trimmed(trimmed) { - filtered.push_str(seg); + + labeled +} + +fn word_repeat_hash_slice(pref: &[u64], pw: &[u64], start: usize, end: usize) -> u64 { + pref[end].wrapping_sub(pref[start].wrapping_mul(pw[end - start])) & WORD_REPEAT_HASH_MASK +} + +#[inline] +fn word_repeat_blocks_equal( + codes: &[u32], + pref: &[u64], + pw: &[u64], + lhs: usize, + rhs: usize, + period: usize, +) -> bool { + word_repeat_hash_slice(pref, pw, lhs, lhs + period) + == word_repeat_hash_slice(pref, pw, rhs, rhs + period) + && codes[lhs..lhs + period] == codes[rhs..rhs + period] +} + +pub fn find_word_repeat_spans_internal( + normalized_text: &str, + rep_threshold: usize, + min_period: usize, + window: usize, +) -> Vec { + let codes: Vec = normalized_text.chars().map(|ch| ch as u32).collect(); + let n_chars = codes.len(); + if rep_threshold == 0 || min_period == 0 || n_chars < rep_threshold.saturating_mul(min_period) { + return Vec::new(); + } + + let mut pref = vec![0u64; n_chars + 1]; + let mut pw = vec![1u64; n_chars + 1]; + for (idx, code) in codes.iter().enumerate() { + pref[idx + 1] = + (pref[idx].wrapping_mul(WORD_REPEAT_HASH_BASE).wrapping_add(*code as u64)) + & WORD_REPEAT_HASH_MASK; + pw[idx + 1] = pw[idx].wrapping_mul(WORD_REPEAT_HASH_BASE) & WORD_REPEAT_HASH_MASK; + } + + let max_period = std::cmp::min( + std::cmp::max(min_period, window / rep_threshold), + n_chars / rep_threshold, + ); + let mut spans: Vec = Vec::new(); + + for period in min_period..=max_period { + let mut idx = 0usize; + while idx + rep_threshold * period <= n_chars { + let mut is_repeat = true; + for multiple in 1..rep_threshold { + if !word_repeat_blocks_equal(&codes, &pref, &pw, idx, idx + multiple * period, period) + { + is_repeat = false; + break; + } + } + if !is_repeat { + idx += 1; + continue; + } + + let mut left = idx; + while left >= period + && word_repeat_blocks_equal(&codes, &pref, &pw, left - period, left, period) + { + left -= period; + } + + let mut right = idx + rep_threshold * period; + while right + period <= n_chars + && word_repeat_blocks_equal(&codes, &pref, &pw, right - period, right, period) + { + right += period; + } + + let pattern = &codes[left..left + period]; + let mut tail_chars = 0usize; + while right + tail_chars < n_chars + && tail_chars < period + && codes[right + tail_chars] == pattern[tail_chars] + { + tail_chars += 1; + } + + spans.push(WordRepeatSpan { + start: left, + end: right + tail_chars, + period, + repetitions: (right - left) / period, + tail_chars, + }); + idx = right; } } - (ratio, Some(filtered), non_empty, table_like) + + spans.sort_by(|lhs, rhs| { + lhs.start + .cmp(&rhs.start) + .then((rhs.end - rhs.start).cmp(&(lhs.end - lhs.start))) + .then(lhs.period.cmp(&rhs.period)) + }); + + let mut deduped: Vec = Vec::new(); + for span in spans { + if let Some(previous) = deduped.last() { + if span.start >= previous.start && span.end <= previous.end { + continue; + } + } + deduped.push(span); + } + deduped } -fn compute_latin_pct(buf: &[u8]) -> f64 { - let latin_chars = buf - .iter() - .filter(|&&b| (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)) - .count(); - latin_chars as f64 / (buf.len() as f64) +fn collect_numeric_debug_spans_for_page( + page: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + if let Some(page_span) = + collect_numeric_page_collapse_span(page, NUMERIC_PAGE_COLLAPSE_MIN_TOKENS) + { + return vec![page_span]; + } + + let block_spans = collect_numeric_block_collapse_spans(page); + if !block_spans.is_empty() { + return block_spans; + } + + let page_tokens = extract_non_whitespace_tokens_with_spans(page); + let mut spans = collect_numeric_progression_matches(page, &page_tokens, min_progress_steps); + let mut line_offset = 0usize; + + for segment in page.split_inclusive('\n') { + let (line, newline) = if let Some(body) = segment.strip_suffix('\n') { + (body, "\n") + } else { + (segment, "") + }; + + let line_tokens = extract_non_whitespace_tokens_with_spans(line); + spans.extend( + collect_compact_repeat_numeric_matches(line, &line_tokens, min_repeat_steps) + .into_iter() + .map(|span| DebugMatchSpan { + start: span.start + line_offset, + end: span.end + line_offset, + match_type: span.match_type, + }), + ); + spans.extend( + collect_same_digit_numeric_matches(line, &line_tokens, min_same_digit_steps) + .into_iter() + .map(|span| DebugMatchSpan { + start: span.start + line_offset, + end: span.end + line_offset, + match_type: span.match_type, + }), + ); + line_offset += line.len() + newline.len(); + } + + spans } -fn compute_polytonic_word_ratio(text: &str) -> (u64, u64, f64) { - let mut greek_words = 0u64; - let mut polytonic_words = 0u64; - for w in text.split_whitespace() { - let mut has_greek = false; - let mut has_poly = false; - for ch in w.chars() { - let cp = ch as u32; - if is_greek(cp) { has_greek = true; } - if (0x1F00..=0x1FFF).contains(&cp) || is_combining_mark(cp) { has_poly = true; } +fn collect_ocr_debug_candidates_for_text( + source_path: &Path, + source_stem: &str, + base_stem: &str, + start_page: u64, + text: &str, + min_repeat_run: u64, +) -> Vec { + let mut candidates = Vec::new(); + let pages = split_pages(text); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if let Some((_annotated_page, _match_types, _match_count)) = + annotate_page_for_debug(page, min_repeat_run) + { + candidates.push(OcrDebugPageCandidate { + source_path: source_path.to_string_lossy().into_owned(), + source_stem: source_stem.to_string(), + base_stem: base_stem.to_string(), + page_number, + page_index_in_file, + }); } - if has_greek { - greek_words += 1; - if has_poly { polytonic_words += 1; } + } + + candidates +} + +fn collect_numeric_debug_candidates_for_text( + source_path: &Path, + source_stem: &str, + base_stem: &str, + start_page: u64, + text: &str, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> Vec { + let mut candidates = Vec::new(); + let pages = split_pages(text); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if !collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + .is_empty() + { + candidates.push(OcrDebugPageCandidate { + source_path: source_path.to_string_lossy().into_owned(), + source_stem: source_stem.to_string(), + base_stem: base_stem.to_string(), + page_number, + page_index_in_file, + }); } } - let ratio = if greek_words > 0 { polytonic_words as f64 / greek_words as f64 } else { 0.0 }; - (polytonic_words, greek_words, ratio) + + candidates +} + +fn render_ocr_debug_candidate( + candidate: &OcrDebugPageCandidate, + output_dir: &Path, + min_repeat_run: u64, +) -> anyhow::Result { + let source_path = PathBuf::from(&candidate.source_path); + let buf = fs::read(&source_path)?; + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let page_idx = candidate + .page_index_in_file + .checked_sub(1) + .ok_or_else(|| anyhow::anyhow!("invalid page index"))? as usize; + let page = pages + .get(page_idx) + .ok_or_else(|| anyhow::anyhow!("page index out of range for {}", candidate.source_path))?; + let (annotated_page, match_types, match_count) = annotate_page_for_debug(page, min_repeat_run) + .ok_or_else(|| { + anyhow::anyhow!( + "candidate page no longer matches: {}", + candidate.source_path + ) + })?; + let match_types_joined = match_types.join(","); + let output_name = format!( + "{}__debug_page_{:05}.md", + candidate.source_stem, candidate.page_number + ); + let output_path = output_dir.join(output_name); + + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content)?; + + Ok(OcrDebugPageRow { + source_path: candidate.source_path.clone(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: candidate.source_stem.clone(), + base_stem: candidate.base_stem.clone(), + page_number: candidate.page_number, + page_index_in_file: candidate.page_index_in_file, + match_types: match_types_joined, + match_count, + }) +} + +fn render_numeric_debug_candidate( + candidate: &OcrDebugPageCandidate, + output_dir: &Path, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, +) -> anyhow::Result { + let source_path = PathBuf::from(&candidate.source_path); + let buf = fs::read(&source_path)?; + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let page_idx = candidate + .page_index_in_file + .checked_sub(1) + .ok_or_else(|| anyhow::anyhow!("invalid page index"))? as usize; + let page = pages + .get(page_idx) + .ok_or_else(|| anyhow::anyhow!("page index out of range for {}", candidate.source_path))?; + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + let (annotated_page, match_types, match_count) = annotate_text_with_debug_spans(page, spans) + .ok_or_else(|| { + anyhow::anyhow!( + "candidate page no longer matches numeric detector: {}", + candidate.source_path + ) + })?; + let match_types_joined = match_types.join(","); + let output_name = format!( + "{}__debug_page_{:05}.md", + candidate.source_stem, candidate.page_number + ); + let output_path = output_dir.join(output_name); + + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content)?; + + Ok(OcrDebugPageRow { + source_path: candidate.source_path.clone(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: candidate.source_stem.clone(), + base_stem: candidate.base_stem.clone(), + page_number: candidate.page_number, + page_index_in_file: candidate.page_index_in_file, + match_types: match_types_joined, + match_count, + }) } /// Compute metrics for UTF-8 bytes; ported from original CLI. @@ -243,7 +2755,9 @@ fn analyse_bytes(buf: &[u8]) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u6 long_word_count += 1; let extra = (word_len - LONG_WORD_LIMIT) as u64; // >= 0 let mut weight = 1 + extra; // equals (len - 20) - if weight > 380 { weight = 380; } + if weight > 380 { + weight = 380; + } long_word_weight_sum += weight; } if word_len > longest_word { @@ -268,44 +2782,83 @@ fn analyse_bytes(buf: &[u8]) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u6 } else { if run_len >= 4 { let pen = run_len - 3; - if run_is_vowel { v_pen += pen; } else { c_pen += pen; } + if run_is_vowel { + v_pen += pen; + } else { + c_pen += pen; + } + } + if run_len > max_run { + max_run = run_len; } - if run_len > max_run { max_run = run_len; } run_is_vowel = vowel; run_len = 1; } if prev_cp != 0 { let pc_low = to_lower_fast(prev_cp); let cc_low = to_lower_fast(cp); - if is_invalid_bigram_pair(pc_low, cc_low) { invalid_bigram += 1; } + if is_invalid_bigram_pair(pc_low, cc_low) { + invalid_bigram += 1; + } + } + if prev_cp == cp && !allowed_double(cp) { + bad_double += 1; } - if prev_cp == cp && !allowed_double(cp) { bad_double += 1; } prev_cp = cp; } if run_len >= 4 { let pen = run_len - 3; - if run_is_vowel { v_pen += pen; } else { c_pen += pen; } + if run_is_vowel { + v_pen += pen; + } else { + c_pen += pen; + } + } + if run_len > max_run { + max_run = run_len; } - if run_len > max_run { max_run = run_len; } if word_len > 0 { total_word_count += 1; - if word_len < SHORT_WORD_LIMIT { short_word_count += 1; } + if word_len < SHORT_WORD_LIMIT { + short_word_count += 1; + } if word_len >= LONG_WORD_LIMIT { long_word_count += 1; let extra = (word_len - LONG_WORD_LIMIT) as u64; let mut weight = 1 + extra; // equals (len - 20) - if weight > 380 { weight = 380; } + if weight > 380 { + weight = 380; + } long_word_weight_sum += weight; } - if word_len > longest_word { longest_word = word_len; } - if prev_cp == 0x03C3 { misplaced_sigma += 1; } + if word_len > longest_word { + longest_word = word_len; + } + if prev_cp == 0x03C3 { + misplaced_sigma += 1; + } } - (len_greek, v_pen, c_pen, bad_double, max_run, long_word_count, long_word_weight_sum, longest_word, misplaced_sigma, invalid_bigram, short_word_count, total_word_count) + ( + len_greek, + v_pen, + c_pen, + bad_double, + max_run, + long_word_count, + long_word_weight_sum, + longest_word, + misplaced_sigma, + invalid_bigram, + short_word_count, + total_word_count, + ) } fn decode_utf8(slice: &[u8]) -> (u32, usize) { - if slice.is_empty() { return (0, 0); } + if slice.is_empty() { + return (0, 0); + } let c0 = slice[0]; if c0 < 0x80 { return (c0 as u32, 1); @@ -313,10 +2866,14 @@ fn decode_utf8(slice: &[u8]) -> (u32, usize) { let cp = ((c0 & 0x1F) as u32) << 6 | (slice[1] & 0x3F) as u32; return (cp, 2); } else if c0 & 0xF0 == 0xE0 && slice.len() >= 3 { - let cp = ((c0 & 0x0F) as u32) << 12 | ((slice[1] & 0x3F) as u32) << 6 | (slice[2] & 0x3F) as u32; + let cp = + ((c0 & 0x0F) as u32) << 12 | ((slice[1] & 0x3F) as u32) << 6 | (slice[2] & 0x3F) as u32; return (cp, 3); } else if c0 & 0xF8 == 0xF0 && slice.len() >= 4 { - let cp = ((c0 & 0x07) as u32) << 18 | ((slice[1] & 0x3F) as u32) << 12 | ((slice[2] & 0x3F) as u32) << 6 | (slice[3] & 0x3F) as u32; + let cp = ((c0 & 0x07) as u32) << 18 + | ((slice[1] & 0x3F) as u32) << 12 + | ((slice[2] & 0x3F) as u32) << 6 + | (slice[3] & 0x3F) as u32; return (cp, 4); } (0, 1) @@ -329,13 +2886,32 @@ fn decode_utf8(slice: &[u8]) -> (u32, usize) { /// v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, /// flags) fn compute_score_and_details( - buf: &[u8] + buf: &[u8], ) -> ( - f64, f64, f64, f64, - u64, u64, - u64, u64, u64, u64, u64, u64, u64, u64, u64, - f64, f64, f64, f64, f64, f64, f64, f64, - String + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, ) { let latin_pct = compute_latin_pct(buf); @@ -343,9 +2919,26 @@ fn compute_score_and_details( let text = String::from_utf8_lossy(buf); let (table_ratio, filtered_opt, _non_empty, table_like) = table_line_ratio_and_filtered(&text); let had_tables = table_like > 0; - let target: &[u8] = if let Some(ref s) = filtered_opt { s.as_bytes() } else { buf }; + let target: &[u8] = if let Some(ref s) = filtered_opt { + s.as_bytes() + } else { + buf + }; - let (len_greek, v_pen, c_pen, bad_dbl, max_run, long_word_count, long_word_weight_sum, longest_word, misplaced_sigma, invalid_bigram, short_word_count, total_word_count) = analyse_bytes(target); + let ( + len_greek, + v_pen, + c_pen, + bad_dbl, + max_run, + long_word_count, + long_word_weight_sum, + longest_word, + misplaced_sigma, + invalid_bigram, + short_word_count, + total_word_count, + ) = analyse_bytes(target); let mut flags: Vec<&str> = Vec::with_capacity(2); @@ -369,34 +2962,113 @@ fn compute_score_and_details( 0.0 }; // Normalized short words: per 1000 Greek chars, then excess over baseline - let short_per_1000 = if len > 0.0 { 1000.0 * (short_word_count as f64) / len } else { 0.0 }; - let short_excess_per_1000 = if short_per_1000 > SHORT_BASELINE_PER_1000 { short_per_1000 - SHORT_BASELINE_PER_1000 } else { 0.0 }; + let short_per_1000 = if len > 0.0 { + 1000.0 * (short_word_count as f64) / len + } else { + 0.0 + }; + let short_excess_per_1000 = if short_per_1000 > SHORT_BASELINE_PER_1000 { + short_per_1000 - SHORT_BASELINE_PER_1000 + } else { + 0.0 + }; // Halved sigma coefficient from 5.0 to 2.5; removed longest_word term - let score = v_rate + 1.5*c_rate + 2.0*d_rate + 2.5*sigma_end_rate + 2.0*bigram_rate + short_excess_per_1000 + long_word_rate; + let score = v_rate + + 1.5 * c_rate + + 2.0 * d_rate + + 2.5 * sigma_end_rate + + 2.0 * bigram_rate + + short_excess_per_1000 + + long_word_rate; - let (_poly_words, _greek_words, poly_ratio) = if len_greek == 0 { - (0, 0, 0.0) + let poly_ratio = if len_greek == 0 { + 0.0 } else { - compute_polytonic_word_ratio(if let Some(ref s) = filtered_opt { s } else { &text }) + let target_text: &str = if let Some(ref s) = filtered_opt { + s.as_str() + } else { + text.as_ref() + }; + scan_script_metrics(target_text).polytonic_ratio() }; - if poly_ratio > 0.0 { flags.push("polytonic"); } - if had_tables { flags.push("had_tables"); } + if poly_ratio > 0.0 { + flags.push("polytonic"); + } + if had_tables { + flags.push("had_tables"); + } ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_word_count, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_excess_per_1000, - flags.join(",") + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_word_count, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_excess_per_1000, + flags.join(","), ) } /// Compute noise score and latin percentage for a UTF-8 buffer. Backward-compatible API. fn compute_score(buf: &[u8]) -> (f64, f64) { - let (score, latin_pct, _t, _p, _lg, _tw, _v,_c,_bd,_ms,_ib,_lwc,_lw,_swc,_mr,_vr,_cr,_dr,_sr,_br,_lwr,_sr2,_sp,_f) = compute_score_and_details(buf); + let ( + score, + latin_pct, + _t, + _p, + _lg, + _tw, + _v, + _c, + _bd, + _ms, + _ib, + _lwc, + _lw, + _swc, + _mr, + _vr, + _cr, + _dr, + _sr, + _br, + _lwr, + _sr2, + _sp, + _f, + ) = compute_score_and_details(buf); (score, latin_pct) } +fn run_in_thread_pool(n_threads: Option, work: F) -> anyhow::Result +where + T: Send, + F: FnOnce() -> T + Send, +{ + let threads = n_threads + .filter(|count| *count > 0) + .unwrap_or_else(rayon::current_num_threads); + let pool = ThreadPoolBuilder::new().num_threads(threads).build()?; + Ok(pool.install(work)) +} + pub fn score_markdown_file_internal(path: &Path) -> anyhow::Result { let mut file = File::open(path)?; let mut buf = Vec::new(); @@ -405,57 +3077,508 @@ pub fn score_markdown_file_internal(path: &Path) -> anyhow::Result { Ok(score) } -pub fn score_markdown_directory_internal(root: &Path, n_threads: Option) -> anyhow::Result> { - if let Some(t) = n_threads { rayon::ThreadPoolBuilder::new().num_threads(t).build_global().ok(); } - let results: Vec<(String, f64, f64)> = WalkDir::new(root) - .into_iter() - .par_bridge() - .filter_map(Result::ok) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) - .map(|e| { - let path = e.path(); - let buf = fs::read(path).expect("read"); - let (score, latin_pct) = compute_score(&buf); - (path.to_string_lossy().into_owned(), score, latin_pct) - }) - .collect(); - Ok(results) +pub fn score_markdown_directory_internal( + root: &Path, + n_threads: Option, +) -> anyhow::Result> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let (score, latin_pct) = compute_score(&buf); + (path.to_string_lossy().into_owned(), score, latin_pct) + }) + .collect() + }) } // Detailed variants for analysis layer -pub fn score_markdown_file_detailed_internal(path: &Path) -> anyhow::Result<(f64, f64, f64, f64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, f64, f64, f64, f64, f64, f64, f64, f64, String)> { +pub fn score_markdown_file_detailed_internal( + path: &Path, +) -> anyhow::Result<( + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, +)> { let mut file = File::open(path)?; let mut buf = Vec::new(); file.read_to_end(&mut buf)?; Ok(compute_score_and_details(&buf)) } -pub fn score_markdown_directory_detailed_internal(root: &Path, n_threads: Option) -> anyhow::Result> { - if let Some(t) = n_threads { rayon::ThreadPoolBuilder::new().num_threads(t).build_global().ok(); } - let results: Vec<(String, f64, f64, f64, f64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, f64, f64, f64, f64, f64, f64, f64, f64, String)> = WalkDir::new(root) - .into_iter() - .par_bridge() - .filter_map(Result::ok) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) - .map(|e| { - let path = e.path(); - let buf = fs::read(path).expect("read"); - let ( - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) = compute_score_and_details(&buf); - ( - path.to_string_lossy().into_owned(), - score, latin_pct, table_ratio, poly_ratio, - len_greek, total_words, - v_pen, c_pen, bad_dbl, misplaced_sigma, invalid_bigram, long_word_count, longest_word, short_word_count, max_run, - v_rate, c_rate, d_rate, sigma_end_rate, bigram_rate, long_word_rate, short_ratio, short_pen, - flags - ) - }) - .collect(); - Ok(results) +pub fn score_markdown_directory_detailed_internal( + root: &Path, + n_threads: Option, +) -> anyhow::Result< + Vec<( + String, + f64, + f64, + f64, + f64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + u64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + f64, + String, + )>, +> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let ( + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) = compute_score_and_details(&buf); + ( + path.to_string_lossy().into_owned(), + score, + latin_pct, + table_ratio, + poly_ratio, + len_greek, + total_words, + v_pen, + c_pen, + bad_dbl, + misplaced_sigma, + invalid_bigram, + long_word_count, + longest_word, + short_word_count, + max_run, + v_rate, + c_rate, + d_rate, + sigma_end_rate, + bigram_rate, + long_word_rate, + short_ratio, + short_pen, + flags, + ) + }) + .collect() + }) +} + +pub fn score_markdown_directory_ocr_profile_internal( + root: &Path, + n_threads: Option, + min_repeat_run: u64, +) -> anyhow::Result> { + run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let (script, noise) = compute_ocr_profile(&text, min_repeat_run); + let mut flags = Vec::new(); + if noise.phrase_run_max >= min_repeat_run { + flags.push("repeat_phrase_run"); + } + if noise.line_run_max >= min_repeat_run { + flags.push("repeat_line_run"); + } + + OcrProfileRow { + path: path.to_string_lossy().into_owned(), + percentage_greek: script.percentage_greek(), + latin_percentage: script.latin_percentage(), + polytonic_ratio: script.polytonic_ratio(), + non_whitespace_chars: script.non_whitespace_chars, + greek_char_count: script.greek_char_count, + latin_char_count: script.latin_char_count, + ocr_repeat_phrase_run_max: noise.phrase_run_max, + ocr_repeat_line_run_max: noise.line_run_max, + ocr_repeat_suspicious_line_count: noise.suspicious_line_count, + ocr_repeat_suspicious_line_ratio: noise.suspicious_line_ratio, + ocr_noise_suspect: noise.suspect, + ocr_noise_flags: flags.join(","), + } + }) + .collect() + }) +} + +pub fn export_ocr_match_debug_pages_internal( + root: &Path, + output_dir: &Path, + n_threads: Option, + min_repeat_run: u64, + max_pages: Option, + sample_seed: u64, +) -> anyhow::Result> { + fs::create_dir_all(output_dir)?; + if let Some(limit) = max_pages { + let mut candidates: Vec = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + collect_ocr_debug_candidates_for_text( + path, + &source_stem, + &base_stem, + start_page, + &text, + min_repeat_run, + ) + }) + .reduce(Vec::new, |mut acc, mut item| { + acc.append(&mut item); + acc + }) + })?; + + if candidates.len() > limit { + let mut rng = StdRng::seed_from_u64(sample_seed); + candidates.shuffle(&mut rng); + candidates.truncate(limit); + } + candidates.sort_by(|a, b| { + a.source_path + .cmp(&b.source_path) + .then(a.page_number.cmp(&b.page_number)) + }); + + let output_dir = output_dir.to_path_buf(); + let mut rows: Vec = run_in_thread_pool(n_threads, move || { + candidates + .into_par_iter() + .map(|candidate| { + render_ocr_debug_candidate(&candidate, &output_dir, min_repeat_run) + }) + .collect::>>() + })??; + rows.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + return Ok(rows); + } + + let rows: Vec> = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let mut page_rows = Vec::new(); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + if let Some((annotated_page, match_types, match_count)) = + annotate_page_for_debug(page, min_repeat_run) + { + let match_types_joined = match_types.join(","); + let output_name = + format!("{}__debug_page_{:05}.md", source_stem, page_number); + let output_path = output_dir.join(output_name); + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content).expect("write debug page"); + + page_rows.push(OcrDebugPageRow { + source_path: path.to_string_lossy().into_owned(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: source_stem.clone(), + base_stem: base_stem.clone(), + page_number, + page_index_in_file, + match_types: match_types_joined, + match_count, + }); + } + } + + page_rows + }) + .collect() + })?; + + let mut flat = Vec::new(); + for mut group in rows { + flat.append(&mut group); + } + flat.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + Ok(flat) +} + +pub fn export_numeric_match_debug_pages_internal( + root: &Path, + output_dir: &Path, + n_threads: Option, + min_progress_steps: u64, + min_repeat_steps: u64, + min_same_digit_steps: u64, + max_pages: Option, + sample_seed: u64, +) -> anyhow::Result> { + fs::create_dir_all(output_dir)?; + if let Some(limit) = max_pages { + let mut candidates: Vec = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + collect_numeric_debug_candidates_for_text( + path, + &source_stem, + &base_stem, + start_page, + &text, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }) + .reduce(Vec::new, |mut acc, mut item| { + acc.append(&mut item); + acc + }) + })?; + + if candidates.len() > limit { + let mut rng = StdRng::seed_from_u64(sample_seed); + candidates.shuffle(&mut rng); + candidates.truncate(limit); + } + candidates.sort_by(|a, b| { + a.source_path + .cmp(&b.source_path) + .then(a.page_number.cmp(&b.page_number)) + }); + + let output_dir = output_dir.to_path_buf(); + let mut rows: Vec = run_in_thread_pool(n_threads, move || { + candidates + .into_par_iter() + .map(|candidate| { + render_numeric_debug_candidate( + &candidate, + &output_dir, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ) + }) + .collect::>>() + })??; + rows.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + return Ok(rows); + } + + let rows: Vec> = run_in_thread_pool(n_threads, || { + WalkDir::new(root) + .into_iter() + .par_bridge() + .filter_map(Result::ok) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "md")) + .map(|e| { + let path = e.path(); + let source_stem = path + .file_stem() + .map(|stem| stem.to_string_lossy().into_owned()) + .unwrap_or_else(|| "unknown".to_string()); + let (base_stem, start_page) = parse_source_stem(&source_stem); + let buf = fs::read(path).expect("read"); + let text = String::from_utf8_lossy(&buf); + let pages = split_pages(&text); + let mut page_rows = Vec::new(); + + for (idx, page) in pages.iter().enumerate() { + let page_index_in_file = idx as u64 + 1; + let page_number = start_page + idx as u64; + let spans = collect_numeric_debug_spans_for_page( + page, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + ); + if let Some((annotated_page, match_types, match_count)) = + annotate_text_with_debug_spans(page, spans) + { + let match_types_joined = match_types.join(","); + let output_name = + format!("{}__debug_page_{:05}.md", source_stem, page_number); + let output_path = output_dir.join(output_name); + let mut content = String::new(); + content.push_str("\n"); + content.push_str("\n\n"); + content.push_str(&annotated_page); + fs::write(&output_path, content).expect("write numeric debug page"); + + page_rows.push(OcrDebugPageRow { + source_path: path.to_string_lossy().into_owned(), + output_path: output_path.to_string_lossy().into_owned(), + source_stem: source_stem.clone(), + base_stem: base_stem.clone(), + page_number, + page_index_in_file, + match_types: match_types_joined, + match_count, + }); + } + } + + page_rows + }) + .collect() + })?; + + let mut flat = Vec::new(); + for mut group in rows { + flat.append(&mut group); + } + flat.sort_by(|a, b| { + a.output_path + .cmp(&b.output_path) + .then(a.page_number.cmp(&b.page_number)) + }); + Ok(flat) } diff --git a/samples/openarchives_download_policy.yml b/samples/openarchives_download_policy.yml new file mode 100644 index 0000000..180e5fe --- /dev/null +++ b/samples/openarchives_download_policy.yml @@ -0,0 +1,104 @@ +default: + downloader: standard + request_timeout: 60 + ssl_verify: true + per_domain_concurrency: 8 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.25 + +rules: + - match: + domains: [ikee.lib.auth.gr] + downloader: standard + request_timeout: 180 + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 5 + sleep: 1.5 + + - match: + domains: [dspace.lib.ntua.gr] + downloader: standard + request_timeout: 120 + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 + sleep: 1.0 + + - match: + domains: [olympias.lib.uoi.gr] + downloader: standard + request_timeout: 180 + ssl_verify: false + per_domain_concurrency: 1 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 1 + skip_failed_after: 4 + sleep: 1.0 + + - match: + domains: [ktisis.cut.ac.cy] + downloader: standard + request_timeout: 90 + ssl_verify: false + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 4 + sleep: 0.5 + + - match: + domains: [repository.academyofathens.gr] + downloader: auto + request_timeout: 45 + per_domain_concurrency: 6 + domain_concurrency_floor: 2 + domain_concurrency_ceiling: 8 + skip_failed_after: 3 + sleep: 0.1 + + - match: + domains: + - dione.lib.unipi.gr + - pergamos.lib.uoa.gr + - hellanicus.lib.aegean.gr + downloader: standard + request_timeout: 60 + per_domain_concurrency: 12 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 12 + skip_failed_after: 3 + sleep: 0.2 + + - match: + domains: + - dias.library.tuc.gr + downloader: auto + request_timeout: 90 + per_domain_concurrency: 2 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 2 + skip_failed_after: 4 + sleep: 0.5 + browser_timeout_ms: 90000 + browser_post_load_wait_ms: 4000 + + - match: + domains: + - repository.ihu.gr + - dlib.statistics.gr + - apothesis.eap.gr + - repository.edulll.gr + - dspace.lib.uom.gr + - dspace.aua.gr + downloader: standard + request_timeout: 75 + per_domain_concurrency: 6 + domain_concurrency_floor: 1 + domain_concurrency_ceiling: 8 + skip_failed_after: 4 + sleep: 0.25 diff --git a/src/glossapi/__init__.py b/src/glossapi/__init__.py index 4539ead..14f0c31 100644 --- a/src/glossapi/__init__.py +++ b/src/glossapi/__init__.py @@ -1,54 +1,7 @@ -""" -GlossAPI Library - -A library for processing academic texts in Greek and other languages: -- Extracting content from PDFs and other formats with Docling -- Robust batch processing with error isolation and automatic resumption -- Clustering documents based on extraction quality -- Extracting and cleaning academic sections -- Classifying sections using machine learning - -This is an open source project that provides tools for linguistic annotations -and text processing, with a special focus on the Greek language. -""" +"""GlossAPI library.""" from __future__ import annotations -import os - -# Keep Docling/RapidOCR bootstrap optional and import‑light by default. -# If the environment requests skipping (common in tests or minimal envs), -# or if Docling is not installed, we avoid importing heavy dependencies here. -_SKIP_DOCLING_BOOT = os.environ.get("GLOSSAPI_SKIP_DOCLING_BOOT") == "1" - -def _attempt_patch_docling() -> bool: - if _SKIP_DOCLING_BOOT: - return False - try: - # Import inside the function to avoid pulling Docling when unused or missing. - from .ocr.rapidocr.safe import patch_docling_rapidocr # type: ignore - - try: - return bool(patch_docling_rapidocr()) - except Exception: - # Swallow any runtime error to keep top‑level import light/safe. - return False - except Exception: - # Docling (or its transitive deps) not available – keep going. - return False - - -def patch_docling_rapidocr() -> bool: - """Best‑effort registration of the SafeRapidOcrModel. - - Returns True when the patch was applied; False when unavailable or skipped. - Safe to call multiple times. - """ - return _attempt_patch_docling() - -# Attempt the patch once at import time, but never fail import if it does not apply. -_ = _attempt_patch_docling() - __all__ = [ 'GlossSection', 'GlossSectionClassifier', @@ -56,7 +9,7 @@ def patch_docling_rapidocr() -> bool: 'Sampler', 'Section', 'GlossDownloader', - 'patch_docling_rapidocr', + 'BrowserGlossDownloader', ] def __getattr__(name: str): @@ -79,9 +32,11 @@ def __getattr__(name: str): if name == 'GlossDownloader': from .gloss_downloader import GlossDownloader # type: ignore return GlossDownloader + if name == 'BrowserGlossDownloader': + from .gloss_browser_downloader import BrowserGlossDownloader # type: ignore + return BrowserGlossDownloader raise AttributeError(name) -# Derive version dynamically from installed package metadata if possible try: from importlib.metadata import version as _pkg_version __version__: str = _pkg_version(__name__) diff --git a/src/glossapi/_naming.py b/src/glossapi/_naming.py index 068b195..5f28434 100644 --- a/src/glossapi/_naming.py +++ b/src/glossapi/_naming.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path +import re from typing import Union _KNOWN_SUFFIXES = ( @@ -19,6 +20,8 @@ ".htm", ) +_PAGE_CHUNK_SUFFIX_RE = re.compile(r"__p\d{4,5}-\d{4,5}$") + def canonical_stem(value: Union[str, Path]) -> str: """Return a normalised stem for any pipeline artefact.""" @@ -33,6 +36,7 @@ def canonical_stem(value: Union[str, Path]) -> str: working = working[: -len(suffix)] stripped = True break + working = _PAGE_CHUNK_SUFFIX_RE.sub("", working) if working: return working fallback = Path(name).stem diff --git a/src/glossapi/_pipeline.py b/src/glossapi/_pipeline.py index 73e5ecc..1909b60 100644 --- a/src/glossapi/_pipeline.py +++ b/src/glossapi/_pipeline.py @@ -1,7 +1,7 @@ """Backward-compatible adapter. -Docling pipeline builders moved to `glossapi.ocr.rapidocr.pipeline`. +Docling pipeline builders moved to `glossapi.ocr.docling.pipeline`. This module re-exports the public API to preserve legacy imports. """ -from .ocr.rapidocr.pipeline import * # noqa: F401,F403 +from .ocr.docling.pipeline import * # noqa: F401,F403 diff --git a/src/glossapi/corpus/corpus_orchestrator.py b/src/glossapi/corpus/corpus_orchestrator.py index dd2fad6..3feb7ec 100644 --- a/src/glossapi/corpus/corpus_orchestrator.py +++ b/src/glossapi/corpus/corpus_orchestrator.py @@ -350,9 +350,11 @@ def _load_metadata(self) -> None: # Top-level worker function for multi-GPU extraction (picklable by multiprocessing) def gpu_extract_worker_queue( device_id: int, + worker_slot: int, + worker_key: str, in_dir: str, out_dir: str, - work_q, # multiprocessing Queue of filename strings + work_q, # multiprocessing Queue of filename strings or bundled path lists force: bool, fe: bool, ce: bool, @@ -392,12 +394,13 @@ def _ensure_thread_caps(): _ensure_thread_caps() _status_proxy = status_map - _marker_path = _Path(marker_dir).expanduser() / f"gpu{device_id}.current" if marker_dir else None + _worker_label = worker_key or f"gpu{device_id}-w{worker_slot}" + _marker_path = _Path(marker_dir).expanduser() / f"{_worker_label}.current" if marker_dir else None def _update_current(batch_items: List[str]) -> None: if _status_proxy is not None: try: - _status_proxy[device_id] = list(batch_items) + _status_proxy[_worker_label] = list(batch_items) except Exception: pass if _marker_path is not None: @@ -409,7 +412,7 @@ def _update_current(batch_items: List[str]) -> None: def _clear_current() -> None: if _status_proxy is not None: try: - _status_proxy.pop(device_id, None) + _status_proxy.pop(_worker_label, None) except Exception: pass if _marker_path is not None: @@ -417,13 +420,28 @@ def _clear_current() -> None: _marker_path.unlink(missing_ok=True) except Exception: pass + + def _normalize_work_item(item: Any) -> List[str]: + if isinstance(item, str): + return [item] if item.strip() else [] + if isinstance(item, (list, tuple, set)): + normalized: List[str] = [] + for value in item: + try: + text = str(value).strip() + except Exception: + continue + if text: + normalized.append(text) + return normalized + return [] _worker_log_handle = None try: _log_dir = _os.environ.get("GLOSSAPI_WORKER_LOG_DIR") if _log_dir: _log_path = _Path(_log_dir).expanduser() _log_path.mkdir(parents=True, exist_ok=True) - _worker_log_file = _log_path / f"gpu{device_id}_{_os.getpid()}.log" + _worker_log_file = _log_path / f"{_worker_label}_{_os.getpid()}.log" _worker_log_handle = open(_worker_log_file, "a", encoding="utf-8", buffering=1) _sys.stdout = _worker_log_handle _sys.stderr = _worker_log_handle @@ -458,9 +476,13 @@ def _clear_current() -> None: except Exception: _phys = "" try: - print(f"[GPU{device_id}] bound: CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}") + print( + f"[GPU{device_id}/W{worker_slot}] bound: " + f"CUDA_VISIBLE_DEVICES={_os.environ.get('CUDA_VISIBLE_DEVICES','')} " + f"pid={_os.getpid()} torch={_torch_name} ORT={_ort_prov}" + ) if _phys: - print(f"[GPU{device_id}] physical: {_phys}") + print(f"[GPU{device_id}/W{worker_slot}] physical: {_phys}") except Exception: pass except Exception: @@ -475,13 +497,15 @@ def _clear_current() -> None: _ensure_thread_caps() from glossapi import Corpus as _Corpus # type: ignore except Exception as _e: - print(f"[GPU{device_id}] Cannot import glossapi in worker: {_e}") + print(f"[{_worker_label}] Cannot import glossapi in worker: {_e}") if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -507,14 +531,16 @@ def _clear_current() -> None: phase1_backend=backend, ) except Exception as _e: - msg = f"[GPU{device_id}] Prime failed: {_e}" + msg = f"[{_worker_label}] Prime failed: {_e}" print(msg) if result_q is not None: try: result_q.put( { "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": 1, "pid": _os.getpid(), "error": str(_e), @@ -534,7 +560,9 @@ def _report_batch(ok_list, bad_list): result_q.put( { "event": "batch", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "processed": [str(x) for x in ok_list], "problematic": [str(x) for x in bad_list], "pid": _os.getpid(), @@ -546,128 +574,78 @@ def _report_batch(ok_list, bad_list): c.extractor.batch_result_callback = _report_batch except Exception as _e: print(f"[GPU{device_id}] Unable to set batch callback: {_e}") - # Prepare persistent extractor in this worker on first call - # Process queue items in small batches to reduce function-call overhead - batch: list[str] = [] - try: - _batch_env = int(str(_os.environ.get("GLOSSAPI_GPU_BATCH_SIZE", "")).strip() or 0) - except Exception: - _batch_env = 0 - default_batch = 5 if not force else 1 - try: - extractor = getattr(c, "extractor", None) - if extractor is not None: - configured = int(getattr(extractor, "max_batch_files", default_batch)) - if force: - default_batch = 1 - else: - default_batch = max(1, configured) - except Exception: - pass - BATCH_SIZE = max(1, _batch_env) if _batch_env else max(1, default_batch) + # The controller already shapes queue items for multi-GPU extraction. Workers + # should execute those queue items as-is rather than re-batching them locally, + # otherwise long PDFs can be accidentally merged back into tail-heavy bundles. import queue as _queue last_progress = _time.time() processed = 0 exit_code = 0 + + def _run_batch(batch_items: List[str]) -> None: + nonlocal processed, exit_code + if not batch_items: + return + try: + _update_current(list(batch_items)) + c.extract( + input_format=input_fmt, + num_threads=threads, + accel_type="cuda:0", + force_ocr=force, + formula_enrichment=fe, + code_enrichment=ce, + file_paths=list(batch_items), + skip_existing=skip, + use_gpus="single", + use_cls=use_cls_w, + benchmark_mode=benchmark, + export_doc_json=bool(export_json), + emit_formula_index=bool(emit_index), + phase1_backend=backend, + _prepared=True, + ) + processed += len(batch_items) + _clear_current() + except Exception as _e: + exit_code = 1 + print(f"[GPU{device_id}] Batch failed ({len(batch_items)}): {_e}") + if result_q is not None: + try: + result_q.put( + { + "event": "batch", + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, + "processed": [], + "problematic": list(batch_items), + "pid": _os.getpid(), + "error": str(_e), + } + ) + except Exception: + pass + _clear_current() + try: while True: try: - nm = work_q.get_nowait() + work_item = work_q.get_nowait() except _queue.Empty: - # queue.Empty or other -> flush any pending batch then exit - if batch: - try: - _update_current(list(batch)) - c.extract( - input_format=input_fmt, - num_threads=threads, - accel_type="cuda:0", - force_ocr=force, - formula_enrichment=fe, - code_enrichment=ce, - file_paths=list(batch), - skip_existing=skip, - use_gpus="single", - use_cls=use_cls_w, - benchmark_mode=benchmark, - export_doc_json=bool(export_json), - emit_formula_index=bool(emit_index), - phase1_backend=backend, - _prepared=True, - ) - processed += len(batch) - _clear_current() - except Exception as _e: - exit_code = 1 - print(f"[GPU{device_id}] Batch failed ({len(batch)}): {_e}") - if result_q is not None: - try: - result_q.put( - { - "event": "batch", - "worker": device_id, - "processed": [], - "problematic": list(batch), - "pid": _os.getpid(), - "error": str(_e), - } - ) - except Exception: - pass - _clear_current() - batch.clear() break except Exception as exc: exit_code = 1 print(f"[GPU{device_id}] Queue receive error: {exc}") break - if isinstance(nm, str) and nm.strip(): - batch.append(nm) - if len(batch) >= BATCH_SIZE: - try: - _update_current(list(batch)) - c.extract( - input_format=input_fmt, - num_threads=threads, - accel_type="cuda:0", - force_ocr=force, - formula_enrichment=fe, - code_enrichment=ce, - file_paths=list(batch), - skip_existing=skip, - use_gpus="single", - use_cls=use_cls_w, - benchmark_mode=benchmark, - export_doc_json=bool(export_json), - emit_formula_index=bool(emit_index), - phase1_backend=backend, - _prepared=True, - ) - processed += len(batch) - _clear_current() - except Exception as _e: - exit_code = 1 - print(f"[GPU{device_id}] Batch failed ({len(batch)}): {_e}") - if result_q is not None: - try: - result_q.put( - { - "event": "batch", - "worker": device_id, - "processed": [], - "problematic": list(batch), - "pid": _os.getpid(), - "error": str(_e), - } - ) - except Exception: - pass - _clear_current() - batch.clear() + normalized = _normalize_work_item(work_item) + if not normalized: + continue + _run_batch(normalized) # Occasional heartbeat if _time.time() - last_progress > 30: try: - print(f"[GPU{device_id}] processed ~{processed} files…") + print(f"[{_worker_label}] processed ~{processed} files...") except Exception: pass last_progress = _time.time() @@ -692,7 +670,9 @@ def _report_batch(ok_list, bad_list): try: result_q.put({ "event": "exit", - "worker": device_id, + "worker": _worker_label, + "device_id": device_id, + "worker_slot": worker_slot, "exitcode": exit_code, "pid": _os.getpid(), }) diff --git a/src/glossapi/corpus/ocr/__init__.py b/src/glossapi/corpus/ocr/__init__.py new file mode 100644 index 0000000..e8d5b32 --- /dev/null +++ b/src/glossapi/corpus/ocr/__init__.py @@ -0,0 +1,6 @@ +"""Readable OCR orchestration helpers for the corpus pipeline.""" + +from .config import OcrRequest, normalize_ocr_request +from .pipeline import run_ocr_phase + +__all__ = ["OcrRequest", "normalize_ocr_request", "run_ocr_phase"] diff --git a/src/glossapi/corpus/ocr/artifacts.py b/src/glossapi/corpus/ocr/artifacts.py new file mode 100644 index 0000000..3e91906 --- /dev/null +++ b/src/glossapi/corpus/ocr/artifacts.py @@ -0,0 +1,143 @@ +"""OCR result persistence helpers.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd + +from ..._naming import canonical_stem +from .context import CorpusOcrContext + + +def build_ocr_stage_artifact_update( + *, + markdown_dir: Path, + metrics_dir: Path, + stem: str, +) -> Optional[Dict[str, object]]: + """Return direct OCR-owned artifact fields for one canonical OCR document.""" + + markdown_path = Path(markdown_dir) / f"{stem}.md" + if not markdown_path.exists(): + return None + text_payload = markdown_path.read_text(encoding="utf-8") + metrics_path = Path(metrics_dir) / f"{stem}.metrics.json" + return { + "text": text_payload, + "ocr_markdown_relpath": str(Path("markdown") / markdown_path.name), + "ocr_metrics_relpath": ( + str(Path("json") / "metrics" / metrics_path.name) if metrics_path.exists() else None + ), + "ocr_text_sha256": hashlib.sha256(text_payload.encode("utf-8")).hexdigest(), + } + + +def apply_ocr_success_updates( + df_meta: pd.DataFrame, + *, + filenames: List[str], + markdown_dir: Path, + metrics_dir: Path, + backend_norm: str, +) -> pd.DataFrame: + """Apply direct OCR-owned metadata updates to parquet rows.""" + + if "filename" not in df_meta.columns: + return df_meta + + if "filter" not in df_meta.columns: + df_meta["filter"] = "ok" + if "needs_ocr" not in df_meta.columns: + df_meta["needs_ocr"] = False + if "ocr_success" not in df_meta.columns: + df_meta["ocr_success"] = False + if "extraction_mode" not in df_meta.columns: + df_meta["extraction_mode"] = None + + direct_columns = ("text", "ocr_markdown_relpath", "ocr_metrics_relpath", "ocr_text_sha256") + for column in direct_columns: + if column not in df_meta.columns: + df_meta[column] = None + + filename_series = df_meta["filename"].astype(str) + stem_series = filename_series.map(canonical_stem) + + for fname in filenames: + stem = canonical_stem(fname) + mask = stem_series == stem + if not bool(mask.any()): + continue + artifact_update = build_ocr_stage_artifact_update( + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + stem=stem, + ) + df_meta.loc[mask, "filter"] = "ok" + df_meta.loc[mask, "needs_ocr"] = False + df_meta.loc[mask, "ocr_success"] = True + if backend_norm == "deepseek": + df_meta.loc[mask, "extraction_mode"] = "deepseek" + if artifact_update is None: + continue + for column, value in artifact_update.items(): + df_meta.loc[mask, column] = value + + return df_meta + + +def persist_ocr_success( + context: CorpusOcrContext, + *, + filenames: List[str], + backend_norm: str, +) -> List[str]: + from ...parquet_schema import ParquetSchema + + success_files: List[str] = [] + for fname in filenames: + stem = canonical_stem(fname) + if (context.markdown_dir / f"{stem}.md").exists(): + success_files.append(fname) + + if not success_files: + return success_files + + parquet_schema = ParquetSchema({"url_column": context.url_column}) + parquet_path = context._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) + if parquet_path and parquet_path.exists(): + df_meta = pd.read_parquet(parquet_path) + df_meta = apply_ocr_success_updates( + df_meta, + filenames=success_files, + markdown_dir=context.markdown_dir, + metrics_dir=context.output_dir / "json" / "metrics", + backend_norm=backend_norm, + ) + context._cache_metadata_parquet(parquet_path) + parquet_schema.write_metadata_parquet(df_meta, parquet_path) + + stems = [canonical_stem(name) for name in success_files] + if hasattr(context, "good_files"): + for stem in stems: + if stem not in getattr(context, "good_files", []): + context.good_files.append(stem) + + return success_files + + +def refresh_cleaner_after_ocr(context: CorpusOcrContext) -> None: + """Refresh cleaner metrics after OCR reruns rewrite markdown outputs.""" + + refresh = getattr(context, "_refresh_metrics_after_ocr_rerun", None) + if callable(refresh): + refresh() + return + + context.logger.info("Re-running Rust cleaner after OCR rerun to refresh metrics") + context.clean( + input_dir=context.markdown_dir, + drop_bad=False, + ) diff --git a/src/glossapi/corpus/ocr/config.py b/src/glossapi/corpus/ocr/config.py new file mode 100644 index 0000000..f9da5fc --- /dev/null +++ b/src/glossapi/corpus/ocr/config.py @@ -0,0 +1,229 @@ +"""Request normalization for corpus OCR orchestration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from ...ocr.deepseek.defaults import ( + DEFAULT_ATTN_BACKEND, + DEFAULT_GPU_MEMORY_UTILIZATION, + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_OCR_PROFILE, + DEFAULT_RENDER_DPI, + DEFAULT_REPAIR_MODE, + DEFAULT_RUNTIME_BACKEND, + DEFAULT_TARGET_BATCH_PAGES, + DEFAULT_WORKERS_PER_GPU, + resolve_gpu_memory_utilization, + resolve_render_dpi, +) + + +@dataclass(slots=True) +class OcrRequest: + mode: str + backend: str + device: Optional[str] + model_dir: Optional[Path] + max_pages: Optional[int] + persist_engine: bool + precision: Optional[str] + workers_per_gpu: int + runtime_backend: str + ocr_profile: str + prompt_override: Optional[str] + attn_backend: str + base_size: Optional[int] + image_size: Optional[int] + crop_mode: Optional[bool] + render_dpi: int + max_new_tokens: int + repetition_penalty: Optional[float] + no_repeat_ngram_size: Optional[int] + vllm_batch_size: Optional[int] + gpu_memory_utilization: float + disable_fp8_kv: bool + repair_mode: str + repair_exec_batch_target_pages: Optional[int] + repair_exec_batch_target_items: Optional[int] + scheduler: str + target_batch_pages: int + shard_pages: int + shard_threshold_pages: int + math_enhance: bool + math_targets: Optional[Dict[str, List[Tuple[int, int]]]] + math_batch_size: int + math_dpi_base: int + use_gpus: str + devices: Optional[List[int]] + reprocess_completed: bool + content_debug: bool + + +def _resolve_mode( + *, + logger, + mode: Optional[str], + fix_bad: bool, + math_enhance: bool, +) -> Optional[str]: + mode_norm: Optional[str] = None + if mode: + candidate = str(mode).strip().lower() + if candidate in {"ocr_bad", "math_only", "ocr_bad_then_math"}: + mode_norm = candidate + else: + logger.warning("Unknown mode '%s'; falling back to legacy flags", mode) + if mode_norm is None: + if fix_bad and math_enhance: + mode_norm = "ocr_bad_then_math" + elif fix_bad: + mode_norm = "ocr_bad" + elif math_enhance: + mode_norm = "math_only" + return mode_norm + + +def normalize_ocr_request( + *, + logger, + fix_bad: bool, + mode: Optional[str], + backend: str, + device: Optional[str], + model_dir: Optional[str | Path], + max_pages: Optional[int], + persist_engine: bool, + precision: Optional[str], + workers_per_gpu: int = DEFAULT_WORKERS_PER_GPU, + runtime_backend: str = DEFAULT_RUNTIME_BACKEND, + ocr_profile: str = DEFAULT_OCR_PROFILE, + prompt_override: Optional[str] = None, + attn_backend: str = DEFAULT_ATTN_BACKEND, + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = DEFAULT_RENDER_DPI, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = DEFAULT_GPU_MEMORY_UTILIZATION, + disable_fp8_kv: bool = False, + repair_mode: str = DEFAULT_REPAIR_MODE, + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, + scheduler: str = "auto", + target_batch_pages: int = DEFAULT_TARGET_BATCH_PAGES, + shard_pages: int = 0, + shard_threshold_pages: int = 0, + math_enhance: bool = True, + math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, + math_batch_size: int = 8, + math_dpi_base: int = 220, + use_gpus: str = "single", + devices: Optional[List[int]] = None, + force: Optional[bool] = None, + reprocess_completed: Optional[bool] = None, + skip_existing: Optional[bool] = None, + content_debug: bool = False, + CONTENT_DEBUG: Optional[bool] = None, + internal_debug: bool = False, + INTERNAL_DEBUG: Optional[bool] = None, +) -> Optional[OcrRequest]: + backend_norm = str(backend or "deepseek").strip().lower() + if backend_norm != "deepseek": + raise ValueError("backend must be 'deepseek'") + + if CONTENT_DEBUG is not None: + content_debug = bool(CONTENT_DEBUG) + elif INTERNAL_DEBUG is not None: + content_debug = bool(INTERNAL_DEBUG) + elif internal_debug: + content_debug = True + + fix_bad_effective = bool(fix_bad) + if force is not None: + logger.warning("Corpus.ocr(force=...) is deprecated; use fix_bad=... instead") + fix_bad_effective = bool(force) + + mode_norm = _resolve_mode( + logger=logger, + mode=mode, + fix_bad=fix_bad_effective, + math_enhance=bool(math_enhance), + ) + if mode_norm is None: + logger.info( + "OCR: no operation requested (enable fix_bad and/or math_enhance or set mode='ocr_bad'|'math_only'|'ocr_bad_then_math')" + ) + return None + + if backend_norm == "deepseek" and mode_norm in {"ocr_bad", "ocr_bad_then_math"}: + logger.info( + "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." + ) + if mode_norm == "ocr_bad_then_math": + logger.info( + "DeepSeek OCR does not run Phase-2 math; treating mode='ocr_bad_then_math' as 'ocr_bad'." + ) + mode_norm = "ocr_bad" + + reprocess_explicit = reprocess_completed is not None + reprocess_flag = bool(reprocess_completed) if reprocess_explicit else False + if skip_existing is not None: + skip_flag = bool(skip_existing) + logger.warning( + "Corpus.ocr(skip_existing=...) is deprecated; use reprocess_completed=... instead." + ) + desired = not skip_flag + if reprocess_explicit and desired != reprocess_flag: + logger.info( + "Corpus.ocr(): skip_existing=%s overrides reprocess_completed=%s (effective reprocess_completed=%s).", + skip_flag, + reprocess_flag, + desired, + ) + reprocess_flag = desired + + return OcrRequest( + mode=mode_norm, + backend=backend_norm, + device=device, + model_dir=Path(model_dir) if model_dir else None, + max_pages=max_pages, + persist_engine=bool(persist_engine), + precision=precision, + workers_per_gpu=int(max(1, workers_per_gpu)), + runtime_backend=str(runtime_backend or DEFAULT_RUNTIME_BACKEND), + ocr_profile=str(ocr_profile or DEFAULT_OCR_PROFILE), + prompt_override=prompt_override, + attn_backend=str(attn_backend or DEFAULT_ATTN_BACKEND), + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=resolve_render_dpi(render_dpi), + max_new_tokens=int(DEFAULT_MAX_NEW_TOKENS if max_new_tokens is None else max_new_tokens), + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=resolve_gpu_memory_utilization(gpu_memory_utilization), + disable_fp8_kv=bool(disable_fp8_kv), + repair_mode=str(repair_mode or DEFAULT_REPAIR_MODE), + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + scheduler=str(scheduler or "auto"), + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), + math_enhance=bool(math_enhance), + math_targets=math_targets, + math_batch_size=int(math_batch_size), + math_dpi_base=int(math_dpi_base), + use_gpus=str(use_gpus or "single"), + devices=devices, + reprocess_completed=bool(reprocess_flag), + content_debug=bool(content_debug), + ) diff --git a/src/glossapi/corpus/ocr/context.py b/src/glossapi/corpus/ocr/context.py new file mode 100644 index 0000000..7c98795 --- /dev/null +++ b/src/glossapi/corpus/ocr/context.py @@ -0,0 +1,26 @@ +"""Shared typing contracts for corpus OCR helpers.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Protocol + + +class CorpusOcrContext(Protocol): + logger: Any + input_dir: Path + output_dir: Path + markdown_dir: Path + logs_dir: Path + url_column: str + good_files: list[str] + + def _resolve_metadata_parquet(self, *args: Any, **kwargs: Any) -> Path | None: ... + + def _cache_metadata_parquet(self, path: Path | None) -> None: ... + + def _get_cached_metadata_parquet(self) -> Path | None: ... + + def clean(self, *args: Any, **kwargs: Any) -> None: ... + + def formula_enrich_from_json(self, *args: Any, **kwargs: Any) -> None: ... diff --git a/src/glossapi/corpus/ocr/dispatch.py b/src/glossapi/corpus/ocr/dispatch.py new file mode 100644 index 0000000..8e8efce --- /dev/null +++ b/src/glossapi/corpus/ocr/dispatch.py @@ -0,0 +1,49 @@ +"""Backend dispatch helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from ...ocr.deepseek import runner as _deepseek_runner +from .config import OcrRequest +from .context import CorpusOcrContext + + +def run_deepseek_ocr( + context: CorpusOcrContext, + *, + request: OcrRequest, + filenames: list[str], +) -> None: + _deepseek_runner.run_for_files( + context, + filenames, + model_dir=request.model_dir, + max_pages=request.max_pages, + persist_engine=request.persist_engine, + precision=request.precision, + device=request.device, + use_gpus=request.use_gpus, + devices=request.devices, + workers_per_gpu=request.workers_per_gpu, + runtime_backend=request.runtime_backend, + ocr_profile=request.ocr_profile, + prompt_override=request.prompt_override, + attn_backend=request.attn_backend, + base_size=request.base_size, + image_size=request.image_size, + crop_mode=request.crop_mode, + render_dpi=request.render_dpi, + max_new_tokens=request.max_new_tokens, + repetition_penalty=request.repetition_penalty, + no_repeat_ngram_size=request.no_repeat_ngram_size, + vllm_batch_size=request.vllm_batch_size, + gpu_memory_utilization=request.gpu_memory_utilization, + disable_fp8_kv=request.disable_fp8_kv, + repair_mode=request.repair_mode, + repair_exec_batch_target_pages=request.repair_exec_batch_target_pages, + repair_exec_batch_target_items=request.repair_exec_batch_target_items, + scheduler=request.scheduler, + target_batch_pages=request.target_batch_pages, + shard_pages=request.shard_pages, + shard_threshold_pages=request.shard_threshold_pages, + content_debug=request.content_debug, + ) diff --git a/src/glossapi/corpus/ocr/math_targets.py b/src/glossapi/corpus/ocr/math_targets.py new file mode 100644 index 0000000..0737d6f --- /dev/null +++ b/src/glossapi/corpus/ocr/math_targets.py @@ -0,0 +1,43 @@ +"""Math-target selection helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Sequence, Set + +from ..._naming import canonical_stem + + +def discover_docling_json_stems(output_dir: Path) -> List[str]: + json_dir = Path(output_dir) / "json" + if not json_dir.exists(): + return [] + return sorted({canonical_stem(path) for path in json_dir.glob("*.docling.json*")}) + + +def filter_math_only_stems( + *, + stems: Sequence[str], + bad_files: Sequence[str], + math_done_stems: Set[str], + reprocess_completed: bool, + logger, +) -> List[str]: + kept = list(stems) + if bad_files: + before = len(kept) + bad_set = {canonical_stem(name) for name in bad_files} + kept = [stem for stem in kept if stem not in bad_set] + removed = before - len(kept) + if removed: + logger.info("Math-only: skipping %d document(s) flagged for OCR", removed) + if not reprocess_completed and kept and math_done_stems: + before = len(kept) + kept = [stem for stem in kept if stem not in math_done_stems] + removed = before - len(kept) + if removed: + logger.info( + "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", + removed, + ) + return kept diff --git a/src/glossapi/corpus/ocr/pipeline.py b/src/glossapi/corpus/ocr/pipeline.py new file mode 100644 index 0000000..bee65e0 --- /dev/null +++ b/src/glossapi/corpus/ocr/pipeline.py @@ -0,0 +1,46 @@ +"""High-level OCR orchestration for corpus remediation.""" + +from __future__ import annotations + +from .artifacts import persist_ocr_success, refresh_cleaner_after_ocr +from .config import OcrRequest +from .context import CorpusOcrContext +from .dispatch import run_deepseek_ocr +from .targets import build_ocr_selection + + +def run_ocr_phase(context: CorpusOcrContext, request: OcrRequest) -> None: + """Run the OCR-remediation path while preserving the current runtime engine.""" + + if request.mode == "math_only": + raise ValueError("run_ocr_phase handles OCR remediation only") + + selection = build_ocr_selection( + context, + mode=request.mode, + reprocess_completed=request.reprocess_completed, + ) + + if not selection.bad_files: + context.logger.info("OCR: no bad documents flagged by cleaner; skipping OCR fix") + return + + run_deepseek_ocr( + context, + request=request, + filenames=selection.bad_files, + ) + + try: + persist_ocr_success( + context, + filenames=selection.bad_files, + backend_norm=request.backend, + ) + except Exception as exc: + context.logger.warning("Failed to update OCR success metadata: %s", exc) + + try: + refresh_cleaner_after_ocr(context) + except Exception as exc: + context.logger.warning("Cleaner refresh after OCR failed: %s", exc) diff --git a/src/glossapi/corpus/ocr/targets.py b/src/glossapi/corpus/ocr/targets.py new file mode 100644 index 0000000..2a393aa --- /dev/null +++ b/src/glossapi/corpus/ocr/targets.py @@ -0,0 +1,144 @@ +"""Target selection helpers for corpus OCR orchestration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional, Set + +import pandas as pd + +from ..._naming import canonical_stem +from ...parquet_schema import ParquetSchema +from .context import CorpusOcrContext +from ..corpus_skiplist import _SkiplistManager, _resolve_skiplist_path + + +@dataclass(slots=True) +class OcrSelection: + bad_files: List[str] + ocr_candidates_initial: int + skipped_completed: int + skipped_skiplist: int + parquet_meta: Optional[pd.DataFrame] + ocr_done_files: List[str] + ocr_done_stems: Set[str] + math_done_stems: Set[str] + skip_mgr: _SkiplistManager + skiplist_path: Path + + +def normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> List[str]: + """Collapse chunk-like metadata rows back to real OCR source files when possible.""" + + source_by_stem = {} + try: + for path in sorted(Path(input_dir).glob("*.pdf")): + source_by_stem.setdefault(canonical_stem(path.name), path.name) + except Exception: + source_by_stem = {} + + normalized: List[str] = [] + seen: Set[str] = set() + for fname in filenames: + resolved = source_by_stem.get(canonical_stem(fname), str(fname)) + if resolved in seen: + continue + normalized.append(resolved) + seen.add(resolved) + return normalized + + +def build_ocr_selection( + context: CorpusOcrContext, + *, + mode: str, + reprocess_completed: bool, +) -> OcrSelection: + bad_files: List[str] = [] + skipped_completed = 0 + skipped_skiplist = 0 + parquet_meta: Optional[pd.DataFrame] = None + ocr_done_files: List[str] = [] + ocr_done_stems: Set[str] = set() + math_done_stems: Set[str] = set() + + parquet_schema = ParquetSchema({"url_column": context.url_column}) + parquet_path = context._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) + if parquet_path and parquet_path.exists(): + df = pd.read_parquet(parquet_path) + if "filename" in df.columns and "needs_ocr" in df.columns: + bad_files = df.loc[df["needs_ocr"] == True, "filename"].dropna().astype(str).tolist() + if "ocr_success" in df.columns: + ocr_done_files = df.loc[df["ocr_success"].fillna(False), "filename"].dropna().astype(str).tolist() + ocr_done_stems = {canonical_stem(name) for name in ocr_done_files} + math_done_files: List[str] = [] + if "math_enriched" in df.columns: + math_done_files = df.loc[df["math_enriched"].fillna(False), "filename"].dropna().astype(str).tolist() + elif "enriched_math" in df.columns: + math_done_files = df.loc[df["enriched_math"].fillna(False), "filename"].dropna().astype(str).tolist() + if math_done_files: + math_done_stems = {canonical_stem(name) for name in math_done_files} + if not reprocess_completed and ocr_done_stems: + before = len(bad_files) + bad_files = [name for name in bad_files if canonical_stem(name) not in ocr_done_stems] + skipped_completed = before - len(bad_files) + if skipped_completed: + context.logger.info( + "OCR: skipping %d already completed document(s) (reprocess_completed=False).", + skipped_completed, + ) + if reprocess_completed and mode in {"ocr_bad", "ocr_bad_then_math"} and ocr_done_files: + pending = {str(name) for name in bad_files} + for fname in ocr_done_files: + if fname not in pending: + bad_files.append(fname) + pending.add(fname) + parquet_meta = df + + ocr_candidates_initial = len(bad_files) + skiplist_path = _resolve_skiplist_path(context.output_dir, context.logger) + skip_mgr = _SkiplistManager(skiplist_path, context.logger) + skip_stems = skip_mgr.load() + if skip_stems: + before = len(bad_files) + bad_files = [name for name in bad_files if canonical_stem(name) not in skip_stems] + skipped_skiplist = before - len(bad_files) + if skipped_skiplist: + context.logger.warning( + "Skip-list %s filtered %d document(s) from Phase-3 OCR.", + skiplist_path, + skipped_skiplist, + ) + + normalized_bad_files = normalize_ocr_target_filenames( + filenames=bad_files, + input_dir=Path(context.input_dir), + ) + if len(normalized_bad_files) != len(bad_files): + context.logger.info( + "OCR: collapsed %d metadata-selected row(s) onto %d real source PDF(s) by canonical stem.", + len(bad_files), + len(normalized_bad_files), + ) + bad_files = normalized_bad_files + context.logger.info( + "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d", + ocr_candidates_initial, + len(bad_files), + skipped_completed, + skipped_skiplist, + ) + + return OcrSelection( + bad_files=bad_files, + ocr_candidates_initial=ocr_candidates_initial, + skipped_completed=skipped_completed, + skipped_skiplist=skipped_skiplist, + parquet_meta=parquet_meta, + ocr_done_files=ocr_done_files, + ocr_done_stems=ocr_done_stems, + math_done_stems=math_done_stems, + skip_mgr=skip_mgr, + skiplist_path=skiplist_path, + ) diff --git a/src/glossapi/corpus/ocr_table.py b/src/glossapi/corpus/ocr_table.py new file mode 100644 index 0000000..63756ed --- /dev/null +++ b/src/glossapi/corpus/ocr_table.py @@ -0,0 +1,240 @@ +"""Table-specific OCR cleaning helpers. + +This module isolates HTML-table handling from the broader OCR repetition logic. + +That separation is intentional: +- some table decisions are repetition-based, like repeated rows +- others are structural cleanups, like sentence-shell tables or near-empty shells + +Keeping table logic together makes the policy easier to understand and keeps the +main OCR page pipeline focused on ordering and span ownership. +""" +from __future__ import annotations + +import html +import re +from collections import Counter +from functools import lru_cache +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from ..scripts.table_markdown_audit import ( + _expand_rows as _audit_expand_table_rows, + _parse_table_rows as _audit_parse_table_rows, + audit_table as _audit_table_html, +) + +HTML_TABLE_BLOCK_RE = re.compile(r"(?is)") +HTML_TABLE_LINE_RE = re.compile(r"(?i).*?") +HTML_TABLE_CELL_RE = re.compile(r"(?is)(.*?)") +HTML_TAG_RE = re.compile(r"(?is)<[^>]+>") + +TABLE_EMPTY_MIN_ROWS = 6 +TABLE_EMPTY_MIN_CELLS = 18 +TABLE_EMPTY_MAX_NONEMPTY_RATIO = 0.15 +TABLE_REPEAT_MIN_ROWS = 4 +TABLE_REPEAT_MIN_NONEMPTY_CELLS = 2 +TABLE_REPEAT_MIN_ROW_TEXT_CHARS = 6 +TABLE_REPEAT_MIN_DUPLICATE_ROWS = 2 +TABLE_SENTENCE_SHELL_MIN_WORDS = 6 +TABLE_SENTENCE_SHELL_MIN_CHARS = 40 + + +def _normalize_table_cell_text(cell_html: str) -> str: + text = HTML_TAG_RE.sub(" ", cell_html) + text = html.unescape(text) + return " ".join(text.split()) + + +def _table_cell_has_content(cell_text: str) -> bool: + return any(ch.isalnum() for ch in cell_text) + + +def _extract_html_table_rows(table_text: str) -> List[List[str]]: + rows: List[List[str]] = [] + for row_match in HTML_TABLE_ROW_RE.finditer(table_text): + cells = [ + _normalize_table_cell_text(cell_match.group(1)) + for cell_match in HTML_TABLE_CELL_RE.finditer(row_match.group(0)) + ] + if cells: + rows.append(cells) + return rows + + +@lru_cache(maxsize=2048) +def _extract_html_table_rows_cached(table_text: str) -> Tuple[Tuple[str, ...], ...]: + """Cache repeated table shells by exact HTML string. + + The OCR corpus contains many duplicated HTML fragments, so exact-string + memoization pays off without changing behavior. + """ + return tuple(tuple(row) for row in _extract_html_table_rows(table_text)) + + +def _flatten_html_table_nonempty_cells(table_text: str) -> List[str]: + parsed_rows, _ = _audit_parse_table_rows(table_text) + grid, _ = _audit_expand_table_rows(parsed_rows) + if not grid: + return [] + nonempty: List[str] = [] + for row in grid: + for cell in row: + normalized = " ".join(cell.split()) + if any(ch.isalnum() for ch in normalized): + nonempty.append(normalized) + return nonempty + + +@lru_cache(maxsize=2048) +def _flatten_html_table_nonempty_cells_cached(table_text: str) -> Tuple[str, ...]: + return tuple(_flatten_html_table_nonempty_cells(table_text)) + + +def _extract_sentence_shell_table_text(table_text: str) -> Optional[str]: + """Return prose text when a table is only a layout shell around one cell. + + This is intentionally not a repetition rule. OCR and VLM extraction often + emit a normal sentence inside a tiny one-cell table shell; when that + happens, the table structure is noise and the prose cell is the content. + """ + nonempty_cells = _flatten_html_table_nonempty_cells_cached(table_text) + if len(nonempty_cells) != 1: + return None + candidate = nonempty_cells[0].strip() + if len(candidate) < TABLE_SENTENCE_SHELL_MIN_CHARS: + return None + if len(re.findall(r"[^\W\d_]+", candidate, re.UNICODE)) < TABLE_SENTENCE_SHELL_MIN_WORDS: + return None + return candidate + + +@lru_cache(maxsize=2048) +def _render_table_html_for_output_cached(table_text: str, match_kind: Optional[str]) -> str: + sentence_shell = _extract_sentence_shell_table_text(table_text) + if sentence_shell and match_kind == "sentence_shell_table": + return sentence_shell + + audit = _audit_table_html(Path("/tmp/table_fragment.md"), 0, 0, table_text) + if audit.markdown: + return audit.markdown + return table_text + + +def render_table_html_for_output(table_text: str, *, match_kind: Optional[str] = None) -> str: + """Render one HTML table for human review/debug output.""" + return _render_table_html_for_output_cached(table_text, match_kind) + + +def replace_html_tables_with_markdown(text: str) -> str: + """Normalize kept HTML tables into GitHub-style Markdown in page text.""" + if " str: + """Render a table in clean mode. + + Clean mode drops tables whose structure is the problem: + - sentence-shell tables + - empty shell tables + - repeated-row tables + """ + if match_kind in {"sentence_shell_table", "empty_table_collapse", "repeated_rows"}: + return "" + return render_table_html_for_output(table_text, match_kind=match_kind) + + +def find_table_repeat_spans(page_text: str, *, match_category: str) -> List[Dict[str, Any]]: + """Classify OCR table problems on a page. + + Table handling is intentionally broader than repetition: + - sentence-shell tables are removed because they are layout shells around prose + - empty table collapse removes sparse structural noise + - repeated rows is the actual repetition-oriented table rule + """ + if "= TABLE_EMPTY_MIN_ROWS + and cell_count >= TABLE_EMPTY_MIN_CELLS + and nonempty_ratio <= TABLE_EMPTY_MAX_NONEMPTY_RATIO + ): + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": match_category, + "kind": "empty_table_collapse", + "row_count": row_count, + "cell_count": cell_count, + "nonempty_ratio": round(nonempty_ratio, 3), + } + ) + continue + + row_keys: List[Tuple[str, ...]] = [] + for row in rows: + nonempty_cells_in_row = [cell for cell in row if _table_cell_has_content(cell)] + if len(nonempty_cells_in_row) < TABLE_REPEAT_MIN_NONEMPTY_CELLS: + continue + row_text = " ".join(nonempty_cells_in_row) + if len(row_text) < TABLE_REPEAT_MIN_ROW_TEXT_CHARS: + continue + row_keys.append(tuple(cell.casefold() for cell in row)) + + if row_count < TABLE_REPEAT_MIN_ROWS or not row_keys: + continue + + row_counts = Counter(row_keys) + duplicate_rows = sum(freq - 1 for freq in row_counts.values() if freq >= 2) + if duplicate_rows >= TABLE_REPEAT_MIN_DUPLICATE_ROWS: + spans.append( + { + "start": table_match.start(), + "end": table_match.end(), + "match_types": ["table_repeat"], + "category": match_category, + "kind": "repeated_rows", + "row_count": row_count, + "duplicate_rows": duplicate_rows, + } + ) + + return spans diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index abdaa5e..f3b5d43 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -1,9 +1,20 @@ -"""Cleaning and filtering helpers split from Corpus.""" +"""Cleaning and filtering helpers split from Corpus. + +This module now primarily owns OCR orchestration: +- page-level analyzer ordering +- shared clean/debug rendering +- worker/process orchestration + +Specialized policy modules, like HTML-table handling, live alongside it so the +main pipeline can stay focused on span ownership and mode selection. +""" from __future__ import annotations +import importlib import json import logging import math +import multiprocessing as mp import os import queue import random @@ -12,6 +23,12 @@ import subprocess import sys import time +import unicodedata +import warnings +from collections import Counter +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from contextlib import contextmanager +from functools import lru_cache from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -21,10 +38,2853 @@ from .._naming import canonical_stem from ..gloss_downloader import GlossDownloader # Avoid importing section/classifier here; cleaning phase does not use them. +from .ocr_table import ( + HTML_TABLE_BLOCK_RE, + HTML_TABLE_LINE_RE, + find_table_repeat_spans as _find_table_repeat_spans_impl, + render_table_html_for_clean as _render_table_html_for_clean, + render_table_html_for_output as _render_table_html_for_output, + replace_html_tables_with_markdown as _replace_html_tables_with_markdown, +) from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path from .corpus_state import _ProcessingStateManager from .corpus_utils import _maybe_import_torch +PAGE_SPLIT_MARKER = "<--- Page Split --->" +WORD_REPEAT_HASH_MASK = (1 << 64) - 1 +WORD_REPEAT_HASH_BASE = 1469598103934665603 +# Neighboring same-category spans may be merged when the visible separator is +# still small enough to read as one corrupted region rather than two separate +# failures. This is intentionally more permissive than the older 10-char rule. +WORD_REPEAT_MERGE_MAX_NONWHITESPACE_GAP = 40 +EXISTING_MATCH_BLOCK_RE = re.compile(r"(?is)]*>.*?") +LATEX_BLOCK_RE = re.compile(r"(?is)\$\$.*?\$\$") +LATEX_BRACKET_RE = re.compile(r"(?is)\\\[.*?\\\]") +LATEX_BEGIN_END_RE = re.compile(r"(?is)\\begin\{([^\n{}]+)\}.*?\\end\{\1\}") +LATEX_INLINE_PAREN_RE = re.compile(r"(?is)\\\(.*?\\\)") +LATEX_INLINE_DOLLAR_RE = re.compile(r"(?s)(?[^<]{1,16}|[^<]{1,16})){8,}" +) +WORD_CONFUSABLE_FOLD_MAP = { + "ο": "o", + "κ": "k", +} +LATEX_SEGMENT_PATTERNS = [ + ("begin_end", LATEX_BEGIN_END_RE), + ("display_dollar", LATEX_BLOCK_RE), + ("display_bracket", LATEX_BRACKET_RE), + ("inline_paren", LATEX_INLINE_PAREN_RE), + ("inline_dollar", LATEX_INLINE_DOLLAR_RE), +] +LATEX_TEXT_WRAPPER_MACROS = ( + r"\mathrm{", + r"\text{", + r"\operatorname{", + r"\mathit{", + r"\mathbf{", +) +LATEX_INTERNAL_REPEAT_COMMANDS = { + r"\frac", + r"\left", + r"\right", + r"\sqrt", + r"\begin", + r"\end", + r"\quad", + r"\qquad", + r"\cdots", + r"\ldots", + r"\mathrm", + r"\text", + r"\operatorname", + r"\mathit", + r"\mathbf", + r"\hat", + r"\tilde", + r"\bar", +} +LATEX_INTERNAL_SMALL_VOCAB_COMMANDS = { + r"\cdots", + r"\ldots", + r"\vdots", + r"\ddots", +} +LATEX_SHORT_REPEAT_ATOM_COMMANDS = { + r"\Delta", + r"\hat", + r"\tilde", + r"\bar", +} +LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS = { + r"\alpha", + r"\beta", + r"\gamma", + r"\delta", + r"\epsilon", + r"\varepsilon", + r"\lambda", + r"\mu", + r"\nu", + r"\omega", + r"\Delta", +} +LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS = { + r"\hat", + r"\tilde", + r"\bar", +} +LATEX_SEGMENT_LOCAL_NONWHITESPACE_GAP = 12 +LATEX_SEGMENT_EXACT_RUN_MIN = 4 +LATEX_SEGMENT_SKELETON_RUN_MIN = 4 +LATEX_SEGMENT_ALTERNATING_RUN_MIN = 6 +LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN = 4 +LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS = 12 +LATEX_SHORT_ATOM_EXACT_SEGMENT_MIN_TOKENS = 2 +LATEX_SHORT_ATOM_CHAIN_MIN_TOKENS = 6 +LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS = 24 +LATEX_SHORT_SEGMENT_MAX_NORM = 32 +LATEX_LONG_SEGMENT_MIN_NORM = 24 +LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP = 3 +LATEX_SMALL_DEFINITION_FAMILY_MAX_RUN = 6 +HYBRID_PREFIX_RE = re.compile( + r"(?\d+\)|\d+\.(?:\d+\.)*\d*\.?)(?=\s*[^\W\d_])", + re.UNICODE, +) +HYBRID_MARKUP_BODY_RE = re.compile(r"(?i)(<[^>]+>|src=|alt=|image_|\.png\b|\.jpg\b|\.jpeg\b|\.gif\b)") +HYBRID_REPEAT_MIN_ITEMS = 4 +HYBRID_REPEAT_MIN_BODY_ALNUM = 6 +HYBRID_REPEAT_MAX_CYCLE = 6 +HYBRID_REPEAT_MIN_CYCLE_ITEMS = 8 +HYBRID_INLINE_CLAUSE_DELIMITER_RE = re.compile(r"[;\n]|,(?!\d)") +HYBRID_INLINE_TOKEN_RE = re.compile(r"[0-9]+(?:[.,/][0-9]+)*|[^\W\d_]+", re.UNICODE) +HYBRID_INLINE_CONTEXT_WORDS = 2 +HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS = 2 +HYBRID_INLINE_CONTEXT_MIN_CHARS = 8 +HYBRID_INLINE_REPEAT_MIN_ITEMS = 6 +LATEX_SYMBOL_SLOT_COMMANDS = ( + r"\mu", + r"\nu", + r"\alpha", + r"\beta", + r"\gamma", + r"\lambda", + r"\tau", + r"\omega", +) +MATCH_CATEGORY_BY_TYPE = { + "ascending_numeric_sequence": "numeric", + "repeat_numeric_run": "numeric", + "same_digit_numeric_run": "numeric", + "numeric_page_collapse": "numeric", + "numeric_block_collapse": "numeric", + "numeric_repeat": "numeric", + "word_repeat": "word", + "latex_repeat": "latex", + "hybrid_repeat": "hybrid", + "table_repeat": "table", +} + +_WORD_REPEAT_RUST_MOD: Optional[Any] = None +_WORD_REPEAT_RUST_IMPORT_ATTEMPTED = False +_RUST_EXTENSION_PREBUILD_ATTEMPTED: Set[str] = set() +_COMBINED_OCR_WORKER_NOISE_MOD: Optional[Any] = None +_COMBINED_OCR_WORKER_REQUIRED_ATTRS = ( + "find_numeric_debug_page_spans", + "evaluate_page_character_noise", +) + + +def _blank_non_newlines(text: str) -> str: + return "".join("\n" if ch == "\n" else " " for ch in text) + + +def _init_combined_ocr_worker() -> None: + global _COMBINED_OCR_WORKER_NOISE_MOD, _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED + noise_mod = importlib.import_module("glossapi_rs_noise") + missing = [ + attr for attr in _COMBINED_OCR_WORKER_REQUIRED_ATTRS if not hasattr(noise_mod, attr) + ] + if missing: + raise ImportError( + "glossapi_rs_noise missing required attrs for OCR worker: " + + ", ".join(missing) + ) + _COMBINED_OCR_WORKER_NOISE_MOD = noise_mod + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + _WORD_REPEAT_RUST_MOD = noise_mod if hasattr(noise_mod, "find_word_repeat_spans") else None + + +def _get_combined_ocr_worker_noise_mod() -> Any: + global _COMBINED_OCR_WORKER_NOISE_MOD + if _COMBINED_OCR_WORKER_NOISE_MOD is None: + _init_combined_ocr_worker() + return _COMBINED_OCR_WORKER_NOISE_MOD + + +def _prime_word_repeat_rust_module(module_name: str, module: Any) -> Any: + global _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED + if module_name == "glossapi_rs_noise": + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + _WORD_REPEAT_RUST_MOD = module if hasattr(module, "find_word_repeat_spans") else None + return module + + +def _can_use_combined_ocr_process_pool(noise_mod: Any, render_workers: int) -> bool: + return ( + render_workers > 1 + and os.name != "nt" + and getattr(noise_mod, "__name__", "") == "glossapi_rs_noise" + ) + + +def _default_combined_ocr_render_workers( + *, + noise_mod: Any, + requested_workers: Optional[int], + max_workers: int, +) -> int: + if requested_workers is not None: + return max(1, int(requested_workers)) + host_workers = max(1, int(max_workers)) + if _can_use_combined_ocr_process_pool(noise_mod, host_workers): + return host_workers + return min(4, host_workers) + + +@contextmanager +def _combined_ocr_process_pool_warning_ctx() -> Iterable[None]: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r"This process .* is multi-threaded, use of fork\(\) may lead to deadlocks in the child\.", + category=DeprecationWarning, + module=r"multiprocessing\.popen_fork", + ) + yield + + +def _blank_regex_matches_preserve_layout(text: str, pattern: re.Pattern[str]) -> str: + return pattern.sub(lambda match: _blank_non_newlines(match.group(0)), text) + + +def _filter_tables_preserve_layout(text: str) -> str: + lowered = text.lower() + if " str: + if "$" not in text and "\\" not in text: + return text + for pattern in ( + LATEX_BEGIN_END_RE, + LATEX_BLOCK_RE, + LATEX_BRACKET_RE, + LATEX_INLINE_PAREN_RE, + LATEX_INLINE_DOLLAR_RE, + ): + text = _blank_regex_matches_preserve_layout(text, pattern) + return text + + +def _blank_existing_match_regions_preserve_layout(text: str) -> str: + if " str: + if not spans: + return text + + chars = list(text) + for span in spans: + start = max(0, int(span["start"])) + end = min(len(chars), int(span["end"])) + for idx in range(start, end): + if chars[idx] != "\n": + chars[idx] = " " + return "".join(chars) + + +def _extract_latex_segments(text: str) -> List[Dict[str, Any]]: + raw: List[Tuple[int, int, str, str]] = [] + for name, pattern in LATEX_SEGMENT_PATTERNS: + for match in pattern.finditer(text): + raw.append((match.start(), match.end(), name, match.group(0))) + + raw.sort(key=lambda item: (item[0], -(item[1] - item[0]), item[2])) + segments: List[Dict[str, Any]] = [] + last_end = -1 + for start, end, kind, body in raw: + if segments and start >= segments[-1]["start"] and end <= segments[-1]["end"]: + continue + if start < last_end: + continue + segments.append({"start": start, "end": end, "kind": kind, "text": body}) + last_end = end + return segments + + +def _clean_fill_for_removed_span(page_text: str, start: int, end: int) -> str: + removed = page_text[start:end] + prev_char = page_text[start - 1] if start > 0 else "" + next_char = page_text[end] if end < len(page_text) else "" + if "\n" in removed: + if prev_char == "\n" or next_char == "\n": + return "" + return "\n" + if prev_char and next_char and not prev_char.isspace() and not next_char.isspace(): + return " " + return "" + + +def _find_table_repeat_spans(page_text: str) -> List[Dict[str, Any]]: + """Keep phase_clean's old call shape while table policy lives in ocr_table.""" + return _find_table_repeat_spans_impl( + page_text, + match_category=MATCH_CATEGORY_BY_TYPE["table_repeat"], + ) + + +def _normalize_latex_repeat_with_map(text: str) -> Tuple[str, List[int]]: + normalized: List[str] = [] + raw_map: List[int] = [] + for raw_idx, ch in enumerate(text): + if ch.isspace(): + continue + normalized.append(ch.casefold()) + raw_map.append(raw_idx) + return "".join(normalized), raw_map + + +def _normalize_latex_segment_exact(text: str) -> str: + return "".join(ch.casefold() for ch in text if not ch.isspace()) + + +def _normalize_latex_segment_skeleton(text: str) -> str: + normalized = _normalize_latex_segment_exact(text) + normalized = re.sub(r"\d+", "#", normalized) + for command in LATEX_SYMBOL_SLOT_COMMANDS: + normalized = normalized.replace(command.casefold(), r"\sym") + normalized = re.sub(r"dr(?:_?\*|_?\\ast)?", "dr@", normalized) + return normalized + + +def _is_short_latex_repeat_atom(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if len(normalized) > LATEX_SHORT_SEGMENT_MAX_NORM: + return False + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + if not command_tokens: + return False + return set(command_tokens).issubset(LATEX_SHORT_REPEAT_ATOM_COMMANDS) + + +def _strip_latex_outer_delimiters(raw_segment: str) -> str: + stripped = raw_segment.strip() + wrappers = ( + (r"\(", r"\)"), + (r"\[", r"\]"), + ("$$", "$$"), + ("$", "$"), + ) + for left, right in wrappers: + if stripped.startswith(left) and stripped.endswith(right) and len(stripped) >= len(left) + len(right): + return stripped[len(left) : len(stripped) - len(right)].strip() + return stripped + + +def _latex_short_atom_block_key(raw_segment: str) -> Optional[str]: + body = "".join(ch for ch in _strip_latex_outer_delimiters(raw_segment) if not ch.isspace()) + if not body or len(body) > LATEX_SHORT_SEGMENT_MAX_NORM: + return None + + plain_pattern = ( + r"^(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS)) + + r")(?P\'+)?$" + ) + match = re.fullmatch(plain_pattern, body) + if match: + base = match.group("base") or "" + primes = match.group("primes") or "" + return f"{base}{primes}" + + decorated_pattern = ( + r"^(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS)) + + r")\{(?P" + + "|".join(re.escape(token) for token in sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS)) + + r")\}(?P\'+)?$" + ) + match = re.fullmatch(decorated_pattern, body) + if match: + decorator = match.group("decorator") or "" + base = match.group("base") or "" + primes = match.group("primes") or "" + return f"{decorator}{{{base}}}{primes}" + + return None + + +def _consume_latex_short_atom_script(body: str, pos: int) -> Optional[int]: + while pos < len(body) and body[pos] in "_^": + pos += 1 + if pos >= len(body): + return None + if body[pos] == "{": + end = body.find("}", pos + 1) + if end == -1 or end == pos + 1: + return None + content = body[pos + 1 : end] + if any(ch.isspace() for ch in content) or "{" in content or "}" in content: + return None + pos = end + 1 + continue + if body[pos] == "\\": + match = re.match(r"\\[A-Za-z]+", body[pos:]) + if match is None: + return None + pos += len(match.group(0)) + continue + if body[pos].isalnum(): + pos += 1 + continue + return None + return pos + + +def _latex_short_atom_sequence_tokens( + raw_segment: str, + *, + allow_truncated_tail: bool = False, +) -> Optional[List[str]]: + body = "".join(ch for ch in _strip_latex_outer_delimiters(raw_segment) if not ch.isspace()) + if not body: + return None + + base_commands = sorted(LATEX_SHORT_ATOM_BLOCK_BASE_COMMANDS, key=len, reverse=True) + decorator_commands = sorted(LATEX_SHORT_ATOM_BLOCK_DECORATOR_COMMANDS, key=len, reverse=True) + tokens: List[str] = [] + pos = 0 + while pos < len(body): + token: Optional[str] = None + for decorator in decorator_commands: + prefix = decorator + "{" + if not body.startswith(prefix, pos): + continue + inner_pos = pos + len(prefix) + base = next((candidate for candidate in base_commands if body.startswith(candidate, inner_pos)), None) + if base is None: + continue + end_pos = inner_pos + len(base) + if end_pos >= len(body) or body[end_pos] != "}": + continue + token = f"{decorator}{{{base}}}" + pos = end_pos + 1 + break + + if token is None: + base = next((candidate for candidate in base_commands if body.startswith(candidate, pos)), None) + if base is not None: + token = base + pos += len(base) + + if token is None: + remaining = body[pos:] + if allow_truncated_tail and tokens and len(remaining) >= 4 and any(command.startswith(remaining) for command in base_commands): + break + return None + + while pos < len(body) and body[pos] == "'": + token += "'" + pos += 1 + + script_end = _consume_latex_short_atom_script(body, pos) + if script_end is None: + return None + token += body[pos:script_end] + pos = script_end + + while pos < len(body) and body[pos] == "'": + token += "'" + pos += 1 + + tokens.append(token) + + return tokens or None + + +def _is_short_latex_whitelist_segment(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if len(normalized) > LATEX_SHORT_SEGMENT_MAX_NORM: + return False + tokens = _latex_short_atom_sequence_tokens(raw_segment) + return tokens is not None and len(tokens) >= LATEX_SHORT_ATOM_EXACT_SEGMENT_MIN_TOKENS + + +def _is_latex_short_atom_chain_segment(raw_segment: str) -> bool: + tokens = _latex_short_atom_sequence_tokens(raw_segment, allow_truncated_tail=True) + if tokens is None or len(tokens) < LATEX_SHORT_ATOM_CHAIN_MIN_TOKENS: + return False + counts = Counter(tokens) + return max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN and len(counts) <= 3 + + +def _is_suspicious_internal_latex_repeat(raw_segment: str) -> bool: + if not raw_segment: + return False + if "" in raw_segment or "" in raw_segment: + return True + if _is_latex_short_atom_chain_segment(raw_segment): + return True + + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + if any(wrapper in raw_segment for wrapper in LATEX_TEXT_WRAPPER_MACROS): + return len(command_tokens) >= 8 or len(raw_segment) >= 60 + + counts = Counter(command_tokens) + if set(command_tokens).issubset(LATEX_INTERNAL_SMALL_VOCAB_COMMANDS): + if len(command_tokens) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS and len(counts) <= 3: + return True + if any(command in LATEX_INTERNAL_REPEAT_COMMANDS for command in counts): + return max(counts.values(), default=0) >= LATEX_INTERNAL_REPEAT_MIN_COMMAND_DUP + + return False + + +def _extract_latex_lhs_key(raw_segment: str) -> Optional[str]: + normalized = _normalize_latex_segment_exact(raw_segment) + if "=" not in normalized: + return None + lhs = normalized.split("=", 1)[0] + return lhs or None + + +def _is_latex_symbol_inventory_segment(raw_segment: str) -> bool: + normalized = _normalize_latex_segment_exact(raw_segment) + if not normalized or len(normalized) > 96: + return False + if any(token in normalized for token in ("=", "+", "-", r"\sum", r"\prod", r"\int", r"\frac")): + return False + if _is_short_latex_repeat_atom(raw_segment): + return False + command_tokens = LATEX_COMMAND_RE.findall(raw_segment) + return bool(command_tokens) + + +def _is_small_parameterized_definition_family(run: List[Dict[str, Any]]) -> bool: + if len(run) > LATEX_SMALL_DEFINITION_FAMILY_MAX_RUN: + return False + lhs_keys = [_extract_latex_lhs_key(str(item["text"])) for item in run] + if any(key is None for key in lhs_keys): + return False + if any( + key is not None and any(token in key for token in (r"\frac", r"\sum", r"\prod", r"\int", "+", "-", "=")) + for key in lhs_keys + ): + return False + return len(set(lhs_keys)) == len(lhs_keys) + + +def _is_symbol_inventory_run(run: List[Dict[str, Any]]) -> bool: + return all(_is_latex_symbol_inventory_segment(str(item["text"])) for item in run) + + +def _short_atom_run_has_clean_gaps(page_text: str, run: List[Dict[str, Any]]) -> bool: + if len(run) < 2: + return True + for left, right in zip(run, run[1:]): + gap = page_text[int(left["end"]) : int(right["start"])] + if any(ch.isalnum() for ch in gap): + return False + return True + + +def _extract_latex_numeric_slots(raw_segment: str) -> Optional[List[float]]: + slots: List[float] = [] + for token in re.findall(r"[0-9]+(?:[.,/][0-9]+)*", raw_segment): + if "/" in token: + if token.count("/") != 1: + return None + lhs, rhs = token.split("/", 1) + if not lhs.isdigit() or not rhs.isdigit() or int(rhs) == 0: + return None + slots.append(float(int(lhs) / int(rhs))) + continue + if token.count(".") + token.count(",") > 1: + return None + normalized = token.replace(",", ".", 1) + if "." in normalized: + lhs, rhs = normalized.split(".", 1) + if not lhs.isdigit() or not rhs.isdigit(): + return None + slots.append(float(normalized)) + continue + if token.isdigit(): + slots.append(float(int(token))) + continue + return None + return slots or None + + +def _latex_slot_progress_position(values: List[float]) -> bool: + if len(values) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + return False + + diffs: List[float] = [] + tolerance = 1e-9 + for left, right in zip(values, values[1:]): + diff = right - left + if diff < -tolerance: + return False + if diff > tolerance: + diffs.append(diff) + + if not diffs: + return False + + baseline = diffs[0] + return all(abs(diff - baseline) <= max(tolerance, abs(baseline) * 1e-6) for diff in diffs[1:]) + + +def _is_latex_slot_progression_run(run: List[Dict[str, Any]]) -> bool: + if len(run) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + return False + if _is_small_parameterized_definition_family(run): + return False + if _is_symbol_inventory_run(run): + return False + if _is_short_latex_repeat_atom(str(run[0]["text"])): + return False + + slot_lists = [item.get("numeric_slots") for item in run] + if any(not slots for slots in slot_lists): + return False + slot_count = len(slot_lists[0] or []) + if slot_count == 0 or any(len(slots or []) != slot_count for slots in slot_lists): + return False + + varying_positions = 0 + for slot_idx in range(slot_count): + values = [float(slots[slot_idx]) for slots in slot_lists if slots is not None] + if len({round(value, 9) for value in values}) > 1: + varying_positions += 1 + if varying_positions == 0 or varying_positions > 2: + return False + + for slot_idx in range(slot_count): + values = [float(slots[slot_idx]) for slots in slot_lists if slots is not None] + if _latex_slot_progress_position(values): + return True + return False + + +def _normalize_alnum_with_map_skip_tags(text: str) -> Tuple[str, List[int]]: + norm_chars: List[str] = [] + raw_char_indices: List[int] = [] + in_tag = False + for raw_idx, ch in enumerate(text): + if in_tag: + if ch == ">": + in_tag = False + continue + if ch == "<": + in_tag = True + continue + folded = unicodedata.normalize("NFD", ch.casefold()) + for sub in folded: + category = unicodedata.category(sub) + if category.startswith("L") or category.startswith("N"): + sub = WORD_CONFUSABLE_FOLD_MAP.get(sub, sub) + norm_chars.append(sub) + raw_char_indices.append(raw_idx) + return "".join(norm_chars), raw_char_indices + + +def _normalize_hybrid_body(text: str) -> str: + norm_chars: List[str] = [] + for ch in text: + folded = unicodedata.normalize("NFD", ch.casefold()) + for sub in folded: + category = unicodedata.category(sub) + if category.startswith("L") or category.startswith("N"): + norm_chars.append(WORD_CONFUSABLE_FOLD_MAP.get(sub, sub)) + return "".join(norm_chars) + + +def _classify_hybrid_numeric_field(prefix: str) -> Optional[Dict[str, Any]]: + token = prefix.strip() + if not token: + return None + + trailing_paren = token.endswith(")") + trailing_dot = token.endswith(".") + stripped = token[:-1] if trailing_paren or trailing_dot else token + if not stripped: + return None + + if "/" in stripped: + return {"field_kind": "numeric_value", "raw": token} + + parts = stripped.split(".") + if not all(part.isdigit() for part in parts): + return None + + numbers = [int(part) for part in parts] + shape = ".".join("#" for _ in numbers) + if trailing_paren: + shape += ")" + elif trailing_dot: + shape += "." + + if trailing_paren or trailing_dot: + field_kind = "header_counter" + elif len(numbers) >= 3: + field_kind = "header_counter" + elif len(numbers) == 2 and len(parts[-1]) <= 2: + field_kind = "header_counter" + else: + field_kind = "numeric_value" + + return { + "field_kind": field_kind, + "numbers": numbers, + "shape": shape, + "raw": token, + } + + +def _classify_hybrid_inline_numeric_field(token: str) -> Optional[Dict[str, Any]]: + stripped = token.strip() + if not stripped: + return None + + if re.fullmatch(r"[0-9]+", stripped): + return {"field_kind": "numeric_value", "raw": stripped} + + if stripped.count("/") == 1: + lhs, rhs = stripped.split("/", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs) and int(rhs) != 0: + return {"field_kind": "numeric_value", "raw": stripped} + return None + + decimal_candidate = stripped.replace(",", ".", 1) + if decimal_candidate.count(".") == 1: + lhs, rhs = decimal_candidate.split(".", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs): + return {"field_kind": "numeric_value", "raw": stripped} + + return None + + +def _parse_hybrid_numeric_value(token: str) -> Optional[float]: + stripped = token.strip() + if not stripped: + return None + + if re.fullmatch(r"[0-9]+", stripped): + return float(int(stripped)) + + if stripped.count("/") == 1: + lhs, rhs = stripped.split("/", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs) and int(rhs) != 0: + return float(int(lhs) / int(rhs)) + return None + + decimal_candidate = stripped.replace(",", ".", 1) + if decimal_candidate.count(".") == 1: + lhs, rhs = decimal_candidate.split(".", 1) + if re.fullmatch(r"[0-9]+", lhs) and re.fullmatch(r"[0-9]+", rhs): + return float(decimal_candidate) + + return None + + +def _prepare_hybrid_analysis_text( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], +) -> str: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + return analysis_text + + +def _extract_hybrid_numbered_items_from_analysis_text(analysis_text: str) -> List[Dict[str, Any]]: + candidates: List[Dict[str, Any]] = [] + for match in HYBRID_PREFIX_RE.finditer(analysis_text): + field = _classify_hybrid_numeric_field(match.group("prefix")) + if field is None: + continue + candidates.append( + { + "prefix_start": match.start("prefix"), + "prefix_end": match.end("prefix"), + **field, + } + ) + + items: List[Dict[str, Any]] = [] + for idx, candidate in enumerate(candidates): + next_start = ( + int(candidates[idx + 1]["prefix_start"]) if idx + 1 < len(candidates) else len(analysis_text) + ) + body_raw = analysis_text[int(candidate["prefix_end"]) : next_start].strip() + if HYBRID_MARKUP_BODY_RE.search(body_raw): + continue + body_key = _normalize_hybrid_body(body_raw) + has_alpha = any(ch.isalpha() for ch in body_key) + if not has_alpha: + continue + body_is_full = len(body_key) >= HYBRID_REPEAT_MIN_BODY_ALNUM + items.append( + { + "start": int(candidate["prefix_start"]), + "end": next_start, + "prefix_end": int(candidate["prefix_end"]), + "field_kind": str(candidate["field_kind"]), + "numbers": list(candidate.get("numbers", [])), + "shape": str(candidate.get("shape", "")), + "body_raw": body_raw, + "body_key": body_key, + "body_is_full": body_is_full, + } + ) + + return items + + +def _extract_hybrid_inline_numeric_items_from_analysis_text(analysis_text: str) -> List[Dict[str, Any]]: + clause_ranges: List[Tuple[int, int]] = [] + clause_start = 0 + for match in HYBRID_INLINE_CLAUSE_DELIMITER_RE.finditer(analysis_text): + clause_ranges.append((clause_start, match.start())) + clause_start = match.end() + clause_ranges.append((clause_start, len(analysis_text))) + + items: List[Dict[str, Any]] = [] + for clause_index, (raw_start, raw_end) in enumerate(clause_ranges): + clause = analysis_text[raw_start:raw_end] + if not clause.strip(): + continue + + leading_ws = len(clause) - len(clause.lstrip()) + trailing_ws = len(clause) - len(clause.rstrip()) + clause_start_abs = raw_start + leading_ws + clause_end_abs = raw_end - trailing_ws + clause_text = analysis_text[clause_start_abs:clause_end_abs] + if not clause_text or HYBRID_MARKUP_BODY_RE.search(clause_text): + continue + + working_offset = clause_start_abs + working_text = clause_text + prefix_match = HYBRID_PREFIX_RE.match(working_text) + if prefix_match: + working_offset += prefix_match.end() + working_text = working_text[prefix_match.end() :].lstrip() + working_offset = clause_end_abs - len(working_text) + if not working_text: + continue + + tokens: List[Dict[str, Any]] = [] + numeric_token_positions: List[int] = [] + for match in HYBRID_INLINE_TOKEN_RE.finditer(working_text): + token = match.group(0) + abs_start = working_offset + match.start() + abs_end = working_offset + match.end() + if token and token[0].isdigit(): + numeric_info = _classify_hybrid_inline_numeric_field(token) + if numeric_info is None: + continue + parsed_value = _parse_hybrid_numeric_value(token) + if parsed_value is None: + continue + numeric_token_positions.append(len(tokens)) + tokens.append( + { + "kind": "numeric", + "start": abs_start, + "end": abs_end, + "raw": token, + "numeric_value": parsed_value, + } + ) + continue + token_key = _normalize_hybrid_body(token) + if not token_key: + continue + tokens.append( + { + "kind": "alpha", + "start": abs_start, + "end": abs_end, + "raw": token, + "token_key": token_key, + } + ) + + if len(numeric_token_positions) != 1: + continue + + numeric_pos = numeric_token_positions[0] + numeric_token = tokens[numeric_pos] + left_alpha = [token for token in tokens[:numeric_pos] if token.get("kind") == "alpha"] + right_alpha = [token for token in tokens[numeric_pos + 1 :] if token.get("kind") == "alpha"] + left_context = left_alpha[-HYBRID_INLINE_CONTEXT_WORDS:] + right_context = right_alpha[:HYBRID_INLINE_CONTEXT_WORDS] + alpha_word_count = len(left_context) + len(right_context) + if alpha_word_count < HYBRID_INLINE_CONTEXT_MIN_ALPHA_WORDS: + continue + + context_parts = [str(token.get("token_key", "")) for token in left_context] + context_parts.append("num") + context_parts.extend(str(token.get("token_key", "")) for token in right_context) + context_key = _normalize_hybrid_body(" ".join(context_parts)) + if len(context_key) < HYBRID_INLINE_CONTEXT_MIN_CHARS: + continue + + item_start = int(left_context[0]["start"]) if left_context else int(numeric_token["start"]) + item_end = int(right_context[-1]["end"]) if right_context else int(numeric_token["end"]) + items.append( + { + "start": item_start, + "end": item_end, + "clause_index": clause_index, + "field_kind": "numeric_value", + "inline_context_key": context_key, + "numeric_value": float(numeric_token["numeric_value"]), + } + ) + + return items + + +def _hybrid_partial_body_matches(candidate_body_key: str, target_body_key: str) -> bool: + if not candidate_body_key or not target_body_key: + return False + if candidate_body_key == target_body_key: + return False + if not target_body_key.startswith(candidate_body_key): + return False + min_chars = min(4, len(target_body_key)) + min_ratio_chars = max(1, math.ceil(len(target_body_key) * 0.5)) + return len(candidate_body_key) >= min(min_chars, min_ratio_chars) + + +def _extend_hybrid_tail_span_end( + items: List[Dict[str, Any]], + *, + run_start: int, + run_end: int, + expected_body_key: str, +) -> int: + span_end = int(items[run_end - 1]["end"]) + if run_end >= len(items): + return span_end + + tail = items[run_end] + if tail.get("field_kind") != "header_counter": + return span_end + if str(tail.get("shape", "")) != str(items[run_start].get("shape", "")): + return span_end + if not _hybrid_header_progresses(items[run_end - 1], tail): + return span_end + if not _hybrid_partial_body_matches(str(tail.get("body_key", "")), expected_body_key): + return span_end + return int(tail["end"]) + + +def _hybrid_header_progresses(previous: Dict[str, Any], current: Dict[str, Any]) -> bool: + if previous.get("field_kind") != "header_counter" or current.get("field_kind") != "header_counter": + return False + prev_numbers = list(previous.get("numbers", [])) + curr_numbers = list(current.get("numbers", [])) + if len(prev_numbers) != len(curr_numbers) or not prev_numbers: + return False + return prev_numbers[:-1] == curr_numbers[:-1] and curr_numbers[-1] == prev_numbers[-1] + 1 + + +def _hybrid_header_is_parent(previous: Dict[str, Any], current: Dict[str, Any]) -> bool: + if previous.get("field_kind") != "header_counter" or current.get("field_kind") != "header_counter": + return False + prev_numbers = list(previous.get("numbers", [])) + curr_numbers = list(current.get("numbers", [])) + if not prev_numbers or len(prev_numbers) + 1 != len(curr_numbers): + return False + return curr_numbers[:-1] == prev_numbers + + +def _hybrid_inline_step(previous: Dict[str, Any], current: Dict[str, Any]) -> Optional[float]: + if previous.get("field_kind") != "numeric_value" or current.get("field_kind") != "numeric_value": + return None + if int(current.get("clause_index", -1)) != int(previous.get("clause_index", -1)) + 1: + return None + if str(previous.get("inline_context_key", "")) != str(current.get("inline_context_key", "")): + return None + + previous_value = float(previous.get("numeric_value", 0.0)) + current_value = float(current.get("numeric_value", 0.0)) + step = current_value - previous_value + if step <= 0: + return None + return step + + +def _hybrid_inline_step_matches(expected_step: float, actual_step: float) -> bool: + tolerance = max(1e-9, abs(expected_step) * 1e-6) + return abs(expected_step - actual_step) <= tolerance + + +def _find_hybrid_same_body_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + idx = 0 + while idx < len(items): + item = items[idx] + if item.get("field_kind") != "header_counter" or not bool(item.get("body_is_full")): + idx += 1 + continue + + end_idx = idx + 1 + while ( + end_idx < len(items) + and items[end_idx].get("field_kind") == "header_counter" + and bool(items[end_idx].get("body_is_full")) + and str(items[end_idx].get("body_key", "")) == str(item.get("body_key", "")) + and str(items[end_idx].get("shape", "")) == str(item.get("shape", "")) + and _hybrid_header_progresses(items[end_idx - 1], items[end_idx]) + ): + end_idx += 1 + + run_length = end_idx - idx + if run_length >= HYBRID_REPEAT_MIN_ITEMS: + start_idx = idx + if idx > 0: + previous = items[idx - 1] + if ( + bool(previous.get("body_is_full")) + and + str(previous.get("body_key", "")) == str(item.get("body_key", "")) + and _hybrid_header_is_parent(previous, item) + ): + start_idx = idx - 1 + + span_end = _extend_hybrid_tail_span_end( + items, + run_start=idx, + run_end=end_idx, + expected_body_key=str(item.get("body_key", "")), + ) + spans.append( + { + "start": int(items[start_idx]["start"]), + "end": span_end, + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "same_body_progression", + "item_count": end_idx - start_idx, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_cycle_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + n_items = len(items) + for cycle_len in range(2, HYBRID_REPEAT_MAX_CYCLE + 1): + idx = 0 + while idx + 2 * cycle_len <= n_items: + run = items[idx : idx + 2 * cycle_len] + if any(item.get("field_kind") != "header_counter" or not bool(item.get("body_is_full")) for item in run): + idx += 1 + continue + shapes = {str(item.get("shape", "")) for item in run} + if len(shapes) != 1: + idx += 1 + continue + if not all(_hybrid_header_progresses(run[pos - 1], run[pos]) for pos in range(1, len(run))): + idx += 1 + continue + + template = [str(item.get("body_key", "")) for item in run[:cycle_len]] + if len(set(template)) < 2: + idx += 1 + continue + + if any(str(run[pos].get("body_key", "")) != template[pos % cycle_len] for pos in range(cycle_len, len(run))): + idx += 1 + continue + + end_idx = idx + 2 * cycle_len + while ( + end_idx < n_items + and items[end_idx].get("field_kind") == "header_counter" + and bool(items[end_idx].get("body_is_full")) + and str(items[end_idx].get("shape", "")) == str(items[idx].get("shape", "")) + and _hybrid_header_progresses(items[end_idx - 1], items[end_idx]) + and str(items[end_idx].get("body_key", "")) == template[(end_idx - idx) % cycle_len] + ): + end_idx += 1 + + item_count = end_idx - idx + if item_count >= HYBRID_REPEAT_MIN_CYCLE_ITEMS: + span_end = _extend_hybrid_tail_span_end( + items, + run_start=idx, + run_end=end_idx, + expected_body_key=template[(end_idx - idx) % cycle_len], + ) + spans.append( + { + "start": int(items[idx]["start"]), + "end": span_end, + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "body_cycle_progression", + "item_count": item_count, + "cycle_len": cycle_len, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_inline_progression_spans(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + spans: List[Dict[str, Any]] = [] + idx = 0 + while idx + HYBRID_INLINE_REPEAT_MIN_ITEMS <= len(items): + first = items[idx] + second = items[idx + 1] + expected_step = _hybrid_inline_step(first, second) + if expected_step is None: + idx += 1 + continue + + end_idx = idx + 2 + while end_idx < len(items): + actual_step = _hybrid_inline_step(items[end_idx - 1], items[end_idx]) + if actual_step is None or not _hybrid_inline_step_matches(expected_step, actual_step): + break + end_idx += 1 + + item_count = end_idx - idx + if item_count >= HYBRID_INLINE_REPEAT_MIN_ITEMS: + spans.append( + { + "start": int(items[idx]["start"]), + "end": int(items[end_idx - 1]["end"]), + "match_types": ["hybrid_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["hybrid_repeat"], + "kind": "inline_numeric_progression", + "item_count": item_count, + } + ) + idx = end_idx + continue + + idx += 1 + + return spans + + +def _find_hybrid_numbered_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + analysis_text: Optional[str] = None, +) -> List[Dict[str, Any]]: + if not any(ch.isdigit() for ch in page_text): + return [] + if analysis_text is None: + analysis_text = _prepare_hybrid_analysis_text(page_text, blocked_spans=blocked_spans) + else: + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + rust_mod = _get_word_repeat_rust_module() + if rust_mod is not None and hasattr(rust_mod, "find_hybrid_repeat_spans"): + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "match_types": list(item["match_types"]), + "category": str(item["category"]), + "kind": str(item["kind"]), + "item_count": int(item["item_count"]), + **({"cycle_len": int(item["cycle_len"])} if "cycle_len" in item else {}), + } + for item in rust_mod.find_hybrid_repeat_spans(analysis_text) + ] + items = _extract_hybrid_numbered_items_from_analysis_text(analysis_text) + spans = _find_hybrid_same_body_progression_spans(items) + spans.extend(_find_hybrid_cycle_progression_spans(items)) + inline_items = _extract_hybrid_inline_numeric_items_from_analysis_text(analysis_text) + spans.extend(_find_hybrid_inline_progression_spans(inline_items)) + spans.sort(key=lambda item: (int(item["start"]), -(int(item["end"]) - int(item["start"])))) + + deduped: List[Dict[str, Any]] = [] + for span in spans: + if deduped and int(span["start"]) >= int(deduped[-1]["start"]) and int(span["end"]) <= int(deduped[-1]["end"]): + continue + deduped.append(span) + return deduped + + +def _build_word_repeat_hash(text: str) -> Tuple[List[int], List[int]]: + pref = [0] * (len(text) + 1) + pw = [1] * (len(text) + 1) + for idx, ch in enumerate(text): + code = ord(ch) + 1 + pref[idx + 1] = (pref[idx] * WORD_REPEAT_HASH_BASE + code) & WORD_REPEAT_HASH_MASK + pw[idx + 1] = (pw[idx] * WORD_REPEAT_HASH_BASE) & WORD_REPEAT_HASH_MASK + return pref, pw + + +def _word_repeat_hash_slice(pref: List[int], pw: List[int], start: int, end: int) -> int: + return (pref[end] - ((pref[start] * pw[end - start]) & WORD_REPEAT_HASH_MASK)) & WORD_REPEAT_HASH_MASK + + +def _word_repeat_blocks_equal( + text: str, + pref: List[int], + pw: List[int], + lhs: int, + rhs: int, + period: int, +) -> bool: + return ( + _word_repeat_hash_slice(pref, pw, lhs, lhs + period) + == _word_repeat_hash_slice(pref, pw, rhs, rhs + period) + and text[lhs : lhs + period] == text[rhs : rhs + period] + ) + + +def _get_word_repeat_rust_module() -> Optional[Any]: + global _WORD_REPEAT_RUST_MOD, _WORD_REPEAT_RUST_IMPORT_ATTEMPTED + if _WORD_REPEAT_RUST_IMPORT_ATTEMPTED: + return _WORD_REPEAT_RUST_MOD + _WORD_REPEAT_RUST_IMPORT_ATTEMPTED = True + try: + module = importlib.import_module("glossapi_rs_noise") + except Exception: + _WORD_REPEAT_RUST_MOD = None + return None + if hasattr(module, "find_word_repeat_spans"): + _WORD_REPEAT_RUST_MOD = module + else: + _WORD_REPEAT_RUST_MOD = None + return _WORD_REPEAT_RUST_MOD + + +def _find_word_repeat_spans_python( + normalized_text: str, + *, + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, int]]: + n_chars = len(normalized_text) + if n_chars < rep_threshold * min_period: + return [] + + pref, pw = _build_word_repeat_hash(normalized_text) + max_period = min(max(min_period, window // rep_threshold), n_chars // rep_threshold) + spans: List[Dict[str, int]] = [] + + for period in range(min_period, max_period + 1): + idx = 0 + while idx + rep_threshold * period <= n_chars: + is_repeat = True + for multiple in range(1, rep_threshold): + if not _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + idx, + idx + multiple * period, + period, + ): + is_repeat = False + break + if not is_repeat: + idx += 1 + continue + + left = idx + while left - period >= 0 and _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + left - period, + left, + period, + ): + left -= period + + right = idx + rep_threshold * period + while right + period <= n_chars and _word_repeat_blocks_equal( + normalized_text, + pref, + pw, + right - period, + right, + period, + ): + right += period + + pattern = normalized_text[left : left + period] + tail_chars = 0 + while ( + right + tail_chars < n_chars + and tail_chars < period + and normalized_text[right + tail_chars] == pattern[tail_chars] + ): + tail_chars += 1 + + spans.append( + { + "start": left, + "end": right + tail_chars, + "period": period, + "repetitions": (right - left) // period, + "tail_chars": tail_chars, + } + ) + idx = right + + spans.sort(key=lambda item: (item["start"], -(item["end"] - item["start"]), item["period"])) + deduped: List[Dict[str, int]] = [] + for span in spans: + if deduped and span["start"] >= deduped[-1]["start"] and span["end"] <= deduped[-1]["end"]: + continue + deduped.append(span) + return deduped + + +def _find_word_repeat_spans( + normalized_text: str, + *, + rep_threshold: int, + min_period: int, + window: int, +) -> List[Dict[str, int]]: + rust_mod = _get_word_repeat_rust_module() + if rust_mod is None: + return _find_word_repeat_spans_python( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "period": int(item["period"]), + "repetitions": int(item["repetitions"]), + "tail_chars": int(item["tail_chars"]), + } + for item in rust_mod.find_word_repeat_spans( + normalized_text, + int(rep_threshold), + int(min_period), + int(window), + ) + ] + + +def _gap_has_fewer_than_n_nonwhitespace_chars(text: str, start: int, end: int, limit: int) -> bool: + if start >= end: + return True + count = 0 + for ch in text[start:end]: + if not ch.isspace(): + count += 1 + if count >= limit: + return False + return True + + +def _gap_has_at_most_n_nonwhitespace_chars(text: str, start: int, end: int, limit: int) -> bool: + if start >= end: + return True + count = 0 + for ch in text[start:end]: + if not ch.isspace(): + count += 1 + if count > limit: + return False + return True + + +def _latex_segments_are_local(page_text: str, left: Dict[str, Any], right: Dict[str, Any]) -> bool: + return _gap_has_fewer_than_n_nonwhitespace_chars( + page_text, + int(left["end"]), + int(right["start"]), + LATEX_SEGMENT_LOCAL_NONWHITESPACE_GAP, + ) + + +def _latex_local_groups(page_text: str, segments: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]: + if not segments: + return [] + + groups: List[List[Dict[str, Any]]] = [[segments[0]]] + for segment in segments[1:]: + if _latex_segments_are_local(page_text, groups[-1][-1], segment): + groups[-1].append(segment) + else: + groups.append([segment]) + return groups + + +def _find_local_latex_segment_block_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + if len(group) < LATEX_SEGMENT_EXACT_RUN_MIN: + continue + + idx = 0 + while idx < len(group): + exact_key = str(group[idx]["exact_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["exact_key"]) == exact_key: + end_idx += 1 + + run_length = end_idx - idx + exact_run = group[idx:end_idx] + is_short_repeat_atom = _is_short_latex_repeat_atom(str(group[idx]["text"])) + is_short_whitelist_segment = _is_short_latex_whitelist_segment(str(group[idx]["text"])) + if run_length >= LATEX_SEGMENT_EXACT_RUN_MIN and ( + len(exact_key) >= LATEX_LONG_SEGMENT_MIN_NORM + or (is_short_repeat_atom and _short_atom_run_has_clean_gaps(page_text, exact_run)) + or (is_short_whitelist_segment and _short_atom_run_has_clean_gaps(page_text, exact_run)) + ): + span = { + "start": int(exact_run[0]["start"]), + "end": int(exact_run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + if is_short_whitelist_segment and not is_short_repeat_atom: + span["kind"] = "short_atom_segment_repeat" + span["item_count"] = len(exact_run) + labeled_spans.append(span) + idx = end_idx + + idx = 0 + while idx < len(group): + skeleton_key = str(group[idx]["skeleton_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["skeleton_key"]) == skeleton_key: + end_idx += 1 + + run = group[idx:end_idx] + exact_vocab = {str(item["exact_key"]) for item in run} + if ( + len(run) >= LATEX_SEGMENT_SKELETON_RUN_MIN + and len(skeleton_key) >= LATEX_LONG_SEGMENT_MIN_NORM + and not _is_short_latex_repeat_atom(str(run[0]["text"])) + and len(exact_vocab) >= 2 + and not _is_small_parameterized_definition_family(run) + and not _is_symbol_inventory_run(run) + ): + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + idx = end_idx + + exact_sequence = [str(item["exact_key"]) for item in group] + exact_counts = Counter(exact_sequence) + if ( + len(group) >= LATEX_SEGMENT_ALTERNATING_RUN_MIN + and len(exact_counts) <= 2 + and min(exact_counts.values()) >= 2 + ): + avg_length = sum(len(item) for item in exact_sequence) / len(exact_sequence) + if avg_length >= LATEX_LONG_SEGMENT_MIN_NORM and not all( + _is_short_latex_repeat_atom(str(item["text"])) for item in group + ): + labeled_spans.append( + { + "start": int(group[0]["start"]), + "end": int(group[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + return labeled_spans + + +def _find_short_atom_block_repeat_bounds( + atom_keys: List[str], +) -> Optional[Tuple[int, int, int, int]]: + n_items = len(atom_keys) + if n_items < LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS: + return None + + best: Optional[Tuple[int, int, int, int]] = None + for period in range(n_items // 2, 1, -1): + for start in range(0, n_items - (2 * period) + 1): + pattern = atom_keys[start : start + period] + if atom_keys[start + period : start + (2 * period)] != pattern: + continue + if len(set(pattern)) < 2: + continue + + left = start + while left - period >= 0 and atom_keys[left - period : left] == pattern: + left -= period + + right = start + (2 * period) + while right + period <= n_items and atom_keys[right : right + period] == pattern: + right += period + + repeated_items = right - left + repetitions = repeated_items // period + if repeated_items < LATEX_SHORT_ATOM_BLOCK_REPEAT_MIN_ITEMS or repetitions < 2: + continue + + candidate = (left, right, period, repetitions) + if best is None: + best = candidate + continue + + best_span_len = best[1] - best[0] + candidate_span_len = candidate[1] - candidate[0] + if candidate_span_len > best_span_len: + best = candidate + continue + if candidate_span_len == best_span_len and candidate[2] > best[2]: + best = candidate + return best + + +def _extend_latex_short_atom_block_partial_tail( + page_text: str, + run: List[Dict[str, Any]], + repeated_bounds: Tuple[int, int, int, int], +) -> int: + if not run: + return 0 + + left, _, period, _ = repeated_bounds + if period <= 0 or left >= len(run): + return int(run[-1]["end"]) + + expected_idx = left + ((len(run) - left) % period) + if expected_idx >= len(run): + return int(run[-1]["end"]) + + expected_text = str(run[expected_idx]["text"]) + segment_end = int(run[-1]["end"]) + cursor = segment_end + while cursor < len(page_text) and page_text[cursor].isspace(): + cursor += 1 + if cursor >= len(page_text): + return segment_end + + prefix_len = 0 + while ( + cursor + prefix_len < len(page_text) + and prefix_len < len(expected_text) + and page_text[cursor + prefix_len] == expected_text[prefix_len] + ): + prefix_len += 1 + + if prefix_len == 0 or prefix_len >= len(expected_text): + return segment_end + return cursor + prefix_len + + +def _find_local_latex_short_atom_block_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + idx = 0 + while idx < len(group): + if not group[idx].get("short_atom_block_key"): + idx += 1 + continue + + end_idx = idx + 1 + while end_idx < len(group) and group[end_idx].get("short_atom_block_key"): + end_idx += 1 + + run = group[idx:end_idx] + atom_keys = [str(item["short_atom_block_key"]) for item in run] + repeated_bounds = _find_short_atom_block_repeat_bounds(atom_keys) + if repeated_bounds is not None: + _, _, period_items, repetitions = repeated_bounds + span_end = _extend_latex_short_atom_block_partial_tail(page_text, run, repeated_bounds) + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(span_end), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "short_atom_block_repeat", + "item_count": len(run), + "period_items": int(period_items), + "repetitions": int(repetitions), + } + ) + + idx = end_idx + return labeled_spans + + +def _find_raw_latex_small_vocab_command_spans(page_text: str) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + command_matches = list(LATEX_COMMAND_RE.finditer(page_text)) + run_start: Optional[int] = None + run_end: Optional[int] = None + run_commands: List[str] = [] + previous_end = 0 + + def flush_run() -> None: + if run_start is None or run_end is None or not run_commands: + return + counts = Counter(run_commands) + if ( + len(run_commands) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS + and len(counts) <= 3 + and max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN + ): + labeled_spans.append( + { + "start": run_start, + "end": run_end, + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "internal_small_vocab_command_run", + "item_count": len(run_commands), + } + ) + + for command_match in command_matches: + command = command_match.group(0) + gap = page_text[previous_end : command_match.start()] + can_extend_run = not any(ch.isalnum() for ch in gap) + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS and (not run_commands or can_extend_run): + if not run_commands: + run_start = command_match.start() + run_end = command_match.end() + run_commands.append(command) + else: + flush_run() + run_start = None + run_end = None + run_commands = [] + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS: + run_start = command_match.start() + run_end = command_match.end() + run_commands = [command] + previous_end = command_match.end() + flush_run() + + return labeled_spans + + +def _find_internal_latex_small_vocab_command_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for segment in segments: + raw_text = str(segment["text"]) + command_matches = list(LATEX_COMMAND_RE.finditer(raw_text)) + run_start: Optional[int] = None + run_end: Optional[int] = None + run_commands: List[str] = [] + previous_end = 0 + + def flush_run() -> None: + if run_start is None or run_end is None or not run_commands: + return + counts = Counter(run_commands) + if ( + len(run_commands) >= LATEX_INTERNAL_SMALL_VOCAB_RUN_MIN_COMMANDS + and len(counts) <= 3 + and max(counts.values(), default=0) >= LATEX_SEGMENT_EXACT_RUN_MIN + ): + labeled_spans.append( + { + "start": int(segment["start"]) + run_start, + "end": int(segment["start"]) + run_end, + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "internal_small_vocab_command_run", + "item_count": len(run_commands), + } + ) + + for command_match in command_matches: + command = command_match.group(0) + gap = raw_text[previous_end : command_match.start()] + can_extend_run = not any(ch.isalnum() for ch in gap) + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS and (not run_commands or can_extend_run): + if not run_commands: + run_start = command_match.start() + run_end = command_match.end() + run_commands.append(command) + else: + flush_run() + run_start = None + run_end = None + run_commands = [] + if command in LATEX_INTERNAL_SMALL_VOCAB_COMMANDS: + run_start = command_match.start() + run_end = command_match.end() + run_commands = [command] + previous_end = command_match.end() + flush_run() + + return labeled_spans + + +def _find_local_latex_slot_progression_spans( + page_text: str, + segments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + labeled_spans: List[Dict[str, Any]] = [] + for group in _latex_local_groups(page_text, segments): + if len(group) < LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN: + continue + + idx = 0 + while idx < len(group): + skeleton_key = str(group[idx]["skeleton_key"]) + end_idx = idx + 1 + while end_idx < len(group) and str(group[end_idx]["skeleton_key"]) == skeleton_key: + end_idx += 1 + + run = group[idx:end_idx] + exact_vocab = {str(item["exact_key"]) for item in run} + if ( + len(run) >= LATEX_SEGMENT_SLOT_PROGRESS_RUN_MIN + and len(skeleton_key) >= LATEX_LONG_SEGMENT_MIN_NORM + and len(exact_vocab) >= 2 + and _is_latex_slot_progression_run(run) + ): + labeled_spans.append( + { + "start": int(run[0]["start"]), + "end": int(run[-1]["end"]), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + "kind": "slot_progression", + "item_count": len(run), + } + ) + idx = end_idx + + return labeled_spans + + +def _find_latex_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + rep_threshold: int, + min_period: int, + window: int, + analysis_text: Optional[str] = None, +) -> List[Dict[str, Any]]: + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + if ( + "$" not in analysis_text + and "\\" not in analysis_text + and "" not in analysis_text + and "" not in analysis_text + ): + return [] + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + labeled_spans: List[Dict[str, Any]] = [] + + for wrapper_pattern in (LATEX_TEXT_WRAPPER_BODY_RE, LATEX_TEXT_WRAPPER_OPEN_BODY_RE): + for match in wrapper_pattern.finditer(analysis_text): + body = match.group(1) + command_tokens = LATEX_COMMAND_RE.findall(body) + if len(command_tokens) < 16: + continue + if len(set(command_tokens)) > 4: + continue + labeled_spans.append( + { + "start": match.start(1), + "end": match.end(1), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + for match in HTML_MATH_MARKUP_CLUSTER_RE.finditer(analysis_text): + labeled_spans.append( + { + "start": match.start(), + "end": match.end(), + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + ) + + labeled_spans.extend(_find_raw_latex_small_vocab_command_spans(analysis_text)) + + segments = _extract_latex_segments(analysis_text) + for segment in segments: + raw_text = str(segment["text"]) + segment["exact_key"] = _normalize_latex_segment_exact(raw_text) + segment["skeleton_key"] = _normalize_latex_segment_skeleton(raw_text) + segment["short_atom_block_key"] = _latex_short_atom_block_key(raw_text) + + labeled_spans.extend(_find_local_latex_segment_block_spans(page_text, segments)) + labeled_spans.extend(_find_local_latex_short_atom_block_spans(page_text, segments)) + labeled_spans.extend(_find_internal_latex_small_vocab_command_spans(page_text, segments)) + + for segment in segments: + normalized_text, raw_map = _normalize_latex_repeat_with_map(segment["text"]) + normalized_spans = _find_word_repeat_spans( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + for span in normalized_spans: + if span["end"] <= span["start"] or span["start"] >= len(raw_map): + continue + start = segment["start"] + raw_map[span["start"]] + end = segment["start"] + raw_map[span["end"] - 1] + 1 + raw_span = page_text[start:end] + if not _is_suspicious_internal_latex_repeat(raw_span): + continue + labeled_span = { + "start": start, + "end": end, + "period": span["period"], + "repetitions": span["repetitions"], + "tail_chars": span["tail_chars"], + "match_types": ["latex_repeat"], + "category": MATCH_CATEGORY_BY_TYPE["latex_repeat"], + } + if _is_latex_short_atom_chain_segment(raw_span): + labeled_span["kind"] = "short_atom_chain_segment" + labeled_spans.append(labeled_span) + return labeled_spans + + +def _find_latex_slot_progression_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + analysis_text: Optional[str] = None, +) -> List[Dict[str, Any]]: + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + + segments = _extract_latex_segments(analysis_text) + for segment in segments: + raw_text = str(segment["text"]) + segment["exact_key"] = _normalize_latex_segment_exact(raw_text) + segment["skeleton_key"] = _normalize_latex_segment_skeleton(raw_text) + segment["numeric_slots"] = _extract_latex_numeric_slots(raw_text) + + return _find_local_latex_slot_progression_spans(page_text, segments) + + +def _shared_repeat_match_type(segment: str) -> Optional[str]: + if not segment: + return None + has_letter = any(ch.isalpha() for ch in segment) + has_digit = any(ch.isdigit() for ch in segment) + if has_letter: + return "word_repeat" + if has_digit: + return "numeric_repeat" + return None + + +def _merge_labeled_raw_spans(text: str, spans: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + if not spans: + return [] + + text_len = len(text) + sanitized_spans: List[Dict[str, Any]] = [] + for span in spans: + start = max(0, int(span["start"])) + end = min(text_len, int(span["end"])) + if start >= text_len or end <= start: + continue + sanitized = dict(span) + sanitized["start"] = start + sanitized["end"] = end + sanitized_spans.append(sanitized) + if not sanitized_spans: + return [] + + spans = sorted(sanitized_spans, key=lambda item: (item["start"], item["end"])) + merged: List[Dict[str, Any]] = [] + for span in spans: + if not merged: + merged.append(dict(span)) + continue + + previous = merged[-1] + overlaps = span["start"] <= previous["end"] + close_gap = ( + not overlaps + and previous["category"] == span["category"] + and previous["category"] != "table" + and _gap_has_at_most_n_nonwhitespace_chars( + text, + previous["end"], + span["start"], + WORD_REPEAT_MERGE_MAX_NONWHITESPACE_GAP, + ) + ) + if overlaps or close_gap: + same_single_type = previous.get("match_types", []) == span.get("match_types", []) + same_kind = previous.get("kind") == span.get("kind") + previous["start"] = min(previous["start"], span["start"]) + previous["end"] = max(previous["end"], span["end"]) + previous["match_types"] = sorted( + set(previous.get("match_types", [])) | set(span.get("match_types", [])) + ) + if ( + previous.get("kind") is None + and span.get("kind") is not None + and previous.get("match_types", []) == span.get("match_types", []) + ): + previous["kind"] = span.get("kind") + if "period" in span: + previous["period"] = min(previous.get("period", span["period"]), span["period"]) + if "repetitions" in span: + previous["repetitions"] = max( + previous.get("repetitions", span["repetitions"]), + span["repetitions"], + ) + if "tail_chars" in span: + previous["tail_chars"] = max( + previous.get("tail_chars", 0), + span.get("tail_chars", 0), + ) + if ( + same_single_type + and same_kind + and previous.get("item_count") is not None + and span.get("item_count") is not None + ): + previous["item_count"] = int(previous["item_count"]) + int(span["item_count"]) + continue + merged.append(dict(span)) + return merged + + +def _summarize_merged_labeled_spans( + merged_spans: List[Dict[str, Any]], +) -> Tuple[List[str], int, int, int, int, int]: + seen_types: Set[str] = set() + numeric_count = 0 + word_count = 0 + latex_count = 0 + table_count = 0 + hybrid_count = 0 + for span in merged_spans: + seen_types.update(span.get("match_types", [])) + if span["category"] == "numeric": + numeric_count += 1 + elif span["category"] == "word": + word_count += 1 + elif span["category"] == "latex": + latex_count += 1 + elif span["category"] == "table": + table_count += 1 + elif span["category"] == "hybrid": + hybrid_count += 1 + return ( + sorted(seen_types), + numeric_count, + word_count, + latex_count, + table_count, + hybrid_count, + ) + + +def _render_page_from_merged_labeled_spans( + page_text: str, + merged_spans: List[Dict[str, Any]], + *, + mode: str, +) -> str: + if not merged_spans: + return _replace_html_tables_with_markdown(page_text) + + parts: List[str] = [] + pos = 0 + for span in merged_spans: + start = span["start"] + end = span["end"] + if start > pos: + parts.append(_replace_html_tables_with_markdown(page_text[pos:start])) + match_types = list(span.get("match_types", [])) + if mode == "debug": + open_tag = f'") + else: + if match_types == ["table_repeat"]: + parts.append( + _render_table_html_for_clean( + page_text[start:end], + match_kind=span.get("kind"), + ) + ) + else: + parts.append(_clean_fill_for_removed_span(page_text, start, end)) + pos = end + if pos < len(page_text): + parts.append(_replace_html_tables_with_markdown(page_text[pos:])) + return "".join(parts) + + +def _render_page_with_labeled_spans_result( + page_text: str, + spans: List[Dict[str, Any]], + *, + mode: str = "debug", +) -> Dict[str, Any]: + if mode not in {"debug", "clean"}: + raise ValueError(f"Unsupported OCR render mode: {mode}") + merged_spans = _merge_labeled_raw_spans(page_text, spans) + ( + page_types, + numeric_count, + word_count, + latex_count, + table_count, + hybrid_count, + ) = _summarize_merged_labeled_spans(merged_spans) + rendered_page = _render_page_from_merged_labeled_spans( + page_text, + merged_spans, + mode=mode, + ) + return { + "rendered_page": rendered_page, + "merged_spans": merged_spans, + "page_types": page_types, + "page_numeric_count": numeric_count, + "page_word_count": word_count, + "page_latex_count": latex_count, + "page_table_count": table_count, + "page_hybrid_count": hybrid_count, + } + + +def _render_page_with_labeled_spans( + page_text: str, + spans: List[Dict[str, Any]], + *, + mode: str = "debug", +) -> Tuple[str, List[str], int, int, int, int, int]: + """Render one page from a shared span plan. + + `debug` and `clean` intentionally share the exact same merged span plan. + The only difference is how that plan is rendered: + - debug wraps the matched source surface in `` tags + - clean removes or rewrites the matched surface according to policy + + Keeping both modes on one renderer prevents the real cleaner from drifting + away from the reviewed debug output. + """ + result = _render_page_with_labeled_spans_result(page_text, spans, mode=mode) + return ( + str(result["rendered_page"]), + list(result["page_types"]), + int(result["page_numeric_count"]), + int(result["page_word_count"]), + int(result["page_latex_count"]), + int(result["page_table_count"]), + int(result["page_hybrid_count"]), + ) + + +def _annotate_page_with_labeled_spans( + page_text: str, + spans: List[Dict[str, Any]], +) -> Tuple[str, List[str], int, int, int, int, int]: + return _render_page_with_labeled_spans(page_text, spans, mode="debug") + + +def _count_hybrid_matches_in_page(page_text: str, spans: List[Dict[str, Any]]) -> int: + merged_spans = _merge_labeled_raw_spans(page_text, spans) + return sum(1 for span in merged_spans if span.get("category") == "hybrid") + + +def _utf8_prefix_byte_offsets(text: str) -> List[int]: + offsets = [0] + total = 0 + for char in text: + total += len(char.encode("utf-8")) + offsets.append(total) + return offsets + + +def _span_repeat_count(span: Dict[str, Any]) -> Optional[int]: + if span.get("repetitions") is not None: + return int(span["repetitions"]) + if span.get("item_count") is not None: + return int(span["item_count"]) + if span.get("duplicate_rows") is not None: + return int(span["duplicate_rows"]) + return None + + +def _build_match_index_rows( + page_text: str, + merged_spans: List[Dict[str, Any]], + *, + source_path: Path, + page_number: int, + debug_output_path: Optional[Path] = None, +) -> List[Dict[str, Any]]: + if not merged_spans: + return [] + byte_offsets = _utf8_prefix_byte_offsets(page_text) + rows: List[Dict[str, Any]] = [] + for match_index, span in enumerate(merged_spans, start=1): + start = int(span["start"]) + end = int(span["end"]) + match_text = page_text[start:end] + rows.append( + { + "match_id": f"{source_path.stem}:page:{page_number}:match:{match_index}", + "source_path": str(source_path), + "source_stem": source_path.stem, + "debug_output_path": None if debug_output_path is None else str(debug_output_path), + "page_number": int(page_number), + "page_index_in_file": int(page_number), + "match_index_in_page": int(match_index), + "start_char": start, + "end_char": end, + "start_byte": int(byte_offsets[start]), + "end_byte": int(byte_offsets[end]), + "match_length_chars": int(end - start), + "match_length_bytes": int(byte_offsets[end] - byte_offsets[start]), + "match_types": list(span.get("match_types", [])), + "match_type": ",".join(span.get("match_types", [])), + "category": str(span.get("category", "")), + "kind": span.get("kind"), + "repeat_count": _span_repeat_count(span), + "period": span.get("period"), + "repetitions": span.get("repetitions"), + "tail_chars": span.get("tail_chars"), + "item_count": span.get("item_count"), + "cycle_len": span.get("cycle_len"), + "row_count": span.get("row_count"), + "duplicate_rows": span.get("duplicate_rows"), + "nonempty_ratio": span.get("nonempty_ratio"), + "word_count": span.get("word_count"), + "char_count": span.get("char_count"), + "matched_text": match_text, + } + ) + return rows + + +def _find_labeled_shared_repeat_spans( + page_text: str, + *, + blocked_spans: List[Dict[str, Any]], + rep_threshold: int, + min_period: int, + window: int, + analysis_text: Optional[str] = None, +) -> List[Dict[str, Any]]: + if analysis_text is None: + analysis_text = _filter_tables_preserve_layout(page_text) + analysis_text = _filter_latex_preserve_layout(analysis_text) + analysis_text = _blank_existing_match_regions_preserve_layout(analysis_text) + analysis_text = _blank_raw_spans_preserve_layout(analysis_text, blocked_spans) + rust_mod = _get_word_repeat_rust_module() + if rust_mod is not None and hasattr(rust_mod, "find_labeled_shared_repeat_spans"): + return [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "period": int(item["period"]), + "repetitions": int(item["repetitions"]), + "tail_chars": int(item["tail_chars"]), + "match_types": [str(item["match_type"])], + "category": MATCH_CATEGORY_BY_TYPE[str(item["match_type"])], + } + for item in rust_mod.find_labeled_shared_repeat_spans( + analysis_text, + int(rep_threshold), + int(min_period), + int(window), + ) + ] + normalized_text, raw_map = _normalize_alnum_with_map_skip_tags(analysis_text) + normalized_spans = _find_word_repeat_spans( + normalized_text, + rep_threshold=rep_threshold, + min_period=min_period, + window=window, + ) + labeled_spans: List[Dict[str, Any]] = [] + for span in normalized_spans: + if span["end"] <= span["start"] or span["start"] >= len(raw_map): + continue + match_type = _shared_repeat_match_type(normalized_text[span["start"] : span["end"]]) + if match_type is None: + continue + start = raw_map[span["start"]] + end = raw_map[span["end"] - 1] + 1 + labeled_spans.append( + { + "start": start, + "end": end, + "period": span["period"], + "repetitions": span["repetitions"], + "tail_chars": span["tail_chars"], + "match_types": [match_type], + "category": MATCH_CATEGORY_BY_TYPE[match_type], + } + ) + return labeled_spans + + +def _analyze_combined_ocr_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + page_start = time.perf_counter() + + char_eval_start = time.perf_counter() + page_noise_metrics = dict(noise_mod.evaluate_page_character_noise(page_text)) + char_eval_elapsed = time.perf_counter() - char_eval_start + + table_start = time.perf_counter() + table_spans = _find_table_repeat_spans(page_text) + table_elapsed = time.perf_counter() - table_start + + # Reuse progressively filtered page views so later passes do not rebuild the + # same blanked surfaces repeatedly. + page_without_tables = _filter_tables_preserve_layout(page_text) + page_without_tables_existing = _blank_existing_match_regions_preserve_layout(page_without_tables) + page_without_tables_latex = _filter_latex_preserve_layout(page_without_tables) + page_without_tables_latex_existing = _blank_existing_match_regions_preserve_layout( + page_without_tables_latex + ) + + numeric_start = time.perf_counter() + numeric_spans = [ + { + "start": int(item["start"]), + "end": int(item["end"]), + "match_types": [str(item["match_type"])], + "category": MATCH_CATEGORY_BY_TYPE[str(item["match_type"])], + } + for item in noise_mod.find_numeric_debug_page_spans( + page_without_tables_latex, + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + ) + ] + numeric_elapsed = time.perf_counter() - numeric_start + + latex_start = time.perf_counter() + latex_spans = _find_latex_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans, + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + analysis_text=page_without_tables_existing, + ) + latex_elapsed = time.perf_counter() - latex_start + + hybrid_start = time.perf_counter() + hybrid_spans = _find_hybrid_numbered_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans + latex_spans, + analysis_text=page_without_tables_latex_existing, + ) + hybrid_elapsed = time.perf_counter() - hybrid_start + + shared_start = time.perf_counter() + shared_spans = _find_labeled_shared_repeat_spans( + page_text, + blocked_spans=table_spans + numeric_spans + latex_spans + hybrid_spans, + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + analysis_text=page_without_tables_latex_existing, + ) + shared_elapsed = time.perf_counter() - shared_start + + page_total_time = time.perf_counter() - page_start + return { + "spans": table_spans + numeric_spans + latex_spans + hybrid_spans + shared_spans, + "page_noise_metrics": page_noise_metrics, + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + + +def _render_combined_ocr_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + mode: str = "debug", +) -> Dict[str, Any]: + """Analyze one OCR page in the shared ownership order. + + The ordering is a policy decision, not an implementation accident: + 1. tables first, because table shells distort every later text pass + 2. numeric second, because numeric progressions should not be stolen by + generic word repetition + 3. LaTeX and hybrid structural passes next, because they operate on more + specialized local structure + 4. shared text repetition last, on the remaining visible surface only + + That ownership model keeps the matcher family specific and reduces the + false positives that appear when a single fuzzy text matcher sees + everything at once. + """ + analysis = _analyze_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + ) + render_result = _render_page_with_labeled_spans_result( + page_text, + list(analysis["spans"]), + mode=mode, + ) + return { + "annotated_page": render_result["rendered_page"], + "merged_spans": render_result["merged_spans"], + "page_types": render_result["page_types"], + "page_numeric_count": render_result["page_numeric_count"], + "page_word_count": render_result["page_word_count"], + "page_latex_count": render_result["page_latex_count"], + "page_table_count": render_result["page_table_count"], + "page_hybrid_count": render_result["page_hybrid_count"], + **analysis, + } + + +def _render_combined_ocr_page_modes( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + modes: Iterable[str], +) -> Dict[str, Any]: + analysis = _analyze_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + ) + merged_spans = _merge_labeled_raw_spans(page_text, list(analysis["spans"])) + ( + page_types, + page_numeric_count, + page_word_count, + page_latex_count, + page_table_count, + page_hybrid_count, + ) = _summarize_merged_labeled_spans(merged_spans) + rendered_pages = { + str(mode): _render_page_from_merged_labeled_spans(page_text, merged_spans, mode=str(mode)) + for mode in modes + } + return { + "rendered_pages": rendered_pages, + "merged_spans": merged_spans, + "page_types": page_types, + "page_numeric_count": page_numeric_count, + "page_word_count": page_word_count, + "page_latex_count": page_latex_count, + "page_table_count": page_table_count, + "page_hybrid_count": page_hybrid_count, + **analysis, + } + + +def _render_combined_ocr_debug_page( + page_text: str, + *, + noise_mod: Any, + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + return _render_combined_ocr_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + mode="debug", + ) + + +def _process_combined_ocr_document( + source_path: Path, + *, + clean_output_path: Optional[Path], + debug_output_path: Optional[Path], + noise_mod: Optional[Any], + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, + include_page_metrics: bool, + include_match_index: bool, +) -> Dict[str, Any]: + if noise_mod is None: + noise_mod = _get_combined_ocr_worker_noise_mod() + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + cleaned_pages: List[str] = [] + debug_pages: List[str] = [] + matched_page_count = 0 + table_match_count = 0 + numeric_match_count = 0 + latex_match_count = 0 + hybrid_match_count = 0 + word_match_count = 0 + doc_match_types: Set[str] = set() + page_metric_rows: List[Dict[str, Any]] = [] + match_index_rows: List[Dict[str, Any]] = [] + + for page_index, page in enumerate(pages, start=1): + if clean_output_path is not None and debug_output_path is not None: + page_result = _render_combined_ocr_page_modes( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + modes=("clean", "debug"), + ) + cleaned_page = str(page_result["rendered_pages"]["clean"]) + debug_page = str(page_result["rendered_pages"]["debug"]) + elif debug_output_path is not None: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="debug", + ) + cleaned_page = "" + debug_page = str(page_result["annotated_page"]) + else: + page_result = _render_combined_ocr_page( + page, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + mode="clean", + ) + cleaned_page = str(page_result["annotated_page"]) + debug_page = "" + + merged_spans = list(page_result.get("merged_spans", [])) + page_types = list(page_result["page_types"]) + page_numeric_count = int(page_result["page_numeric_count"]) + page_word_count = int(page_result["page_word_count"]) + page_latex_count = int(page_result["page_latex_count"]) + page_table_count = int(page_result["page_table_count"]) + page_hybrid_count = int(page_result["page_hybrid_count"]) + page_noise_metrics = dict(page_result["page_noise_metrics"]) + char_eval_elapsed = float(page_result["char_eval_seconds"]) + table_elapsed = float(page_result["table_seconds"]) + numeric_elapsed = float(page_result["numeric_seconds"]) + latex_elapsed = float(page_result["latex_seconds"]) + hybrid_elapsed = float(page_result["hybrid_seconds"]) + shared_elapsed = float(page_result["shared_repeat_seconds"]) + page_total_time = float(page_result["total_page_seconds"]) + + if clean_output_path is not None: + cleaned_pages.append(cleaned_page) + if debug_output_path is not None: + debug_pages.append(debug_page) + + page_match_total = ( + page_table_count + page_numeric_count + page_word_count + page_latex_count + page_hybrid_count + ) + if page_match_total: + matched_page_count += 1 + table_match_count += page_table_count + numeric_match_count += page_numeric_count + latex_match_count += page_latex_count + hybrid_match_count += page_hybrid_count + word_match_count += page_word_count + doc_match_types.update(page_types) + + if include_page_metrics: + page_metric_rows.append( + { + "source_path": str(source_path), + "source_stem": source_path.stem, + "page_number": page_index, + "page_index_in_file": page_index, + "total_chars": int(page_noise_metrics.get("total_chars", 0)), + "bad_char_count": int(page_noise_metrics.get("bad_char_count", 0)), + "bad_char_ratio": float(page_noise_metrics.get("bad_char_ratio", 0.0)), + "control_count": int(page_noise_metrics.get("control_count", 0)), + "private_use_count": int(page_noise_metrics.get("private_use_count", 0)), + "cjk_count": int(page_noise_metrics.get("cjk_count", 0)), + "replacement_count": int(page_noise_metrics.get("replacement_count", 0)), + "table_match_count": page_table_count, + "numeric_match_count": page_numeric_count, + "latex_match_count": page_latex_count, + "hybrid_match_count": page_hybrid_count, + "word_match_count": page_word_count, + "match_types": ",".join(page_types), + "char_eval_seconds": char_eval_elapsed, + "table_seconds": table_elapsed, + "numeric_seconds": numeric_elapsed, + "latex_seconds": latex_elapsed, + "hybrid_seconds": hybrid_elapsed, + "shared_repeat_seconds": shared_elapsed, + "total_page_seconds": page_total_time, + } + ) + + if include_match_index: + match_index_rows.extend( + _build_match_index_rows( + page, + merged_spans, + source_path=source_path, + page_number=page_index, + debug_output_path=debug_output_path, + ) + ) + + if clean_output_path is not None: + clean_output_path.write_text(PAGE_SPLIT_MARKER.join(cleaned_pages), encoding="utf-8") + if debug_output_path is not None: + debug_output_path.write_text(PAGE_SPLIT_MARKER.join(debug_pages), encoding="utf-8") + + output_path = debug_output_path or clean_output_path + row = { + "source_path": str(source_path), + "output_path": None if output_path is None else str(output_path), + "clean_output_path": None if clean_output_path is None else str(clean_output_path), + "debug_output_path": None if debug_output_path is None else str(debug_output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_count": len(pages), + "matched_page_count": matched_page_count, + "table_match_count": table_match_count, + "numeric_match_count": numeric_match_count, + "latex_match_count": latex_match_count, + "hybrid_match_count": hybrid_match_count, + "word_match_count": word_match_count, + "match_count": int(len(match_index_rows)), + "match_types": ",".join(sorted(doc_match_types)), + } + return { + "row": row, + "page_metric_rows": page_metric_rows, + "match_index_rows": match_index_rows, + } + + +def _process_combined_ocr_debug_document( + source_path: Path, + output_path: Path, + *, + noise_mod: Optional[Any], + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> Dict[str, Any]: + return _process_combined_ocr_document( + source_path, + clean_output_path=None, + debug_output_path=output_path, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + include_page_metrics=True, + include_match_index=True, + ) + + +def _process_combined_ocr_clean_document( + source_path: Path, + output_path: Path, + *, + noise_mod: Optional[Any], + min_progress_steps: int, + min_repeat_steps: int, + min_same_digit_steps: int, + word_rep_threshold: int, + word_min_period: int, + word_window: int, +) -> None: + _process_combined_ocr_document( + source_path, + clean_output_path=output_path, + debug_output_path=None, + noise_mod=noise_mod, + min_progress_steps=min_progress_steps, + min_repeat_steps=min_repeat_steps, + min_same_digit_steps=min_same_digit_steps, + word_rep_threshold=word_rep_threshold, + word_min_period=word_min_period, + word_window=word_window, + include_page_metrics=False, + include_match_index=False, + ) + + +def _process_combined_ocr_debug_document_job( + job: Tuple[str, str, int, int, int, int, int, int] +) -> Dict[str, Any]: + ( + source_path_str, + output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + return _process_combined_ocr_debug_document( + Path(source_path_str), + Path(output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + +def _process_combined_ocr_clean_document_job( + job: Tuple[str, str, int, int, int, int, int, int] +) -> None: + ( + source_path_str, + output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + _process_combined_ocr_clean_document( + Path(source_path_str), + Path(output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + +def _process_combined_ocr_dual_document_job( + job: Tuple[str, str, str, int, int, int, int, int, int] +) -> Dict[str, Any]: + ( + source_path_str, + clean_output_path_str, + debug_output_path_str, + min_progress_steps, + min_repeat_steps, + min_same_digit_steps, + word_rep_threshold, + word_min_period, + word_window, + ) = job + return _process_combined_ocr_document( + Path(source_path_str), + clean_output_path=Path(clean_output_path_str), + debug_output_path=Path(debug_output_path_str), + noise_mod=None, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + include_page_metrics=True, + include_match_index=True, + ) + + +def _summarize_metric(values: List[float]) -> Dict[str, float]: + if not values: + return {"count": 0, "p50": 0.0, "p95": 0.0, "max": 0.0} + array = np.array(values, dtype=float) + return { + "count": int(array.size), + "p50": float(np.percentile(array, 50)), + "p95": float(np.percentile(array, 95)), + "max": float(array.max()), + } + class CleanPhaseMixin: @staticmethod @@ -37,27 +2897,54 @@ def _project_root() -> Path: return candidate return here.parents[2] - def _load_rust_extension(self, module_name: str, manifest_relative: str): - """Import a Rust extension, building it with maturin if necessary.""" + def _load_rust_extension( + self, + module_name: str, + manifest_relative: str, + *, + required_attrs: Optional[Iterable[str]] = None, + ): + """Import a Rust extension, building it with maturin if necessary. + + The load path is intentionally import-first: + - fast path: import an already-built extension and return immediately + - fallback: build in place only if the module is missing or incomplete + + That keeps ordinary OCR runs from paying a `maturin develop` startup tax + in every fresh process while still letting a developer bootstrap a local + checkout without separate setup steps. + """ import importlib - try: - return importlib.import_module(module_name) - except ModuleNotFoundError: - self.logger.warning( - "Rust extension %s missing; attempting in-place build via maturin …", - module_name, - ) + required = tuple(required_attrs or ()) + + def _missing_attrs(module: Any) -> List[str]: + return [attr for attr in required if not hasattr(module, attr)] + + def _build_extension_once() -> None: + if module_name in _RUST_EXTENSION_PREBUILD_ATTEMPTED: + return + _RUST_EXTENSION_PREBUILD_ATTEMPTED.add(module_name) root_dir = self._project_root() manifest = root_dir / manifest_relative if not manifest.exists(): - raise RuntimeError( - f"Cannot locate Cargo manifest for {module_name} at {manifest}" + return + build_env = os.environ.copy() + if sys.prefix != getattr(sys, "base_prefix", sys.prefix): + build_env.setdefault("VIRTUAL_ENV", sys.prefix) + venv_bin = str(Path(sys.prefix) / "bin") + build_env["PATH"] = ( + f"{venv_bin}:{build_env['PATH']}" + if build_env.get("PATH") + else venv_bin ) try: subprocess.run( [sys.executable, "-m", "pip", "install", "maturin>=1.5,<2.0"], check=True, + env=build_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) subprocess.run( [ @@ -70,12 +2957,114 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str): str(manifest), ], check=True, + env=build_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) - return importlib.import_module(module_name) + importlib.invalidate_caches() except Exception as build_err: + self.logger.debug( + "Rust prebuild for %s skipped or failed: %s", + module_name, + build_err, + ) + + def _import_module_with_fallback(): + candidates = [module_name] + if "." not in module_name: + candidates.append(f"{module_name}.{module_name}") + + last_error: Optional[Exception] = None + for candidate in candidates: + try: + return importlib.import_module(candidate) + except Exception as err: # pragma: no cover - import surface varies by wheel layout + last_error = err + if last_error is not None: + raise last_error + raise ModuleNotFoundError(module_name) + + needs_build = False + try: + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if not missing: + return _prime_word_repeat_rust_module(module_name, module) + self.logger.warning( + "Rust extension %s is missing required attributes %s; attempting in-place build via maturin …", + module_name, + ", ".join(missing), + ) + needs_build = True + except ModuleNotFoundError: + self.logger.warning( + "Rust extension %s missing; attempting in-place build via maturin …", + module_name, + ) + needs_build = True + + if needs_build: + _build_extension_once() + try: + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if not missing: + return _prime_word_repeat_rust_module(module_name, module) + except ModuleNotFoundError: + pass + + if not needs_build: + raise RuntimeError(f"Unexpected load state for Rust extension {module_name}") + + root_dir = self._project_root() + manifest = root_dir / manifest_relative + if not manifest.exists(): + raise RuntimeError( + f"Cannot locate Cargo manifest for {module_name} at {manifest}" + ) + try: + build_env = os.environ.copy() + if sys.prefix != getattr(sys, "base_prefix", sys.prefix): + build_env.setdefault("VIRTUAL_ENV", sys.prefix) + venv_bin = str(Path(sys.prefix) / "bin") + build_env["PATH"] = ( + f"{venv_bin}:{build_env['PATH']}" + if build_env.get("PATH") + else venv_bin + ) + subprocess.run( + [sys.executable, "-m", "pip", "install", "maturin>=1.5,<2.0"], + check=True, + env=build_env, + ) + subprocess.run( + [ + sys.executable, + "-m", + "maturin", + "develop", + "--release", + "--manifest-path", + str(manifest), + ], + check=True, + env=build_env, + ) + importlib.invalidate_caches() + sys.modules.pop(module_name, None) + if "." not in module_name: + sys.modules.pop(f"{module_name}.{module_name}", None) + module = _import_module_with_fallback() + missing = _missing_attrs(module) + if missing: raise RuntimeError( - f"Automatic build of {module_name} failed: {build_err}" + f"Built {module_name} but it is still missing required attributes: {missing}" ) + return _prime_word_repeat_rust_module(module_name, module) + except Exception as build_err: + raise RuntimeError( + f"Automatic build of {module_name} failed: {build_err}" + ) def _load_metrics_dataframe( self, parquet_path: Path, filenames: Optional[Iterable[str]] = None @@ -115,6 +3104,30 @@ def _merge_metric_dataframe( base_idx.update(update_idx) return base_idx.reset_index(drop=True) + def _resolve_clean_metrics_parquet(self, parquet_schema) -> Path: + parquet_path: Optional[Path] = self._get_cached_metadata_parquet() + if parquet_path is None: + existing_metadata = parquet_schema.find_metadata_parquet(self.input_dir) + if existing_metadata is not None: + parquet_path = self._cache_metadata_parquet(existing_metadata) + if parquet_path is None: + ensured = parquet_schema.ensure_metadata_parquet(self.output_dir) + if ensured is not None: + parquet_path = self._cache_metadata_parquet(ensured) + if parquet_path is None: + ensured = parquet_schema.ensure_metadata_parquet(self.input_dir) + if ensured is not None: + parquet_path = self._cache_metadata_parquet(ensured) + if parquet_path is None: + metadata_target = self.output_dir / "download_results" / "download_results.parquet" + self.logger.info( + "Cleaner: no metadata parquet found; will bootstrap %s when metrics become available.", + metadata_target, + ) + else: + metadata_target = parquet_path + return self._cache_metadata_parquet(metadata_target) + def clean( self, input_dir: Union[str, Path] = None, @@ -156,7 +3169,9 @@ def clean( self.ocr_model_dir = Path(ocr_model_dir) self._load_rust_extension( - "glossapi_rs_cleaner", "rust/glossapi_rs_cleaner/Cargo.toml" + "glossapi_rs_cleaner", + "rust/glossapi_rs_cleaner/Cargo.toml", + required_attrs=("run_complete_pipeline",), ) self.logger.info("Using compiled glossapi_rs_cleaner extension for fast cleaning") @@ -168,28 +3183,7 @@ def clean( # Prepare parquet helper parquet_schema = ParquetSchema({"url_column": self.url_column}) - parquet_path: Optional[Path] = self._get_cached_metadata_parquet() - if parquet_path is None: - existing_metadata = parquet_schema.find_metadata_parquet(self.input_dir) - if existing_metadata is not None: - parquet_path = self._cache_metadata_parquet(existing_metadata) - if parquet_path is None: - ensured = parquet_schema.ensure_metadata_parquet(self.output_dir) - if ensured is not None: - parquet_path = self._cache_metadata_parquet(ensured) - if parquet_path is None: - ensured = parquet_schema.ensure_metadata_parquet(self.input_dir) - if ensured is not None: - parquet_path = self._cache_metadata_parquet(ensured) - if parquet_path is None: - metadata_target = self.output_dir / "download_results" / "download_results.parquet" - self.logger.info( - "Cleaner: no metadata parquet found; will bootstrap %s when metrics become available.", - metadata_target, - ) - else: - metadata_target = parquet_path - parquet_path = self._cache_metadata_parquet(metadata_target) + parquet_path = self._resolve_clean_metrics_parquet(parquet_schema) import os records: list = [] # will hold metrics for parquet merge @@ -346,6 +3340,8 @@ def finalize(self) -> None: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + encoding="utf-8", + errors="replace", bufsize=1, ) try: @@ -445,7 +3441,9 @@ def finalize(self) -> None: try: self.logger.info("Scoring cleaned markdown files with glossapi_rs_noise …") noise_mod = self._load_rust_extension( - "glossapi_rs_noise", "rust/glossapi_rs_noise/Cargo.toml" + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("score_markdown_directory_detailed",), ) results = noise_mod.score_markdown_directory_detailed( str(self.cleaned_markdown_dir), os.cpu_count() @@ -700,6 +3698,991 @@ def _merge_reason(value: str) -> str: if write_cleaned_files: self.markdown_dir = self.cleaned_markdown_dir + def clean_ocr( + self, + input_dir: Union[str, Path] = None, + num_threads: int = None, + drop_bad: bool = False, + *, + min_repeat_run: int = 6, + write_cleaned_files: bool = True, + write_debug_files: bool = False, + debug_output_dir: Union[str, Path, None] = None, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> None: + """Clean OCR markdown with the shared page loop and update OCR-noise metrics. + + The OCR profile keeps the existing canonical script metrics columns + (`percentage_greek`, `latin_percentage`, `polytonic_ratio`) and adds + OCR-specific noise diagnostics. The same combined page analyzer drives + both clean and debug outputs: + - clean mode writes pipeline-ready markdown to ``self.cleaned_markdown_dir`` + - debug mode writes annotated markdown and a structured match index under + ``debug_output_dir`` (default: ``self.output_dir / "debug"``) + """ + from glossapi.parquet_schema import ParquetSchema + + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + parquet_schema = ParquetSchema({"url_column": self.url_column}) + parquet_path = self._resolve_clean_metrics_parquet(parquet_schema) + parquet_path.parent.mkdir(parents=True, exist_ok=True) + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=( + "score_markdown_directory_ocr_profile", + "find_numeric_debug_page_spans", + "evaluate_page_character_noise", + ), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + render_workers = _default_combined_ocr_render_workers( + noise_mod=noise_mod, + requested_workers=None, + max_workers=n_threads, + ) + md_files = sorted(input_dir.glob("*.md")) + debug_dir: Optional[Path] = None + debug_manifest_path: Optional[Path] = None + debug_page_metrics_path: Optional[Path] = None + debug_match_index_path: Optional[Path] = None + debug_summary_path: Optional[Path] = None + if write_debug_files: + debug_dir = Path(debug_output_dir) if debug_output_dir is not None else (self.output_dir / "debug") + if debug_dir.exists(): + shutil.rmtree(debug_dir) + debug_dir.mkdir(parents=True, exist_ok=True) + debug_manifest_path = debug_dir / "manifest.jsonl" + debug_page_metrics_path = debug_dir / "page_metrics.jsonl" + debug_match_index_path = debug_dir / "match_index.jsonl" + debug_summary_path = debug_dir / "summary.json" + + if write_cleaned_files: + if self.cleaned_markdown_dir.exists(): + shutil.rmtree(self.cleaned_markdown_dir) + self.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) + + if write_cleaned_files or write_debug_files: + mode_label = "clean+debug" if write_cleaned_files and write_debug_files else ("debug" if write_debug_files else "clean") + self.logger.info( + "Running shared OCR %s loop over %d markdown files (workers=%d)…", + mode_label, + len(md_files), + render_workers, + ) + + if write_debug_files: + rows: List[Dict[str, Any]] = [] + total_page_times: List[float] = [] + table_page_times: List[float] = [] + numeric_page_times: List[float] = [] + latex_page_times: List[float] = [] + shared_page_times: List[float] = [] + hybrid_page_times: List[float] = [] + char_eval_times: List[float] = [] + bad_char_ratios: List[float] = [] + + def _consume_debug_doc_result( + doc_result: Dict[str, Any], + *, + page_metrics_handle: Any, + match_index_handle: Any, + ) -> None: + rows.append(dict(doc_result["row"])) + for page_row in doc_result["page_metric_rows"]: + page_metrics_handle.write(json.dumps(page_row, ensure_ascii=False)) + page_metrics_handle.write("\n") + total_page_times.append(float(page_row["total_page_seconds"])) + table_page_times.append(float(page_row["table_seconds"])) + numeric_page_times.append(float(page_row["numeric_seconds"])) + latex_page_times.append(float(page_row["latex_seconds"])) + hybrid_page_times.append(float(page_row["hybrid_seconds"])) + shared_page_times.append(float(page_row["shared_repeat_seconds"])) + char_eval_times.append(float(page_row["char_eval_seconds"])) + bad_char_ratios.append(float(page_row["bad_char_ratio"])) + for match_row in doc_result["match_index_rows"]: + match_index_handle.write(json.dumps(match_row, ensure_ascii=False)) + match_index_handle.write("\n") + + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + if write_cleaned_files: + jobs = [ + ( + str(source_path), + str(self.cleaned_markdown_dir / source_path.name), + str(debug_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + else: + jobs = [ + ( + str(source_path), + str(debug_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + with debug_page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, debug_match_index_path.open("w", encoding="utf-8") as match_index_handle: + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + if write_cleaned_files: + iterator = executor.map(_process_combined_ocr_dual_document_job, jobs) + else: + iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) + for doc_result in iterator: + _consume_debug_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + else: + if write_cleaned_files: + def _run_dual_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_document( + source_path, + clean_output_path=self.cleaned_markdown_dir / source_path.name, + debug_output_path=debug_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + include_page_metrics=True, + include_match_index=True, + ) + run_doc = _run_dual_doc + else: + def _run_debug_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_debug_document( + source_path, + debug_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + run_doc = _run_debug_doc + + with debug_page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, debug_match_index_path.open("w", encoding="utf-8") as match_index_handle: + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(run_doc, md_files): + _consume_debug_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + + with debug_manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + debug_summary = { + "doc_count": len(rows), + "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), + "matched_page_count": int(sum(int(row["matched_page_count"]) for row in rows)), + "match_count": int(sum(int(row.get("match_count", 0)) for row in rows)), + "table_match_count": int(sum(int(row["table_match_count"]) for row in rows)), + "numeric_match_count": int(sum(int(row["numeric_match_count"]) for row in rows)), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "word_match_count": int(sum(int(row["word_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "total_page_seconds": _summarize_metric(total_page_times), + "table_seconds": _summarize_metric(table_page_times), + "numeric_seconds": _summarize_metric(numeric_page_times), + "latex_seconds": _summarize_metric(latex_page_times), + "hybrid_seconds": _summarize_metric(hybrid_page_times), + "shared_repeat_seconds": _summarize_metric(shared_page_times), + "char_eval_seconds": _summarize_metric(char_eval_times), + "bad_char_ratio": _summarize_metric(bad_char_ratios), + } + debug_summary_path.write_text(json.dumps(debug_summary, ensure_ascii=False, indent=2), encoding="utf-8") + else: + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + jobs = [ + ( + str(source_path), + str(self.cleaned_markdown_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in md_files + ] + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + list(executor.map(_process_combined_ocr_clean_document_job, jobs)) + else: + def _run_clean_doc(source_path: Path) -> None: + _process_combined_ocr_clean_document( + source_path, + self.cleaned_markdown_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + with ThreadPoolExecutor(max_workers=render_workers) as executor: + list(executor.map(_run_clean_doc, md_files)) + + + self.logger.info( + "Scoring OCR markdown files with glossapi_rs_noise OCR profile on %d markdown files…", + len(md_files), + ) + + results = noise_mod.score_markdown_directory_ocr_profile( + str(input_dir), + n_threads, + int(min_repeat_run), + ) + df_updates = pd.DataFrame(list(results)) + if df_updates.empty: + self.good_files = [] + self.logger.info("OCR cleaning found no markdown files under %s", input_dir) + return + + df_updates["filename"] = df_updates["path"].apply( + lambda value: f"{Path(str(value)).stem}.pdf" + ) + df_updates["polytonic_ratio"] = pd.to_numeric( + df_updates["polytonic_ratio"], errors="coerce" + ).round(2) + df_updates["percentage_greek"] = pd.to_numeric( + df_updates["percentage_greek"], errors="coerce" + ).round(3) + df_updates["latin_percentage"] = pd.to_numeric( + df_updates["latin_percentage"], errors="coerce" + ).round(3) + df_updates["ocr_repeat_suspicious_line_ratio"] = pd.to_numeric( + df_updates["ocr_repeat_suspicious_line_ratio"], errors="coerce" + ).round(4) + df_updates["ocr_noise_flags"] = ( + df_updates["ocr_noise_flags"].fillna("").astype(str) + ) + + update_columns = [ + "filename", + "percentage_greek", + "latin_percentage", + "polytonic_ratio", + "ocr_noise_suspect", + "ocr_noise_flags", + "ocr_repeat_phrase_run_max", + "ocr_repeat_line_run_max", + "ocr_repeat_suspicious_line_count", + "ocr_repeat_suspicious_line_ratio", + ] + + df = self._load_metrics_dataframe(parquet_path, df_updates.get("filename")) + self._ensure_metric_columns( + df, + { + "filter": "ok", + "percentage_greek": pd.NA, + "latin_percentage": pd.NA, + "polytonic_ratio": pd.NA, + "ocr_noise_suspect": False, + "ocr_noise_flags": "", + "ocr_repeat_phrase_run_max": pd.NA, + "ocr_repeat_line_run_max": pd.NA, + "ocr_repeat_suspicious_line_count": pd.NA, + "ocr_repeat_suspicious_line_ratio": pd.NA, + }, + ) + df = self._merge_metric_dataframe(df, df_updates[update_columns]) + + if "filter" not in df.columns: + df["filter"] = "ok" + else: + df["filter"] = df["filter"].fillna("ok").astype(str) + + suspect_mask = df["ocr_noise_suspect"].fillna(False).astype(bool) + if bool(suspect_mask.any()): + current = df.loc[suspect_mask, "filter"].astype(str) + + def _append_ocr_noise(value: str) -> str: + if value == "ok" or not value: + return "ocr_noise" + tokens = [token for token in value.split(";") if token] + if "ocr_noise" not in tokens: + tokens.append("ocr_noise") + return ";".join(tokens) + + df.loc[suspect_mask, "filter"] = current.apply(_append_ocr_noise) + + parquet_schema.write_metadata_parquet(df, parquet_path) + self.logger.info("OCR metrics updated in %s", parquet_path) + + filenames = df.get("filename", pd.Series(dtype=str)) + if drop_bad: + good_df = df[~df["ocr_noise_suspect"].fillna(False).astype(bool)] + filenames = good_df.get("filename", pd.Series(dtype=str)) + self.logger.info( + "After OCR filtering, %d good files remain", + len(filenames.dropna()), + ) + self.good_files = [canonical_stem(f) for f in filenames.dropna().astype(str).tolist()] + if write_cleaned_files: + self.markdown_dir = self.cleaned_markdown_dir + + def clean_ocr_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + num_threads: int = None, + *, + min_repeat_run: int = 6, + max_pages: Optional[int] = 1000, + sample_seed: int = 0, + ) -> List[Dict[str, Any]]: + """Export page-level OCR debug files for repeated-pattern matches. + + Only pages that contain OCR repetition matches are exported. Each output page + contains inline `...` tags around the matched spans. + """ + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("export_ocr_match_debug_pages",), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + self.logger.info( + "Exporting OCR debug matches from %s into %s with glossapi_rs_noise…", + input_dir, + output_dir, + ) + + rows = list( + noise_mod.export_ocr_match_debug_pages( + str(input_dir), + str(output_dir), + n_threads, + int(min_repeat_run), + None if max_pages is None else int(max_pages), + int(sample_seed), + ) + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(dict(row), ensure_ascii=False)) + handle.write("\n") + + self.logger.info( + "Exported %d OCR debug pages with matches to %s", + len(rows), + output_dir, + ) + return [dict(row) for row in rows] + + def clean_ocr_numeric_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + num_threads: int = None, + *, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + max_pages: Optional[int] = 1000, + sample_seed: int = 0, + ) -> List[Dict[str, Any]]: + """Export page-level OCR debug files for numeric-only collapse patterns.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("export_numeric_match_debug_pages",), + ) + n_threads = int(num_threads or os.cpu_count() or 4) + self.logger.info( + "Exporting OCR numeric debug matches from %s into %s with glossapi_rs_noise…", + input_dir, + output_dir, + ) + + rows = list( + noise_mod.export_numeric_match_debug_pages( + str(input_dir), + str(output_dir), + n_threads, + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + None if max_pages is None else int(max_pages), + int(sample_seed), + ) + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(dict(row), ensure_ascii=False)) + handle.write("\n") + + self.logger.info( + "Exported %d OCR numeric debug pages with matches to %s", + len(rows), + output_dir, + ) + return [dict(row) for row in rows] + + def clean_ocr_numeric_word_debug_docs( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 100, + doc_offset: int = 0, + doc_workers: Optional[int] = None, + min_progress_steps: int = 10, + min_repeat_steps: int = 8, + min_same_digit_steps: int = 10, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> List[Dict[str, Any]]: + """Annotate complete markdown documents with table, numeric, LaTeX, hybrid, then shared-repeat matches. + + Default repetition threshold for both word and LaTeX repeat detection is 4. + """ + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + page_metrics_path = output_dir / "page_metrics.jsonl" + if page_metrics_path.exists(): + page_metrics_path.unlink() + match_index_path = output_dir / "match_index.jsonl" + if match_index_path.exists(): + match_index_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + noise_mod = self._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_numeric_debug_page_spans", "evaluate_page_character_noise"), + ) + + all_source_paths = sorted(input_dir.glob("*.md")) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + render_workers = _default_combined_ocr_render_workers( + noise_mod=noise_mod, + requested_workers=doc_workers, + max_workers=int(os.cpu_count() or 1), + ) + + self.logger.info( + "Exporting combined OCR table+numeric+latex+hybrid+word debug docs from %s into %s for %d documents (offset=%d, workers=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + render_workers, + ) + + rows: List[Dict[str, Any]] = [] + total_page_times: List[float] = [] + table_page_times: List[float] = [] + numeric_page_times: List[float] = [] + latex_page_times: List[float] = [] + shared_page_times: List[float] = [] + hybrid_page_times: List[float] = [] + char_eval_times: List[float] = [] + bad_char_ratios: List[float] = [] + def _consume_doc_result( + doc_result: Dict[str, Any], + *, + page_metrics_handle: Any, + match_index_handle: Any, + ) -> None: + rows.append(dict(doc_result["row"])) + for page_row in doc_result["page_metric_rows"]: + page_metrics_handle.write(json.dumps(page_row, ensure_ascii=False)) + page_metrics_handle.write("\n") + total_page_times.append(float(page_row["total_page_seconds"])) + table_page_times.append(float(page_row["table_seconds"])) + numeric_page_times.append(float(page_row["numeric_seconds"])) + latex_page_times.append(float(page_row["latex_seconds"])) + hybrid_page_times.append(float(page_row["hybrid_seconds"])) + shared_page_times.append(float(page_row["shared_repeat_seconds"])) + char_eval_times.append(float(page_row["char_eval_seconds"])) + bad_char_ratios.append(float(page_row["bad_char_ratio"])) + for match_row in doc_result["match_index_rows"]: + match_index_handle.write(json.dumps(match_row, ensure_ascii=False)) + match_index_handle.write("\n") + if _can_use_combined_ocr_process_pool(noise_mod, render_workers): + jobs = [ + ( + str(source_path), + str(output_dir / source_path.name), + int(min_progress_steps), + int(min_repeat_steps), + int(min_same_digit_steps), + int(word_rep_threshold), + int(word_min_period), + int(word_window), + ) + for source_path in source_paths + ] + iterator: Iterable[Dict[str, Any]] + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, match_index_path.open("w", encoding="utf-8") as match_index_handle: + with _combined_ocr_process_pool_warning_ctx(): + with ProcessPoolExecutor( + max_workers=render_workers, + # Match the clean-mode executor policy so debug and + # clean keep the same performance shape and worker init. + mp_context=mp.get_context("fork"), + initializer=_init_combined_ocr_worker, + ) as executor: + iterator = executor.map(_process_combined_ocr_debug_document_job, jobs) + for doc_result in iterator: + _consume_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + else: + def _run_debug_doc(source_path: Path) -> Dict[str, Any]: + return _process_combined_ocr_debug_document( + source_path, + output_dir / source_path.name, + noise_mod=noise_mod, + min_progress_steps=int(min_progress_steps), + min_repeat_steps=int(min_repeat_steps), + min_same_digit_steps=int(min_same_digit_steps), + word_rep_threshold=int(word_rep_threshold), + word_min_period=int(word_min_period), + word_window=int(word_window), + ) + + with page_metrics_path.open("w", encoding="utf-8") as page_metrics_handle, match_index_path.open("w", encoding="utf-8") as match_index_handle: + with ThreadPoolExecutor(max_workers=render_workers) as executor: + for doc_result in executor.map(_run_debug_doc, source_paths): + _consume_doc_result( + doc_result, + page_metrics_handle=page_metrics_handle, + match_index_handle=match_index_handle, + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(rows), + "matched_doc_count": sum(1 for row in rows if int(row["matched_page_count"]) > 0), + "matched_page_count": int(sum(int(row["matched_page_count"]) for row in rows)), + "match_count": int(sum(int(row.get("match_count", 0)) for row in rows)), + "table_match_count": int(sum(int(row["table_match_count"]) for row in rows)), + "numeric_match_count": int(sum(int(row["numeric_match_count"]) for row in rows)), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "word_match_count": int(sum(int(row["word_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "total_page_seconds": _summarize_metric(total_page_times), + "table_seconds": _summarize_metric(table_page_times), + "numeric_seconds": _summarize_metric(numeric_page_times), + "latex_seconds": _summarize_metric(latex_page_times), + "hybrid_seconds": _summarize_metric(hybrid_page_times), + "shared_repeat_seconds": _summarize_metric(shared_page_times), + "char_eval_seconds": _summarize_metric(char_eval_times), + "bad_char_ratio": _summarize_metric(bad_char_ratios), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d combined OCR debug docs to %s", + len(rows), + output_dir, + ) + return rows + + def clean_ocr_hybrid_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 100, + doc_offset: int = 0, + ) -> List[Dict[str, Any]]: + """Export only matched pages for local hybrid numbered repetitions.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting hybrid OCR debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + hybrid_spans = _find_hybrid_numbered_repeat_spans(page, blocked_spans=[]) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not hybrid_spans: + continue + + annotated_page, page_types, _, _, _, _, _ = _annotate_page_with_labeled_spans( + page, + hybrid_spans, + ) + hybrid_count = _count_hybrid_matches_in_page(page, hybrid_spans) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "hybrid_match_count": hybrid_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "hybrid_match_count": int(sum(int(row["hybrid_match_count"]) for row in rows)), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d hybrid OCR debug pages to %s", + len(rows), + output_dir, + ) + return rows + + def clean_ocr_latex_slot_progression_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 1000, + doc_offset: int = 0, + ) -> List[Dict[str, Any]]: + """Export only matched pages for local LaTeX slot-progression runs.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting LaTeX slot-progression debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + latex_spans = _find_latex_slot_progression_spans(page, blocked_spans=[]) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not latex_spans: + continue + + annotated_page, page_types, _, _, latex_count, _, _ = _annotate_page_with_labeled_spans( + page, + latex_spans, + ) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "latex_match_count": latex_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d LaTeX slot-progression debug pages to %s", + len(rows), + output_dir, + ) + return rows + + def clean_ocr_latex_debug( + self, + output_dir: Union[str, Path], + input_dir: Union[str, Path] = None, + *, + max_docs: Optional[int] = 1000, + doc_offset: int = 0, + word_rep_threshold: int = 4, + word_min_period: int = 3, + word_window: int = 96, + ) -> List[Dict[str, Any]]: + """Export only matched pages for all LaTeX repeat classes.""" + if input_dir is None: + input_dir = self.markdown_dir + else: + input_dir = Path(input_dir) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + for stale in output_dir.glob("*.md"): + stale.unlink() + manifest_path = output_dir / "manifest.jsonl" + if manifest_path.exists(): + manifest_path.unlink() + summary_path = output_dir / "summary.json" + if summary_path.exists(): + summary_path.unlink() + + all_source_paths = sorted(list(Path(input_dir).glob("*.md")) + list(Path(input_dir).glob("*.txt"))) + doc_offset = max(0, int(doc_offset)) + if max_docs is not None: + source_paths = all_source_paths[doc_offset : doc_offset + int(max_docs)] + else: + source_paths = all_source_paths[doc_offset:] + + self.logger.info( + "Exporting LaTeX debug pages from %s into %s for %d documents (offset=%d)", + input_dir, + output_dir, + len(source_paths), + doc_offset, + ) + + rows: List[Dict[str, Any]] = [] + page_times: List[float] = [] + + for source_path in source_paths: + text = source_path.read_text(encoding="utf-8") + pages = text.split(PAGE_SPLIT_MARKER) + for page_index, page in enumerate(pages, start=1): + page_start = time.perf_counter() + latex_spans = _find_latex_repeat_spans( + page, + blocked_spans=[], + rep_threshold=int(word_rep_threshold), + min_period=int(word_min_period), + window=int(word_window), + ) + page_elapsed = time.perf_counter() - page_start + page_times.append(page_elapsed) + if not latex_spans: + continue + + annotated_page, page_types, _, _, latex_count, _, _ = _annotate_page_with_labeled_spans( + page, + latex_spans, + ) + output_name = f"{source_path.stem}__debug_page_{page_index:05d}.md" + output_path = output_dir / output_name + output_path.write_text(annotated_page, encoding="utf-8") + rows.append( + { + "source_path": str(source_path), + "output_path": str(output_path), + "source_stem": source_path.stem, + "base_stem": canonical_stem(source_path.stem), + "page_number": page_index, + "page_index_in_file": page_index, + "latex_match_count": latex_count, + "match_types": ",".join(page_types), + "page_seconds": page_elapsed, + } + ) + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "doc_count": len(source_paths), + "matched_page_count": len(rows), + "latex_match_count": int(sum(int(row["latex_match_count"]) for row in rows)), + "word_rep_threshold": int(word_rep_threshold), + "word_min_period": int(word_min_period), + "word_window": int(word_window), + "page_seconds": _summarize_metric(page_times), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + self.logger.info( + "Exported %d LaTeX debug pages to %s", + len(rows), + output_dir, + ) + return rows + def filter(self, *args, **kwargs): # type: ignore[override] """Deprecated: use :py:meth:`clean` instead. Retained for one release.""" self.logger.warning("Corpus.filter() is deprecated – calling clean() instead") diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py index 38179fd..c543076 100644 --- a/src/glossapi/corpus/phase_download.py +++ b/src/glossapi/corpus/phase_download.py @@ -19,6 +19,7 @@ import pandas as pd from .._naming import canonical_stem +from ..gloss_browser_downloader import BrowserGlossDownloader from ..gloss_downloader import GlossDownloader # Avoid importing section/classifier here; download phase does not use them. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path @@ -212,6 +213,22 @@ def _looks_like_list(s: str) -> bool: # Initialize downloader configuration (kwargs take precedence) dl_cfg = dict(self.downloader_config) dl_cfg.update(kwargs) + browser_mode = dl_cfg.pop('browser_mode', None) + if browser_mode is not None and 'download_mode' not in dl_cfg: + dl_cfg['download_mode'] = 'browser' if browser_mode else 'standard' + download_mode = str(dl_cfg.pop('download_mode', 'standard')).strip().lower() + policy_requested = bool(dl_cfg.get('download_policy_file') or dl_cfg.get('download_policy')) + if download_mode in {'standard', 'default', 'http'} and not policy_requested: + downloader_cls = GlossDownloader + default_download_route = 'standard' + elif download_mode in {'browser', 'browser_protected'} or policy_requested: + downloader_cls = BrowserGlossDownloader + default_download_route = 'browser' if download_mode in {'browser', 'browser_protected'} else 'standard' + elif download_mode in {'auto', 'browser_fallback'}: + downloader_cls = BrowserGlossDownloader + default_download_route = 'auto' + else: + raise ValueError(f"Unsupported download_mode: {download_mode}") # Allow caller to override which column holds links if links_column: url_column = links_column @@ -232,14 +249,18 @@ def _looks_like_list(s: str) -> bool: except Exception: pass - downloader = GlossDownloader( - url_column=url_column, - output_dir=str(self.output_dir), - log_level=self.logger.level, - verbose=verbose if verbose is not None else self.verbose, + downloader_kwargs = { + "url_column": url_column, + "output_dir": str(self.output_dir), + "log_level": self.logger.level, + "verbose": verbose if verbose is not None else self.verbose, **{k: v for k, v in dl_cfg.items() if k not in {'input_parquet'}}, - _used_filename_bases=used_bases - ) + "_used_filename_bases": used_bases, + } + if downloader_cls is BrowserGlossDownloader: + downloader_kwargs["default_download_route"] = default_download_route + + downloader = downloader_cls(**downloader_kwargs) # Download files self.logger.info(f"Downloading files from URLs in {input_parquet}...") diff --git a/src/glossapi/corpus/phase_export.py b/src/glossapi/corpus/phase_export.py index 26a6a82..4bcc6a8 100644 --- a/src/glossapi/corpus/phase_export.py +++ b/src/glossapi/corpus/phase_export.py @@ -471,8 +471,6 @@ def _normalize_value(value: Any) -> Any: chunk_paths: List[Path] = entry.get("chunk_paths", []) or [] base_path: Optional[Path] = entry.get("base_path") representative_path: Optional[Path] = base_path - if representative_path is None and chunk_paths: - representative_path = sorted(chunk_paths, key=_chunk_sort_key)[0] base_metadata = metadata_by_stem.get(stem) chunk_metadata = metadata_chunks_by_stem.get(stem, []) if base_metadata is None and not chunk_metadata: @@ -480,17 +478,11 @@ def _normalize_value(value: Any) -> Any: metadata = _aggregate_metadata(stem, base_metadata, chunk_metadata) metadata = {k: _normalize_value(v) for k, v in metadata.items()} original_filename_value = metadata.get("filename") - if chunk_paths: - ordered_chunks = sorted(chunk_paths, key=_chunk_sort_key) - parts: List[str] = [] - for path in ordered_chunks: - parts.append(path.read_text(encoding="utf-8")) - document_text = "\n".join(parts) - elif representative_path is not None: - document_text = representative_path.read_text(encoding="utf-8") - else: + if base_path is None or not base_path.exists(): continue + document_text = base_path.read_text(encoding="utf-8") + filetype = metadata.get("filetype") or metadata.get("file_ext") if not filetype: filename_candidate = original_filename_value or metadata.get("filename") diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a584eaf..476c3c6 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -13,7 +13,7 @@ import sys import time from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -37,6 +37,147 @@ def _maybe_import_torch(force: bool = False): return _maybe_import_torch_fallback(force=force) +def _resolve_docling_max_batch_files(default: int = 1) -> int: + """Resolve the per-worker Docling document batch size for Phase-1 extraction. + + GlossAPI keeps the default conservative because fresh GPU nodes have been + more sensitive to bootstrap/runtime drift than to raw scheduler limits. + Strong GPUs can still be benchmarked explicitly by raising this knob. + """ + + fallback = max(1, int(default)) + raw = os.getenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES") + if not raw: + return fallback + try: + return max(1, int(raw)) + except Exception: + return fallback + + +def _resolve_docling_batch_target_pages(default: int = 256) -> int: + """Resolve the target page budget per queued Docling extraction work item.""" + + fallback = max(1, int(default)) + raw = os.getenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES") + if not raw: + return fallback + try: + return max(1, int(raw)) + except Exception: + return fallback + + +def _estimate_extract_work_pages(path: Path) -> int: + """Best-effort PDF page estimate used for Phase-1 queue packing.""" + + suffix = path.suffix.lower() + if suffix != ".pdf": + return 1 + + try: + import pypdfium2 as pdfium # type: ignore + + pdf = pdfium.PdfDocument(str(path)) + try: + return max(1, int(len(pdf))) + finally: + close = getattr(pdf, "close", None) + if callable(close): + close() + except Exception: + pass + + for module_name, attr_name in ( + ("pypdf", "PdfReader"), + ("PyPDF2", "PdfReader"), + ): + try: + module = __import__(module_name, fromlist=[attr_name]) + reader_cls = getattr(module, attr_name) + reader = reader_cls(str(path)) + return max(1, int(len(reader.pages))) + except Exception: + continue + + return 1 + + +def _build_extract_work_items( + paths: Iterable[Path], + *, + max_batch_files: int, + target_batch_pages: int, + long_pdf_page_threshold: int = 600, + page_counter: Optional[Callable[[Path], int]] = None, +) -> List[List[Path]]: + """Pack extraction work into steadier page-budget batches for multi-GPU runs.""" + + files = [Path(path) for path in paths] + if not files: + return [] + + max_files = max(1, int(max_batch_files)) + target_pages = max(1, int(target_batch_pages)) + long_threshold = max(1, int(long_pdf_page_threshold)) + counter = page_counter or _estimate_extract_work_pages + + packed: List[Tuple[List[Path], int]] = [] + standalone: List[Tuple[List[Path], int]] = [] + + for path in files: + try: + est_pages = max(1, int(counter(path))) + except Exception: + est_pages = 1 + + if path.suffix.lower() == ".pdf" and est_pages > long_threshold: + standalone.append(([path], est_pages)) + continue + + best_idx: Optional[int] = None + best_leftover: Optional[int] = None + for idx, (bundle_paths, bundle_pages) in enumerate(packed): + if len(bundle_paths) >= max_files: + continue + new_pages = bundle_pages + est_pages + if bundle_paths and new_pages > target_pages: + continue + leftover = max(0, target_pages - new_pages) + if best_leftover is None or leftover < best_leftover: + best_idx = idx + best_leftover = leftover + if best_idx is None: + packed.append(([path], est_pages)) + else: + packed[best_idx][0].append(path) + packed[best_idx] = (packed[best_idx][0], packed[best_idx][1] + est_pages) + + work_items = standalone + packed + work_items.sort(key=lambda item: item[1], reverse=True) + return [bundle_paths for bundle_paths, _ in work_items] + + +def _resolve_docling_queue_policy(extractor: Any | None = None) -> Tuple[int, int]: + """Return the Docling queue packing knobs the multi-GPU planner should use.""" + + max_batch_files = _resolve_docling_max_batch_files() + long_pdf_page_threshold = 600 + if extractor is None: + return max_batch_files, long_pdf_page_threshold + try: + max_batch_files = max(1, int(getattr(extractor, "max_batch_files", max_batch_files))) + except Exception: + max_batch_files = _resolve_docling_max_batch_files() + try: + long_pdf_page_threshold = max( + 1, int(getattr(extractor, "long_pdf_page_threshold", long_pdf_page_threshold)) + ) + except Exception: + long_pdf_page_threshold = 600 + return max_batch_files, long_pdf_page_threshold + + class ExtractPhaseMixin: def prime_extractor( self, @@ -96,25 +237,36 @@ def prime_extractor( except Exception: images_scale_env = "1.25" + if force_ocr: + self.logger.warning( + "Corpus.extract(force_ocr=True) is deprecated and no longer executes OCR. " + "Use Corpus.ocr(backend='deepseek') for OCR remediation." + ) + # Hard GPU preflight before we attempt to build OCR/enrichment pipelines self._gpu_preflight( accel_type=accel_type, - require_ocr=bool(force_ocr), + require_ocr=False, require_math=bool(formula_enrichment or code_enrichment), require_backend_gpu=(backend_choice == "docling"), ) # Configure batch/backend policy based on resolved choice if backend_choice == "docling": - # Keep docling runs conservative: process one document per batch for stability - self.extractor.configure_batch_policy("docling", max_batch_files=1, prefer_safe_backend=False) + # Keep docling runs conservative by default, but expose an explicit + # Phase-1 tuning hook for benchmark nodes and strong GPUs. + self.extractor.configure_batch_policy( + "docling", + max_batch_files=_resolve_docling_max_batch_files(), + prefer_safe_backend=False, + ) else: self.extractor.configure_batch_policy("safe", max_batch_files=1, prefer_safe_backend=True) # Ensure converter exists (reuse when unchanged) self.extractor.ensure_extractor( - enable_ocr=bool(force_ocr), - force_full_page_ocr=bool(force_ocr), + enable_ocr=False, + force_full_page_ocr=False, formula_enrichment=bool(formula_enrichment), code_enrichment=bool(code_enrichment), images_scale=float(images_scale_env), @@ -136,12 +288,12 @@ def _resolve_phase1_backend( raise ValueError( f"Invalid phase1_backend='{requested}'. Expected one of: 'auto', 'safe', 'docling'." ) - needs_gpu = bool(force_ocr or formula_enrichment or code_enrichment) + needs_gpu = bool(formula_enrichment or code_enrichment) if choice == "auto": choice = "docling" if needs_gpu else "safe" if choice == "safe" and needs_gpu: self.logger.info( - "Phase-1 backend 'safe' overridden to 'docling' because OCR/math enrichment was requested." + "Phase-1 backend 'safe' overridden to 'docling' because math/code enrichment was requested." ) choice = "docling" return choice @@ -154,12 +306,12 @@ def _gpu_preflight( require_math: bool, require_backend_gpu: bool = False, ) -> None: - """Abort early when GPU OCR/math is requested but CUDA is unavailable.""" + """Abort early when GPU-backed Docling work is requested but CUDA is unavailable.""" if not (require_ocr or require_math or require_backend_gpu): return instructions = ( - "GPU OCR and math enrichment require CUDA-enabled torch and onnxruntime-gpu. " + "GPU-backed Docling extraction and math enrichment require CUDA-enabled torch. " "Install the CUDA wheels and ensure NVIDIA drivers expose the desired devices." ) @@ -167,30 +319,15 @@ def _gpu_preflight( accel_lower = str(accel_type or "").strip().lower() if accel_lower.startswith("cpu"): raise RuntimeError( - "GPU OCR was requested (force_ocr/math) but accel_type='CPU'. " + "GPU-backed Docling extraction was requested but accel_type='CPU'. " f"{instructions}" ) - try: - import onnxruntime as _ort # type: ignore - providers = _ort.get_available_providers() - except Exception as exc: - raise RuntimeError( - "onnxruntime not available while attempting GPU OCR. " - "Install onnxruntime-gpu and rerun." - ) from exc - - if "CUDAExecutionProvider" not in providers: - raise RuntimeError( - "CUDAExecutionProvider missing from onnxruntime providers. " - f"Detected providers={providers}. {instructions}" - ) - torch_mod = _maybe_import_torch(force=True) if torch_mod is None or not getattr(torch_mod, "cuda", None) or not torch_mod.cuda.is_available(): raise RuntimeError( - "Torch CUDA is not available but GPU OCR/math was requested. " - "Install the CUDA wheel (e.g. torch==2.5.1+cu121) and ensure CUDA drivers/devices are visible." + "Torch CUDA is not available but GPU-backed Docling extraction/math was requested. " + "Install the CUDA wheel and ensure CUDA drivers/devices are visible." ) device_count = torch_mod.cuda.device_count() @@ -208,13 +345,12 @@ def _gpu_preflight( if not self._gpu_banner_logged: self.logger.info( - "GPU preflight: using torch + onnxruntime GPU backends; ensure CUDA drivers are available." + "GPU preflight: using torch-backed Docling extraction; ensure CUDA drivers are available." ) self._gpu_banner_logged = True self.logger.info( - "GPU preflight OK: providers=%s torch_devices=%s", - ",".join(providers), + "GPU preflight OK: torch_devices=%s", ", ".join(device_names) or "", ) @@ -237,6 +373,7 @@ def extract( export_doc_json: bool = True, emit_formula_index: bool = False, phase1_backend: str = "auto", + workers_per_device: int = 1, _prepared: bool = False, ) -> None: """ @@ -250,8 +387,9 @@ def extract( export_doc_json: When True (default), writes Docling layout JSON to `json/.docling.json(.zst)` emit_formula_index: Also emit `json/.formula_index.jsonl` (default: False) phase1_backend: Selects the Phase-1 backend. ``"auto"`` (default) keeps the safe backend unless - OCR/math is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` forces the - Docling backend. + math/code enrichment is requested, ``"safe"`` forces the PyPDFium backend, and ``"docling"`` + forces the Docling backend. + workers_per_device: Number of extraction workers to bind to each visible GPU when ``use_gpus='multi'``. """ if not file_paths: @@ -425,12 +563,17 @@ def extract( except Exception: threads_effective = int(num_threads) if isinstance(num_threads, int) else max(2, 2 * max(1, len(devs))) - batch_hint = 5 if backend_choice == "docling" and not force_ocr else 1 + workers_per_device = max(1, int(workers_per_device or 1)) + configured_batch_hint = 1 + if backend_choice == "docling": + extractor = getattr(self, "extractor", None) + configured_batch_hint, _ = _resolve_docling_queue_policy(extractor) self.logger.info( - "Phase-1 config: backend=%s batch_size=%s threads=%s skip_existing=%s benchmark=%s", + "Phase-1 config: backend=%s max_batch_files=%s threads=%s workers_per_device=%s skip_existing=%s benchmark=%s", backend_choice, - batch_hint, + configured_batch_hint, threads_effective, + workers_per_device, bool(skip_existing), bool(benchmark_mode), ) @@ -464,15 +607,44 @@ def extract( return # Dynamic work queue across GPUs + from .corpus_orchestrator import gpu_extract_worker_queue from multiprocessing import get_context ctx = get_context("spawn") manager = ctx.Manager() task_q = ctx.Queue() result_q = ctx.Queue() status_map = manager.dict() - path_list = [str(p.resolve()) for p in pending_files] - for full_path in path_list: - task_q.put(full_path) + batch_target_pages = 1 + configured_max_batch_files = 1 + long_pdf_page_threshold = 600 + work_items: List[List[Path]] = [[Path(p)] for p in pending_files] + extractor = getattr(self, "extractor", None) + configured_max_batch_files, long_pdf_page_threshold = _resolve_docling_queue_policy(extractor) + if backend_choice == "docling": + batch_target_pages = _resolve_docling_batch_target_pages() + work_items = _build_extract_work_items( + pending_files, + max_batch_files=configured_max_batch_files, + target_batch_pages=batch_target_pages, + long_pdf_page_threshold=long_pdf_page_threshold, + ) + queue_items = [[str(path.resolve()) for path in item] for item in work_items] + for queue_item in queue_items: + task_q.put(queue_item) + total_estimated_pages = 0 + try: + total_estimated_pages = sum(_estimate_extract_work_pages(path) for path in pending_files) + except Exception: + total_estimated_pages = 0 + self.logger.info( + "Phase-1 dispatch: %d file(s) -> %d work item(s) (backend=%s max_batch_files=%d target_pages=%d est_pages=%d)", + len(pending_files), + len(queue_items), + backend_choice, + configured_max_batch_files, + batch_target_pages, + total_estimated_pages, + ) worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") worker_log_dir_to_use = worker_log_dir_env if not worker_log_dir_to_use: @@ -494,14 +666,29 @@ def extract( marker_base.mkdir(parents=True, exist_ok=True) except Exception as exc: self.logger.debug("Unable to prepare marker directory %s: %s", marker_base, exc) - procs: List[Any] = [] - proc_gpu: Dict[int, int] = {} - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} + worker_specs: List[Dict[str, Any]] = [] for dev_id in devs: + for worker_slot in range(workers_per_device): + worker_specs.append( + { + "device_id": int(dev_id), + "worker_slot": int(worker_slot), + "worker_key": f"gpu{dev_id}-w{worker_slot}", + } + ) + procs: List[Any] = [] + proc_specs: Dict[int, Dict[str, Any]] = {} + marker_files: Dict[str, Path] = { + spec["worker_key"]: marker_base / f"{spec['worker_key']}.current" + for spec in worker_specs + } + for spec in worker_specs: p = ctx.Process( target=gpu_extract_worker_queue, args=( - dev_id, + spec["device_id"], + spec["worker_slot"], + spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -524,7 +711,7 @@ def extract( p.start() procs.append(p) if p.pid is not None: - proc_gpu[p.pid] = dev_id + proc_specs[p.pid] = dict(spec) active = list(procs) any_fail = False last_summary = time.time() @@ -541,20 +728,21 @@ def extract( procs.remove(p) pid = p.pid or -1 heartbeat[pid] = time.time() - gpu_id = proc_gpu.pop(pid, None) + worker_spec = proc_specs.pop(pid, None) + worker_key = worker_spec["worker_key"] if worker_spec else None if p.exitcode not in (0, None): any_fail = True self.logger.warning("GPU worker pid=%s exited with code %s", p.pid, p.exitcode) current_paths: List[str] = [] stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) + if worker_key is not None: + current_entry = status_map.pop(worker_key, None) if current_entry: if not isinstance(current_entry, (list, tuple, set)): current_entry = [current_entry] current_paths = [str(x) for x in current_entry] stems_for_skip = [canonical_stem(path) for path in current_paths] - marker_path = marker_files.get(gpu_id) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -565,12 +753,17 @@ def extract( state_mgr.save(processed_files, problematic_files) if stems_for_skip: skip_mgr.add(stems_for_skip) - if gpu_id is not None: - self.logger.info("Respawning GPU%s worker after crash.", gpu_id) + if worker_spec is not None: + self.logger.info( + "Respawning %s after crash.", + worker_spec["worker_key"], + ) replacement = ctx.Process( target=gpu_extract_worker_queue, args=( - gpu_id, + worker_spec["device_id"], + worker_spec["worker_slot"], + worker_spec["worker_key"], str(self.input_dir), str(self.output_dir), task_q, @@ -594,13 +787,13 @@ def extract( procs.append(replacement) active.append(replacement) if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id + proc_specs[replacement.pid] = dict(worker_spec) heartbeat[replacement.pid] = time.time() continue else: - if gpu_id is not None: - status_map.pop(gpu_id, None) - marker_path = marker_files.get(gpu_id) + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(worker_key) if marker_path: try: marker_path.unlink(missing_ok=True) @@ -628,7 +821,7 @@ def extract( skip_mgr.add(bad_stems) state_mgr.save(processed_files, problematic_files) self.logger.info( - "GPU%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", + "%s batch complete: +%d processed, +%d problematic (totals: %d processed, %d problematic)", result.get("worker"), len(ok_stems), len(bad_stems), @@ -642,30 +835,25 @@ def extract( if result.get("exitcode", 0) not in (0, None): any_fail = True self.logger.warning( - "GPU%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") + "%s reported non-zero exit: %s", result.get("worker"), result.get("exitcode") ) worker_pid = result.get("pid") if worker_pid is not None: heartbeat[worker_pid] = time.time() - worker_gpu = result.get("worker") - if worker_gpu is not None: - try: - worker_gpu_int = int(worker_gpu) - except Exception: - worker_gpu_int = None - else: - status_map.pop(worker_gpu_int, None) - marker_path = marker_files.get(worker_gpu_int) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass + worker_key = result.get("worker") + if worker_key is not None: + status_map.pop(worker_key, None) + marker_path = marker_files.get(str(worker_key)) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass now = time.time() if now - last_summary > 30: try: - pending = result_q.qsize() + pending = task_q.qsize() except NotImplementedError: pending = -1 self.logger.info( @@ -706,6 +894,13 @@ def extract( pending_item = task_q.get_nowait() if isinstance(pending_item, str) and pending_item.strip(): remaining_after_failure.append(pending_item) + continue + if isinstance(pending_item, (list, tuple, set)): + remaining_after_failure.extend( + str(item).strip() + for item in pending_item + if str(item).strip() + ) except queue.Empty: pass if remaining_after_failure: diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 4dec423..0d86861 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -1,6 +1,7 @@ """OCR and math enrichment helpers split from Corpus.""" from __future__ import annotations +import hashlib import json import logging import math @@ -21,27 +22,195 @@ from .._naming import canonical_stem from ..gloss_downloader import GlossDownloader from ..gloss_section import GlossSection +from ..ocr.deepseek.defaults import ( + DEFAULT_ATTN_BACKEND, + DEFAULT_GPU_MEMORY_UTILIZATION, + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_OCR_PROFILE, + DEFAULT_RENDER_DPI, + DEFAULT_REPAIR_MODE, + DEFAULT_RUNTIME_BACKEND, + DEFAULT_TARGET_BATCH_PAGES, + DEFAULT_WORKERS_PER_GPU, +) # Avoid importing classifier here; OCR/math phase does not require it at import time. from .corpus_skiplist import _SkiplistManager, _resolve_skiplist_path from .corpus_state import _ProcessingStateManager from .corpus_utils import _maybe_import_torch +from .ocr.config import OcrRequest, normalize_ocr_request +from .ocr.math_targets import discover_docling_json_stems, filter_math_only_stems +from .ocr.pipeline import run_ocr_phase +from .ocr.targets import build_ocr_selection + + +def _build_ocr_stage_artifact_update( + *, + markdown_dir: Path, + metrics_dir: Path, + stem: str, +) -> Optional[Dict[str, object]]: + """Return direct OCR-owned artifact fields for one canonical OCR document. + + The OCR stage should hand off the same row identity that upstream stages + produced, with corrected text embedded back into parquet. Markdown and + metrics remain sidecars, but detached markdown alone is not the full stage + contract. + """ + + markdown_path = Path(markdown_dir) / f"{stem}.md" + if not markdown_path.exists(): + return None + text_payload = markdown_path.read_text(encoding="utf-8") + metrics_path = Path(metrics_dir) / f"{stem}.metrics.json" + return { + "text": text_payload, + "ocr_markdown_relpath": str(Path("markdown") / markdown_path.name), + "ocr_metrics_relpath": ( + str(Path("json") / "metrics" / metrics_path.name) if metrics_path.exists() else None + ), + "ocr_text_sha256": hashlib.sha256(text_payload.encode("utf-8")).hexdigest(), + } + + +def _apply_ocr_success_updates( + df_meta: pd.DataFrame, + *, + filenames: List[str], + markdown_dir: Path, + metrics_dir: Path, + backend_norm: str, +) -> pd.DataFrame: + """Apply only direct, obvious OCR-owned metadata updates to the parquet rows.""" + + if "filename" not in df_meta.columns: + return df_meta + + if "filter" not in df_meta.columns: + df_meta["filter"] = "ok" + if "needs_ocr" not in df_meta.columns: + df_meta["needs_ocr"] = False + if "ocr_success" not in df_meta.columns: + df_meta["ocr_success"] = False + if "extraction_mode" not in df_meta.columns: + df_meta["extraction_mode"] = None + + direct_columns = ("text", "ocr_markdown_relpath", "ocr_metrics_relpath", "ocr_text_sha256") + for column in direct_columns: + if column not in df_meta.columns: + df_meta[column] = None + + filename_series = df_meta["filename"].astype(str) + stem_series = filename_series.map(canonical_stem) + + for fname in filenames: + stem = canonical_stem(fname) + mask = stem_series == stem + if not bool(mask.any()): + continue + artifact_update = _build_ocr_stage_artifact_update( + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + stem=stem, + ) + df_meta.loc[mask, "filter"] = "ok" + df_meta.loc[mask, "needs_ocr"] = False + df_meta.loc[mask, "ocr_success"] = True + if backend_norm == "deepseek": + df_meta.loc[mask, "extraction_mode"] = "deepseek" + if artifact_update is None: + continue + for column, value in artifact_update.items(): + df_meta.loc[mask, column] = value + + return df_meta + + +def _normalize_ocr_target_filenames(*, filenames: List[str], input_dir: Path) -> List[str]: + """Collapse chunk-like metadata rows back to real OCR source files when possible.""" + + source_by_stem: Dict[str, str] = {} + try: + for path in sorted(Path(input_dir).glob("*.pdf")): + source_by_stem.setdefault(canonical_stem(path.name), path.name) + except Exception: + source_by_stem = {} + + normalized: List[str] = [] + seen: Set[str] = set() + for fname in filenames: + resolved = source_by_stem.get(canonical_stem(fname), str(fname)) + if resolved in seen: + continue + normalized.append(resolved) + seen.add(resolved) + return normalized class OcrMathPhaseMixin: + def _refresh_metrics_after_ocr_rerun(self) -> None: + """Refresh OCR-owned and export-owned metrics after OCR remediation. + + `clean_ocr()` and `clean()` remain separate stages on purpose: + + - `clean_ocr()` owns OCR artifact removal and OCR-specific metrics. + - `clean()` owns the broader export-facing clean metrics. + + After OCR reruns we intentionally execute both stages in sequence on the + OCR-cleaned text surface instead of treating one stage as a synonym for + the other. + """ + + self.logger.info( + "Re-running OCR cleaner after OCR rerun to refresh cleaned text and OCR metrics" + ) + self.clean_ocr( + input_dir=self.markdown_dir, + drop_bad=False, + ) + self.logger.info( + "Re-running Rust cleaner in score-only mode on OCR-cleaned markdown to refresh export metrics" + ) + self.clean( + input_dir=self.cleaned_markdown_dir, + drop_bad=False, + write_cleaned_files=False, + ) + def ocr( self, *, fix_bad: bool = True, mode: Optional[str] = None, - backend: str = "rapidocr", + backend: str = "deepseek", device: Optional[str] = None, model_dir: Optional[Union[str, Path]] = None, max_pages: Optional[int] = None, persist_engine: bool = True, limit: Optional[int] = None, - dpi: Optional[int] = None, # reserved for future use - precision: Optional[str] = None, # reserved for future use ("fp16","bf16") - # Integrated math enrichment controls + dpi: Optional[int] = None, + precision: Optional[str] = None, + workers_per_gpu: int = DEFAULT_WORKERS_PER_GPU, + runtime_backend: str = DEFAULT_RUNTIME_BACKEND, + ocr_profile: str = DEFAULT_OCR_PROFILE, + prompt_override: Optional[str] = None, + attn_backend: str = DEFAULT_ATTN_BACKEND, + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = DEFAULT_RENDER_DPI, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + vllm_batch_size: Optional[int] = None, + gpu_memory_utilization: Optional[float] = DEFAULT_GPU_MEMORY_UTILIZATION, + disable_fp8_kv: bool = False, + repair_mode: str = DEFAULT_REPAIR_MODE, + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, + scheduler: str = "auto", + target_batch_pages: int = DEFAULT_TARGET_BATCH_PAGES, + shard_pages: int = 0, + shard_threshold_pages: int = 0, math_enhance: bool = True, math_targets: Optional[Dict[str, List[Tuple[int, int]]]] = None, math_batch_size: int = 8, @@ -51,674 +220,373 @@ def ocr( force: Optional[bool] = None, reprocess_completed: Optional[bool] = None, skip_existing: Optional[bool] = None, - # Content debug: keep page separators and truncation markers when True content_debug: bool = False, CONTENT_DEBUG: Optional[bool] = None, - # Back-compat aliases (deprecated): internal_debug: bool = False, INTERNAL_DEBUG: Optional[bool] = None, ) -> None: - """OCR and/or math enrichment with explicit mode control. - - Parameters - - mode: one of - - 'ocr_bad': re‑OCR only documents flagged as bad by Rust cleaner (parquet 'filter' != 'ok'). - - 'math_only': run math enrichment from Docling JSON (generate JSON without OCR when missing). - - 'ocr_bad_then_math': re‑OCR bad documents, then run math enrichment on those. - If not provided, falls back to legacy flags (fix_bad, math_enhance): - fix_bad and math_enhance -> 'ocr_bad_then_math'; - fix_bad only -> 'ocr_bad'; - math_enhance only -> 'math_only'; - neither -> no‑op. - - backend: 'rapidocr' (default) uses the Docling + RapidOCR path via Phase‑1 extract(). - 'deepseek' uses the DeepSeek‑OCR path (no Docling JSON, math unsupported). - - fix_bad: re-run OCR on documents marked bad by the cleaner (default True). - - math_enhance: run math/code enrichment after OCR (default True). - - force: [DEPRECATED] alias for fix_bad retained for backward compatibility. - - reprocess_completed: when False, skip documents already flagged as successfully - OCRed or math-enriched in metadata. Set True to force reprocessing. Defaults to False - unless ``skip_existing`` overrides it. - - skip_existing: legacy alias for ``reprocess_completed`` (``skip_existing=True`` equals - ``reprocess_completed=False``). Prefer the explicit ``reprocess_completed`` toggle. - """ - # Normalize backend - backend_norm = str(backend or "rapidocr").strip().lower() - if backend_norm not in {"rapidocr", "deepseek"}: - raise ValueError("backend must be 'rapidocr' or 'deepseek'") - - # CONTENT_DEBUG override (preferred uppercase alias) - # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags - if CONTENT_DEBUG is not None: - content_debug = bool(CONTENT_DEBUG) - elif INTERNAL_DEBUG is not None: - content_debug = bool(INTERNAL_DEBUG) - elif internal_debug: - content_debug = True - - # Normalize mode from explicit value or legacy flags - mode_norm = None - fix_bad_effective = bool(fix_bad) - if force is not None: - try: - self.logger.warning("Corpus.ocr(force=...) is deprecated; use fix_bad=... instead") - except Exception: - pass - fix_bad_effective = bool(force) - if mode: - m = str(mode).strip().lower() - if m in {"ocr_bad", "math_only", "ocr_bad_then_math"}: - mode_norm = m - else: - self.logger.warning("Unknown mode '%s'; falling back to legacy flags", mode) - if mode_norm is None: - if fix_bad_effective and math_enhance: - mode_norm = "ocr_bad_then_math" - elif fix_bad_effective: - mode_norm = "ocr_bad" - elif math_enhance: - mode_norm = "math_only" - else: - self.logger.info( - "OCR: no operation requested (enable fix_bad and/or math_enhance or set mode='ocr_bad'|'math_only'|'ocr_bad_then_math')" + """OCR and/or math enrichment with explicit mode control.""" + + del limit, dpi + request = normalize_ocr_request( + logger=self.logger, + fix_bad=fix_bad, + mode=mode, + backend=backend, + device=device, + model_dir=model_dir, + max_pages=max_pages, + persist_engine=persist_engine, + precision=precision, + workers_per_gpu=workers_per_gpu, + runtime_backend=runtime_backend, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + scheduler=scheduler, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + math_enhance=math_enhance, + math_targets=math_targets, + math_batch_size=math_batch_size, + math_dpi_base=math_dpi_base, + use_gpus=use_gpus, + devices=devices, + force=force, + reprocess_completed=reprocess_completed, + skip_existing=skip_existing, + content_debug=content_debug, + CONTENT_DEBUG=CONTENT_DEBUG, + internal_debug=internal_debug, + INTERNAL_DEBUG=INTERNAL_DEBUG, + ) + if request is None: + return + if request.mode == "math_only": + self._run_math_only_request(request) + return + run_ocr_phase(self, request) + + def _run_math_only_request(self, request: OcrRequest) -> None: + selection = build_ocr_selection( + self, + mode=request.mode, + reprocess_completed=request.reprocess_completed, + ) + stems = discover_docling_json_stems(self.output_dir) + stems = filter_math_only_stems( + stems=stems, + bad_files=selection.bad_files, + math_done_stems=selection.math_done_stems, + reprocess_completed=request.reprocess_completed, + logger=self.logger, + ) + self._run_math_targets( + stems=stems, + request=request, + skip_mgr=selection.skip_mgr, + skiplist_path=selection.skiplist_path, + ) + + def _run_math_targets( + self, + *, + stems: List[str], + request: OcrRequest, + skip_mgr: Optional[_SkiplistManager], + skiplist_path: Path, + ) -> None: + if not stems: + self.logger.info("No Docling JSON found for math enrichment.") + return + + initial_math_targets = len(stems) + current_skips = skip_mgr.reload() if skip_mgr else set() + if current_skips: + before = len(stems) + stems = [stem for stem in stems if stem not in current_skips] + removed = before - len(stems) + if removed: + self.logger.warning( + "Skip-list %s filtered %d document(s) from Phase-2 math.", + skiplist_path, + removed, ) + if not stems: + self.logger.info("All math targets filtered by skip-list; nothing to do.") return - reprocess_explicit = reprocess_completed is not None - reprocess_flag = bool(reprocess_completed) if reprocess_explicit else False - if skip_existing is not None: - skip_flag = bool(skip_existing) + + self.logger.info( + "Math targets: total=%d kept=%d filtered_skiplist=%d", + initial_math_targets, + len(stems), + initial_math_targets - len(stems), + ) + + local_targets = None + if request.math_targets: + local_targets = {stem: request.math_targets.get(stem) for stem in stems if stem in request.math_targets} + + if str(request.use_gpus).lower() != "multi": + self.formula_enrich_from_json( + files=stems, + device=(request.device or "cuda"), + batch_size=int(request.math_batch_size), + dpi_base=int(request.math_dpi_base), + targets_by_stem=local_targets, + ) + return + + devs = list(request.devices or []) + if not devs: try: - self.logger.warning( - "Corpus.ocr(skip_existing=...) is deprecated; use reprocess_completed=... instead." + proc = subprocess.run( + ["nvidia-smi", "-L"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=5, ) + if proc.returncode == 0 and proc.stdout: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + try: + devs.append(int(line.split(":", 1)[0].split()[1])) + except Exception: + pass except Exception: pass - desired = not skip_flag - if reprocess_explicit and desired != reprocess_flag: + if not devs: + torch_mod = _maybe_import_torch() try: - self.logger.info( - "Corpus.ocr(): skip_existing=%s overrides reprocess_completed=%s (effective reprocess_completed=%s).", - skip_flag, - reprocess_flag, - desired, - ) + if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): + devs = list(range(torch_mod.cuda.device_count())) except Exception: pass - reprocess_flag = desired - reprocess_completed = reprocess_flag - # DeepSeek semantics note - if backend_norm == "deepseek": + if not devs: + msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement" + self.logger.error(msg) + raise RuntimeError(msg) + + from multiprocessing import get_context + + ctx = get_context("spawn") + work_q = ctx.Queue() + result_q = ctx.Queue() + manager = ctx.Manager() + status_map = manager.dict() + for stem in stems: + work_q.put(stem) + + worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") + worker_log_dir_to_use = worker_log_dir_env + if not worker_log_dir_to_use: + default_worker_log_dir = self.logs_dir / "math_workers" try: - self.logger.info( - "DeepSeek backend: Phase-2 math is not required; equations are included inline via OCR." + default_worker_log_dir.mkdir(parents=True, exist_ok=True) + worker_log_dir_to_use = str(default_worker_log_dir) + except Exception as exc: + self.logger.warning( + "Unable to prepare worker log directory %s: %s", + default_worker_log_dir, + exc, ) - except Exception: - pass - # Identify bad documents from parquet (Rust cleaner output) - bad_files: List[str] = [] - skipped_completed = 0 - skipped_skiplist = 0 - parquet_meta: Optional["pd.DataFrame"] = None - ocr_done_files: List[str] = [] - ocr_done_stems: Set[str] = set() - math_done_files: List[str] = [] - math_done_stems: Set[str] = set() + worker_log_dir_to_use = None + if worker_log_dir_to_use: + os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_to_use + marker_base = Path(worker_log_dir_to_use) if worker_log_dir_to_use else (self.logs_dir / "math_workers") try: - from glossapi.parquet_schema import ParquetSchema - parquet_schema = ParquetSchema({"url_column": self.url_column}) - parquet_path = self._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) - if parquet_path and parquet_path.exists(): - import pandas as _pd - df = _pd.read_parquet(parquet_path) - if "filename" in df.columns and "needs_ocr" in df.columns: - bad_files = df.loc[df["needs_ocr"] == True, "filename"].dropna().astype(str).tolist() - else: - # No fallback: selection relies strictly on the 'needs_ocr' flag - # populated by the cleaner. If missing, we skip OCR selection. - bad_files = [] - ocr_done: Set[str] = set() - if "ocr_success" in df.columns: - ocr_done_files = df.loc[df["ocr_success"].fillna(False), "filename"].dropna().astype(str).tolist() - ocr_done = {canonical_stem(str(name)) for name in ocr_done_files} - ocr_done_stems = set(ocr_done) - if "math_enriched" in df.columns: - math_done_files = df.loc[df["math_enriched"].fillna(False), "filename"].dropna().astype(str).tolist() - elif "enriched_math" in df.columns: - math_done_files = df.loc[df["enriched_math"].fillna(False), "filename"].dropna().astype(str).tolist() - if math_done_files: - math_done_stems = {canonical_stem(str(name)) for name in math_done_files} - if not reprocess_completed and ocr_done: - before = len(bad_files) - bad_files = [name for name in bad_files if canonical_stem(name) not in ocr_done] - removed = before - len(bad_files) - if removed: - skipped_completed = removed - self.logger.info( - "OCR: skipping %d already completed document(s) (reprocess_completed=False).", - removed, - ) - if reprocess_completed and mode_norm in {"ocr_bad", "ocr_bad_then_math"} and ocr_done_files: - pending = {str(f) for f in bad_files} - for fname in ocr_done_files: - if fname not in pending: - bad_files.append(fname) - pending.add(fname) - parquet_meta = df - else: - parquet_meta = None + marker_base.mkdir(parents=True, exist_ok=True) except Exception: pass + marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} - ocr_candidates_initial = len(bad_files) - skiplist_path = _resolve_skiplist_path(self.output_dir, self.logger) - skip_mgr = _SkiplistManager(skiplist_path, self.logger) - skip_stems = skip_mgr.load() - if skip_stems: - before = len(bad_files) - bad_files = [name for name in bad_files if canonical_stem(name) not in skip_stems] - removed = before - len(bad_files) - if removed: - skipped_skiplist = removed - self.logger.warning( - "Skip-list %s filtered %d document(s) from Phase-3 OCR.", - skiplist_path, - removed, - ) + procs: List[Any] = [] + active: List[Any] = [] + proc_gpu: Dict[int, int] = {} try: - self.logger.info( - "OCR targets: total=%d kept=%d skipped_completed=%d skipped_skiplist=%d", - ocr_candidates_initial, - len(bad_files), - skipped_completed, - skipped_skiplist, - ) + respawn_cap = int(os.environ.get("GLOSSAPI_MATH_RESPAWN_CAP", "5")) except Exception: - pass + respawn_cap = 5 + respawn_cap = max(0, respawn_cap) + respawn_counts: Dict[int, int] = {dev_id: 0 for dev_id in devs} - # Helper to run Phase‑2 enrichment over stems - def _run_math(stems: List[str]) -> None: - if not stems: - self.logger.info("No Docling JSON found for math enrichment.") - return - initial_math_targets = len(stems) - current_skips = skip_mgr.reload() if skip_mgr else set() - if current_skips: - before = len(stems) - stems = [s for s in stems if s not in current_skips] - removed = before - len(stems) - if removed: - self.logger.warning( - "Skip-list %s filtered %d document(s) from Phase-2 math.", - skiplist_path, - removed, - ) - if not stems: - self.logger.info("All math targets filtered by skip-list; nothing to do.") - return - try: - self.logger.info( - "Math targets: total=%d kept=%d filtered_skiplist=%d", - initial_math_targets, - len(stems), - initial_math_targets - len(stems), - ) - except Exception: - pass - local_targets = None - if math_targets: - local_targets = {s: math_targets.get(s) for s in stems if s in math_targets} - if str(use_gpus).lower() == "multi": - # Detect GPU devices - devs = devices or [] - if not devs: - try: - import subprocess - p = subprocess.run(["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5) - if p.returncode == 0 and p.stdout: - for line in p.stdout.splitlines(): - if line.startswith("GPU "): - try: - idx = int(line.split(":", 1)[0].split()[1]) - devs.append(idx) - except Exception: - pass - except Exception: - pass - if not devs: - torch_mod = _maybe_import_torch() - try: - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - devs = list(range(torch_mod.cuda.device_count())) - except Exception: - pass - if not devs: - msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement" - self.logger.error(msg) - raise RuntimeError(msg) - else: - from multiprocessing import get_context - - ctx = get_context("spawn") - work_q = ctx.Queue() - result_q = ctx.Queue() - manager = ctx.Manager() - status_map = manager.dict() - for s in stems: - work_q.put(s) - - worker_log_dir_env = os.environ.get("GLOSSAPI_WORKER_LOG_DIR") - worker_log_dir_to_use = worker_log_dir_env - if not worker_log_dir_to_use: - default_worker_log_dir = self.logs_dir / "math_workers" - try: - default_worker_log_dir.mkdir(parents=True, exist_ok=True) - worker_log_dir_to_use = str(default_worker_log_dir) - except Exception as exc: - self.logger.warning( - "Unable to prepare worker log directory %s: %s", - default_worker_log_dir, - exc, - ) - worker_log_dir_to_use = None - if worker_log_dir_to_use: - os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_to_use - marker_base = Path(worker_log_dir_to_use) if worker_log_dir_to_use else (self.logs_dir / "math_workers") - try: - marker_base.mkdir(parents=True, exist_ok=True) - except Exception: - pass - marker_files: Dict[int, Path] = {dev_id: marker_base / f"gpu{dev_id}.current" for dev_id in devs} - - procs: List[Any] = [] - active: List[Any] = [] - proc_gpu: Dict[int, int] = {} - try: - respawn_cap = int(os.environ.get("GLOSSAPI_MATH_RESPAWN_CAP", "5")) - except Exception: - respawn_cap = 5 - respawn_cap = max(0, respawn_cap) - respawn_counts: Dict[int, int] = {dev_id: 0 for dev_id in devs} + for dev_id in devs: + proc = ctx.Process( + target=_gpu_math_worker, + args=( + dev_id, + str(self.input_dir), + str(self.output_dir), + work_q, + int(request.math_batch_size), + int(request.math_dpi_base), + request.device or "cuda", + local_targets or {}, + result_q, + status_map, + str(marker_base), + ), + ) + proc.start() + procs.append(proc) + active.append(proc) + if proc.pid is not None: + proc_gpu[proc.pid] = dev_id - for dev_id in devs: - p = ctx.Process( + try: + last_summary = time.time() + while active: + for proc in list(active): + proc.join(timeout=0.05) + if proc.is_alive(): + continue + active.remove(proc) + if proc in procs: + procs.remove(proc) + pid = proc.pid or -1 + gpu_id = proc_gpu.pop(pid, None) + exitcode = proc.exitcode + stems_for_skip: List[str] = [] + if gpu_id is not None: + current_entry = status_map.pop(gpu_id, None) + if current_entry: + if isinstance(current_entry, (list, tuple, set)): + entries = list(current_entry) + else: + entries = [current_entry] + stems_for_skip = [str(item) for item in entries if item] + marker_path = marker_files.get(gpu_id) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass + if exitcode not in (0, None) and gpu_id is not None: + if stems_for_skip and skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in stems_for_skip) + self.logger.warning("Math worker GPU%s exited with %s", gpu_id, exitcode) + respawn_counts[gpu_id] = respawn_counts.get(gpu_id, 0) + 1 + attempts = respawn_counts[gpu_id] + if respawn_cap and attempts > respawn_cap: + self.logger.error( + "Math worker GPU%s exceeded respawn cap (%s); not respawning", + gpu_id, + respawn_cap, + ) + continue + replacement = ctx.Process( target=_gpu_math_worker, args=( - dev_id, + gpu_id, str(self.input_dir), str(self.output_dir), work_q, - int(math_batch_size), - int(math_dpi_base), - device or "cuda", + int(request.math_batch_size), + int(request.math_dpi_base), + request.device or "cuda", local_targets or {}, result_q, status_map, str(marker_base), ), ) - p.start() - procs.append(p) - active.append(p) - if p.pid is not None: - proc_gpu[p.pid] = dev_id + replacement.start() + procs.append(replacement) + active.append(replacement) + if replacement.pid is not None: + proc_gpu[replacement.pid] = gpu_id + continue + while True: try: - last_summary = time.time() - while active: - for p in list(active): - p.join(timeout=0.05) - if p.is_alive(): - continue - active.remove(p) - if p in procs: - procs.remove(p) - pid = p.pid or -1 - gpu_id = proc_gpu.pop(pid, None) - exitcode = p.exitcode - stems_for_skip: List[str] = [] - if gpu_id is not None: - current_entry = status_map.pop(gpu_id, None) - if current_entry: - if isinstance(current_entry, (list, tuple, set)): - entries = list(current_entry) - else: - entries = [current_entry] - stems_for_skip = [str(item) for item in entries if item] - marker_path = marker_files.get(gpu_id) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass - if exitcode not in (0, None) and gpu_id is not None: - if stems_for_skip: - skip_mgr.add(canonical_stem(s) for s in stems_for_skip) - self.logger.warning( - "Math worker GPU%s exited with %s", - gpu_id, - exitcode, - ) - respawn_counts[gpu_id] = respawn_counts.get(gpu_id, 0) + 1 - attempts = respawn_counts[gpu_id] - if respawn_cap and attempts > respawn_cap: - self.logger.error( - "Math worker GPU%s exceeded respawn cap (%s); not respawning", - gpu_id, - respawn_cap, - ) - continue - replacement = ctx.Process( - target=_gpu_math_worker, - args=( - gpu_id, - str(self.input_dir), - str(self.output_dir), - work_q, - int(math_batch_size), - int(math_dpi_base), - device or "cuda", - local_targets or {}, - result_q, - status_map, - str(marker_base), - ), - ) - replacement.start() - procs.append(replacement) - active.append(replacement) - if replacement.pid is not None: - proc_gpu[replacement.pid] = gpu_id - continue - - while True: - try: - event = result_q.get_nowait() - except queue.Empty: - break - if not event: - continue - if event.get("event") == "math_batch": - stems_bad = event.get("problematic", []) - if stems_bad: - skip_mgr.add(canonical_stem(s) for s in stems_bad) - worker = event.get("worker") - try: - worker_gpu = int(worker) - except Exception: - worker_gpu = None - if worker_gpu is not None: - status_map.pop(worker_gpu, None) - marker_path = marker_files.get(worker_gpu) - if marker_path: - try: - marker_path.unlink(missing_ok=True) - except Exception: - pass - elif event.get("event") == "exit" and event.get("exitcode", 0) not in (0, None): - self.logger.warning( - "Math worker GPU%s reported exit code %s", - event.get("worker"), - event.get("exitcode"), - ) - - now = time.time() - if now - last_summary > 30: - try: - qsize = work_q.qsize() - except NotImplementedError: - qsize = -1 - self.logger.info( - "Math progress: queue=%d active_workers=%d", - qsize, - len(active), - ) - last_summary = now - - if not active: - break - remaining_after_cap: List[str] = [] - try: - while True: - item = work_q.get_nowait() - if isinstance(item, str) and item.strip(): - remaining_after_cap.append(item) - except queue.Empty: - pass - if remaining_after_cap: - skip_mgr.add(canonical_stem(s) for s in remaining_after_cap) - self.logger.error( - "No active math workers remain; skipped %d pending item(s)", - len(remaining_after_cap), - ) - finally: - for p in procs: - if p.is_alive(): - p.join() + event = result_q.get_nowait() + except queue.Empty: + break + if not event: + continue + if event.get("event") == "math_batch": + stems_bad = event.get("problematic", []) + if stems_bad and skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in stems_bad) + worker = event.get("worker") try: - manager.shutdown() + worker_gpu = int(worker) except Exception: - pass - if worker_log_dir_env is not None: - os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_env - else: - os.environ.pop("GLOSSAPI_WORKER_LOG_DIR", None) - return - # Single-GPU path - self.formula_enrich_from_json( - files=stems, - device=(device or "cuda"), - batch_size=int(math_batch_size), - dpi_base=int(math_dpi_base), - targets_by_stem=local_targets, - ) - - # Branches - if mode_norm == "math_only": - if not math_enhance: - self.logger.info("OCR: fix_bad=False and math_enhance=False → nothing to do") - return - # Math-only: ensure JSON exists; if not, generate without OCR - json_dir = self.output_dir / "json" - stems: List[str] = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - # Do not generate layout JSON here; Phase‑1 is responsible for JSON artifacts. - # Never run math on files that need OCR - if bad_files: - before = len(stems) - bad_set = {canonical_stem(s) for s in bad_files} - stems = [s for s in stems if s not in bad_set] - removed = before - len(stems) - if removed: - try: - self.logger.info( - "Math-only: skipping %d document(s) flagged for OCR", - removed, - ) - except Exception: - pass - if not reprocess_completed and stems and parquet_meta is not None: - if math_done_stems: - before = len(stems) - stems = [s for s in stems if s not in math_done_stems] - removed = before - len(stems) - if removed: - self.logger.info( - "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", - removed, + worker_gpu = None + if worker_gpu is not None: + status_map.pop(worker_gpu, None) + marker_path = marker_files.get(worker_gpu) + if marker_path: + try: + marker_path.unlink(missing_ok=True) + except Exception: + pass + elif event.get("event") == "exit" and event.get("exitcode", 0) not in (0, None): + self.logger.warning( + "Math worker GPU%s reported exit code %s", + event.get("worker"), + event.get("exitcode"), ) - _run_math(stems) - return - # 'ocr_bad' and 'ocr_bad_then_math' paths: OCR bad files first - if mode_norm in {"ocr_bad", "ocr_bad_then_math"} and not bad_files: - self.logger.info("OCR: no bad documents flagged by cleaner; skipping OCR fix") - if mode_norm == "ocr_bad_then_math": - json_dir = self.output_dir / "json" - stems = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - _run_math(stems) - return - - reran_ocr = False - - if mode_norm in {"ocr_bad", "ocr_bad_then_math"}: - if backend_norm == "deepseek": - # DeepSeek path: run OCR via dedicated runner (no Docling JSON) - from glossapi.ocr.deepseek import runner as _deepseek_runner # type: ignore - - try: - _deepseek_runner.run_for_files( - self, - bad_files, - model_dir=Path(model_dir) if model_dir else None, - content_debug=bool(content_debug), - ) - except Exception as _e: - self.logger.error("DeepSeek OCR runner failed: %s", _e) - raise - else: - # RapidOCR/Docling path via Phase-1 extract - self.extract( - input_format="pdf", - num_threads=os.cpu_count() or 4, - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=bad_files, - skip_existing=False, - use_gpus=use_gpus, - devices=devices, - # Do not generate Docling JSON for OCR targets; math will skip them - export_doc_json=False, - emit_formula_index=False, - phase1_backend="docling", - ) - reran_ocr = True - # Update metadata to reflect successful OCR reruns - try: - from glossapi.parquet_schema import ParquetSchema as _ParquetSchema - - success_files: List[str] = [] - for _fname in bad_files: - stem = canonical_stem(_fname) - if (self.markdown_dir / f"{stem}.md").exists(): - success_files.append(_fname) - - if success_files: - parquet_schema = _ParquetSchema({"url_column": self.url_column}) - parquet_path = self._resolve_metadata_parquet(parquet_schema, ensure=True, search_input=True) - if parquet_path and parquet_path.exists(): - import pandas as _pd - - df_meta = _pd.read_parquet(parquet_path) - if "filename" in df_meta.columns: - if "filter" not in df_meta.columns: - df_meta["filter"] = "ok" - if "needs_ocr" not in df_meta.columns: - df_meta["needs_ocr"] = False - if "ocr_success" not in df_meta.columns: - df_meta["ocr_success"] = False - if "extraction_mode" not in df_meta.columns: - df_meta["extraction_mode"] = None - for _fname in success_files: - mask = df_meta["filename"].astype(str) == str(_fname) - if mask.any(): - df_meta.loc[mask, "filter"] = "ok" - df_meta.loc[mask, "needs_ocr"] = False - df_meta.loc[mask, "ocr_success"] = True - if backend_norm == "deepseek": - df_meta.loc[mask, "extraction_mode"] = "deepseek" - self._cache_metadata_parquet(parquet_path) - parquet_schema.write_metadata_parquet(df_meta, parquet_path) - # Keep sectioner in sync with newly recovered files + now = time.time() + if now - last_summary > 30: try: - stems = [canonical_stem(_f) for _f in success_files] - if hasattr(self, "good_files"): - for _stem in stems: - if _stem not in getattr(self, "good_files", []): - self.good_files.append(_stem) - except Exception: - pass - except Exception as _e: - self.logger.warning("Failed to update OCR success metadata: %s", _e) + qsize = work_q.qsize() + except NotImplementedError: + qsize = -1 + self.logger.info( + "Math progress: queue=%d active_workers=%d", + qsize, + len(active), + ) + last_summary = now - if reran_ocr: + if not active: + break + + remaining_after_cap: List[str] = [] try: - self.logger.info("Re-running Rust cleaner after OCR rerun to refresh metrics") - self.clean( - input_dir=self.markdown_dir, - drop_bad=False, + while True: + item = work_q.get_nowait() + if isinstance(item, str) and item.strip(): + remaining_after_cap.append(item) + except queue.Empty: + pass + if remaining_after_cap: + if skip_mgr is not None: + skip_mgr.add(canonical_stem(stem) for stem in remaining_after_cap) + self.logger.error( + "No active math workers remain; skipped %d pending item(s)", + len(remaining_after_cap), ) - except Exception as _e: - self.logger.warning("Cleaner refresh after OCR failed: %s", _e) - - if mode_norm == "ocr_bad_then_math": + finally: + for proc in procs: + if proc.is_alive(): + proc.join() try: - # Run math only on documents that do NOT require OCR - json_dir = self.output_dir / "json" - stems: List[str] = [] - if json_dir.exists(): - stems = sorted({canonical_stem(p) for p in json_dir.glob("*.docling.json*")}) - bad_set = {canonical_stem(f) for f in bad_files} - if stems: - # When OCR was rerun we now want math on all stems (bad_set included). - # Only skip bad_set when no rerun happened. - if not reran_ocr: - stems = [s for s in stems if s not in bad_set] - if not reprocess_completed: - if math_done_stems: - before = len(stems) - stems = [s for s in stems if s not in math_done_stems] - removed = before - len(stems) - if removed: - self.logger.info( - "Math enrichment: skipping %d already enriched document(s) (reprocess_completed=False).", - removed, - ) - if not stems: - self.logger.info("Math enrichment: no pending documents after filtering.") - return - # Best-effort: ensure placeholder sidecars for metadata-selected math targets - try: - from glossapi.parquet_schema import ParquetSchema as _ParquetSchema - _ps = _ParquetSchema({"url_column": self.url_column}) - _pq = self._resolve_metadata_parquet(_ps, ensure=True, search_input=True) - except Exception: - _pq = None - if _pq and _pq.exists(): - try: - import pandas as _pd, json as _json - _df = _pd.read_parquet(_pq) - if "filename" in _df.columns: - _df['stem'] = _df['filename'].astype(str).str.replace(r"\.pdf$", "", regex=True) - _phase = _df['phase_recommended'].astype(str) == '2A' if 'phase_recommended' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _ft = (_df['formula_total'].fillna(0).astype('float') > 0) if 'formula_total' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _med = (_df['math_equations_detected'].fillna(0).astype('float') > 0) if 'math_equations_detected' in _df.columns else ((_df['filename'] == _df['filename']) & False) - _mask = _phase | _ft | _med - _parq_stems = set(_df.loc[_mask, 'stem'].dropna().astype(str).tolist()) - if _parq_stems: - sc_dir = self.output_dir / 'sidecars' / 'math' - sc_dir.mkdir(parents=True, exist_ok=True) - for _s in (set(stems) | _parq_stems): - _p = sc_dir / f"{_s}.json" - if not _p.exists(): - _p.write_text(_json.dumps({"items": 0, "accepted": 0, "time_sec": 0.0}, ensure_ascii=False), encoding='utf-8') - except Exception: - pass - try: - self.logger.info("OCR: invoking Phase-2 math for stems: %s", ",".join(stems)) - except Exception: - pass - _run_math(stems) - try: - self.logger.info("OCR: Phase-2 math completed for stems: %s", ",".join(stems)) - except Exception: - pass - except Exception as _e: - self.logger.warning("Phase‑2 enrichment after OCR failed: %s", _e) + manager.shutdown() + except Exception: + pass + if worker_log_dir_env is not None: + os.environ["GLOSSAPI_WORKER_LOG_DIR"] = worker_log_dir_env + else: + os.environ.pop("GLOSSAPI_WORKER_LOG_DIR", None) def formula_enrich_from_json( self, diff --git a/src/glossapi/download_policy.py b/src/glossapi/download_policy.py new file mode 100644 index 0000000..36d3ce6 --- /dev/null +++ b/src/glossapi/download_policy.py @@ -0,0 +1,135 @@ +"""Policy routing for downloader selection.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, Optional +from urllib.parse import urlparse + +import yaml + +VALID_DOWNLOADERS = {"standard", "browser", "auto"} +ROUTE_OPTION_KEYS = { + "request_timeout", + "ssl_verify", + "ssl_cafile", + "request_method", + "sleep", + "per_domain_concurrency", + "domain_concurrency_floor", + "domain_concurrency_ceiling", + "skip_failed_after", + "domain_cookies", + "browser_timeout_ms", + "browser_post_load_wait_ms", + "browser_engine", + "browser_headless", + "browser_session_ttl_seconds", +} + + +def _normalize_downloader(value: Any, default: str = "standard") -> str: + normalized = str(value or default).strip().lower() + if normalized in {"default", "http"}: + normalized = "standard" + if normalized in {"browser_fallback"}: + normalized = "auto" + if normalized in {"browser_protected"}: + normalized = "browser" + if normalized not in VALID_DOWNLOADERS: + raise ValueError(f"Unsupported downloader route: {value}") + return normalized + + +@dataclass(frozen=True) +class DownloadPolicyMatch: + domains: tuple[str, ...] = () + url_regex: Optional[re.Pattern[str]] = None + + def matches(self, url: str) -> bool: + parsed = urlparse(url) + hostname = (parsed.hostname or "").lower() + if self.domains: + matched_domain = any( + hostname == domain or hostname.endswith(f".{domain}") + for domain in self.domains + ) + if not matched_domain: + return False + if self.url_regex and not self.url_regex.search(url): + return False + return True + + +@dataclass(frozen=True) +class DownloadPolicyRule: + matcher: DownloadPolicyMatch + downloader: str + options: Dict[str, Any] + + def matches(self, url: str) -> bool: + return self.matcher.matches(url) + + +@dataclass(frozen=True) +class DownloadPolicy: + default_downloader: str = "standard" + default_options: Dict[str, Any] | None = None + rules: tuple[DownloadPolicyRule, ...] = () + + def resolve(self, url: str) -> tuple[str, Dict[str, Any]]: + for rule in self.rules: + if rule.matches(url): + return rule.downloader, dict(rule.options) + return self.default_downloader, dict(self.default_options or {}) + + +def _extract_route_options(data: Dict[str, Any]) -> Dict[str, Any]: + return {key: value for key, value in data.items() if key in ROUTE_OPTION_KEYS} + + +def _build_matcher(raw: Dict[str, Any]) -> DownloadPolicyMatch: + domains = tuple(str(item).strip().lower() for item in (raw.get("domains") or []) if str(item).strip()) + url_regex = raw.get("url_regex") + compiled = re.compile(str(url_regex)) if url_regex else None + return DownloadPolicyMatch(domains=domains, url_regex=compiled) + + +def build_download_policy(data: Dict[str, Any]) -> DownloadPolicy: + default_block = dict(data.get("default") or {}) + default_downloader = _normalize_downloader(default_block.get("downloader"), default="standard") + default_options = _extract_route_options(default_block) + + rules = [] + for raw_rule in data.get("rules") or []: + raw_rule = dict(raw_rule or {}) + matcher = _build_matcher(dict(raw_rule.get("match") or {})) + downloader = _normalize_downloader(raw_rule.get("downloader"), default=default_downloader) + options = _extract_route_options(raw_rule) + rules.append(DownloadPolicyRule(matcher=matcher, downloader=downloader, options=options)) + + return DownloadPolicy( + default_downloader=default_downloader, + default_options=default_options, + rules=tuple(rules), + ) + + +def load_download_policy(path: str | Path) -> DownloadPolicy: + policy_path = Path(path).expanduser().resolve() + payload = yaml.safe_load(policy_path.read_text(encoding="utf-8")) or {} + if not isinstance(payload, dict): + raise ValueError("Download policy file must define a mapping at the top level") + return build_download_policy(payload) + + +__all__ = [ + "DownloadPolicy", + "DownloadPolicyMatch", + "DownloadPolicyRule", + "VALID_DOWNLOADERS", + "build_download_policy", + "load_download_policy", +] diff --git a/src/glossapi/gloss_browser_downloader.py b/src/glossapi/gloss_browser_downloader.py new file mode 100644 index 0000000..66a7c6e --- /dev/null +++ b/src/glossapi/gloss_browser_downloader.py @@ -0,0 +1,527 @@ +"""Browser-capable downloader mode for browser-gated file endpoints.""" + +from __future__ import annotations + +import asyncio +import io +import json +import os +import re +import time +from dataclasses import dataclass +from urllib.parse import urlparse +from typing import Any, Dict, Optional, Tuple + +import aiofiles +import aiohttp +from PIL import Image + +from .download_policy import DownloadPolicy, load_download_policy +from .gloss_downloader import GlossDownloader + + +@dataclass +class BrowserSessionState: + user_agent: str + cookie_header: str + cached_at: float + + +class BrowserGlossDownloader(GlossDownloader): + """ + Downloader variant that retries browser-gated file endpoints via Playwright. + + This mode only targets file endpoints that are protected by browser/session + checks. It intentionally does not attempt viewer-style extraction. + """ + + def __init__( + self, + *args, + browser_timeout_ms: int = 60000, + browser_post_load_wait_ms: int = 3000, + browser_engine: str = "chromium", + browser_headless: bool = True, + browser_session_ttl_seconds: int = 900, + browser_max_parallel_bootstraps: int = 2, + default_download_route: str = "auto", + **kwargs, + ): + super().__init__(*args, **kwargs) + self.browser_timeout_ms = int(browser_timeout_ms) + self.browser_post_load_wait_ms = int(browser_post_load_wait_ms) + self.browser_engine = str(browser_engine or "chromium") + self.browser_headless = bool(browser_headless) + self.browser_session_ttl_seconds = int(browser_session_ttl_seconds) + self.browser_max_parallel_bootstraps = max(1, int(browser_max_parallel_bootstraps)) + self.browser_bootstrap_semaphore = asyncio.Semaphore(self.browser_max_parallel_bootstraps) + self._browser_session_cache: Dict[str, BrowserSessionState] = {} + self._browser_session_locks: Dict[str, asyncio.Lock] = {} + self.default_download_route = str(default_download_route or "auto").strip().lower() + self.policy = self._load_policy() + + def _load_policy(self) -> Optional[DownloadPolicy]: + if self.download_policy is not None: + return self.download_policy + if self.download_policy_file: + return load_download_policy(self.download_policy_file) + return None + + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + if self.policy is not None: + return self.policy.resolve(url) + return self.default_download_route, {} + + def _route_setting(self, route_options: Dict[str, Any], name: str, fallback: Any) -> Any: + return route_options.get(name, fallback) + + def _domain_key(self, url: str) -> str: + return self._extract_base_domain(url) or (urlparse(url).hostname or "").lower() + + def _choose_browser_bootstrap_url(self, url: str) -> str: + if self._url_looks_like_file_endpoint(url): + return self.get_base_url(url) + return url + + def _should_ignore_navigation_exception(self, url: str, exc: Exception) -> bool: + message = str(exc) + if self._url_looks_like_file_endpoint(url) and "net::ERR_ABORTED" in message: + return True + return False + + def _session_lock_for_domain(self, domain_key: str) -> asyncio.Lock: + lock = self._browser_session_locks.get(domain_key) + if lock is None: + lock = asyncio.Lock() + self._browser_session_locks[domain_key] = lock + return lock + + def _is_browser_session_fresh(self, state: BrowserSessionState, route_options: Dict[str, Any]) -> bool: + ttl = int(self._route_setting(route_options, "browser_session_ttl_seconds", self.browser_session_ttl_seconds)) + if ttl <= 0: + return False + return (time.time() - state.cached_at) < ttl + + def _should_attempt_browser_recovery(self, url: str, html_issue: str) -> bool: + issue = str(html_issue or "").lower() + if "document viewer returned" in issue: + return False + if "challenge page returned" in issue: + return True + if "cookie bootstrap is required" in issue: + return True + if "expected a file-like response but received html instead" in issue: + return self._url_looks_like_file_endpoint(url) + return False + + def _extract_academy_document_id(self, url: str) -> Optional[str]: + parsed = urlparse(str(url or "")) + host = (parsed.hostname or "").lower() + if host != "repository.academyofathens.gr": + return None + match = re.match(r"^/document/(\d+)(?:\.pdf)?/?$", parsed.path or "") + if not match: + return None + return match.group(1) + + async def _fetch_bytes(self, session: aiohttp.ClientSession, url: str) -> bytes: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=min(max(self.request_timeout, 60), 180))) as response: + response.raise_for_status() + return await response.read() + + def _academy_images_to_pdf_bytes(self, image_blobs: list[bytes]) -> bytes: + if not image_blobs: + raise RuntimeError("No Academy image pages available to synthesize PDF") + images = [] + try: + for blob in image_blobs: + img = Image.open(io.BytesIO(blob)).convert("RGB") + images.append(img) + out = io.BytesIO() + images[0].save(out, format="PDF", save_all=True, append_images=images[1:]) + return out.getvalue() + finally: + for img in images: + try: + img.close() + except Exception: + pass + + async def _download_academy_bookreader_pdf(self, url: str) -> Optional[bytes]: + item_id = self._extract_academy_document_id(url) + if not item_id: + return None + + candidate_bases = [ + "https://repo.academyofathens.gr", + "https://digitallibrary.academyofathens.gr", + ] + connector = self._build_ssl_connector() + headers = {"User-Agent": "Mozilla/5.0", "Accept": "application/json,*/*"} + async with aiohttp.ClientSession(connector=connector, headers=headers) as session: + for base_url in candidate_bases: + try: + payload_bytes = await self._fetch_bytes(session, f"{base_url}/archive/bookreader_options/{item_id}") + payload = json.loads(payload_bytes.decode("utf-8", errors="ignore")) + except Exception: + continue + + page_data = payload.get("data") + if not isinstance(page_data, list) or not page_data: + continue + + image_urls: list[str] = [] + for page in page_data: + if not page or not isinstance(page, list): + continue + first = page[0] if page else None + uri = first.get("uri") if isinstance(first, dict) else None + if not uri: + continue + image_urls.append(uri if uri.startswith("http") else f"{base_url}{uri}") + + if not image_urls: + continue + + image_blobs: list[bytes] = [] + try: + for image_url in image_urls: + image_blobs.append(await self._fetch_bytes(session, image_url)) + except Exception: + continue + + try: + return await asyncio.to_thread(self._academy_images_to_pdf_bytes, image_blobs) + except Exception: + continue + return None + + async def _recover_source_specific_html_interstitial( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + pdf_body = await self._download_academy_bookreader_pdf(url) + if not pdf_body: + return None + + filename = f"{filename_base}.pdf" if filename_base and str(filename_base).strip() else self.generate_filename(row_index, "pdf") + await self._write_recovered_file(row_index, filename, pdf_body) + self.logger.info("Recovered Academy document via bookreader image->PDF fallback: %s -> %s", url, filename) + return True, filename, "pdf", "", retry_count + + def _build_ssl_connector(self) -> Optional[aiohttp.TCPConnector]: + connector = None + if not self.ssl_verify: + connector = aiohttp.TCPConnector(ssl=False) + elif self.ssl_cafile: + import ssl as _ssl + + ctx = _ssl.create_default_context(cafile=self.ssl_cafile) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + def _domain_cookies_for_url(self, url: str) -> Dict[str, str]: + cookies: Dict[str, str] = {} + for domain_pattern, domain_cookies in self.domain_cookies.items(): + if domain_pattern in url: + cookies.update(domain_cookies) + return cookies + + async def _write_recovered_file(self, row_index: int, filename: str, body: bytes) -> None: + tmp_path = self.downloads_dir / f".part_browser_{row_index}" + async with aiofiles.open(tmp_path, "wb") as handle: + await handle.write(body) + final_path = self.downloads_dir / filename + os.replace(tmp_path, final_path) + + async def _fetch_with_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + state: BrowserSessionState, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + request_headers = { + "User-Agent": state.user_agent, + "Accept": "application/pdf,application/octet-stream,*/*;q=0.8", + } + if state.cookie_header: + request_headers["Cookie"] = state.cookie_header + if referer: + request_headers["Referer"] = referer + + connector = self._build_ssl_connector() + timeout = aiohttp.ClientTimeout(total=min(max(self.request_timeout, 30), 180)) + async with aiohttp.ClientSession(connector=connector) as session: + async with session.get(url, headers=request_headers, timeout=timeout) as response: + response.raise_for_status() + body = await response.read() + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + return body, response_headers, {"candidate_url": url, "session_reused": True} + + async def _bootstrap_browser_session_state( + self, + *, + url: str, + referer: Optional[str], + route_options: Dict[str, Any], + ) -> tuple[BrowserSessionState, list[tuple[str, Dict[str, str], str]]]: + timeout_ms = int(self._route_setting(route_options, "browser_timeout_ms", self.browser_timeout_ms)) + post_load_wait_ms = int( + self._route_setting(route_options, "browser_post_load_wait_ms", self.browser_post_load_wait_ms) + ) + browser_engine = str(self._route_setting(route_options, "browser_engine", self.browser_engine)) + browser_headless = bool(self._route_setting(route_options, "browser_headless", self.browser_headless)) + + try: + from playwright.async_api import async_playwright + except ImportError as exc: # pragma: no cover - exercised via monkeypatch + raise RuntimeError( + "Browser download mode requires the optional 'browser' dependencies " + "(install Playwright and browser binaries)" + ) from exc + + accepted_responses: list[tuple[str, Dict[str, str], str]] = [] + bootstrap_url = self._choose_browser_bootstrap_url(url) + + async with self.browser_bootstrap_semaphore: + async with async_playwright() as playwright: + browser_type = getattr(playwright, browser_engine, None) + if browser_type is None: + raise RuntimeError(f"Unsupported browser engine: {browser_engine}") + + browser = await browser_type.launch(headless=browser_headless) + context = await browser.new_context(ignore_https_errors=not self.ssl_verify) + parsed = urlparse(url) + browser_cookies = [ + { + "name": key, + "value": str(value), + "domain": parsed.hostname or "", + "path": "/", + } + for key, value in self._domain_cookies_for_url(url).items() + ] + if browser_cookies: + await context.add_cookies(browser_cookies) + page = await context.new_page() + if referer: + await page.set_extra_http_headers({"Referer": referer}) + + async def _route_filter(route: Any) -> None: + req = route.request + if req.resource_type in {"image", "media", "font"}: + await route.abort() + return + req_url = str(req.url or "") + if "googletagmanager" in req_url or "google-analytics.com" in req_url: + await route.abort() + return + await route.continue_() + + await page.route("**/*", _route_filter) + + def _record_response(response: Any) -> None: + try: + response_headers = {str(k): str(v) for k, v in (response.headers or {}).items()} + file_ext = self.infer_file_extension(response.url, response_headers, b"") + if file_ext and file_ext != "html" and self.is_supported_format(file_ext): + accepted_responses.append((response.url, response_headers, file_ext)) + except Exception: + return + + page.on("response", _record_response) + + try: + main_response = None + try: + main_response = await page.goto(bootstrap_url, wait_until="networkidle", timeout=timeout_ms) + except Exception as exc: + if not self._should_ignore_navigation_exception(bootstrap_url, exc): + raise + if main_response is not None: + main_headers = {str(k): str(v) for k, v in (main_response.headers or {}).items()} + main_ext = self.infer_file_extension(main_response.url, main_headers, b"") + if main_ext and main_ext != "html" and self.is_supported_format(main_ext): + accepted_responses.insert(0, (main_response.url, main_headers, main_ext)) + if not accepted_responses and post_load_wait_ms > 0: + await page.wait_for_timeout(post_load_wait_ms) + + browser_user_agent = await page.evaluate("() => navigator.userAgent") + browser_cookies = await context.cookies() + finally: + await browser.close() + + cookie_header = "; ".join( + f"{cookie['name']}={cookie['value']}" for cookie in browser_cookies if cookie.get("name") + ) + return BrowserSessionState( + user_agent=browser_user_agent, + cookie_header=cookie_header, + cached_at=time.time(), + ), accepted_responses + + async def _download_via_browser_session( + self, + *, + url: str, + referer: Optional[str], + route_options: Optional[Dict[str, Any]] = None, + force_refresh: bool = False, + ) -> Tuple[bytes, Dict[str, str], Dict[str, Any]]: + options = dict(route_options or {}) + domain_key = self._domain_key(url) + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + lock = self._session_lock_for_domain(domain_key) + async with lock: + state = self._browser_session_cache.get(domain_key) + if state and self._is_browser_session_fresh(state, options) and not force_refresh: + try: + return await self._fetch_with_browser_session_state(url=url, referer=referer, state=state) + except Exception: + pass + + state, accepted_responses = await self._bootstrap_browser_session_state( + url=url, + referer=referer, + route_options=options, + ) + self._browser_session_cache[domain_key] = state + candidate_url = accepted_responses[0][0] if accepted_responses else url + body, response_headers, meta = await self._fetch_with_browser_session_state( + url=candidate_url, + referer=referer, + state=state, + ) + meta.update({ + "candidate_url": candidate_url, + "session_reused": False, + "domain_key": domain_key, + }) + return body, response_headers, meta + + async def _download_browser_route( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + route_options: Dict[str, Any], + ) -> Tuple[bool, str, str, str, int]: + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + error_msg = f"Browser-routed download failed: {exc}" + self.logger.warning(error_msg) + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + return await self._finalize_download_result( + row_index=row_index, + url=meta.get("candidate_url") or url, + resp_headers=response_headers, + content=body, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + route, route_options = self._resolve_route(url) + if route != "browser": + return None + return await self._download_browser_route( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + route_options=route_options, + ) + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + source_specific = await self._recover_source_specific_html_interstitial( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + ) + if source_specific is not None: + return source_specific + + route, route_options = self._resolve_route(url) + if route == "standard": + return None + if route == "auto" and not self._should_attempt_browser_recovery(url, html_issue): + return None + + try: + body, response_headers, meta = await self._download_via_browser_session( + url=url, + referer=referer, + route_options=route_options, + ) + except Exception as exc: + message = f"{html_issue}; browser recovery failed: {exc}" + self.logger.warning(message) + return False, "", "html", message, retry_count + 1 + + file_ext = self.infer_file_extension(meta["candidate_url"], response_headers, body) + if file_ext == "html": + message = ( + f"{html_issue}; browser recovery still returned HTML from {meta['candidate_url']}" + ) + self.logger.warning(message) + return False, "", file_ext, message, retry_count + 1 + if not self.is_supported_format(file_ext): + message = ( + f"{html_issue}; browser recovery returned unsupported format: {file_ext}" + ) + self.logger.warning(message) + return False, "", file_ext or "", message, retry_count + 1 + + if filename_base and str(filename_base).strip(): + filename = f"{filename_base}.{file_ext}" + else: + filename = self.generate_filename(row_index, file_ext) + + await self._write_recovered_file(row_index, filename, body) + self.logger.info( + "Recovered browser-gated download via browser mode: %s -> %s", + url, + filename, + ) + return True, filename, file_ext, "", retry_count diff --git a/src/glossapi/gloss_downloader.py b/src/glossapi/gloss_downloader.py index f9a7bf2..45f0d39 100644 --- a/src/glossapi/gloss_downloader.py +++ b/src/glossapi/gloss_downloader.py @@ -141,6 +141,8 @@ def __init__( error_burst_window: int = 20, error_burst_threshold: float = 0.5, park_403_seconds: float = 600.0, + download_policy_file: Optional[Union[str, Path]] = None, + download_policy: Optional[Any] = None, _used_filename_bases: Optional[Set[str]] = None, ): """ @@ -241,6 +243,8 @@ def verbose_log(self, message, level=logging.DEBUG): self.checkpoint_seconds = float(checkpoint_seconds) if checkpoint_seconds else None # Warnings JSON path self.domain_warnings_path = self.output_dir / 'domain_scheduler_warnings.json' + self.download_policy_file = Path(download_policy_file).expanduser().resolve() if download_policy_file else None + self.download_policy = download_policy # Progress logger (separate file; default to output logs dir) self.progress_logger = self.logger @@ -530,12 +534,47 @@ def _extract_base_domain(self, url: str) -> str: except Exception: return '' + def _resolve_route(self, url: str) -> tuple[str, Dict[str, Any]]: + return "standard", {} + + def _route_setting(self, route_options: Optional[Dict[str, Any]], name: str, fallback: Any) -> Any: + if route_options and name in route_options: + return route_options[name] + return fallback + + def _resolve_domain_scheduler_settings( + self, + route_options: Optional[Dict[str, Any]], + ) -> tuple[int, int, int, int]: + floor = max( + 1, + int(self._route_setting(route_options, "domain_concurrency_floor", self.domain_concurrency_floor)), + ) + raw_ceiling = self._route_setting(route_options, "domain_concurrency_ceiling", self.domain_concurrency_ceiling) + if raw_ceiling is None: + ceiling = max(floor, int(self.domain_concurrency_ceiling)) + else: + ceiling = max(floor, int(raw_ceiling)) + start = max( + floor, + min( + int(self._route_setting(route_options, "per_domain_concurrency", self.per_domain_concurrency)), + max(1, self.concurrency), + ceiling, + ), + ) + skip_after = max(1, int(self._route_setting(route_options, "skip_failed_after", self.skip_failed_after))) + return floor, ceiling, start, skip_after + @dataclass class _DomainState: base: str queue: deque = field(default_factory=deque) active: int = 0 concurrency: int = 1 + concurrency_floor: int = 1 + concurrency_ceiling: int = 1 + skip_failed_after: int = 3 successes: int = 0 failures: int = 0 http_429: int = 0 @@ -713,15 +752,17 @@ def _ext_from_magic_bytes(self, content: bytes) -> Optional[str]: if not content: return None head = content[:4096] - # PDF - if head.startswith(b'%PDF-'): + lower_head = head.lower() + lstripped = lower_head.lstrip() + # PDF: allow a small junk prefix before the real header. + pdf_idx = head.find(b'%PDF-') + if 0 <= pdf_idx <= 1024: return 'pdf' # HTML (very simple heuristic) - lower_head = head.lower() - if b' Optional[str]: pass return None + def _looks_like_pdf_bytes(self, content: bytes) -> bool: + """Lightweight PDF sanity check for content we are about to persist as a PDF.""" + if not content: + return False + head = content[:4096] + pdf_idx = head.find(b'%PDF-') + return 0 <= pdf_idx <= 1024 + def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes) -> str: """Infer the most likely file extension using URL, headers and content bytes""" + # Strong content sniffing first for the two cases that matter most here: + # real PDFs and HTML bodies masquerading as direct-file endpoints. + sniff_ext = self._ext_from_magic_bytes(content) + if sniff_ext == 'pdf': + return 'pdf' + if sniff_ext == 'html': + return 'html' + # 1) URL path extension url_ext = self.get_file_extension_from_url(url) if self.is_supported_format(url_ext): @@ -758,48 +815,125 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes if ct_ext and self.is_supported_format(ct_ext): return ct_ext - # 4) Magic byte sniffing - sniff_ext = self._ext_from_magic_bytes(content) + # 4) Magic byte sniffing for the remaining supported formats if sniff_ext and self.is_supported_format(sniff_ext): return sniff_ext # 5) Fall back to URL ext if any, otherwise 'bin' return url_ext if url_ext else 'bin' - - async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], - rate_limiter: RateLimiter, retry_count: int = 0, - filename_base: Optional[str] = None, - referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + + def _url_looks_like_file_endpoint(self, url: str) -> bool: + """Return True when the URL shape suggests a direct file download endpoint.""" + try: + lowered = str(url or "").lower() + except Exception: + return False + hints = ( + ".pdf", + ".docx", + ".pptx", + ".xml", + ".csv", + "/pdf", + "format=pdf", + "type=pdf", + "download", + "attachment", + "/file", + "getfile.php", + ) + return any(token in lowered for token in hints) + + def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: bytes) -> Optional[str]: """ - Download a file from a URL - - Args: - row_index: Index in the dataframe - url: URL to download - semaphore: Semaphore for concurrency control - rate_limiter: Rate limiter for API limits - retry_count: Current retry count - Returns: - Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + Detect HTML challenge/viewer pages that should not count as successful downloads. + + We still allow regular HTML documents, but fail fast on common interstitials + such as WAF challenge pages and JavaScript-only document viewers. """ - if not url or pd.isna(url): - return False, "", "", "Empty URL", retry_count - - # Get a new user-agent for each request - user_agent = next(self.user_agents) - domain = urlparse(url).netloc - - # Ensure URL has scheme + try: + lower_headers = {str(k).lower(): str(v).lower() for k, v in (headers or {}).items()} + lower_body = (content or b"")[: 1 << 17].decode("utf-8", errors="ignore").lower() + except Exception: + lower_headers = {} + lower_body = "" + + if not lower_body: + return None + + if ( + "x-amzn-waf-action" in lower_headers + or "awswafintegration" in lower_body + or "challenge.js" in lower_body + or "verify that you're not a robot" in lower_body + or "making sure you're not a bot" in lower_body + or "making sure you're not a bot" in lower_body + or "/.within.website/" in lower_body + or "anubis" in lower_body + ): + return ( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ) + + viewer_markers = ( + "fliphtml5_pages", + "monitor:player:html5", + "javascript/loadingjs.js", + "javascript/main.js", + "bookconfig.totalpagecount", + "getfile.php?lib=", + ) + viewer_hits = sum(1 for marker in viewer_markers if marker in lower_body) + if viewer_hits >= 2: + return ( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ) + + content_type = lower_headers.get("content-type", "") + if self._url_looks_like_file_endpoint(url) and "text/html" in content_type: + return "Expected a file-like response but received HTML instead" + + return None + + async def _recover_html_interstitial( + self, + *, + row_index: int, + url: str, + headers: Dict[str, str], + content: bytes, + html_issue: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to recover from HTML interstitials via alternate fetch modes.""" + return None + + async def _preflight_download( + self, + *, + row_index: int, + url: str, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Optional[Tuple[bool, str, str, str, int]]: + """Allow subclasses to short-circuit the direct HTTP path for known routes.""" + return None + + def _normalize_request_url(self, url: str) -> str: if not url.startswith(("http://", "https://")): - url = f"https://{url}" - - # Get base URL for referer header + return f"https://{url}" + return url + + def _build_request_headers(self, url: str, user_agent: str, referer: Optional[str]) -> Dict[str, str]: + domain = urlparse(url).netloc base_url = self.get_base_url(url) - - # Enhanced headers with common browser-like attributes to bypass 403 errors - # Prefer caller-provided referer (e.g., the external_link page) - _referer = (referer or '').strip() - headers = { + referer_value = (referer or '').strip() + return { 'User-Agent': user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', @@ -813,75 +947,328 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'TE': 'trailers', - 'Referer': _referer if _referer else f"https://www.google.com/search?q={domain}", + 'Referer': referer_value if referer_value else f"https://www.google.com/search?q={domain}", 'Origin': base_url, 'DNT': '1' } - - # Check for domain-specific cookies - cookies = {} + + def _resolve_request_cookies(self, url: str, route_options: Optional[Dict[str, Any]] = None) -> Dict[str, str]: + cookies: Dict[str, str] = {} for domain_pattern, domain_cookies in self.domain_cookies.items(): if domain_pattern in url: cookies.update(domain_cookies) # If the domain needs dynamic values like random IDs - for key, value in cookies.items(): + for key, value in list(cookies.items()): if 'random.randint' in str(value): # Replace with an actual random value (only supporting this pattern for now) - if 'session-id' in value: + if 'session-id' in str(value): cookies[key] = f"session-id-{random.randint(100000000, 999999999)}" + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict): + cookies.update({str(k): str(v) for k, v in extra_cookies.items()}) + return cookies + + def _build_request_timeout( + self, + retry_count: int, + route_options: Optional[Dict[str, Any]] = None, + ) -> aiohttp.ClientTimeout: + base_request_timeout = float(self._route_setting(route_options, "request_timeout", self.request_timeout)) + return aiohttp.ClientTimeout( + total=min(base_request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes + connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute + sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute + sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + ) + + def _build_session_connector( + self, + url: str, + route_options: Optional[Dict[str, Any]] = None, + ) -> Optional[aiohttp.TCPConnector]: + connector = None + url_base = self._extract_base_domain(url) + force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) + ssl_verify = bool(self._route_setting(route_options, "ssl_verify", self.ssl_verify)) + ssl_cafile = self._route_setting(route_options, "ssl_cafile", self.ssl_cafile) + if (not ssl_verify) or force_insecure: + connector = aiohttp.TCPConnector(ssl=False) + elif ssl_cafile: + import ssl as _ssl + ctx = _ssl.create_default_context(cafile=str(ssl_cafile)) + connector = aiohttp.TCPConnector(ssl=ctx) + return connector + + async def _bootstrap_download_session( + self, + session: aiohttp.ClientSession, + url: str, + headers: Dict[str, str], + route_options: Optional[Dict[str, Any]] = None, + ) -> Dict[str, str]: + headers = await self.setup_session(session, url, headers) + + # Set a shorter timeout for the initial connection attempt + base_timeout = aiohttp.ClientTimeout(total=10) + try: + # Visit the base domain to establish cookies if needed + base_domain = urlparse(url).netloc + all_cookie_domains = set(self.domain_cookies.keys()) + extra_cookies = self._route_setting(route_options, "domain_cookies", None) + if isinstance(extra_cookies, dict) and extra_cookies: + all_cookie_domains.add(base_domain) + if any(domain in base_domain for domain in all_cookie_domains): + base_url = f"https://{base_domain}" + async with session.get(base_url, headers=headers, timeout=base_timeout): + pass + except Exception as e: + # Non-fatal error, just log and continue + self.logger.debug(f"Initial base URL visit failed: {str(e)}") + return headers + + def _best_effort_url_extension(self, url: str) -> str: + try: + return self.get_file_extension_from_url(url) + except Exception: + return "" + + def _build_output_filename(self, row_index: int, file_ext: str, filename_base: Optional[str]) -> str: + if filename_base and str(filename_base).strip(): + return f"{filename_base}.{file_ext}" + return self.generate_filename(row_index, file_ext) + + def _cleanup_temp_file(self, tmp_path: Optional[Path]) -> None: + if not tmp_path: + return + try: + os.remove(tmp_path) + except Exception: + pass + + def _move_temp_file_to_final(self, tmp_path: Path, filename: str) -> None: + final_path = Path(self.downloads_dir) / filename + try: + os.replace(tmp_path, final_path) + except Exception: + try: + os.rename(tmp_path, final_path) + except Exception: + pass + + async def _finalize_download_result( + self, + *, + row_index: int, + url: str, + resp_headers: Dict[str, str], + content: bytes, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + tmp_path: Optional[Path] = None, + ) -> Tuple[bool, str, str, str, int]: + file_ext = self.infer_file_extension(url, resp_headers, content) + if file_ext == 'html': + html_issue = self._detect_html_interstitial(url, resp_headers, content) + if html_issue: + self._cleanup_temp_file(tmp_path) + recovered = await self._recover_html_interstitial( + row_index=row_index, + url=url, + headers=resp_headers, + content=content, + html_issue=html_issue, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + if recovered is not None: + return recovered + self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}") + return False, "", file_ext, html_issue, retry_count + if not self.is_supported_format(file_ext): + self._cleanup_temp_file(tmp_path) + self.logger.warning( + f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}" + ) + return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count + if file_ext == 'pdf' and not self._looks_like_pdf_bytes(content): + self._cleanup_temp_file(tmp_path) + message = "Invalid PDF signature in downloaded content" + self.logger.warning("%s for %s", message, url) + return False, "", file_ext, message, retry_count + + filename = self._build_output_filename(row_index, file_ext, filename_base) + if tmp_path is not None: + self._move_temp_file_to_final(tmp_path, filename) + else: + await self.write_file(filename, content, self.downloads_dir) + self.logger.info(f"Successfully downloaded {filename} from {url}") + return True, filename, file_ext, "", retry_count + + async def _download_via_streaming_get( + self, + *, + session: aiohttp.ClientSession, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + from tenacity import AsyncRetrying + + head = bytearray() + async for attempt in AsyncRetrying( + stop=stop_after_attempt(max(1, int(self.max_retries))), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=(retry_if_exception_type(aiohttp.ClientError) | + retry_if_exception_type(asyncio.TimeoutError)), + before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), + reraise=True, + ): + with attempt: + async with session.get(url, headers=headers, timeout=timeout) as response: + response.raise_for_status() + resp_headers = dict(response.headers or {}) + tmp_path = Path(self.downloads_dir) / f".part_{row_index}" + async with aiofiles.open(tmp_path, 'wb') as f: + async for chunk in response.content.iter_chunked(1 << 16): + if chunk: + if len(head) < (1 << 16): + need = (1 << 16) - len(head) + head.extend(chunk[:need]) + await f.write(chunk) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=bytes(head), + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + tmp_path=tmp_path, + ) + return False, "", "", "Retry exhaustion", retry_count + 1 + + async def _download_via_buffered_request( + self, + *, + session: aiohttp.ClientSession, + requester: str, + row_index: int, + url: str, + headers: Dict[str, str], + timeout: aiohttp.ClientTimeout, + retry_count: int, + filename_base: Optional[str], + referer: Optional[str], + ) -> Tuple[bool, str, str, str, int]: + content, status, resp_headers = await self.make_request( + session, requester, url, headers, timeout + ) + return await self._finalize_download_result( + row_index=row_index, + url=url, + resp_headers=resp_headers, + content=content, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) + + def _build_http_error_result( + self, + url: str, + error: aiohttp.ClientResponseError, + retry_count: int, + ) -> Tuple[bool, str, str, str, int]: + status = error.status + self.logger.warning(f"Received {status} for {url}") + + if self.verbose: + self.logger.debug(f"HTTP Error Details - Status: {error.status}, Message: {error.message}") + self.logger.debug(f"Headers: {error.headers if hasattr(error, 'headers') else 'No headers available'}") + self.logger.debug(f"Request info: {error.request_info if hasattr(error, 'request_info') else 'No request info available'}") + + retry_after = None + try: + hdrs = dict(getattr(error, 'headers', {}) or {}) + for k, v in hdrs.items(): + if k.lower() == 'retry-after': + val = str(v).strip() + if val.isdigit(): + retry_after = int(val) + else: + try: + dt = parsedate_to_datetime(val) + retry_after = max(0, int((dt.timestamp() - time.time()))) + except Exception: + retry_after = None + break + except Exception: + retry_after = None + error_msg = f"HTTP {status}: {str(error)}" + if status in (429, 503) and retry_after is not None: + error_msg += f" retry_after={retry_after}" + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 + + async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore], + rate_limiter: RateLimiter, retry_count: int = 0, + filename_base: Optional[str] = None, + referer: Optional[str] = None) -> Tuple[bool, str, str, str, int]: + """ + Download a file from a URL + + Args: + row_index: Index in the dataframe + url: URL to download + semaphore: Semaphore for concurrency control + rate_limiter: Rate limiter for API limits + retry_count: Current retry count + Returns: + Tuple[bool, str, str, str, int]: (success, filename, file_ext, error_message, retry_count) + """ + if not url or pd.isna(url): + return False, "", "", "Empty URL", retry_count + + url = self._normalize_request_url(url) + _, route_options = self._resolve_route(url) + user_agent = next(self.user_agents) + headers = self._build_request_headers(url, user_agent, referer) + cookies = self._resolve_request_cookies(url, route_options=route_options) if semaphore: await semaphore.acquire() try: - # Apply rate limiting await rate_limiter.acquire() - - # Implement exponential backoff - sleep_time = self.sleep * (2 ** retry_count) + base_sleep = float(self._route_setting(route_options, "sleep", self.sleep)) + sleep_time = base_sleep * (2 ** retry_count) await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) - - # Set up timeout with exponential backoff - timeout = aiohttp.ClientTimeout( - total=min(self.request_timeout * (1.5 ** retry_count), 180), # Cap at 3 minutes - connect=min(30 * (1.2 ** retry_count), 60), # Cap connect timeout at 1 minute - sock_connect=min(30 * (1.2 ** retry_count), 60), # Cap socket connect at 1 minute - sock_read=min(60 * (1.2 ** retry_count), 120) # Cap socket read at 2 minutes + preflight = await self._preflight_download( + row_index=row_index, + url=url, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - + if preflight is not None: + return preflight + timeout = self._build_request_timeout(retry_count, route_options=route_options) + try: - # Prepare optional SSL connector - connector = None - # Domain-specific insecure override (discovered via ping) - url_base = self._extract_base_domain(url) - _force_insecure = url_base in getattr(self, '_domains_ssl_insecure', set()) - if (not self.ssl_verify) or _force_insecure: - connector = aiohttp.TCPConnector(ssl=False) - elif self.ssl_cafile: - import ssl as _ssl - ctx = _ssl.create_default_context(cafile=self.ssl_cafile) - connector = aiohttp.TCPConnector(ssl=ctx) - # Create a new session for each download to avoid cookie contamination + connector = self._build_session_connector(url, route_options=route_options) async with aiohttp.ClientSession(cookies=cookies, connector=connector) as session: try: - # Try to access the base domain first to establish cookies - headers = await self.setup_session(session, url, headers) - - # Set a shorter timeout for the initial connection attempt - base_timeout = aiohttp.ClientTimeout(total=10) - try: - # Visit the base domain to establish cookies if needed - base_domain = urlparse(url).netloc - if any(domain in base_domain for domain in self.domain_cookies.keys()): - base_url = f"https://{base_domain}" - async with session.get(base_url, headers=headers, timeout=base_timeout): - pass - except Exception as e: - # Non-fatal error, just log and continue - self.logger.debug(f"Initial base URL visit failed: {str(e)}") - pass - - # Choose request method and perform streaming for GET - requester = self.request_method.lower() + headers = await self._bootstrap_download_session( + session, + url, + headers, + route_options=route_options, + ) + requester = str(self._route_setting(route_options, "request_method", self.request_method)).lower() try: self.verbose_log(f"Attempting download request to URL: {url}") @@ -889,112 +1276,30 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn self.verbose_log(f"Headers: {headers}") if requester == 'get': - # Streaming GET with retries - from tenacity import AsyncRetrying - head = bytearray() - resp_headers = {} - async for attempt in AsyncRetrying( - stop=stop_after_attempt(max(1, int(self.max_retries))), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=(retry_if_exception_type(aiohttp.ClientError) | - retry_if_exception_type(asyncio.TimeoutError)), - before_sleep=before_sleep_log(logging.getLogger(__name__), logging.INFO), - reraise=True, - ): - with attempt: - async with session.get(url, headers=headers, timeout=timeout) as response: - response.raise_for_status() - resp_headers = dict(response.headers or {}) - # Write to a temp file first - tmp_path = Path(self.downloads_dir) / f".part_{row_index}" - async with aiofiles.open(tmp_path, 'wb') as f: - async for chunk in response.content.iter_chunked(1 << 16): - if chunk: - if len(head) < (1 << 16): - need = (1 << 16) - len(head) - head.extend(chunk[:need]) - await f.write(chunk) - # Infer extension using URL, headers and first bytes - file_ext = self.infer_file_extension(url, resp_headers, bytes(head)) - if not self.is_supported_format(file_ext): - # Clean up temp and report - try: - os.remove(tmp_path) - except Exception: - pass - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - # Decide final filename - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - final_path = Path(self.downloads_dir) / filename - try: - os.replace(tmp_path, final_path) - except Exception: - # Fallback to copy/rename - try: - os.rename(tmp_path, final_path) - except Exception: - pass - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count - else: - # Fallback to non-streaming POST - content, status, resp_headers = await self.make_request( - session, requester, url, headers, timeout + return await self._download_via_streaming_get( + session=session, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, ) - file_ext = self.infer_file_extension(url, resp_headers, content) - if not self.is_supported_format(file_ext): - self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}") - return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count - if filename_base and str(filename_base).strip(): - filename = f"{filename_base}.{file_ext}" - else: - filename = self.generate_filename(row_index, file_ext) - await self.write_file(filename, content, self.downloads_dir) - self.logger.info(f"Successfully downloaded {filename} from {url}") - return True, filename, file_ext, "", retry_count + return await self._download_via_buffered_request( + session=session, + requester=requester, + row_index=row_index, + url=url, + headers=headers, + timeout=timeout, + retry_count=retry_count, + filename_base=filename_base, + referer=referer, + ) except aiohttp.ClientResponseError as e: - # Handle HTTP errors - status = e.status - self.logger.warning(f"Received {status} for {url}") - - # Detailed verbose logging for HTTP errors - if self.verbose: - self.logger.debug(f"HTTP Error Details - Status: {e.status}, Message: {e.message}") - self.logger.debug(f"Headers: {e.headers if hasattr(e, 'headers') else 'No headers available'}") - self.logger.debug(f"Request info: {e.request_info if hasattr(e, 'request_info') else 'No request info available'}") - - # Build error with optional Retry-After info - retry_after = None - try: - hdrs = dict(getattr(e, 'headers', {}) or {}) - for k, v in hdrs.items(): - if k.lower() == 'retry-after': - val = str(v).strip() - if val.isdigit(): - retry_after = int(val) - else: - try: - dt = parsedate_to_datetime(val) - retry_after = max(0, int((dt.timestamp() - time.time()))) - except Exception: - retry_after = None - break - except Exception: - retry_after = None - error_msg = f"HTTP {status}: {str(e)}" - if status in (429, 503) and retry_after is not None: - error_msg += f" retry_after={retry_after}" - # Best-effort ext from URL if possible - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return self._build_http_error_result(url, e, retry_count) except Exception as e: error_msg = str(e) @@ -1007,11 +1312,7 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn import traceback self.logger.debug(f"Traceback: {traceback.format_exc()}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Overall timeout exceeded for {url}") @@ -1023,22 +1324,14 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn except aiohttp.ClientError as e: error_msg = str(e) self.logger.error(f"ClientError while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 except asyncio.TimeoutError: self.logger.error(f"Timeout while downloading {url}") return False, "", "", "Timeout", retry_count + 1 except Exception as e: error_msg = str(e) self.logger.error(f"Error while downloading {url}: {error_msg}") - try: - url_ext = self.get_file_extension_from_url(url) - except Exception: - url_ext = "" - return False, "", url_ext, error_msg, retry_count + 1 + return False, "", self._best_effort_url_extension(url), error_msg, retry_count + 1 finally: if semaphore: try: @@ -1137,6 +1430,8 @@ def _write_checkpoint() -> None: for i, row_idx in enumerate(batch_indices): url = df.loc[row_idx, self.url_column] retry_count = df.loc[row_idx, 'download_retry_count'] + _, route_options = self._resolve_route(url) + _, _, _, skip_after = self._resolve_domain_scheduler_settings(route_options) # Optional per-row referer (e.g., external_link page) ref_val = None if self.referer_column and self.referer_column in df.columns: @@ -1156,7 +1451,7 @@ def _write_checkpoint() -> None: pass # Skip URLs that have failed too many times - if retry_count >= self.skip_failed_after: + if retry_count >= skip_after: self.logger.info(f"Skipping URL at row {row_idx} - too many failures: {retry_count}") continue @@ -1367,6 +1662,7 @@ def _write_checkpoint() -> None: domains: Dict[str, GlossDownloader._DomainState] = {} for idx in row_indices: url = df.at[idx, self.url_column] + _, route_options = self._resolve_route(url) # Determine grouping key if self.scheduler_group_by and self.scheduler_group_by != 'base_domain': key = str(df.at[idx, self.scheduler_group_by]) if self.scheduler_group_by in df.columns else '' @@ -1377,9 +1673,14 @@ def _write_checkpoint() -> None: if not key: key = '' if key not in domains: - # Each group starts with up to per_domain_concurrency, but not exceeding global - start_c = min(self.per_domain_concurrency, max(1, self.concurrency)) - domains[key] = GlossDownloader._DomainState(base=key, concurrency=start_c) + floor_c, ceiling_c, start_c, skip_after = self._resolve_domain_scheduler_settings(route_options) + domains[key] = GlossDownloader._DomainState( + base=key, + concurrency=start_c, + concurrency_floor=floor_c, + concurrency_ceiling=ceiling_c, + skip_failed_after=skip_after, + ) domains[key].queue.append(idx) if not domains: @@ -1638,7 +1939,7 @@ def estimate_eta_s(state: GlossDownloader._DomainState) -> float: if remaining <= 0: return 0.0 avg = state.avg_duration() or 5.0 # default initial guess - eff_c = max(self.domain_concurrency_floor, min(state.concurrency, self.domain_concurrency_ceiling)) + eff_c = max(state.concurrency_floor, min(state.concurrency, state.concurrency_ceiling)) # ETA ≈ remaining * avg / eff_c (assuming steady parallelism) return float(remaining) * avg / max(1, eff_c) @@ -1722,7 +2023,7 @@ async def dispatch_ready(): if pending_domains: active_order.append(pending_domains.popleft()) continue - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Unparked domain: {dom}; resuming at concurrency={state.concurrency}") # Attempt to launch up to (state.concurrency - state.active) while ( @@ -1734,7 +2035,7 @@ async def dispatch_ready(): url = df.at[row_idx, self.url_column] retry_count = int(df.at[row_idx, 'download_retry_count']) if 'download_retry_count' in df.columns else 0 # Skip rows with too many failures - if retry_count >= self.skip_failed_after: + if retry_count >= state.skip_failed_after: continue # Launch task t0 = time.time() @@ -1916,7 +2217,7 @@ async def dispatch_ready(): # Dynamic tuning: ease if overloaded if self.dynamic_tuning and should_ease(state): - if state.concurrency > self.domain_concurrency_floor: + if state.concurrency > state.concurrency_floor: state.concurrency -= 1 self.logger.info(f"Easing concurrency for {dom} -> {state.concurrency}") @@ -1936,14 +2237,14 @@ async def dispatch_ready(): if retry_after is None: retry_after = max(1, int(self.ping_recheck_seconds)) state.parked_until = now2 + retry_after - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) self.progress_logger.info(f"[park] Rate limited: {dom}; parked for {retry_after}s") # Timeout streak -> exponential backoff elif state.timeout_streak >= int(getattr(self, 'timeout_streak_threshold', 5)): backoff = min(float(getattr(self, 'backoff_min_s', 60.0)) * (2 ** max(0, state.ping_failures)), float(getattr(self, 'backoff_max_s', 900.0))) state.ping_failures += 1 state.parked_until = now2 + backoff - state.concurrency = max(self.domain_concurrency_floor, 1) + state.concurrency = max(state.concurrency_floor, 1) state.timeout_streak = 0 self.progress_logger.info(f"[park] Timeout streak: {dom}; parked for {int(backoff)}s (level={state.ping_failures})") else: @@ -1965,7 +2266,7 @@ async def dispatch_ready(): state.eta_exceeded_count += 1 if state.eta_exceeded_count == 1: # Try to increase concurrency gently to improve ETA, up to ceiling - if state.concurrency < self.domain_concurrency_ceiling: + if state.concurrency < state.concurrency_ceiling: state.concurrency += 1 self.logger.info( f"ETA high for {dom} ({int(eta_s)}s). Bumping concurrency -> {state.concurrency}" diff --git a/src/glossapi/gloss_extract.py b/src/glossapi/gloss_extract.py index 4a2477c..1c21cf1 100644 --- a/src/glossapi/gloss_extract.py +++ b/src/glossapi/gloss_extract.py @@ -10,7 +10,6 @@ AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, - RapidOcrOptions, LayoutOptions, TableStructureOptions, TableFormerMode, @@ -47,9 +46,9 @@ def _maybe_import_torch(*, force: bool = False): MarkdownFormatOption = None CsvFormatOption = None StandardPdfPipeline = None -DoclingParseV2DocumentBackend = None DoclingParseDocumentBackend = None PyPdfiumDocumentBackend = None +_DOCLING_PARSE_BACKEND_NAME = "docling_parse" class _NoOpOption: # minimal stand-ins for optional helpers @@ -84,19 +83,23 @@ def _ensure_docling_converter_loaded() -> None: def _ensure_docling_pipeline_loaded() -> None: global _DOC_PIPELINE_LOADED, StandardPdfPipeline - global DoclingParseV2DocumentBackend, DoclingParseDocumentBackend, PyPdfiumDocumentBackend + global DoclingParseDocumentBackend, PyPdfiumDocumentBackend, _DOCLING_PARSE_BACKEND_NAME if _DOC_PIPELINE_LOADED: return try: StandardPdfPipeline = importlib.import_module( "docling.pipeline.standard_pdf_pipeline" ).StandardPdfPipeline - DoclingParseV2DocumentBackend = importlib.import_module( - "docling.backend.docling_parse_v2_backend" - ).DoclingParseV2DocumentBackend - DoclingParseDocumentBackend = importlib.import_module( - "docling.backend.docling_parse_backend" - ).DoclingParseDocumentBackend + try: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_backend" + ).DoclingParseDocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse" + except Exception: + DoclingParseDocumentBackend = importlib.import_module( + "docling.backend.docling_parse_v2_backend" + ).DoclingParseV2DocumentBackend + _DOCLING_PARSE_BACKEND_NAME = "docling_parse_v2" PyPdfiumDocumentBackend = importlib.import_module( "docling.backend.pypdfium2_backend" ).PyPdfiumDocumentBackend @@ -106,11 +109,8 @@ def _ensure_docling_pipeline_loaded() -> None: from docling.pipeline.simple_pipeline import SimplePipeline -# Ensure RapidOCR plugin is registered for factory-based OCR construction -import docling.models.rapid_ocr_model # noqa: F401 -from .ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from .ocr.rapidocr.pool import GLOBAL_RAPID_OCR_POOL import inspect +from .ocr.docling_pipeline import build_layout_pipeline import ftfy import logging @@ -328,7 +328,7 @@ def _apply_thread_caps(self) -> None: self._thread_caps_applied = True def release_resources(self) -> None: - """Release Docling converters, pooled RapidOCR engines, and GPU caches.""" + """Release Docling converters and GPU caches.""" try: self.converter = None except Exception: @@ -343,10 +343,6 @@ def release_resources(self) -> None: setattr(self, attr, None) except Exception: pass - try: - GLOBAL_RAPID_OCR_POOL.clear() - except Exception: - pass torch_mod = _maybe_import_torch() if torch_mod is not None and getattr(torch_mod, "cuda", None): try: @@ -390,7 +386,7 @@ def _convert_all_with_timeout(self, files: Iterable[Path], timeout_s: int, **kwa timeout_kw = None backend_cls = getattr(self, "_active_pdf_backend", None) - is_native_backend = backend_cls is DoclingParseV2DocumentBackend if backend_cls else False + is_native_backend = backend_cls is DoclingParseDocumentBackend if backend_cls else False if timeout_kw and not is_native_backend and len(set(budgets)) == 1: kw = dict(raises_on_error=False) @@ -553,12 +549,7 @@ def create_extractor( ocr_langs: list[str] | None = None, profile_timings: bool = True, ): - """Create a document converter with configured options using the canonical builder. - - Delegates PDF pipeline construction to `glossapi.ocr.rapidocr.pipeline.build_rapidocr_pipeline` - to avoid duplicated provider checks and option wiring. Falls back to the legacy - inline path if the canonical builder is unavailable. - """ + """Create a Docling document converter for Phase-1 extraction.""" _ensure_docling_converter_loaded() _ensure_docling_pipeline_loaded() # Enable/disable Docling pipeline timings collection (for benchmarks) @@ -569,176 +560,88 @@ def create_extractor( pass # Record the PDF backend name for provenance (default to native backend) - self.pdf_backend_name = "docling_parse_v2" - self._active_pdf_backend = DoclingParseV2DocumentBackend + self.pdf_backend_name = _DOCLING_PARSE_BACKEND_NAME + self._active_pdf_backend = DoclingParseDocumentBackend # Best-effort Torch preflight only if Phase‑1 is asked to do enrichment try: - if formula_enrichment: + if formula_enrichment or code_enrichment: torch_mod = _maybe_import_torch(force=True) if torch_mod is None: - raise RuntimeError("Torch not available but formula enrichment requested.") + raise RuntimeError("Torch not available but Docling GPU enrichment was requested.") if hasattr(torch_mod, "cuda") and isinstance(getattr(self, "pipeline_options", None), PdfPipelineOptions): dev = getattr(self.pipeline_options, "accelerator_options", None) dv = getattr(dev, "device", None) if (isinstance(dv, str) and dv.lower().startswith("cuda")) and not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") + raise RuntimeError("Torch CUDA not available but Docling GPU enrichment was requested.") except Exception as e: raise RuntimeError(f"Torch CUDA preflight failed: {e}") - # Build PDF pipeline via the canonical builder (preferred) - opts = None - active_backend = DoclingParseV2DocumentBackend - try: - from .ocr.rapidocr.pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - except Exception: # pragma: no cover - adapter fallback - from ._pipeline import build_layout_pipeline, build_rapidocr_pipeline # type: ignore - - device_str = self._current_device_str() or "cuda:0" - builder = build_rapidocr_pipeline if enable_ocr else build_layout_pipeline - - try: - _, opts = builder( - device=device_str, - images_scale=float(images_scale), - formula_enrichment=bool(formula_enrichment), - code_enrichment=bool(code_enrichment), - **({"text_score": float(text_score)} if enable_ocr else {}), - ) - - if enable_ocr and hasattr(opts, "ocr_options") and getattr(opts, "ocr_options", None) is not None: - if use_cls is not None: - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - if ocr_langs: - setattr(opts.ocr_options, "lang", list(ocr_langs)) # type: ignore[attr-defined] - if force_full_page_ocr is not None: - setattr(opts.ocr_options, "force_full_page_ocr", bool(force_full_page_ocr)) # type: ignore[attr-defined] - + if enable_ocr: try: - setattr(opts, "images_scale", float(images_scale)) + self._log.warning( + "Docling Phase-1 OCR is no longer supported. " + "Ignoring enable_ocr/force_full_page_ocr; use Corpus.ocr(backend='deepseek') instead." + ) except Exception: pass - self._active_pdf_options = opts - self._current_ocr_enabled = bool(enable_ocr) - - # Create a multi-format DocumentConverter using the built PDF options - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - if opts is None: - opts = self.pipeline_options - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=opts, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), - InputFormat.XML_JATS: XMLJatsFormatOption(), - InputFormat.HTML: HTMLFormatOption(), - InputFormat.PPTX: PowerpointFormatOption(), - InputFormat.CSV: CsvFormatOption(), - InputFormat.MD: MarkdownFormatOption(), - }, - ) - self._active_pdf_backend = active_backend + active_backend = DoclingParseDocumentBackend + device_str = self._current_device_str() or "cuda:0" + _, opts = build_layout_pipeline( + device=device_str, + images_scale=float(images_scale), + formula_enrichment=bool(formula_enrichment), + code_enrichment=bool(code_enrichment), + ) + try: + opts.do_ocr = False + setattr(opts, "images_scale", float(images_scale)) except Exception: - # Fallback to legacy inline configuration path - if enable_ocr: - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError( - "RapidOCR ONNX models/keys not found. Ensure models exist under glossapi.models/rapidocr or set GLOSSAPI_RAPIDOCR_ONNX_DIR." - ) - langs = ocr_langs or ["el", "en"] - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=langs, - force_full_page_ocr=bool(force_full_page_ocr), - use_det=True, - use_cls=bool(use_cls), - use_rec=True, - text_score=float(text_score), - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - self.pipeline_options.ocr_options = ocr_opts - # Attach core toggles to existing pipeline_options - try: - self.pipeline_options.do_ocr = bool(enable_ocr) - self.pipeline_options.do_formula_enrichment = bool(formula_enrichment) - self.pipeline_options.do_code_enrichment = bool(code_enrichment) - try: - setattr(self.pipeline_options, "images_scale", float(images_scale)) - except Exception: - pass - except Exception: - pass - if not enable_ocr: - try: - setattr(self.pipeline_options, "ocr_options", None) - except Exception: - pass + pass - pdf_backend = DoclingParseV2DocumentBackend - if not enable_ocr: - try: - if getattr(self, "use_pypdfium_backend", False): - pdf_backend = PyPdfiumDocumentBackend - self.pdf_backend_name = "pypdfium" - except Exception: - pdf_backend = DoclingParseV2DocumentBackend - - active_backend = pdf_backend - - self.converter = DocumentConverter( - allowed_formats=[ - InputFormat.PDF, - InputFormat.DOCX, - InputFormat.XML_JATS, - InputFormat.HTML, - InputFormat.PPTX, - InputFormat.CSV, - InputFormat.MD, - ], - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=self.pipeline_options, - pipeline_cls=StandardPdfPipeline, - backend=active_backend, - ), - }, - ) + self._active_pdf_options = opts + self._current_ocr_enabled = False - self._active_pdf_options = self.pipeline_options - self._current_ocr_enabled = bool(enable_ocr) - self._active_pdf_backend = active_backend + pdf_backend = DoclingParseDocumentBackend + try: + if getattr(self, "use_pypdfium_backend", False): + pdf_backend = PyPdfiumDocumentBackend + self.pdf_backend_name = "pypdfium" + except Exception: + pdf_backend = DoclingParseDocumentBackend + active_backend = pdf_backend + + self.converter = DocumentConverter( + allowed_formats=[ + InputFormat.PDF, + InputFormat.DOCX, + InputFormat.XML_JATS, + InputFormat.HTML, + InputFormat.PPTX, + InputFormat.CSV, + InputFormat.MD, + ], + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=opts, + pipeline_cls=StandardPdfPipeline, + backend=active_backend, + ), + InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), + InputFormat.XML_JATS: XMLJatsFormatOption(), + InputFormat.HTML: HTMLFormatOption(), + InputFormat.PPTX: PowerpointFormatOption(), + InputFormat.CSV: CsvFormatOption(), + InputFormat.MD: MarkdownFormatOption(), + }, + ) + self._active_pdf_backend = active_backend # Record last configuration for reuse try: self._last_extractor_cfg = self._cfg_signature( - enable_ocr=enable_ocr, + enable_ocr=False, force_full_page_ocr=force_full_page_ocr, text_score=text_score, images_scale=images_scale, @@ -914,6 +817,17 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: except Exception as e: self._log.error(f"Failed to write chunk manifest for {file_path.name}: {e}") + # Always attempt to assemble whatever chunks succeeded (best-effort) + out_md_path = output_dir / f"{stem}.md" + final_md_written = False + if all_segments: + try: + final_md = "\n\n".join(all_segments) + out_md_path.write_text(final_md, encoding="utf-8") + final_md_written = True + except Exception as e: + self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not completed: # Record failure/timeout provenance in parquet try: @@ -928,6 +842,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: chunk_size=self.chunk_size, chunk_count=len(manifest.get("entries", [])), chunk_manifest_path=manifest_path, + no_partial_output=not final_md_written, ) except Exception as e: self._log.warning(f"Failed to record chunked extraction metadata for {file_path.name}: {e}") @@ -939,14 +854,7 @@ def _process_file_chunked(self, file_path: Path, output_dir: Path, timeout_dir: self._log.error(f"Failed to copy timeout/failed file {file_path.name}: {e}") return False - # Assemble final markdown - try: - final_md = "\n\n".join(all_segments) - out_md_path = output_dir / f"{stem}.md" - with out_md_path.open("w", encoding="utf-8") as fp: - fp.write(final_md) - except Exception as e: - self._log.error(f"Failed to assemble final markdown for {file_path.name}: {e}") + if not final_md_written: return False # Record success provenance in parquet try: @@ -1294,7 +1202,7 @@ def _update_extraction_metadata( if chunk_manifest_path is not None: data["chunk_manifest_path"] = str(chunk_manifest_path) # Backend and failure - backend_name = getattr(self, "pdf_backend_name", None) or ("docling_parse_v2" if getattr(self, "USE_V2", True) else "docling_parse") + backend_name = getattr(self, "pdf_backend_name", None) or _DOCLING_PARSE_BACKEND_NAME data["extraction_backend"] = backend_name if status in ("timeout", "error", "failure"): data["failure_mode"] = status diff --git a/src/glossapi/ocr/__init__.py b/src/glossapi/ocr/__init__.py index bb167c4..df79456 100644 --- a/src/glossapi/ocr/__init__.py +++ b/src/glossapi/ocr/__init__.py @@ -1,7 +1,7 @@ """Lightweight OCR backend package. Exports minimal, import-safe helpers for OCR backends. Heavy -dependencies (vLLM, transformers, PyMuPDF) are imported lazily +dependencies (transformers, PyMuPDF) are imported lazily inside the specific backend functions so importing this package does not require GPU stacks or model weights. """ @@ -12,17 +12,14 @@ __all__ = [ "deepseek", - "rapidocr", "math", "utils", "deepseek_runner", - "rapidocr_dispatch", ] -_SUBPACKAGES = {"deepseek", "rapidocr", "math", "utils"} +_SUBPACKAGES = {"deepseek", "math", "utils"} _ALIASES = { "deepseek_runner": "glossapi.ocr.deepseek.runner", - "rapidocr_dispatch": "glossapi.ocr.rapidocr.dispatch", } diff --git a/src/glossapi/ocr/deepseek/__init__.py b/src/glossapi/ocr/deepseek/__init__.py index 5326c42..a5fb1ca 100644 --- a/src/glossapi/ocr/deepseek/__init__.py +++ b/src/glossapi/ocr/deepseek/__init__.py @@ -1,4 +1,4 @@ -"""DeepSeek OCR backend with a lightweight stub fallback.""" +"""DeepSeek OCR backend.""" from .runner import run_for_files from . import preflight diff --git a/src/glossapi/ocr/deepseek/defaults.py b/src/glossapi/ocr/deepseek/defaults.py new file mode 100644 index 0000000..c36309c --- /dev/null +++ b/src/glossapi/ocr/deepseek/defaults.py @@ -0,0 +1,27 @@ +"""Canonical DeepSeek OCR defaults shared across orchestration and CLIs.""" + +from __future__ import annotations + +from typing import Optional + +DEFAULT_RUNTIME_BACKEND = "transformers" +DEFAULT_OCR_PROFILE = "markdown_grounded" +DEFAULT_ATTN_BACKEND = "auto" +DEFAULT_RENDER_DPI = 144 +DEFAULT_MAX_NEW_TOKENS = 2048 +DEFAULT_GPU_MEMORY_UTILIZATION = 0.9 +DEFAULT_REPAIR_MODE = "auto" +DEFAULT_WORKERS_PER_GPU = 1 +DEFAULT_TARGET_BATCH_PAGES = 160 + + +def resolve_render_dpi(value: Optional[int]) -> int: + """Return the canonical render DPI, even when callers pass ``None``.""" + + return DEFAULT_RENDER_DPI if value is None else int(value) + + +def resolve_gpu_memory_utilization(value: Optional[float]) -> float: + """Return the canonical vLLM memory target, even when callers pass ``None``.""" + + return DEFAULT_GPU_MEMORY_UTILIZATION if value is None else float(value) diff --git a/src/glossapi/ocr/deepseek/preflight.py b/src/glossapi/ocr/deepseek/preflight.py index 76810e6..b8638b1 100644 --- a/src/glossapi/ocr/deepseek/preflight.py +++ b/src/glossapi/ocr/deepseek/preflight.py @@ -1,17 +1,17 @@ -"""Preflight checks for the DeepSeek OCR CLI environment.""" +"""Preflight checks for the DeepSeek OCR environment.""" from __future__ import annotations import dataclasses import os -import shutil -import sys from pathlib import Path from typing import Dict, Iterable, List, Optional -DEFAULT_SCRIPT = Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" -DEFAULT_MODEL_DIR = Path.cwd() / "deepseek-ocr" / "DeepSeek-OCR" -DEFAULT_LIB_DIR = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" +from .runtime_paths import resolve_deepseek_python + +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_MODEL_DIR = REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2" @dataclasses.dataclass(frozen=True) @@ -46,9 +46,6 @@ def summarize(self) -> str: def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[Path]: - if not path: - errors.append(CheckResult(label, False, "Not provided")) - return None if not path.exists(): errors.append(CheckResult(label, False, f"Missing at {path}")) return None @@ -58,38 +55,41 @@ def _ensure_path(path: Path, label: str, errors: List[CheckResult]) -> Optional[ def check_deepseek_env( env: Optional[Dict[str, str]] = None, *, - check_flashinfer: bool = True, + check_torch: bool = True, ) -> PreflightReport: - """Validate DeepSeek CLI prerequisites without running the model.""" + """Validate DeepSeek OCR prerequisites without running the model.""" env = dict(env or os.environ) errors: List[CheckResult] = [] warnings: List[CheckResult] = [] infos: List[CheckResult] = [] - allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" - allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" + allow_cli = env.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") == "1" + allow_stub = env.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1" if not allow_cli: - warnings.append( + errors.append( CheckResult( "allow_cli", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1 to force the real CLI.", + "DeepSeek OCR requires the real CLI/runtime. Set GLOSSAPI_DEEPSEEK_ALLOW_CLI=1.", ) ) if allow_stub: - warnings.append( + errors.append( CheckResult( "allow_stub", False, - "Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0 to fail instead of falling back to stub output.", + "Stub execution is no longer supported. Set GLOSSAPI_DEEPSEEK_ALLOW_STUB=0.", ) ) - script = Path(env.get("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT") or DEFAULT_SCRIPT) - _ensure_path(script, "vllm_script", errors) + script = Path( + env.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT") + or DEFAULT_SCRIPT + ) + _ensure_path(script, "runner_script", errors) - python_bin = Path(env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON") or sys.executable) + python_bin = resolve_deepseek_python(env=env) _ensure_path(python_bin, "deepseek_python", errors) model_dir = Path( @@ -99,7 +99,7 @@ def check_deepseek_env( ) model_dir = _ensure_path(model_dir, "model_dir", errors) if model_dir: - has_weights = any(model_dir.glob("*.safetensors")) or (model_dir / "model-00001-of-000001.safetensors").exists() + has_weights = any(model_dir.glob("*.safetensors")) has_config = (model_dir / "config.json").exists() if not has_weights or not has_config: errors.append( @@ -110,34 +110,21 @@ def check_deepseek_env( ) ) - ld_path_env = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - lib_dir = Path(ld_path_env) if ld_path_env else DEFAULT_LIB_DIR - _ensure_path(lib_dir, "ld_library_path", errors) - - cc1plus_path = shutil.which("cc1plus", path=env.get("PATH", "")) - if not cc1plus_path: - errors.append( - CheckResult( - "cc1plus", - False, - "C++ toolchain missing (cc1plus not on PATH); install g++ and ensure PATH includes gcc's cc1plus.", - ) - ) - else: - infos.append(CheckResult("cc1plus", True, f"Found at {cc1plus_path}")) - - if check_flashinfer: + if check_torch: try: - import flashinfer # type: ignore + import torch # type: ignore - infos.append(CheckResult("flashinfer", True, f"flashinfer {flashinfer.__version__} import ok")) + infos.append(CheckResult("torch", True, f"torch {torch.__version__} import ok")) + if not torch.cuda.is_available(): + warnings.append(CheckResult("cuda", False, "Torch CUDA is not available.")) except Exception as exc: # pragma: no cover - depends on env - errors.append(CheckResult("flashinfer", False, f"flashinfer import failed: {exc}")) + errors.append(CheckResult("torch", False, f"torch import failed: {exc}")) return PreflightReport(errors=errors, warnings=warnings, infos=infos) def main(argv: Optional[Iterable[str]] = None) -> int: + del argv report = check_deepseek_env() summary = report.summarize() if summary: diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py new file mode 100644 index 0000000..9b318e1 --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_transformers.py @@ -0,0 +1,591 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files.""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +import tempfile +import time +from pathlib import Path +from typing import Iterable, Iterator, List + +from PIL import Image + +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.utils.cleaning import ( # noqa: E402 + apply_early_stop, + canonicalize_markdown, + clean_output, + strip_prompt_echo, +) + +LOGGER = logging.getLogger(__name__) +PROMPT_GROUNDED_MARKDOWN = "\n<|grounding|>Convert the document to markdown. " +PROMPT_PLAIN_OCR = "\nExtract the text from the document page in reading order." +PAGE_SPLIT = "\n<--- Page Split --->\n" +PAGE_SPLIT_RE = re.compile(r"(?:^|\n)(?:\n)?<--- Page Split --->\n?") +DEFAULT_MAX_NEW_TOKENS = 2048 + + +def _profile_defaults(profile: str) -> dict: + profile_norm = str(profile or "markdown_grounded").strip().lower() + if profile_norm == "plain_ocr": + return { + "prompt": PROMPT_PLAIN_OCR, + "base_size": 768, + "image_size": 512, + "crop_mode": True, + } + return { + "prompt": PROMPT_GROUNDED_MARKDOWN, + "base_size": 1024, + "image_size": 768, + "crop_mode": True, + } + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) + parser.add_argument("--attn-backend", default="auto", choices=["auto", "flash_attention_2", "sdpa", "eager"]) + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--content-debug", action="store_true") + return parser.parse_args() + + +def _parse_page_range_spec(input_dir: Path, spec: str) -> dict: + try: + name, start_raw, end_raw = str(spec).rsplit(":", 2) + except ValueError as exc: + raise ValueError(f"Invalid page range spec: {spec}") from exc + start_page = int(start_raw) + end_page = int(end_raw) + if start_page <= 0 or end_page < start_page: + raise ValueError(f"Invalid page range bounds: {spec}") + pdf_path = (input_dir / name).resolve() + return { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": start_page, + "end_page": end_page, + "stem": f"{pdf_path.stem}__p{start_page:05d}-{end_page:05d}", + } + + +def _iter_pdf_jobs(input_dir: Path, files: List[str], page_ranges: List[str]) -> List[dict]: + jobs: List[dict] = [] + if files: + for name in files: + pdf_path = (input_dir / name).resolve() + jobs.append( + { + "pdf_path": pdf_path, + "source_name": str(name), + "source_stem": pdf_path.stem, + "start_page": 1, + "end_page": None, + "stem": pdf_path.stem, + } + ) + if page_ranges: + jobs.extend(_parse_page_range_spec(input_dir, spec) for spec in page_ranges) + if jobs: + return jobs + return [ + { + "pdf_path": path.resolve(), + "source_name": path.name, + "source_stem": path.stem, + "start_page": 1, + "end_page": None, + "stem": path.stem, + } + for path in sorted(input_dir.glob("*.pdf")) + ] + + +def _resolve_render_window( + *, + doc_page_count: int, + max_pages: int | None, + start_page: int = 1, + end_page: int | None = None, +) -> tuple[int, int] | None: + first_idx = max(0, int(start_page) - 1) + last_idx = int(doc_page_count) - 1 if end_page is None else min(int(doc_page_count) - 1, int(end_page) - 1) + if max_pages is not None: + last_idx = min(last_idx, first_idx + int(max_pages) - 1) + if last_idx < first_idx: + return None + return first_idx, last_idx + + +def _count_rendered_pages( + pdf_path: Path, + max_pages: int | None, + *, + start_page: int = 1, + end_page: int | None = None, +) -> int: + import fitz + + doc = fitz.open(pdf_path) + try: + window = _resolve_render_window( + doc_page_count=int(doc.page_count), + max_pages=max_pages, + start_page=start_page, + end_page=end_page, + ) + if window is None: + return 0 + first_idx, last_idx = window + return max(0, int(last_idx) - int(first_idx) + 1) + finally: + doc.close() + + +def _iter_rendered_pages( + pdf_path: Path, + max_pages: int | None, + render_dpi: int, + *, + start_page: int = 1, + end_page: int | None = None, +) -> Iterator[Image.Image]: + import fitz + + doc = fitz.open(pdf_path) + try: + window = _resolve_render_window( + doc_page_count=int(doc.page_count), + max_pages=max_pages, + start_page=start_page, + end_page=end_page, + ) + if window is None: + return + first_idx, last_idx = window + zoom = float(render_dpi) / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for idx in range(first_idx, last_idx + 1): + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + yield img + finally: + doc.close() + + +def _render_pages( + pdf_path: Path, + max_pages: int | None, + render_dpi: int, + *, + start_page: int = 1, + end_page: int | None = None, +) -> List[Image.Image]: + return list( + _iter_rendered_pages( + pdf_path, + max_pages, + render_dpi, + start_page=start_page, + end_page=end_page, + ) + ) + + +def _clean_markdown(text: str) -> str: + text = (text or "").replace("<|end▁of▁sentence|>", "").strip() + pattern = re.compile(r"(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)", re.DOTALL) + matches = pattern.findall(text) + for full_match, label, _coords in matches: + if label == "image": + text = text.replace(full_match, "") + else: + text = text.replace(full_match, "") + return text.replace("\\coloneqq", ":=").replace("\\eqqcolon", "=:").strip() + + +def _page_split_comment(page_number: int) -> str: + return f"\n\n<--- Page Split --->\n" + + +def _join_page_outputs(page_outputs: List[str]) -> str: + if not page_outputs: + return "" + first_page = str(page_outputs[0]) + parts = [first_page] + emitted = bool(first_page) + for page_number, page_text in enumerate(page_outputs[1:], start=2): + separator = _page_split_comment(page_number) + if not emitted: + separator = separator.lstrip("\n") + parts.append(separator) + emitted = True + parts.append(str(page_text)) + return "".join(parts) + + +def _split_page_outputs(markdown_text: str) -> List[str]: + content = str(markdown_text or "").rstrip("\n") + if not content: + return [] + return PAGE_SPLIT_RE.split(content) + + +def _serialize_markdown(markdown: str) -> str: + return str(markdown or "").rstrip("\n") + "\n" + + +def _postprocess_page_text( + text: str, + *, + prompt: str, + content_debug: bool, +) -> tuple[str, dict]: + metrics: dict = {} + cleaned = _clean_markdown(text) + cleaned = strip_prompt_echo(cleaned, prompt) + cleaned = clean_output(cleaned, keep_refdet=False, metrics=metrics) + cleaned = canonicalize_markdown(cleaned) + cleaned = apply_early_stop(cleaned, content_debug=content_debug, metrics=metrics) + return cleaned.strip(), metrics + + +def _resolve_attn_backend(attn_backend: str) -> str: + requested = str(attn_backend or "auto").strip().lower() + if requested != "auto": + return requested + try: + import flash_attn # noqa: F401 + return "flash_attention_2" + except Exception: + # DeepSeek-OCR-2's custom decoder path has not behaved reliably with SDPA + # on the stacks we have exercised; if FA2 is unavailable, prefer the known + # fallback instead of silently selecting a backend that then downgrades. + return "eager" + + +def _supports_retry_with_eager(exc: Exception, attn_impl: str) -> bool: + if str(attn_impl) == "eager": + return False + message = str(exc) + markers = ( + "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention", + 'load your model with the argument `attn_implementation="eager"` meanwhile', + ) + return any(marker in message for marker in markers) + + +def _configure_generate( + model, + *, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + if ( + max_new_tokens is None + and repetition_penalty is None + and no_repeat_ngram_size is None + ): + return + capped = None + if max_new_tokens is not None: + capped = int(max_new_tokens) + if capped <= 0: + raise ValueError("max_new_tokens must be > 0") + repetition_penalty_value = None + if repetition_penalty is not None: + repetition_penalty_value = float(repetition_penalty) + if repetition_penalty_value <= 0: + raise ValueError("repetition_penalty must be > 0") + no_repeat_ngram_value = None + if no_repeat_ngram_size is not None: + no_repeat_ngram_value = int(no_repeat_ngram_size) + if no_repeat_ngram_value <= 0: + raise ValueError("no_repeat_ngram_size must be > 0") + original_generate = model.generate + + def _wrapped_generate(*args, **kwargs): + if capped is not None: + current = kwargs.get("max_new_tokens") + if current is None: + kwargs["max_new_tokens"] = capped + else: + kwargs["max_new_tokens"] = min(int(current), capped) + if repetition_penalty_value is not None and kwargs.get("repetition_penalty") is None: + kwargs["repetition_penalty"] = repetition_penalty_value + if no_repeat_ngram_value is not None and kwargs.get("no_repeat_ngram_size") is None: + kwargs["no_repeat_ngram_size"] = no_repeat_ngram_value + return original_generate(*args, **kwargs) + + model.generate = _wrapped_generate + + +def _load_model( + model_dir: Path, + device: str, + attn_backend: str, + max_new_tokens: int | None, + repetition_penalty: float | None, + no_repeat_ngram_size: int | None, +): + import torch + from transformers import AutoModel, AutoTokenizer + + attn_impl = _resolve_attn_backend(attn_backend) + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + try: + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + except ValueError as exc: + if not _supports_retry_with_eager(exc, attn_impl): + raise + LOGGER.warning( + "DeepSeek model rejected attention backend `%s`; retrying with eager attention: %s", + attn_impl, + exc, + ) + attn_impl = "eager" + model = AutoModel.from_pretrained( + model_dir, + _attn_implementation=attn_impl, + trust_remote_code=True, + use_safetensors=True, + ) + if device.startswith("cuda"): + model = model.eval().to(device).to(torch.bfloat16) + else: + model = model.eval().to(device) + _configure_generate( + model, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + ) + return tokenizer, model, attn_impl + + +def _infer_page( + model, + tokenizer, + image_path: Path, + output_dir: Path, + *, + prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, +) -> str: + result = model.infer( + tokenizer, + prompt=prompt, + image_file=str(image_path), + output_path=str(output_dir), + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + save_results=False, + eval_mode=True, + ) + return _clean_markdown(str(result)) + + +def _write_outputs( + output_dir: Path, + stem: str, + markdown: str, + page_count: int, + extra_metrics: dict | None = None, +) -> None: + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + (md_dir / f"{stem}.md").write_text(_serialize_markdown(markdown), encoding="utf-8") + metrics = { + "page_count": page_count, + "model": "deepseek-ai/DeepSeek-OCR-2", + } + if extra_metrics: + metrics.update(extra_metrics) + (metrics_dir / f"{stem}.metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8") + partial_path = progress_dir / f"{stem}.partial.md" + if partial_path.exists(): + partial_path.unlink() + + +def _write_progress( + output_dir: Path, + stem: str, + page_outputs: List[str], + total_pages: int, + completed_pages: int, +) -> None: + """Emit lightweight progress artifacts during long OCR runs.""" + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + progress_dir = output_dir / "sidecars" / "ocr_progress" + metrics_dir.mkdir(parents=True, exist_ok=True) + progress_dir.mkdir(parents=True, exist_ok=True) + partial_markdown = _join_page_outputs(page_outputs) + if partial_markdown: + (progress_dir / f"{stem}.partial.md").write_text(_serialize_markdown(partial_markdown), encoding="utf-8") + progress = { + "completed_pages": completed_pages, + "total_pages": total_pages, + "status": "running" if completed_pages < total_pages else "complete", + "model": "deepseek-ai/DeepSeek-OCR-2", + } + (metrics_dir / f"{stem}.progress.json").write_text( + json.dumps(progress, indent=2), + encoding="utf-8", + ) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + jobs = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs: + return 0 + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + tokenizer, model, attn_impl = _load_model( + model_dir, + args.device, + args.attn_backend, + args.max_new_tokens, + args.repetition_penalty, + args.no_repeat_ngram_size, + ) + + for job in jobs: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) + doc_start = time.perf_counter() + render_start = time.perf_counter() + images = _render_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) + render_sec = time.perf_counter() - render_start + page_outputs: List[str] = [] + page_metrics: List[dict] = [] + total_pages = len(images) + _write_progress(output_dir, stem, page_outputs, total_pages, 0) + with tempfile.TemporaryDirectory(prefix=f"{stem}_deepseek_") as tmp_dir_str: + tmp_dir = Path(tmp_dir_str) + for idx, image in enumerate(images): + page_png = tmp_dir / f"page_{idx + 1:04d}.png" + image.save(page_png, format="PNG") + infer_start = time.perf_counter() + raw_page_text = _infer_page( + model, + tokenizer, + page_png, + tmp_dir / f"page_{idx + 1:04d}", + prompt=prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + ) + infer_sec = time.perf_counter() - infer_start + page_text, postprocess_metrics = _postprocess_page_text( + raw_page_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + page_outputs.append(page_text) + page_metrics.append( + { + "page_number": int(idx + 1), + "infer_sec": float(infer_sec), + "raw_chars": int(len(str(raw_page_text or "").strip())), + "final_chars": int(len(page_text.strip())), + **postprocess_metrics, + } + ) + _write_progress( + output_dir, + stem, + page_outputs, + total_pages, + idx + 1, + ) + markdown = _join_page_outputs(page_outputs) if page_outputs else "[[Blank page]]" + _write_outputs( + output_dir, + stem, + markdown, + len(images), + extra_metrics={ + "source_file": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "source_end_page": int(job["start_page"]) + max(0, len(images) - 1), + "ocr_profile": args.ocr_profile, + "attn_backend": attn_impl, + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "repetition_penalty": args.repetition_penalty, + "no_repeat_ngram_size": args.no_repeat_ngram_size, + "render_sec": float(render_sec), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - doc_start), + "page_metrics": page_metrics, + }, + ) + + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py new file mode 100644 index 0000000..edc838b --- /dev/null +++ b/src/glossapi/ocr/deepseek/run_pdf_ocr_vllm.py @@ -0,0 +1,1382 @@ +"""CLI wrapper for DeepSeek-OCR-2 inference over PDF files using vLLM.""" + +from __future__ import annotations + +import argparse +import json +import logging +import queue +import sys +import threading +import time +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from PIL import Image + +SRC_ROOT = Path(__file__).resolve().parents[3] +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import ( + DEFAULT_MAX_NEW_TOKENS, + _join_page_outputs, + _count_rendered_pages, + _iter_pdf_jobs, + _iter_rendered_pages, + _postprocess_page_text, + _profile_defaults, + _split_page_outputs, + _write_outputs, + _write_progress, +) +from glossapi.ocr.deepseek.work_queue import ( + QUEUE_MAIN, + QUEUE_REPAIR, + STATUS_PENDING, + STATUS_RUNNING, + claim_next_batch, + enqueue_batches, + heartbeat_batch, + mark_batch_done, + mark_batch_failed, + work_queue_counts, +) +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector + +LOGGER = logging.getLogger(__name__) +REPAIR_DARK_THRESHOLD = 235 +EMPTY_PAGE_OVERALL_DARK_MAX = 0.0015 +EMPTY_PAGE_BAND_DARK_MAX = 0.0025 +GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS = 48 +GARBAGE_EARLY_STOP_WINDOW_TOKENS = 160 +DEFAULT_REPAIR_EXEC_BATCH_TARGET_PAGES = 48 +DEFAULT_REPAIR_EXEC_BATCH_TARGET_ITEMS = 32 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--model-dir", required=True) + parser.add_argument("--files", nargs="*", default=[]) + parser.add_argument("--page-ranges", nargs="*", default=[]) + parser.add_argument("--max-pages", type=int, default=None) + parser.add_argument("--device", default="cuda") + parser.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + parser.add_argument("--prompt-override", default=None) + parser.add_argument("--attn-backend", default="vllm") + parser.add_argument("--base-size", type=int, default=None) + parser.add_argument("--image-size", type=int, default=None) + parser.add_argument("--render-dpi", type=int, default=144) + parser.add_argument("--max-new-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS) + parser.add_argument("--repetition-penalty", type=float, default=None) + parser.add_argument("--no-repeat-ngram-size", type=int, default=None) + parser.add_argument("--crop-mode", dest="crop_mode", action="store_true") + parser.add_argument("--no-crop-mode", dest="crop_mode", action="store_false") + parser.set_defaults(crop_mode=None) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--gpu-memory-utilization", type=float, default=0.9) + parser.add_argument("--disable-fp8-kv", action="store_true") + parser.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + parser.add_argument("--content-debug", action="store_true") + parser.add_argument("--work-db", default=None) + parser.add_argument("--worker-id", default=None) + parser.add_argument("--worker-runtime-file", default=None) + parser.add_argument("--work-stale-after-sec", type=float, default=900.0) + parser.add_argument("--work-heartbeat-sec", type=float, default=10.0) + parser.add_argument("--work-max-attempts", type=int, default=2) + parser.add_argument("--repair-exec-batch-target-pages", type=int, default=DEFAULT_REPAIR_EXEC_BATCH_TARGET_PAGES) + parser.add_argument("--repair-exec-batch-target-items", type=int, default=DEFAULT_REPAIR_EXEC_BATCH_TARGET_ITEMS) + return parser.parse_args() + + +def _load_vllm(model_dir: Path, gpu_memory_utilization: float, disable_fp8_kv: bool): + from vllm import LLM + + logits_processors = [] + try: + from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor + + logits_processors.append(NGramPerReqLogitsProcessor) + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("DeepSeek OCR logits processor unavailable in vLLM; continuing without it: %s", exc) + + try: + from transformers import AutoTokenizer + from vllm.sampling_params import SamplingParams + from vllm.v1.sample.logits_processor import AdapterLogitsProcessor + + class _GarbageStopPerReqLogitsProcessor: + def __init__( + self, + tokenizer, + eos_token_id: int | None, + *, + min_output_tokens: int, + window_tokens: int, + ) -> None: + self.tokenizer = tokenizer + self.eos_token_id = eos_token_id + self.min_output_tokens = int(min_output_tokens) + self.window_tokens = int(window_tokens) + self.detector = StreamingGarbageDetector() + self.seen_output_tokens = 0 + + def __call__(self, prompt_ids: list[int], output_ids: list[int], logits): + del prompt_ids + if self.eos_token_id is None: + return logits + current_len = len(output_ids) + if current_len <= self.seen_output_tokens: + return logits + new_ids = output_ids[self.seen_output_tokens :] + self.seen_output_tokens = current_len + if not new_ids: + return logits + new_text = self.tokenizer.decode(new_ids, skip_special_tokens=False) + if new_text: + self.detector.feed(new_text) + if current_len < self.min_output_tokens or self.detector.triggered_reason is None: + return logits + eos_token_id = int(self.eos_token_id) + eos_value = logits[eos_token_id].clone() + logits[:] = float("-inf") + logits[eos_token_id] = eos_value + return logits + + class GarbageEarlyStopLogitsProcessor(AdapterLogitsProcessor): + @classmethod + def validate_params(cls, params: SamplingParams): + extra = params.extra_args or {} + enabled = extra.get("garbage_early_stop") + if enabled is None: + return + if not isinstance(enabled, bool): + raise ValueError("garbage_early_stop must be a bool when provided") + min_output_tokens = extra.get("garbage_min_output_tokens") + if min_output_tokens is not None and int(min_output_tokens) <= 0: + raise ValueError("garbage_min_output_tokens must be > 0") + window_tokens = extra.get("garbage_window_tokens") + if window_tokens is not None and int(window_tokens) <= 0: + raise ValueError("garbage_window_tokens must be > 0") + + def __init__(self, vllm_config, device, is_pin_memory): + super().__init__(vllm_config, device, is_pin_memory) + self._tokenizer = AutoTokenizer.from_pretrained(str(model_dir), trust_remote_code=True) + self._eos_token_id = self._tokenizer.eos_token_id + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor(self, params: SamplingParams): + extra = params.extra_args or {} + if not bool(extra.get("garbage_early_stop", False)): + return None + return _GarbageStopPerReqLogitsProcessor( + self._tokenizer, + self._eos_token_id, + min_output_tokens=int( + extra.get("garbage_min_output_tokens", GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS) + ), + window_tokens=int( + extra.get("garbage_window_tokens", GARBAGE_EARLY_STOP_WINDOW_TOKENS) + ), + ) + + logits_processors.append(GarbageEarlyStopLogitsProcessor) + except Exception as exc: # pragma: no cover - environment dependent + LOGGER.warning("Garbage-stop logits processor unavailable in vLLM; continuing without it: %s", exc) + + engine_kwargs = { + "model": str(model_dir), + "tokenizer": str(model_dir), + "trust_remote_code": True, + "dtype": "bfloat16", + "enable_prefix_caching": False, + "mm_processor_cache_gb": 0, + "gpu_memory_utilization": float(gpu_memory_utilization), + "tensor_parallel_size": 1, + } + if disable_fp8_kv: + engine_kwargs["kv_cache_dtype"] = "auto" + if logits_processors: + engine_kwargs["logits_processors"] = logits_processors + return LLM(**engine_kwargs) + + +def _sampling_params(max_new_tokens: int | None, *, enable_garbage_early_stop: bool): + from vllm import SamplingParams + + return SamplingParams( + temperature=0.0, + max_tokens=int(max_new_tokens or DEFAULT_MAX_NEW_TOKENS), + skip_special_tokens=False, + extra_args={ + "ngram_size": 30, + "window_size": 90, + "whitelist_token_ids": {128821, 128822}, + "garbage_early_stop": bool(enable_garbage_early_stop), + "garbage_min_output_tokens": int(GARBAGE_EARLY_STOP_MIN_OUTPUT_TOKENS), + "garbage_window_tokens": int(GARBAGE_EARLY_STOP_WINDOW_TOKENS), + }, + ) + + +def _batched(items: List[dict], batch_size: int) -> List[List[dict]]: + size = max(1, int(batch_size)) + return [items[idx : idx + size] for idx in range(0, len(items), size)] + + +def _image_content_stats(image: Image.Image) -> dict: + sample = image.convert("L") + sample.thumbnail((256, 256)) + width, height = sample.size + pixels = list(sample.getdata()) + + def _dark_ratio(y0: int, y1: int) -> float: + values = [] + for row in range(y0, y1): + start = row * width + values.extend(pixels[start : start + width]) + total = len(values) + if total <= 0: + return 0.0 + dark = sum(1 for value in values if value < REPAIR_DARK_THRESHOLD) + return float(dark) / float(total) + + half = max(1, height // 2) + third = max(1, height // 3) + top_third_end = min(height, third) + middle_third_end = min(height, third * 2) + dark_total = sum(1 for value in pixels if value < REPAIR_DARK_THRESHOLD) + return { + "top_dark_ratio": _dark_ratio(0, half), + "bottom_dark_ratio": _dark_ratio(half, height), + "top_third_dark_ratio": _dark_ratio(0, top_third_end), + "middle_third_dark_ratio": _dark_ratio(top_third_end, middle_third_end), + "bottom_third_dark_ratio": _dark_ratio(middle_third_end, height), + "overall_dark_ratio": float(dark_total) / float(max(1, len(pixels))), + } + + +def _text_quality_metrics(text: str) -> dict: + stripped = str(text or "").strip() + letters = sum(1 for ch in stripped if ch.isalpha()) + digits = sum(1 for ch in stripped if ch.isdigit()) + pua_chars = sum( + 1 + for ch in stripped + if 0xE000 <= ord(ch) <= 0xF8FF + or 0xF0000 <= ord(ch) <= 0xFFFFD + or 0x100000 <= ord(ch) <= 0x10FFFD + ) + lines = [line.strip() for line in stripped.splitlines() if line.strip()] + avg_line_length = (sum(len(line) for line in lines) / float(len(lines))) if lines else 0.0 + score = float(letters) + (0.10 * float(len(stripped))) + (0.05 * float(digits)) - (20.0 * float(pua_chars)) + return { + "chars": int(len(stripped)), + "letters": int(letters), + "digits": int(digits), + "pua_chars": int(pua_chars), + "line_count": int(len(lines)), + "avg_line_length": float(avg_line_length), + "quality_score": float(score), + } + + +def _is_effectively_empty_page(image_stats: dict, repair_mode: str) -> bool: + if str(repair_mode or "off").strip().lower() != "auto": + return False + overall_dark = float(image_stats.get("overall_dark_ratio", 0.0)) + if overall_dark > EMPTY_PAGE_OVERALL_DARK_MAX: + return False + return all( + float(image_stats.get(key, 0.0)) <= EMPTY_PAGE_BAND_DARK_MAX + for key in ( + "top_dark_ratio", + "bottom_dark_ratio", + "top_third_dark_ratio", + "middle_third_dark_ratio", + "bottom_third_dark_ratio", + ) + ) + + +def _resolve_job_image(item: dict) -> Tuple[Image.Image, bool]: + image = item.get("image") + if isinstance(image, Image.Image): + return image, False + return Image.open(item["image_path"]).convert("RGB"), True + + +def _close_job_image(item: dict) -> None: + image = item.pop("image", None) + if isinstance(image, Image.Image): + image.close() + + +def _empty_page_metric(*, page_number: int, image_stats: dict) -> dict: + return { + "page_number": int(page_number), + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "skip_empty", + "repair_reason": "empty_page", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": True, + "garbage_early_stop_applied": False, + **image_stats, + } + + +def _utc_now_iso(now_ts: Optional[float] = None) -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(float(now_ts) if now_ts is not None else time.time())) + + +def _write_worker_runtime(runtime_file: Optional[Path], state: dict) -> None: + if runtime_file is None: + return + runtime_path = Path(runtime_file).expanduser().resolve() + runtime_path.parent.mkdir(parents=True, exist_ok=True) + payload = dict(state) + payload["updated_at"] = _utc_now_iso() + runtime_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") + + +def _build_jobs_from_batch(input_dir: Path, batch: dict) -> List[dict]: + files = list(batch.get("files") or []) + page_ranges = list(batch.get("page_ranges") or []) + return _iter_pdf_jobs(input_dir, files, page_ranges) + + +def _iter_selected_rendered_pages( + pdf_path: Path, + *, + render_dpi: int, + source_page_numbers: List[int], +): + import fitz + + doc = fitz.open(pdf_path) + try: + zoom = float(render_dpi) / 72.0 + matrix = fitz.Matrix(zoom, zoom) + for source_page_number in source_page_numbers: + idx = int(source_page_number) - 1 + if idx < 0 or idx >= int(doc.page_count): + raise ValueError(f"Requested page {source_page_number} outside document bounds for {pdf_path}") + page = doc[idx] + pixmap = page.get_pixmap(matrix=matrix, alpha=False) + yield int(source_page_number), Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples) + finally: + doc.close() + + +def _emit_progress(output_dir: Path, stem: str, state: dict) -> None: + _write_progress( + output_dir, + stem, + state["page_outputs"], + int(state["total_pages"]), + int(state["completed_pages"]), + ) + + +def _resolve_repair_disposition(*, repair_text: str, repair_postprocess: dict) -> dict: + if bool(repair_postprocess.get("early_stops", 0)): + return { + "final_text": "", + "repair_applied": False, + "page_dropped_after_repair": True, + "drop_reason": "repeat_garbage_cutoff", + } + if repair_text.strip(): + return { + "final_text": repair_text, + "repair_applied": True, + "page_dropped_after_repair": False, + "drop_reason": None, + } + return { + "final_text": None, + "repair_applied": False, + "page_dropped_after_repair": False, + "drop_reason": None, + } + + +def _repair_summary_from_page_metrics(page_metrics: List[dict], repair_mode: str) -> dict: + return { + "repair_mode": str(repair_mode), + "pages_flagged": int(sum(1 for item in page_metrics if str(item.get("repair_strategy")) != "none")), + "pages_repaired": int(sum(1 for item in page_metrics if bool(item.get("repair_applied")))), + "plain_repairs": int( + sum(1 for item in page_metrics if str(item.get("repair_profile")) == "plain_ocr" and bool(item.get("repair_applied"))) + ), + "tiled_repairs": 0, + "pages_dropped_after_repeat_cutoff": int(sum(1 for item in page_metrics if bool(item.get("page_dropped_after_repair")))), + "empty_pages_skipped": int(sum(1 for item in page_metrics if bool(item.get("empty_page_skipped")))), + "pages_with_early_stop": int(sum(1 for item in page_metrics if bool(item.get("garbage_early_stop_applied")))), + } + + +def _load_persisted_doc_state(output_dir: Path, stem: str) -> dict: + markdown_path = output_dir / "markdown" / f"{stem}.md" + metrics_path = output_dir / "json" / "metrics" / f"{stem}.metrics.json" + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + page_count = int(metrics.get("page_count", 0)) + page_outputs = _split_page_outputs(markdown_path.read_text(encoding="utf-8")) if markdown_path.exists() else [] + if len(page_outputs) < page_count: + page_outputs.extend([""] * (page_count - len(page_outputs))) + elif len(page_outputs) > page_count: + page_outputs = page_outputs[:page_count] + metrics_by_page = { + int(item["page_number"]): dict(item) + for item in list(metrics.get("page_metrics") or []) + if item is not None and "page_number" in item + } + page_metrics = [metrics_by_page.get(page_number) for page_number in range(1, page_count + 1)] + extra_metrics = dict(metrics) + extra_metrics.pop("page_count", None) + extra_metrics.pop("model", None) + return { + "stem": stem, + "page_outputs": page_outputs, + "page_metrics": page_metrics, + "total_pages": page_count, + "extra_metrics": extra_metrics, + } + + +def _build_repair_batches(*, doc_states: Dict[str, dict], retry_pages_by_stem: Dict[str, List[int]], origin_batch_id: int) -> List[dict]: + batches: List[dict] = [] + for stem, retry_pages in sorted(retry_pages_by_stem.items()): + unique_retry_pages = sorted({int(page_number) for page_number in retry_pages}) + if not unique_retry_pages: + continue + state = doc_states[stem] + batches.append( + { + "queue_key": f"repair:{int(origin_batch_id)}:{stem}", + "origin_batch_id": int(origin_batch_id), + "stem": stem, + "pdf_path": str(state["pdf_path"]), + "source_name": str(state["source_name"]), + "source_stem": str(state["source_stem"]), + "source_start_page": int(state["source_start_page"]), + "source_end_page": int(state["source_start_page"]) + max(0, int(state["total_pages"]) - 1), + "repair_page_numbers": unique_retry_pages, + "pages": int(len(unique_retry_pages)), + } + ) + return batches + + +def _claim_additional_repair_batches( + work_db: Path, + *, + worker_id: str, + stale_after_sec: float, + first_batch: dict, + target_pages: int, + target_items: int, +) -> List[dict]: + claimed_batches = [dict(first_batch)] + first_batch_pages = max(0, int(first_batch.get("pages", len(list(first_batch.get("repair_page_numbers") or []))))) + claimed_pages = first_batch_pages + target_pages = max(1, int(target_pages)) + target_items = max(1, int(target_items)) + if "batch_id" in first_batch: + heartbeat_batch(work_db, batch_id=int(first_batch["batch_id"]), worker_id=worker_id) + while len(claimed_batches) < target_items and claimed_pages < target_pages: + next_batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_REPAIR, + ) + if next_batch is None: + break + claimed_batches.append(dict(next_batch)) + claimed_pages += max(0, int(next_batch.get("pages", 0))) + return claimed_batches + + +def _repair_batch_result( + *, + batch: dict, + render_sec_total: float, + infer_sec_total: float, + first_infer_started_at: Optional[float], + last_infer_completed_at: Optional[float], + batch_wall_time_sec: float, + execution_pack_batch_ids: List[int], + execution_pack_pages: int, +) -> dict: + batch_pages = int(batch.get("pages", len(list(batch.get("repair_page_numbers") or [])))) + return { + "docs": 1, + "pages": int(batch_pages), + "render_sec_total": float(render_sec_total), + "infer_sec_total": float(infer_sec_total), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "batch_wall_time_sec": float(batch_wall_time_sec), + "execution_pack_batch_ids": [int(batch_id) for batch_id in execution_pack_batch_ids], + "execution_pack_pages": int(execution_pack_pages), + "execution_pack_items": int(len(execution_pack_batch_ids)), + "queue_name": QUEUE_REPAIR, + "batch_id": int(batch["batch_id"]) if "batch_id" in batch else None, + } + + +def _run_vllm_batch( + llm, + *, + batch: List[dict], + prompt: str, + sampling_params, +) -> List[dict]: + if not batch: + return [] + + prompt_batch = [] + opened_images: List[Image.Image] = [] + keys: List[tuple[str, int]] = [] + for item in batch: + image, should_close = _resolve_job_image(item) + if should_close: + opened_images.append(image) + keys.append((str(item["stem"]), int(item["page_number"]))) + prompt_batch.append( + { + "prompt": prompt, + "multi_modal_data": {"image": image}, + } + ) + + try: + infer_start = time.perf_counter() + batch_outputs = llm.generate(prompt_batch, sampling_params=sampling_params) + infer_sec = time.perf_counter() - infer_start + finally: + for image in opened_images: + image.close() + + per_item_sec = infer_sec / max(1, len(batch)) + results: List[dict] = [] + for item, key, output in zip(batch, keys, batch_outputs): + raw_text = "" + if getattr(output, "outputs", None): + raw_text = str(output.outputs[0].text) + results.append( + { + "key": key, + "item": item, + "raw_text": raw_text, + "infer_sec": float(per_item_sec), + } + ) + return results + + +def _generate_batch_outputs( + llm, + *, + jobs: List[dict], + prompt: str, + batch_size: int, + sampling_params, +) -> List[dict]: + outputs_by_key: Dict[tuple[str, int], dict] = {} + for batch in _batched(jobs, batch_size): + for result in _run_vllm_batch( + llm, + batch=batch, + prompt=prompt, + sampling_params=sampling_params, + ): + outputs_by_key[result["key"]] = { + "item": result["item"], + "raw_text": result["raw_text"], + "infer_sec": result["infer_sec"], + } + return [outputs_by_key[(str(item["stem"]), int(item["page_number"]))] for item in jobs] + + +def _run_jobs_to_outputs( + args: argparse.Namespace, + *, + jobs_to_run: List[dict], + output_dir: Path, + work_db: Optional[Path], + origin_batch_id: Optional[int], + llm, + prompt: str, + plain_prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, + sampling_params, +) -> dict: + batch_wall_start = time.perf_counter() + batch_size = max(1, int(args.batch_size)) + doc_states: Dict[str, dict] = {} + plain_retry_jobs: List[dict] = [] + retry_pages_by_stem: Dict[str, List[int]] = {} + state_lock = threading.Lock() + render_queue: "queue.Queue[dict | None]" = queue.Queue(maxsize=max(2, batch_size * 2)) + producer_errors: List[BaseException] = [] + first_infer_started_at: Optional[float] = None + last_infer_completed_at: Optional[float] = None + shared_repair_queue = ( + work_db is not None + and origin_batch_id is not None + and str(args.repair_mode or "off").strip().lower() == "auto" + ) + + def _render_producer() -> None: + try: + for job in jobs_to_run: + pdf_path = Path(job["pdf_path"]) + stem = str(job["stem"]) + doc_start = time.perf_counter() + total_pages = _count_rendered_pages( + pdf_path, + args.max_pages, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ) + state = { + "stem": stem, + "pdf_path": str(pdf_path), + "source_name": str(job["source_name"]), + "source_stem": str(job["source_stem"]), + "source_start_page": int(job["start_page"]), + "page_outputs": [""] * total_pages, + "page_metrics": [None] * total_pages, + "render_sec": 0.0, + "doc_start": float(doc_start), + "completed_pages": 0, + "total_pages": total_pages, + } + with state_lock: + doc_states[stem] = state + _emit_progress(output_dir, stem, state) + + render_start = time.perf_counter() + for page_number, image in enumerate( + _iter_rendered_pages( + pdf_path, + args.max_pages, + args.render_dpi, + start_page=int(job["start_page"]), + end_page=job["end_page"], + ), + start=1, + ): + image_stats = _image_content_stats(image) + if _is_effectively_empty_page(image_stats, args.repair_mode): + with state_lock: + state["page_metrics"][page_number - 1] = _empty_page_metric( + page_number=page_number, + image_stats=image_stats, + ) + state["completed_pages"] = int(state["completed_pages"]) + 1 + _emit_progress(output_dir, stem, state) + image.close() + continue + render_queue.put( + { + "stem": stem, + "page_number": int(page_number), + "image": image, + "image_stats": image_stats, + } + ) + + with state_lock: + state["render_sec"] = float(time.perf_counter() - render_start) + except BaseException as exc: # pragma: no cover - exercised in integration flows + producer_errors.append(exc) + finally: + render_queue.put(None) + + producer = threading.Thread(target=_render_producer, name="deepseek-vllm-render", daemon=True) + producer.start() + + in_flight_batch: List[dict] = [] + producer_done = False + queue_wait_timeout = 0.05 + queue_flush_marker = "__flush__" + try: + while not producer_done or in_flight_batch: + if not producer_done and len(in_flight_batch) < batch_size: + try: + item = render_queue.get(timeout=queue_wait_timeout) + except queue.Empty: + item = queue_flush_marker if in_flight_batch else None + if item is None: + if producer.is_alive(): + continue + producer_done = True + elif item == queue_flush_marker: + pass + else: + in_flight_batch.append(item) + if len(in_flight_batch) < batch_size: + continue + + if not in_flight_batch: + continue + + batch_infer_started_at = time.time() + if first_infer_started_at is None: + first_infer_started_at = batch_infer_started_at + batch_results = _run_vllm_batch( + llm, + batch=in_flight_batch, + prompt=prompt, + sampling_params=sampling_params, + ) + last_infer_completed_at = time.time() + for result in batch_results: + item = result["item"] + state = doc_states[item["stem"]] + raw_text = str(result["raw_text"]) + image_stats = dict(item.get("image_stats", {})) + page_text, postprocess_metrics = _postprocess_page_text( + raw_text, + prompt=prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + page_text = f"\n{page_text}".strip() + quality = _text_quality_metrics(page_text) + metric = { + "page_number": int(item["page_number"]), + "infer_sec": float(result["infer_sec"]), + "raw_chars": int(len(raw_text.strip())), + "final_chars": int(len(page_text.strip())), + "first_pass_quality_score": float(quality["quality_score"]), + "first_pass_letters": int(quality["letters"]), + "first_pass_digits": int(quality["digits"]), + "first_pass_pua_chars": int(quality["pua_chars"]), + "repair_strategy": "plain" if bool(postprocess_metrics.get("early_stops", 0)) else "none", + "repair_reason": "early_stop_markdown_garbage" if bool(postprocess_metrics.get("early_stops", 0)) else None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": bool(postprocess_metrics.get("early_stops", 0)), + **image_stats, + **postprocess_metrics, + } + with state_lock: + state["page_outputs"][item["page_number"] - 1] = page_text + state["page_metrics"][item["page_number"] - 1] = metric + state["completed_pages"] = int(state["completed_pages"]) + 1 + _emit_progress(output_dir, item["stem"], state) + + if bool(postprocess_metrics.get("early_stops", 0)) and str(args.repair_mode or "off").strip().lower() == "auto": + if shared_repair_queue: + retry_pages_by_stem.setdefault(str(item["stem"]), []).append(int(item["page_number"])) + _close_job_image(item) + else: + plain_retry_jobs.append(item) + else: + _close_job_image(item) + + in_flight_batch = [] + + producer.join() + if producer_errors: + raise producer_errors[0] + + if plain_retry_jobs: + repair_started_at = time.time() + if first_infer_started_at is None: + first_infer_started_at = repair_started_at + plain_repair_outputs = _generate_batch_outputs( + llm, + jobs=plain_retry_jobs, + prompt=plain_prompt, + batch_size=batch_size, + sampling_params=sampling_params, + ) + last_infer_completed_at = time.time() + for result in plain_repair_outputs: + item = result["item"] + state = doc_states[item["stem"]] + metric = state["page_metrics"][item["page_number"] - 1] + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + repair_text = f"\n{repair_text}".strip() + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_profile"] = "plain_ocr" + disposition = _resolve_repair_disposition( + repair_text=repair_text, + repair_postprocess=repair_postprocess, + ) + repair_effective_text = disposition["final_text"] or "" + metric["repair_final_chars"] = int(len(repair_effective_text.strip())) + metric["repair_quality_score"] = float(_text_quality_metrics(repair_effective_text)["quality_score"]) + metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) + metric["repair_applied"] = bool(disposition["repair_applied"]) + metric["page_dropped_after_repair"] = bool(disposition["page_dropped_after_repair"]) + if disposition["drop_reason"] is not None: + metric["drop_reason"] = str(disposition["drop_reason"]) + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric["infer_sec"]) + float(result["infer_sec"]) + with state_lock: + if disposition["final_text"] is not None: + state["page_outputs"][item["page_number"] - 1] = repair_effective_text + metric["final_chars"] = int(len(repair_effective_text.strip())) + _emit_progress(output_dir, item["stem"], state) + _close_job_image(item) + finally: + for item in in_flight_batch: + _close_job_image(item) + for item in plain_retry_jobs: + _close_job_image(item) + + for stem, state in doc_states.items(): + markdown = _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]" + page_metrics = sorted( + [item for item in state["page_metrics"] if item], + key=lambda item: int(item["page_number"]), + ) + repair_summary = _repair_summary_from_page_metrics(page_metrics, str(args.repair_mode)) + _write_outputs( + output_dir, + stem, + markdown, + int(state["total_pages"]), + extra_metrics={ + "source_file": str(state["source_name"]), + "source_stem": str(state["source_stem"]), + "source_start_page": int(state["source_start_page"]), + "source_end_page": int(state["source_start_page"]) + max(0, len(page_metrics) - 1), + "ocr_profile": args.ocr_profile, + "attn_backend": "vllm", + "runtime_backend": "vllm", + "base_size": base_size, + "image_size": image_size, + "crop_mode": crop_mode, + "render_dpi": int(args.render_dpi), + "max_new_tokens": args.max_new_tokens, + "batch_size": int(args.batch_size), + "gpu_memory_utilization": float(args.gpu_memory_utilization), + "disable_fp8_kv": bool(args.disable_fp8_kv), + "repair_mode": str(args.repair_mode), + "render_sec": float(state["render_sec"]), + "infer_sec_total": float(sum(item["infer_sec"] for item in page_metrics)), + "wall_time_sec": float(time.perf_counter() - float(state["doc_start"])), + "repair_summary": repair_summary, + "page_metrics": page_metrics, + }, + ) + if shared_repair_queue and retry_pages_by_stem: + enqueue_batches( + work_db, + queue_name=QUEUE_REPAIR, + batches=_build_repair_batches( + doc_states=doc_states, + retry_pages_by_stem=retry_pages_by_stem, + origin_batch_id=int(origin_batch_id), + ), + ) + + return { + "docs": int(len(doc_states)), + "pages": int(sum(int(state["total_pages"]) for state in doc_states.values())), + "render_sec_total": float(sum(float(state["render_sec"]) for state in doc_states.values())), + "infer_sec_total": float( + sum( + sum(float(item["infer_sec"]) for item in state["page_metrics"] if item is not None) + for state in doc_states.values() + ) + ), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "repair_batches_enqueued": int(sum(1 for pages in retry_pages_by_stem.values() if pages)), + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + } + + +def _run_repair_batches_to_outputs( + args: argparse.Namespace, + *, + batches: List[dict], + output_dir: Path, + llm, + plain_prompt: str, + sampling_params, +) -> dict: + batch_wall_start = time.perf_counter() + claimed_batches = [dict(batch) for batch in batches] + if not claimed_batches: + return { + "docs": 0, + "pages": 0, + "render_sec_total": 0.0, + "infer_sec_total": 0.0, + "first_infer_started_at": None, + "last_infer_completed_at": None, + "batch_wall_time_sec": float(time.perf_counter() - batch_wall_start), + "per_batch_results": {}, + } + + state_by_stem: Dict[str, dict] = {} + repair_jobs: List[dict] = [] + per_batch_results: Dict[int, dict] = {} + execution_pack_batch_ids = [int(batch["batch_id"]) for batch in claimed_batches if "batch_id" in batch] + execution_pack_pages = int(sum(max(0, int(batch.get("pages", 0))) for batch in claimed_batches)) + render_sec_total = 0.0 + + for batch in claimed_batches: + batch_id = int(batch["batch_id"]) if "batch_id" in batch else None + stem = str(batch["stem"]) + state = state_by_stem.get(stem) + if state is None: + state = _load_persisted_doc_state(output_dir, stem) + state_by_stem[stem] = state + source_start_page = int(batch["source_start_page"]) + repair_page_numbers = sorted({int(page_number) for page_number in list(batch.get("repair_page_numbers") or [])}) + render_start = time.perf_counter() + if repair_page_numbers: + source_page_numbers = [source_start_page + page_number - 1 for page_number in repair_page_numbers] + for source_page_number, image in _iter_selected_rendered_pages( + Path(str(batch["pdf_path"])), + render_dpi=int(args.render_dpi), + source_page_numbers=source_page_numbers, + ): + repair_jobs.append( + { + "batch_id": batch_id, + "stem": stem, + "page_number": int(source_page_number) - source_start_page + 1, + "image": image, + } + ) + render_sec = float(time.perf_counter() - render_start) + render_sec_total += render_sec + if batch_id is not None: + per_batch_results[batch_id] = _repair_batch_result( + batch=batch, + render_sec_total=render_sec, + infer_sec_total=0.0, + first_infer_started_at=None, + last_infer_completed_at=None, + batch_wall_time_sec=float(time.perf_counter() - batch_wall_start), + execution_pack_batch_ids=execution_pack_batch_ids, + execution_pack_pages=execution_pack_pages, + ) + + first_infer_started_at: Optional[float] = None + last_infer_completed_at: Optional[float] = None + if repair_jobs: + first_infer_started_at = time.time() + repair_outputs = _generate_batch_outputs( + llm, + jobs=repair_jobs, + prompt=plain_prompt, + batch_size=max(1, int(args.batch_size)), + sampling_params=sampling_params, + ) + last_infer_completed_at = time.time() + else: + repair_outputs = [] + + try: + for result in repair_outputs: + item = result["item"] + stem = str(item["stem"]) + page_number = int(item["page_number"]) + batch_id = int(item["batch_id"]) if item.get("batch_id") is not None else None + state = state_by_stem[stem] + metric = state["page_metrics"][page_number - 1] + if metric is None: + metric = { + "page_number": page_number, + "infer_sec": 0.0, + "raw_chars": 0, + "final_chars": 0, + "first_pass_quality_score": 0.0, + "first_pass_letters": 0, + "first_pass_digits": 0, + "first_pass_pua_chars": 0, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + } + state["page_metrics"][page_number - 1] = metric + repair_text, repair_postprocess = _postprocess_page_text( + str(result["raw_text"]), + prompt=plain_prompt, + content_debug=bool(args.content_debug), + ) + if args.content_debug: + repair_text = f"\n{repair_text}".strip() + metric["repair_attempted"] = True + metric["repair_infer_sec"] = float(result["infer_sec"]) + metric["repair_raw_chars"] = int(len(str(result["raw_text"]).strip())) + metric["repair_profile"] = "plain_ocr" + disposition = _resolve_repair_disposition( + repair_text=repair_text, + repair_postprocess=repair_postprocess, + ) + repair_effective_text = disposition["final_text"] or "" + metric["repair_final_chars"] = int(len(repair_effective_text.strip())) + metric["repair_quality_score"] = float(_text_quality_metrics(repair_effective_text)["quality_score"]) + metric["repair_garbage_early_stop_applied"] = bool(repair_postprocess.get("early_stops", 0)) + metric["repair_applied"] = bool(disposition["repair_applied"]) + metric["page_dropped_after_repair"] = bool(disposition["page_dropped_after_repair"]) + if disposition["drop_reason"] is not None: + metric["drop_reason"] = str(disposition["drop_reason"]) + metric.update({f"repair_{key}": value for key, value in repair_postprocess.items()}) + metric["infer_sec"] = float(metric.get("infer_sec", 0.0)) + float(result["infer_sec"]) + if disposition["final_text"] is not None: + state["page_outputs"][page_number - 1] = repair_effective_text + metric["final_chars"] = int(len(repair_effective_text.strip())) + if batch_id is not None and batch_id in per_batch_results: + per_batch_results[batch_id]["infer_sec_total"] = float( + per_batch_results[batch_id]["infer_sec_total"] + float(result["infer_sec"]) + ) + _close_job_image(item) + finally: + for item in repair_jobs: + _close_job_image(item) + + for stem, state in state_by_stem.items(): + page_metrics = sorted([item for item in state["page_metrics"] if item], key=lambda item: int(item["page_number"])) + extra_metrics = dict(state["extra_metrics"]) + extra_metrics["repair_summary"] = _repair_summary_from_page_metrics( + page_metrics, + extra_metrics.get("repair_mode", args.repair_mode), + ) + extra_metrics["page_metrics"] = page_metrics + extra_metrics["infer_sec_total"] = float(sum(float(item["infer_sec"]) for item in page_metrics)) + _write_outputs( + output_dir, + stem, + _join_page_outputs(state["page_outputs"]) if state["page_outputs"] else "[[Blank page]]", + int(state["total_pages"]), + extra_metrics=extra_metrics, + ) + + batch_wall_time_sec = float(time.perf_counter() - batch_wall_start) + for batch_id, result in per_batch_results.items(): + result["first_infer_started_at"] = ( + _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None + ) + result["last_infer_completed_at"] = ( + _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None + ) + result["batch_wall_time_sec"] = batch_wall_time_sec + + return { + "docs": int(len(state_by_stem)), + "pages": int( + sum(max(0, int(batch.get("pages", len(list(batch.get("repair_page_numbers") or []))))) for batch in claimed_batches) + ), + "render_sec_total": float(render_sec_total), + "infer_sec_total": float(sum(float(result["infer_sec"]) for result in repair_outputs)), + "first_infer_started_at": _utc_now_iso(first_infer_started_at) if first_infer_started_at is not None else None, + "last_infer_completed_at": _utc_now_iso(last_infer_completed_at) if last_infer_completed_at is not None else None, + "batch_wall_time_sec": batch_wall_time_sec, + "per_batch_results": per_batch_results, + } + + +def _run_repair_batch_to_outputs( + args: argparse.Namespace, + *, + batch: dict, + output_dir: Path, + llm, + plain_prompt: str, + sampling_params, +) -> dict: + result = _run_repair_batches_to_outputs( + args, + batches=[batch], + output_dir=output_dir, + llm=llm, + plain_prompt=plain_prompt, + sampling_params=sampling_params, + ) + batch_id = int(batch["batch_id"]) if "batch_id" in batch else None + if batch_id is not None and batch_id in result["per_batch_results"]: + return dict(result["per_batch_results"][batch_id]) + result.pop("per_batch_results", None) + return result + + +def _queue_has_pending_or_running(counts: Dict[str, object], queue_name: str) -> bool: + queue_counts = counts.get("by_queue", {}).get(queue_name, {}) + return int(queue_counts.get(STATUS_PENDING, 0)) > 0 or int(queue_counts.get(STATUS_RUNNING, 0)) > 0 + + +def _claim_next_phase_batch( + work_db: Path, + *, + worker_id: str, + stale_after_sec: float, +) -> Tuple[Optional[str], Optional[Dict[str, object]], bool]: + batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_MAIN, + ) + if batch is not None: + return QUEUE_MAIN, batch, False + + counts = work_queue_counts(work_db) + # Repairs are a distinct global phase: no worker should start repair work + # while any first-pass batch is still pending or running elsewhere. + if _queue_has_pending_or_running(counts, QUEUE_MAIN): + return None, None, True + + batch = claim_next_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + queue_name=QUEUE_REPAIR, + ) + if batch is not None: + return QUEUE_REPAIR, batch, False + + counts = work_queue_counts(work_db) + if _queue_has_pending_or_running(counts, QUEUE_REPAIR): + return None, None, True + return None, None, False + + +def _run_work_queue( + args: argparse.Namespace, + *, + input_dir: Path, + output_dir: Path, + llm, + prompt: str, + plain_prompt: str, + base_size: int, + image_size: int, + crop_mode: bool, + sampling_params, +) -> int: + work_db = Path(str(args.work_db)).expanduser().resolve() + worker_id = str(args.worker_id or f"worker-{int(time.time())}") + runtime_file = Path(str(args.worker_runtime_file)).expanduser().resolve() if args.worker_runtime_file else None + heartbeat_interval = float(max(1.0, args.work_heartbeat_sec)) + stale_after_sec = float(max(30.0, args.work_stale_after_sec)) + max_attempts = int(max(1, args.work_max_attempts)) + runtime_state = { + "worker_id": worker_id, + "status": "starting", + "started_at": _utc_now_iso(), + "engine_ready_at": _utc_now_iso(), + "current_batch_id": None, + "current_queue_name": None, + "current_batch_ids": [], + "completed_batches": [], + "first_batch_started_at": None, + "last_batch_finished_at": None, + } + _write_worker_runtime(runtime_file, runtime_state) + + while True: + queue_name, batch, should_wait = _claim_next_phase_batch( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + ) + if batch is None: + if should_wait: + time.sleep(min(heartbeat_interval, 1.0)) + continue + runtime_state["status"] = "complete" + runtime_state["current_batch_id"] = None + runtime_state["current_queue_name"] = None + _write_worker_runtime(runtime_file, runtime_state) + return 0 + + claimed_batches = [dict(batch)] + if queue_name == QUEUE_REPAIR: + claimed_batches = _claim_additional_repair_batches( + work_db, + worker_id=worker_id, + stale_after_sec=stale_after_sec, + first_batch=batch, + target_pages=int(args.repair_exec_batch_target_pages), + target_items=int(args.repair_exec_batch_target_items), + ) + batch_ids = [int(claimed_batch["batch_id"]) for claimed_batch in claimed_batches if "batch_id" in claimed_batch] + batch_id = batch_ids[0] + heartbeat_stop = threading.Event() + + def _heartbeat_loop() -> None: + while not heartbeat_stop.wait(heartbeat_interval): + for heartbeat_batch_id in batch_ids: + heartbeat_batch(work_db, batch_id=heartbeat_batch_id, worker_id=worker_id) + runtime_state["heartbeat_at"] = _utc_now_iso() + _write_worker_runtime(runtime_file, runtime_state) + + heartbeat_thread = threading.Thread(target=_heartbeat_loop, name=f"{worker_id}-heartbeat", daemon=True) + heartbeat_thread.start() + try: + runtime_state["status"] = f"running_{queue_name}" + runtime_state["current_batch_id"] = batch_id + runtime_state["current_queue_name"] = queue_name + runtime_state["current_batch_ids"] = batch_ids + runtime_state["current_batch_pages"] = int(sum(int(claimed_batch.get("pages", 0)) for claimed_batch in claimed_batches)) + runtime_state["heartbeat_at"] = _utc_now_iso() + _write_worker_runtime(runtime_file, runtime_state) + if queue_name == QUEUE_MAIN: + result = _run_jobs_to_outputs( + args, + jobs_to_run=_build_jobs_from_batch(input_dir, batch), + output_dir=output_dir, + work_db=work_db, + origin_batch_id=batch_id, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) + per_batch_results = {batch_id: dict(result)} + else: + result = _run_repair_batches_to_outputs( + args, + batches=claimed_batches, + output_dir=output_dir, + llm=llm, + plain_prompt=plain_prompt, + sampling_params=sampling_params, + ) + per_batch_results = dict(result.get("per_batch_results") or {}) + if runtime_state["first_batch_started_at"] is None: + runtime_state["first_batch_started_at"] = result.get("first_infer_started_at") + runtime_state["last_batch_finished_at"] = result.get("last_infer_completed_at") + runtime_state["completed_batches"].extend( + { + "batch_id": int(claimed_batch["batch_id"]), + "queue_name": queue_name, + } + for claimed_batch in claimed_batches + if "batch_id" in claimed_batch + ) + for claimed_batch in claimed_batches: + claimed_batch_id = int(claimed_batch["batch_id"]) + mark_batch_done( + work_db, + batch_id=claimed_batch_id, + worker_id=worker_id, + result=per_batch_results.get(claimed_batch_id, dict(result)), + ) + except Exception as exc: + runtime_state["status"] = "failed" + runtime_state["current_batch_id"] = batch_id + runtime_state["current_queue_name"] = queue_name + runtime_state["last_error"] = str(exc) + _write_worker_runtime(runtime_file, runtime_state) + for claimed_batch in claimed_batches: + mark_batch_failed( + work_db, + batch_id=int(claimed_batch["batch_id"]), + worker_id=worker_id, + error=str(exc), + max_attempts=max_attempts, + ) + raise + finally: + heartbeat_stop.set() + heartbeat_thread.join(timeout=max(1.0, heartbeat_interval)) + runtime_state["current_batch_id"] = None + runtime_state["current_queue_name"] = None + runtime_state["current_batch_ids"] = [] + _write_worker_runtime(runtime_file, runtime_state) + + +def main() -> int: + args = _parse_args() + input_dir = Path(args.input_dir).resolve() + output_dir = Path(args.output_dir).resolve() + model_dir = Path(args.model_dir).resolve() + + profile_defaults = _profile_defaults(args.ocr_profile) + prompt = str(args.prompt_override) if args.prompt_override else profile_defaults["prompt"] + plain_prompt = _profile_defaults("plain_ocr")["prompt"] + base_size = int(args.base_size) if args.base_size is not None else int(profile_defaults["base_size"]) + image_size = int(args.image_size) if args.image_size is not None else int(profile_defaults["image_size"]) + crop_mode = bool(args.crop_mode) if args.crop_mode is not None else bool(profile_defaults["crop_mode"]) + + llm = _load_vllm( + model_dir, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + ) + sampling_params = _sampling_params( + args.max_new_tokens, + enable_garbage_early_stop=str(args.repair_mode or "off").strip().lower() == "auto", + ) + + if args.work_db: + return _run_work_queue( + args, + input_dir=input_dir, + output_dir=output_dir, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) + + jobs_to_run = _iter_pdf_jobs(input_dir, args.files, args.page_ranges) + if not jobs_to_run: + return 0 + _run_jobs_to_outputs( + args, + jobs_to_run=jobs_to_run, + output_dir=output_dir, + work_db=None, + origin_batch_id=None, + llm=llm, + prompt=prompt, + plain_prompt=plain_prompt, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + sampling_params=sampling_params, + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/ocr/deepseek/runner.py b/src/glossapi/ocr/deepseek/runner.py index d68f05c..0731228 100644 --- a/src/glossapi/ocr/deepseek/runner.py +++ b/src/glossapi/ocr/deepseek/runner.py @@ -1,22 +1,72 @@ -"""DeepSeek OCR runner with stub and optional CLI dispatch.""" +"""DeepSeek OCR runner.""" from __future__ import annotations +from contextlib import ExitStack +import calendar import json import logging import os +import re +import signal import shutil import subprocess import sys +import threading +import time from pathlib import Path from typing import Any, Dict, Iterable, List, Optional +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) +from glossapi.ocr.deepseek.runtime_paths import resolve_deepseek_python +from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs +from glossapi.ocr.deepseek.work_queue import ( + STATUS_DONE, + STATUS_FAILED, + init_work_db, + iter_work_items, + requeue_worker_batches, + work_queue_counts, +) + try: import pypdfium2 as _pypdfium2 except Exception: # pragma: no cover - optional dependency _pypdfium2 = None LOGGER = logging.getLogger(__name__) +REPO_ROOT = Path(__file__).resolve().parents[4] +DEFAULT_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py" +DEFAULT_VLLM_SCRIPT = REPO_ROOT / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" +AUTO_VLLM_BATCH_PAGE_CAP = 160 +DEFAULT_MAX_NEW_TOKENS = 2048 +DEFAULT_WORKER_RESPAWN_CAP = 3 +DEFAULT_WORK_ITEM_MAX_ATTEMPTS = 2 +DEFAULT_WORK_STALE_AFTER_SEC = 900.0 +DEFAULT_WORK_HEARTBEAT_SEC = 10.0 +DEFAULT_TELEMETRY_INTERVAL_SEC = 15.0 +SHARD_STEM_RE = re.compile(r"^(?P.+)__p(?P\d{5})-(?P\d{5})$") +REASSEMBLED_CONFIG_KEYS = ( + "ocr_profile", + "attn_backend", + "runtime_backend", + "base_size", + "image_size", + "crop_mode", + "render_dpi", + "max_new_tokens", + "batch_size", + "gpu_memory_utilization", + "disable_fp8_kv", + "repair_mode", +) def _page_count(pdf_path: Path) -> int: @@ -28,17 +78,42 @@ def _page_count(pdf_path: Path) -> int: return 0 -def _run_cli( +def _build_cli_command( input_dir: Path, output_dir: Path, *, + files: List[str], + page_ranges: Optional[List[str]], + model_dir: Path, python_bin: Optional[Path], script: Path, max_pages: Optional[int], content_debug: bool, - gpu_memory_utilization: Optional[float] = None, - disable_fp8_kv: bool = False, -) -> None: + device: Optional[str], + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, + work_db: Optional[Path] = None, + worker_id: Optional[str] = None, + worker_runtime_file: Optional[Path] = None, + work_stale_after_sec: Optional[float] = None, + work_heartbeat_sec: Optional[float] = None, + work_max_attempts: Optional[int] = None, +) -> List[str]: python_exe = Path(python_bin) if python_bin else Path(sys.executable) cmd: List[str] = [ str(python_exe), @@ -47,151 +122,1425 @@ def _run_cli( str(input_dir), "--output-dir", str(output_dir), + "--model-dir", + str(model_dir), ] + if files: + cmd += ["--files", *files] + if page_ranges: + cmd += ["--page-ranges", *page_ranges] if max_pages is not None: cmd += ["--max-pages", str(max_pages)] if content_debug: cmd.append("--content-debug") - if gpu_memory_utilization is not None: - cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)] - if disable_fp8_kv: - cmd.append("--no-fp8-kv") + if device: + cmd += ["--device", str(device)] + if ocr_profile: + cmd += ["--ocr-profile", str(ocr_profile)] + if prompt_override: + cmd += ["--prompt-override", str(prompt_override)] + if attn_backend: + cmd += ["--attn-backend", str(attn_backend)] + if base_size is not None: + cmd += ["--base-size", str(int(base_size))] + if image_size is not None: + cmd += ["--image-size", str(int(image_size))] + if crop_mode is True: + cmd.append("--crop-mode") + elif crop_mode is False: + cmd.append("--no-crop-mode") + if render_dpi is not None: + cmd += ["--render-dpi", str(int(render_dpi))] + if max_new_tokens is not None: + cmd += ["--max-new-tokens", str(int(max_new_tokens))] + if work_db is not None: + cmd += ["--work-db", str(work_db)] + if worker_id: + cmd += ["--worker-id", str(worker_id)] + if worker_runtime_file is not None: + cmd += ["--worker-runtime-file", str(worker_runtime_file)] + if work_stale_after_sec is not None: + cmd += ["--work-stale-after-sec", str(float(work_stale_after_sec))] + if work_heartbeat_sec is not None: + cmd += ["--work-heartbeat-sec", str(float(work_heartbeat_sec))] + if work_max_attempts is not None: + cmd += ["--work-max-attempts", str(int(work_max_attempts))] + if repetition_penalty is not None: + cmd += ["--repetition-penalty", str(float(repetition_penalty))] + if no_repeat_ngram_size is not None: + cmd += ["--no-repeat-ngram-size", str(int(no_repeat_ngram_size))] + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + if runtime_backend_norm == "vllm": + if vllm_batch_size is not None: + cmd += ["--batch-size", str(int(vllm_batch_size))] + if gpu_memory_utilization is not None: + cmd += ["--gpu-memory-utilization", str(float(gpu_memory_utilization))] + if disable_fp8_kv: + cmd.append("--disable-fp8-kv") + if repair_mode: + cmd += ["--repair-mode", str(repair_mode)] + if repair_exec_batch_target_pages is not None: + cmd += ["--repair-exec-batch-target-pages", str(int(repair_exec_batch_target_pages))] + if repair_exec_batch_target_items is not None: + cmd += ["--repair-exec-batch-target-items", str(int(repair_exec_batch_target_items))] + return cmd + +def _build_env( + *, + python_bin: Optional[Path], + visible_device: Optional[int] = None, + script: Optional[Path] = None, +) -> Dict[str, str]: env = os.environ.copy() + if python_bin: + python_path = Path(python_bin).expanduser() + venv_bin = str(python_path.parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + env["VIRTUAL_ENV"] = str(python_path.parent.parent) + if script is not None: + script_path = Path(script).expanduser().resolve() + src_root = next((parent for parent in script_path.parents if (parent / "glossapi").is_dir()), None) + if src_root is not None: + src_root_str = str(src_root) + existing_pythonpath = str(env.get("PYTHONPATH", "")).strip() + pythonpath_entries = [src_root_str] + if existing_pythonpath: + pythonpath_entries.extend( + entry + for entry in existing_pythonpath.split(os.pathsep) + if entry and entry != src_root_str + ) + env["PYTHONPATH"] = os.pathsep.join(pythonpath_entries) + env.pop("PYTHONHOME", None) + if visible_device is not None: + env["CUDA_VISIBLE_DEVICES"] = str(visible_device) if shutil.which("cc1plus", path=env.get("PATH", "")) is None: - # FlashInfer JIT (via vLLM) needs a C++ toolchain; add a known cc1plus location if missing. for candidate in sorted(Path("/usr/lib/gcc/x86_64-linux-gnu").glob("*/cc1plus")): - env["PATH"] = f"{candidate.parent}:{env.get('PATH','')}" + env["PATH"] = f"{candidate.parent}:{env.get('PATH', '')}" break + ld_entries: List[str] = [] + if python_bin: + # Keep the venv path semantics instead of resolving the interpreter symlink + # back to `/usr/bin/python...`; the wheel-managed CUDA libs live under the + # virtualenv tree, not under the system interpreter location. + venv_root = Path(python_bin).expanduser().parent.parent + for site_packages in sorted((venv_root / "lib").glob("python*/site-packages")): + nvidia_root = site_packages / "nvidia" + if not nvidia_root.is_dir(): + continue + for lib_dir in sorted(nvidia_root.glob("*/lib")): + if lib_dir.is_dir(): + ld_entries.append(str(lib_dir)) ld_path = env.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") if ld_path: - env["LD_LIBRARY_PATH"] = f"{ld_path}:{env.get('LD_LIBRARY_PATH','')}" + ld_entries.extend(entry for entry in str(ld_path).split(os.pathsep) if entry) + existing_ld = str(env.get("LD_LIBRARY_PATH", "")).strip() + if existing_ld: + ld_entries.extend(entry for entry in existing_ld.split(os.pathsep) if entry) + if ld_entries: + deduped: List[str] = [] + seen: Set[str] = set() + for entry in ld_entries: + if entry and entry not in seen: + seen.add(entry) + deduped.append(entry) + env["LD_LIBRARY_PATH"] = os.pathsep.join(deduped) + return env + + +def _run_cli( + input_dir: Path, + output_dir: Path, + *, + files: List[str], + model_dir: Path, + python_bin: Optional[Path], + script: Path, + max_pages: Optional[int], + content_debug: bool, + device: Optional[str], + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int], + repair_exec_batch_target_items: Optional[int], + visible_device: Optional[int] = None, +) -> None: + cmd = _build_cli_command( + input_dir=input_dir, + output_dir=output_dir, + files=files, + page_ranges=None, + model_dir=model_dir, + python_bin=python_bin, + script=script, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + ) + env = _build_env(python_bin=python_bin, visible_device=visible_device, script=script) - LOGGER.info("Running DeepSeek CLI: %s", " ".join(cmd)) + LOGGER.info("Running DeepSeek OCR CLI: %s", " ".join(cmd)) subprocess.run(cmd, check=True, env=env) # nosec: controlled arguments -def _run_one_pdf(pdf_path: Path, md_out: Path, metrics_out: Path, cfg: Dict[str, Any]) -> Dict[str, Any]: - """Stub processor for a single PDF.""" - page_count = _page_count(pdf_path) - max_pages = cfg.get("max_pages") - if max_pages is not None and page_count: - page_count = min(page_count, max_pages) +def _parse_device_index(device: Optional[str]) -> Optional[int]: + if not device: + return None + value = str(device).strip().lower() + if value.startswith("cuda:"): + suffix = value.split(":", 1)[1] + if suffix.isdigit(): + return int(suffix) + return None + + +def _detect_visible_gpus() -> List[int]: + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + parsed = [piece.strip() for piece in visible.split(",") if piece.strip()] + if parsed and all(piece.isdigit() for piece in parsed): + return [int(piece) for piece in parsed] + torch_mod = None + try: # pragma: no cover - best effort + import torch as torch_mod # type: ignore + except Exception: # pragma: no cover - optional import + torch_mod = None + if torch_mod is not None: + try: + if torch_mod.cuda.is_available(): + return list(range(torch_mod.cuda.device_count())) + except Exception: + pass + try: # pragma: no cover - shell fallback + proc = subprocess.run( + ["nvidia-smi", "-L"], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + devices: List[int] = [] + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + if line.startswith("GPU "): + prefix = line.split(":", 1)[0] + idx = prefix.split()[1] + if idx.isdigit(): + devices.append(int(idx)) + return devices + except Exception: + return [] + + +def _resolve_lane_devices( + *, + use_gpus: Optional[str], + devices: Optional[List[int]], + workers_per_gpu: int, + device: Optional[str], +) -> List[int]: + if devices: + resolved = [int(dev) for dev in devices] + if resolved: + return resolved + if str(use_gpus or "single").strip().lower() == "multi": + resolved = _detect_visible_gpus() + if resolved: + return resolved + if workers_per_gpu > 1: + from_device = _parse_device_index(device) + if from_device is not None: + return [from_device] + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible: + first = visible.split(",", 1)[0].strip() + if first.isdigit(): + return [int(first)] + return [0] + return [] + + +def _effective_page_count(pdf_path: Path, max_pages: Optional[int]) -> int: + count = _page_count(pdf_path) + if max_pages is not None and count > 0: + return min(count, int(max_pages)) + return max(1, count) + + +def _source_documents( + *, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> List[SourceDocument]: + documents: List[SourceDocument] = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + documents.append( + SourceDocument( + name=str(name), + pages=int(_effective_page_count(pdf_path, max_pages)), + ) + ) + return documents + + +def _plan_lanes( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], +) -> List[Dict[str, Any]]: + lanes: List[Dict[str, Any]] = [] + lane_id = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append( + { + "lane_id": lane_id, + "visible_device": int(visible_device), + "files": [], + "weight": 0, + } + ) + lane_id += 1 + if not lanes: + return [] + + weighted_files = [] + for name in file_list: + pdf_path = (input_root / name).resolve() + weighted_files.append((name, _effective_page_count(pdf_path, max_pages))) + weighted_files.sort(key=lambda item: (-item[1], item[0])) + + for name, weight in weighted_files: + lane = min(lanes, key=lambda item: int(item["weight"])) + lane["files"].append(name) + lane["weight"] = int(lane["weight"]) + int(weight) + return lanes + + +def _resolve_scheduler( + *, + scheduler: Optional[str], + runtime_backend: str, + lane_devices: List[int], + workers_per_gpu: int, +) -> str: + scheduler_norm = str(scheduler or "auto").strip().lower() + if scheduler_norm not in {"auto", "whole_doc", "fixed_shard", "exact_fill"}: + raise ValueError("scheduler must be one of 'auto', 'whole_doc', 'fixed_shard', or 'exact_fill'") + if scheduler_norm != "auto": + return scheduler_norm + runtime_backend_norm = str(runtime_backend or "transformers").strip().lower() + lane_count = max(1, len(lane_devices)) * max(1, int(workers_per_gpu)) + if runtime_backend_norm == "vllm" and lane_count > 1: + return "exact_fill" + return "whole_doc" + + +def _plan_lane_batches( + *, + file_list: List[str], + input_root: Path, + lane_devices: List[int], + workers_per_gpu: int, + max_pages: Optional[int], + runtime_backend: str, + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + documents = _source_documents( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + ) + scheduler_norm = _resolve_scheduler( + scheduler=scheduler, + runtime_backend=runtime_backend, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches( + documents, + target_batch_pages=max(1, int(target_batch_pages)), + ) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches( + slices, + target_batch_pages=max(1, int(target_batch_pages)), + ) + lanes = assign_batches_to_lanes( + batches, + devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + return [lane.to_dict() for lane in lanes if lane.batches] + + +def _plan_work_batches( + *, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], + runtime_backend: str, + scheduler: Optional[str], + lane_devices: List[int], + workers_per_gpu: int, + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + documents = _source_documents( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + ) + scheduler_norm = _resolve_scheduler( + scheduler=scheduler, + runtime_backend=runtime_backend, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + ) + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches( + documents, + target_batch_pages=max(1, int(target_batch_pages)), + ) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches( + slices, + target_batch_pages=max(1, int(target_batch_pages)), + ) + return [batch.to_dict() for batch in batches if int(batch.pages) > 0] + + +def _auto_vllm_batch_size( + *, + runtime_backend: str, + file_list: List[str], + input_root: Path, + max_pages: Optional[int], +) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + total_pages = 0 + for name in file_list: + pdf_path = (input_root / name).resolve() + total_pages += int(_effective_page_count(pdf_path, max_pages)) + if total_pages <= 0: + return 1 + return min(int(total_pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + +def _auto_vllm_batch_size_for_pages(*, runtime_backend: str, pages: int) -> Optional[int]: + if str(runtime_backend or "").strip().lower() != "vllm": + return None + if int(pages) <= 0: + return 1 + return min(int(pages), int(AUTO_VLLM_BATCH_PAGE_CAP)) + + +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + +def _utc_now_iso(now_ts: Optional[float] = None) -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(float(now_ts) if now_ts is not None else time.time())) + + +def _parse_utc_iso(value: Optional[str]) -> Optional[float]: + if not value: + return None + try: + return float(calendar.timegm(time.strptime(str(value), "%Y-%m-%dT%H:%M:%SZ"))) + except Exception: + return None + + +def _run_text_command(cmd: List[str]) -> str: + proc = subprocess.run(cmd, check=True, capture_output=True, text=True) # nosec: controlled args + return str(proc.stdout or "").strip() + + +def _process_group_members(pgid: int) -> List[int]: + proc = subprocess.run(["pgrep", "-g", str(int(pgid))], check=False, capture_output=True, text=True) # nosec: controlled args + if int(proc.returncode) not in {0, 1}: + return [] + members: List[int] = [] + for line in str(proc.stdout or "").splitlines(): + line = line.strip() + if line: + try: + members.append(int(line)) + except ValueError: + continue + return members + + +def _wait_for_process_group_exit(pgid: int, *, timeout_sec: float) -> bool: + deadline = time.time() + float(max(0.0, timeout_sec)) + while time.time() <= deadline: + if not _process_group_members(pgid): + return True + time.sleep(0.2) + return not _process_group_members(pgid) + + +def _terminate_worker_process_group(worker: Dict[str, Any]) -> bool: + pgid = int(worker["proc"].pid) + worker_id = str(worker["worker_id"]) + for sig, grace_sec in ((signal.SIGTERM, 5.0), (signal.SIGKILL, 5.0)): + try: + os.killpg(pgid, sig) + except ProcessLookupError: + return True + except Exception as exc: + LOGGER.warning("Failed to signal worker process group %s pgid=%s: %s", worker_id, pgid, exc) + return False + if _wait_for_process_group_exit(pgid, timeout_sec=grace_sec): + return True + LOGGER.warning("Worker process group %s pgid=%s did not exit cleanly", worker_id, pgid) + return False + + +def _launch_worker_process(cmd: List[str], *, fh, env: Dict[str, str]) -> subprocess.Popen: + return subprocess.Popen( + cmd, + stdout=fh, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) # nosec: controlled args + + +def _parse_csv_table(text: str, columns: List[str]) -> List[Dict[str, str]]: + rows: List[Dict[str, str]] = [] + for raw_line in str(text or "").splitlines(): + line = raw_line.strip() + if not line: + continue + parts = [piece.strip() for piece in line.split(",")] + if len(parts) < len(columns): + parts.extend([""] * (len(columns) - len(parts))) + rows.append({name: str(parts[idx]) for idx, name in enumerate(columns)}) + return rows - md_lines = [ - f"# DeepSeek OCR (stub) — {pdf_path.name}", - "", - f"Pages: {page_count if page_count else 'unknown'}", + +def _collect_gpu_snapshot(*, visible_devices: List[int]) -> Dict[str, Any]: + gpu_text = _run_text_command( + [ + "nvidia-smi", + f"--id={','.join(str(device) for device in visible_devices)}", + "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,persistence_mode", + "--format=csv,noheader,nounits", + ] + ) + process_text = _run_text_command( + [ + "nvidia-smi", + "--query-compute-apps=gpu_uuid,pid,process_name,used_memory", + "--format=csv,noheader,nounits", + ] + ) + return { + "captured_at": _utc_now_iso(), + "gpus": _parse_csv_table( + gpu_text, + [ + "index", + "name", + "utilization_gpu", + "memory_used_mib", + "memory_total_mib", + "temperature_c", + "power_draw_w", + "persistence_mode", + ], + ), + "processes": _parse_csv_table( + process_text, + [ + "gpu_uuid", + "pid", + "process_name", + "used_memory_mib", + ], + ), + } + + +def _read_worker_runtime(runtime_path: Path) -> Dict[str, Any]: + try: + return json.loads(Path(runtime_path).read_text(encoding="utf-8")) + except Exception: + return {} + + +def _write_runtime_summary(*, runtime_dir: Path, db_path: Path) -> Path: + runtime_dir.mkdir(parents=True, exist_ok=True) + workers = [] + first_batch_started = [] + last_batch_finished = [] + engine_ready = [] + for path in sorted(runtime_dir.glob("worker_*.runtime.json")): + data = _read_worker_runtime(path) + workers.append(data) + first_batch_started_ts = _parse_utc_iso(data.get("first_batch_started_at")) + last_batch_finished_ts = _parse_utc_iso(data.get("last_batch_finished_at")) + engine_ready_ts = _parse_utc_iso(data.get("engine_ready_at")) + if first_batch_started_ts is not None: + first_batch_started.append(first_batch_started_ts) + if last_batch_finished_ts is not None: + last_batch_finished.append(last_batch_finished_ts) + if engine_ready_ts is not None: + engine_ready.append(engine_ready_ts) + steady_summary = { + "first_batch_started_at": _utc_now_iso(min(first_batch_started)) if first_batch_started else None, + "last_batch_finished_at": _utc_now_iso(max(last_batch_finished)) if last_batch_finished else None, + "all_workers_ready_at": _utc_now_iso(max(engine_ready)) if engine_ready else None, + "first_batch_to_last_batch_window_sec": ( + float(max(last_batch_finished) - min(first_batch_started)) + if first_batch_started and last_batch_finished + else None + ), + "all_workers_ready_to_last_batch_window_sec": ( + float(max(last_batch_finished) - max(engine_ready)) + if engine_ready and last_batch_finished + else None + ), + } + summary_path = runtime_dir / "runtime_summary.json" + summary_path.write_text( + json.dumps( + { + "generated_at": _utc_now_iso(), + "queue_counts": work_queue_counts(db_path), + "work_items": list(iter_work_items(db_path)), + "workers": workers, + "steady_state": steady_summary, + }, + indent=2, + ), + encoding="utf-8", + ) + return summary_path + + +def _query_persistence_mode(*, visible_devices: List[int]) -> List[Dict[str, str]]: + raw = _run_text_command( + [ + "nvidia-smi", + f"--id={','.join(str(device) for device in visible_devices)}", + "--query-gpu=index,persistence_mode", + "--format=csv,noheader,nounits", + ] + ) + return _parse_csv_table(raw, ["index", "persistence_mode"]) + + +def _ensure_gpu_preflight(*, visible_devices: List[int], mode: str) -> Dict[str, Any]: + mode_norm = str(mode or "warn").strip().lower() + status = { + "mode": mode_norm, + "checked_at": _utc_now_iso(), + "before": _query_persistence_mode(visible_devices=visible_devices), + "changed": False, + } + disabled = [item for item in status["before"] if str(item.get("persistence_mode", "")).lower() != "enabled"] + if not disabled or mode_norm == "off": + status["after"] = list(status["before"]) + return status + if mode_norm == "ensure": + try: + subprocess.run(["sudo", "-n", "nvidia-smi", "-pm", "1"], check=True, capture_output=True, text=True) # nosec: controlled args + status["changed"] = True + except Exception as exc: + status["ensure_error"] = str(exc) + status["after"] = _query_persistence_mode(visible_devices=visible_devices) + return status + + +def _collect_xid_faults(*, start_utc_iso: str) -> Dict[str, Any]: + cmd = [ + "journalctl", + "-k", + "--since", + str(start_utc_iso), + "--no-pager", ] - if cfg.get("content_debug"): - md_lines.append("") - md_lines.append("") - md_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("\n".join(md_lines) + "\n", encoding="utf-8") + try: + output = _run_text_command(cmd) + except Exception as exc: + return { + "supported": False, + "error": str(exc), + "faults": [], + } + faults = [line for line in output.splitlines() if "NVRM: Xid" in line] + return { + "supported": True, + "faults": faults, + } + + +def _start_gpu_telemetry( + *, + telemetry_path: Path, + visible_devices: List[int], + interval_sec: float, + stop_event: threading.Event, +) -> threading.Thread: + telemetry_path.parent.mkdir(parents=True, exist_ok=True) + + def _loop() -> None: + while not stop_event.wait(float(max(1.0, interval_sec))): + try: + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + except Exception as exc: # pragma: no cover - best effort logging + LOGGER.warning("GPU telemetry sample failed: %s", exc) + + thread = threading.Thread(target=_loop, name="deepseek-gpu-telemetry", daemon=True) + thread.start() + return thread + + +def _parse_shard_stem(stem: str) -> Optional[Dict[str, Any]]: + match = SHARD_STEM_RE.match(str(stem)) + if match is None: + return None + return { + "source_stem": str(match.group("source_stem")), + "start_page": int(match.group("start")), + "end_page": int(match.group("end")), + } + + +def _split_markdown_pages(markdown_text: str, *, expected_pages: int) -> List[str]: + pages = _split_page_outputs(markdown_text) + if len(pages) < int(expected_pages): + pages.extend([""] * (int(expected_pages) - len(pages))) + elif len(pages) > int(expected_pages): + pages = pages[: int(expected_pages)] + return pages + + +def _archive_shard_artifact(*, out_root: Path, source_path: Path, relative_path: Path) -> None: + archive_path = out_root / "sidecars" / "ocr_shards" / relative_path + archive_path.parent.mkdir(parents=True, exist_ok=True) + if archive_path.exists(): + archive_path.unlink() + source_path.replace(archive_path) + + +def _reassemble_canonical_output_for_source( + *, + out_root: Path, + pdf_path: Path, + source_name: str, +) -> bool: + md_dir = out_root / "markdown" + metrics_dir = out_root / "json" / "metrics" + source_stem = Path(source_name).stem + canonical_md = md_dir / f"{source_stem}.md" + canonical_metrics = metrics_dir / f"{source_stem}.metrics.json" + if canonical_md.exists() and canonical_metrics.exists(): + return True + + shard_records: List[Dict[str, Any]] = [] + for metrics_path in sorted(metrics_dir.glob(f"{source_stem}__p*.metrics.json")): + shard_stem = metrics_path.name.removesuffix(".metrics.json") + shard_md = md_dir / f"{shard_stem}.md" + if not shard_md.exists(): + continue + shard_meta = _parse_shard_stem(shard_stem) + if shard_meta is None: + continue + metrics = json.loads(metrics_path.read_text(encoding="utf-8")) + start_page = int(metrics.get("source_start_page", shard_meta["start_page"])) + end_page = int(metrics.get("source_end_page", shard_meta["end_page"])) + shard_records.append( + { + "stem": shard_stem, + "md_path": shard_md, + "metrics_path": metrics_path, + "metrics": metrics, + "start_page": start_page, + "end_page": end_page, + } + ) + + if not shard_records: + return False + + shard_records.sort(key=lambda item: (int(item["start_page"]), int(item["end_page"]), str(item["stem"]))) + page_count = max(int(_page_count(pdf_path)), max(int(item["end_page"]) for item in shard_records)) + merged_pages = [""] * int(page_count) + merged_page_metrics: List[Optional[Dict[str, Any]]] = [None] * int(page_count) + merged_extra_metrics: Dict[str, Any] = {} + repair_totals: Dict[str, int] = {} + render_sec_total = 0.0 + infer_sec_total = 0.0 + wall_time_sec_total = 0.0 + reassembled_ranges: List[Dict[str, int]] = [] + + for shard in shard_records: + metrics = dict(shard["metrics"]) + start_page = int(shard["start_page"]) + end_page = int(shard["end_page"]) + expected_pages = max(0, end_page - start_page + 1) + reassembled_ranges.append({"start_page": start_page, "end_page": end_page}) + + shard_pages = _split_markdown_pages( + shard["md_path"].read_text(encoding="utf-8"), + expected_pages=expected_pages, + ) + for offset, page_text in enumerate(shard_pages): + merged_pages[start_page - 1 + offset] = page_text + + for idx, page_metric in enumerate(list(metrics.get("page_metrics") or []), start=1): + absolute_page = start_page + int(page_metric.get("page_number", idx)) - 1 + if absolute_page <= 0 or absolute_page > int(page_count): + continue + merged_metric = dict(page_metric) + merged_metric["page_number"] = int(absolute_page) + merged_page_metrics[absolute_page - 1] = merged_metric + + render_sec_total += float(metrics.get("render_sec", 0.0)) + infer_sec_total += float(metrics.get("infer_sec_total", 0.0)) + wall_time_sec_total += float(metrics.get("wall_time_sec", 0.0)) + for key, value in dict(metrics.get("repair_summary") or {}).items(): + if key == "repair_mode": + continue + repair_totals[key] = int(repair_totals.get(key, 0)) + int(value) + for key in REASSEMBLED_CONFIG_KEYS: + if key in metrics and key not in merged_extra_metrics: + merged_extra_metrics[key] = metrics[key] + + merged_extra_metrics.update( + { + "source_file": str(source_name), + "source_stem": str(source_stem), + "source_start_page": 1, + "source_end_page": int(page_count), + "reassembled_from_shards": True, + "reassembled_shard_count": len(shard_records), + "reassembled_source_ranges": reassembled_ranges, + "render_sec": float(render_sec_total), + "infer_sec_total": float(infer_sec_total), + "wall_time_sec": float(wall_time_sec_total), + "wall_time_sec_semantics": "sum_of_shard_wall_times", + "page_metrics": [item for item in merged_page_metrics if item is not None], + } + ) + if repair_totals: + merged_extra_metrics["repair_summary"] = { + "repair_mode": str(merged_extra_metrics.get("repair_mode", "unknown")), + **{key: int(value) for key, value in repair_totals.items()}, + } + + merged_markdown = _join_page_outputs(merged_pages) if merged_pages else "[[Blank page]]" + _write_outputs( + output_dir=out_root, + stem=source_stem, + markdown=merged_markdown, + page_count=int(page_count), + extra_metrics=merged_extra_metrics, + ) + for shard in shard_records: + _archive_shard_artifact( + out_root=out_root, + source_path=Path(shard["md_path"]), + relative_path=Path("markdown") / Path(shard["md_path"]).name, + ) + _archive_shard_artifact( + out_root=out_root, + source_path=Path(shard["metrics_path"]), + relative_path=Path("json") / "metrics" / Path(shard["metrics_path"]).name, + ) + return True + + +def _ensure_canonical_outputs(*, out_root: Path, pdf_root: Path, file_list: List[str]) -> None: + for name in file_list: + pdf_path = (pdf_root / name).resolve() + if _reassemble_canonical_output_for_source( + out_root=out_root, + pdf_path=pdf_path, + source_name=name, + ): + continue + + + +def _run_multi_cli( + *, + input_root: Path, + out_root: Path, + file_list: List[str], + lane_devices: List[int], + workers_per_gpu: int, + model_root: Path, + python_exe: Path, + script_path: Path, + max_pages: Optional[int], + content_debug: bool, + log_dir: Path, + ocr_profile: str, + prompt_override: Optional[str], + attn_backend: str, + base_size: Optional[int], + image_size: Optional[int], + crop_mode: Optional[bool], + render_dpi: Optional[int], + max_new_tokens: Optional[int], + repetition_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + runtime_backend: str, + vllm_batch_size: Optional[int], + gpu_memory_utilization: Optional[float], + disable_fp8_kv: bool, + repair_mode: Optional[str], + repair_exec_batch_target_pages: Optional[int], + repair_exec_batch_target_items: Optional[int], + scheduler: Optional[str], + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> None: + if str(runtime_backend or "").strip().lower() == "vllm": + batches = _plan_work_batches( + file_list=file_list, + input_root=input_root, + max_pages=max_pages, + runtime_backend=runtime_backend, + scheduler=scheduler, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + ) + if not batches: + return + + log_dir.mkdir(parents=True, exist_ok=True) + runtime_dir = out_root / "sidecars" / "ocr_runtime" + runtime_dir.mkdir(parents=True, exist_ok=True) + work_db = runtime_dir / "work_queue.sqlite" + init_work_db(work_db, batches=batches, replace=True) + + visible_devices = sorted({int(device) for device in lane_devices}) + preflight_mode = str(os.environ.get("GLOSSAPI_DEEPSEEK_GPU_PREFLIGHT", "ensure")).strip().lower() + preflight = _ensure_gpu_preflight(visible_devices=visible_devices, mode=preflight_mode) + (runtime_dir / "gpu_preflight.json").write_text(json.dumps(preflight, indent=2), encoding="utf-8") + + telemetry_path = runtime_dir / "gpu_telemetry.jsonl" + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "preflight", **preflight}) + "\n") + fh.write(json.dumps({"kind": "initial_sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + + telemetry_stop = threading.Event() + telemetry_thread = _start_gpu_telemetry( + telemetry_path=telemetry_path, + visible_devices=visible_devices, + interval_sec=float(os.environ.get("GLOSSAPI_DEEPSEEK_TELEMETRY_INTERVAL_SEC", DEFAULT_TELEMETRY_INTERVAL_SEC)), + stop_event=telemetry_stop, + ) + stale_after_sec = float(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_STALE_AFTER_SEC", DEFAULT_WORK_STALE_AFTER_SEC)) + heartbeat_sec = float(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_HEARTBEAT_SEC", DEFAULT_WORK_HEARTBEAT_SEC)) + respawn_cap = int(os.environ.get("GLOSSAPI_DEEPSEEK_WORKER_RESPAWN_CAP", DEFAULT_WORKER_RESPAWN_CAP)) + work_max_attempts = int( + max(1, int(os.environ.get("GLOSSAPI_DEEPSEEK_WORK_ITEM_MAX_ATTEMPTS", DEFAULT_WORK_ITEM_MAX_ATTEMPTS))) + ) + xid_start = _utc_now_iso() + + def _start_worker(*, worker_id: str, visible_device: int, respawns: int) -> Dict[str, Any]: + log_path = log_dir / f"{worker_id}.r{int(respawns)}.log" + fh = log_path.open("w", encoding="utf-8") + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size_for_pages( + runtime_backend=runtime_backend, + pages=int(target_batch_pages), + ) + ) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=[], + page_ranges=None, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + work_db=work_db, + worker_id=worker_id, + worker_runtime_file=runtime_dir / f"{worker_id}.runtime.json", + work_stale_after_sec=stale_after_sec, + work_heartbeat_sec=heartbeat_sec, + work_max_attempts=work_max_attempts, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device, script=script_path) + LOGGER.info( + "Running DeepSeek OCR worker=%s visible_gpu=%s batches=%d: %s", + worker_id, + visible_device, + len(batches), + " ".join(cmd), + ) + proc = _launch_worker_process(cmd, fh=fh, env=env) + return { + "worker_id": worker_id, + "visible_device": int(visible_device), + "proc": proc, + "fh": fh, + "log_path": log_path, + "respawns": int(respawns), + } + + active_workers: List[Dict[str, Any]] = [] + worker_index = 0 + for visible_device in lane_devices: + for _ in range(max(1, int(workers_per_gpu))): + worker_id = f"worker_{worker_index:02d}_gpu{int(visible_device)}" + active_workers.append(_start_worker(worker_id=worker_id, visible_device=int(visible_device), respawns=0)) + worker_index += 1 + + failures: List[str] = [] + try: + while active_workers: + time.sleep(0.5) + for worker in list(active_workers): + rc = worker["proc"].poll() + if rc is None: + continue + worker["fh"].close() + active_workers.remove(worker) + if int(rc) == 0: + continue + error_message = f"{worker['worker_id']} rc={int(rc)} log={worker['log_path']}" + LOGGER.warning("DeepSeek OCR worker failed: %s", error_message) + _terminate_worker_process_group(worker) + requeue_worker_batches( + work_db, + worker_id=str(worker["worker_id"]), + error=error_message, + max_attempts=work_max_attempts, + ) + counts = work_queue_counts(work_db) + # Only respawn while there is retryable work left in the + # durable queue; terminally failed items should stop the run. + remaining_work = int(counts.get("pending", 0)) + int(counts.get("running", 0)) + if remaining_work > 0 and int(worker["respawns"]) < respawn_cap: + active_workers.append( + _start_worker( + worker_id=str(worker["worker_id"]), + visible_device=int(worker["visible_device"]), + respawns=int(worker["respawns"]) + 1, + ) + ) + continue + failures.append(error_message) + counts = work_queue_counts(work_db) + if int(counts.get(STATUS_FAILED, 0)) > 0 or int(counts.get(STATUS_DONE, 0)) < int(counts.get("total", 0)): + failures.append(f"incomplete_work queue_counts={counts}") + finally: + for worker in list(active_workers): + _terminate_worker_process_group(worker) + try: + worker["proc"].wait(timeout=5) + except Exception: + pass + worker["fh"].close() + telemetry_stop.set() + telemetry_thread.join(timeout=max(1.0, DEFAULT_TELEMETRY_INTERVAL_SEC)) + with telemetry_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps({"kind": "final_sample", **_collect_gpu_snapshot(visible_devices=visible_devices)}) + "\n") + fh.write(json.dumps({"kind": "xid_faults", **_collect_xid_faults(start_utc_iso=xid_start)}) + "\n") + _write_runtime_summary(runtime_dir=runtime_dir, db_path=work_db) + + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) + return + + lanes = _plan_lane_batches( + file_list=file_list, + input_root=input_root, + lane_devices=lane_devices, + workers_per_gpu=workers_per_gpu, + max_pages=max_pages, + runtime_backend=runtime_backend, + scheduler=scheduler, + target_batch_pages=target_batch_pages, + shard_pages=shard_pages, + shard_threshold_pages=shard_threshold_pages, + ) + if not lanes: + return + + log_dir.mkdir(parents=True, exist_ok=True) + failures: List[str] = [] + with ExitStack() as stack: + procs = [] + + for lane in lanes: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) + if pages <= 0: + continue + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size_for_pages( + runtime_backend=runtime_backend, + pages=min(int(target_batch_pages), int(pages)), + ) + ) + log_path = log_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" + fh = stack.enter_context(log_path.open("w", encoding="utf-8")) + cmd = _build_cli_command( + input_dir=input_root, + output_dir=out_root, + files=files, + page_ranges=page_ranges, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device="cuda", + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + ) + env = _build_env(python_bin=python_exe, visible_device=visible_device, script=script_path) + LOGGER.info( + "Running DeepSeek OCR lane=%s visible_gpu=%s pages=%s planned_batches=%s files=%d ranges=%d: %s", + lane_id, + visible_device, + pages, + lane_plan["planned_batch_count"], + len(files), + len(page_ranges), + " ".join(cmd), + ) + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + procs.append((lane, log_path, proc)) - metrics = {"page_count": page_count} - metrics_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.write_text(json.dumps(metrics, indent=2), encoding="utf-8") - return metrics + for lane, log_path, proc in procs: + rc = proc.wait() + if rc != 0: + failures.append( + f"lane={lane['lane_id']} gpu={lane['visible_device']} rc={rc} log={log_path}" + ) + if failures: + raise RuntimeError("DeepSeek OCR multi-worker failure(s): " + "; ".join(failures)) def run_for_files( self_ref: Any, files: Iterable[str], *, - model_dir: Optional[Path] = None, # kept for API compatibility + model_dir: Optional[Path] = None, output_dir: Optional[Path] = None, - log_dir: Optional[Path] = None, # unused placeholder to mirror rapidocr + log_dir: Optional[Path] = None, # kept for API compatibility max_pages: Optional[int] = None, - allow_stub: bool = True, - allow_cli: bool = False, + allow_stub: bool = False, # ignored after stub removal; kept for compatibility + allow_cli: bool = True, # ignored after stub removal; kept for compatibility python_bin: Optional[Path] = None, vllm_script: Optional[Path] = None, content_debug: bool = False, persist_engine: bool = True, # placeholder for future session reuse precision: Optional[str] = None, # reserved - device: Optional[str] = None, # reserved + device: Optional[str] = None, + runtime_backend: str = "transformers", + ocr_profile: str = "markdown_grounded", + prompt_override: Optional[str] = None, + attn_backend: str = "auto", + base_size: Optional[int] = None, + image_size: Optional[int] = None, + crop_mode: Optional[bool] = None, + render_dpi: Optional[int] = None, + max_new_tokens: Optional[int] = DEFAULT_MAX_NEW_TOKENS, + repetition_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + use_gpus: Optional[str] = None, + devices: Optional[List[int]] = None, + workers_per_gpu: int = 1, gpu_memory_utilization: Optional[float] = None, disable_fp8_kv: bool = False, + vllm_batch_size: Optional[int] = None, + repair_mode: str = "auto", + repair_exec_batch_target_pages: Optional[int] = None, + repair_exec_batch_target_items: Optional[int] = None, + scheduler: str = "auto", + target_batch_pages: int = AUTO_VLLM_BATCH_PAGE_CAP, + shard_pages: int = 0, + shard_threshold_pages: int = 0, **_: Any, ) -> Dict[str, Any]: - """Run DeepSeek OCR for the provided files. + """Run DeepSeek OCR for the provided files.""" + + requested_stub = bool(allow_stub) + del allow_stub, allow_cli, persist_engine, precision + if requested_stub or os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") == "1": + raise RuntimeError( + "DeepSeek stub execution has been removed. " + "Unset GLOSSAPI_DEEPSEEK_ALLOW_STUB and configure the real DeepSeek runtime." + ) - Returns a mapping of stem -> minimal metadata (page_count). - """ + runtime_backend_norm = str( + runtime_backend or os.environ.get("GLOSSAPI_DEEPSEEK_RUNTIME_BACKEND", "transformers") + ).strip().lower() + if runtime_backend_norm not in {"transformers", "vllm"}: + raise ValueError("runtime_backend must be 'transformers' or 'vllm'") file_list = [str(f) for f in files or []] if not file_list: return {} input_root = Path(getattr(self_ref, "input_dir", ".")).resolve() + pdf_root = (input_root / "downloads") if (input_root / "downloads").exists() else input_root out_root = Path(output_dir) if output_dir else Path(getattr(self_ref, "output_dir", input_root)) md_dir = out_root / "markdown" metrics_dir = out_root / "json" / "metrics" md_dir.mkdir(parents=True, exist_ok=True) metrics_dir.mkdir(parents=True, exist_ok=True) - env_allow_stub = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") == "1" - env_allow_cli = os.environ.get("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "0") == "1" + model_root = Path( + model_dir + or os.environ.get("GLOSSAPI_DEEPSEEK_MODEL_DIR", "") + or (REPO_ROOT / "deepseek-ocr-2-model" / "DeepSeek-OCR-2") + ) + if not model_root.exists(): + raise FileNotFoundError( + "DeepSeek model directory not found. Set model_dir or GLOSSAPI_DEEPSEEK_MODEL_DIR." + ) - use_cli = allow_cli or env_allow_cli - use_stub = allow_stub and env_allow_stub + default_script = DEFAULT_VLLM_SCRIPT if runtime_backend_norm == "vllm" else DEFAULT_SCRIPT + script_path = Path( + vllm_script + or os.environ.get("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", "") + or default_script + ) + if not script_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR runner script not found: {script_path}") - script_path = Path(vllm_script) if vllm_script else Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py" - # Optional GPU memory utilization override (env wins over kwarg) - env_gpu_mem = os.environ.get("GLOSSAPI_DEEPSEEK_GPU_MEMORY_UTILIZATION") - gpu_mem_fraction = gpu_memory_utilization - if env_gpu_mem: - try: - gpu_mem_fraction = float(env_gpu_mem) - except Exception: - gpu_mem_fraction = gpu_memory_utilization - disable_fp8_kv = disable_fp8_kv or os.environ.get("GLOSSAPI_DEEPSEEK_NO_FP8_KV") == "1" + python_exe = resolve_deepseek_python(explicit_python=python_bin) + if not python_exe.exists(): + raise FileNotFoundError(f"DeepSeek Python interpreter not found: {python_exe}") - if use_cli and script_path.exists(): - try: - _run_cli( - input_root, - out_root, - python_bin=python_bin, - script=script_path, + lane_devices = _resolve_lane_devices( + use_gpus=use_gpus, + devices=devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + device=device, + ) + multi_requested = str(use_gpus or "single").strip().lower() == "multi" or int(max(1, workers_per_gpu)) > 1 + if multi_requested and lane_devices: + _run_multi_cli( + input_root=pdf_root, + out_root=out_root, + file_list=file_list, + lane_devices=lane_devices, + workers_per_gpu=int(max(1, workers_per_gpu)), + model_root=model_root, + python_exe=python_exe, + script_path=script_path, + max_pages=max_pages, + content_debug=content_debug, + log_dir=Path(log_dir) if log_dir else (out_root / "logs" / "deepseek_workers"), + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + scheduler=scheduler, + target_batch_pages=int(max(1, target_batch_pages)), + shard_pages=int(max(0, shard_pages)), + shard_threshold_pages=int(max(0, shard_threshold_pages)), + ) + else: + resolved_vllm_batch_size = ( + int(vllm_batch_size) + if vllm_batch_size is not None + else _auto_vllm_batch_size( + runtime_backend=runtime_backend_norm, + file_list=file_list, + input_root=pdf_root, max_pages=max_pages, - content_debug=content_debug, - gpu_memory_utilization=gpu_mem_fraction, - disable_fp8_kv=disable_fp8_kv, ) - results: Dict[str, Any] = {} - for name in file_list: - pdf_path = (input_root / name).resolve() - stem = Path(name).stem - md_path = md_dir / f"{stem}.md" - metrics_path = metrics_dir / f"{stem}.metrics.json" - if not md_path.exists() or not md_path.read_text(encoding="utf-8").strip(): - placeholder = [ - f"# DeepSeek OCR — {pdf_path.name}", - "", - "[[Blank page]]", - ] - md_path.parent.mkdir(parents=True, exist_ok=True) - md_path.write_text("\n".join(placeholder) + "\n", encoding="utf-8") - page_count = _page_count(pdf_path) - if not metrics_path.exists(): - metrics_path.parent.mkdir(parents=True, exist_ok=True) - metrics_path.write_text(json.dumps({"page_count": page_count}, indent=2), encoding="utf-8") - results[stem] = {"page_count": page_count} - return results - except Exception as exc: - if not use_stub: - raise - LOGGER.warning("DeepSeek CLI failed (%s); falling back to stub output", exc) + ) + _run_cli( + input_dir=pdf_root, + output_dir=out_root, + files=file_list, + page_ranges=None, + model_dir=model_root, + python_bin=python_exe, + script=script_path, + max_pages=max_pages, + content_debug=content_debug, + device=device, + ocr_profile=ocr_profile, + prompt_override=prompt_override, + attn_backend=attn_backend, + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + render_dpi=render_dpi, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + runtime_backend=runtime_backend_norm, + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=gpu_memory_utilization, + disable_fp8_kv=disable_fp8_kv, + repair_mode=repair_mode, + repair_exec_batch_target_pages=repair_exec_batch_target_pages, + repair_exec_batch_target_items=repair_exec_batch_target_items, + ) + + _ensure_canonical_outputs(out_root=out_root, pdf_root=pdf_root, file_list=file_list) - cfg = {"max_pages": max_pages, "content_debug": content_debug} results: Dict[str, Any] = {} for name in file_list: - pdf_path = (input_root / name).resolve() + pdf_path = (pdf_root / name).resolve() stem = Path(name).stem md_path = md_dir / f"{stem}.md" metrics_path = metrics_dir / f"{stem}.metrics.json" - results[stem] = _run_one_pdf(pdf_path, md_path, metrics_path, cfg) + if not md_path.exists(): + raise FileNotFoundError(f"DeepSeek OCR did not produce markdown for {name}: {md_path}") + if not md_path.read_text(encoding="utf-8").strip(): + raise RuntimeError(f"DeepSeek OCR produced empty markdown for {name}: {md_path}") + page_count = _page_count(pdf_path) + if metrics_path.exists(): + try: + results[stem] = json.loads(metrics_path.read_text(encoding="utf-8")) + continue + except Exception: + pass + results[stem] = {"page_count": page_count} + metrics_path.write_text(json.dumps(results[stem], indent=2), encoding="utf-8") return results diff --git a/src/glossapi/ocr/deepseek/runtime_paths.py b/src/glossapi/ocr/deepseek/runtime_paths.py new file mode 100644 index 0000000..a442010 --- /dev/null +++ b/src/glossapi/ocr/deepseek/runtime_paths.py @@ -0,0 +1,91 @@ +"""Resolve DeepSeek runtime paths for split-runtime GlossAPI installs.""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Dict, Iterable, List, Optional + +REPO_ROOT = Path(__file__).resolve().parents[4] + + +def _runtime_sort_key(candidate: Path) -> tuple[int, int, str]: + name = candidate.parent.parent.name + if name == "deepseek": + return (1, 0, name) + if name.startswith("deepseek"): + suffix = name[len("deepseek") :] + if suffix.isdigit(): + return (0, -int(suffix), name) + return (2, 0, name) + + +def _candidate_deepseek_pythons( + *, + explicit_python: Optional[Path | str] = None, + env: Optional[Dict[str, str]] = None, + repo_root: Optional[Path] = None, +) -> List[Path]: + resolved_env = dict(env or os.environ) + root = Path(repo_root) if repo_root is not None else REPO_ROOT + + candidates: List[Path] = [] + + def _append(candidate: Optional[Path | str]) -> None: + if not candidate: + return + path = Path(candidate).expanduser() + if path not in candidates: + candidates.append(path) + + _append(explicit_python) + _append(resolved_env.get("GLOSSAPI_DEEPSEEK_PYTHON")) + _append(resolved_env.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON")) + + venv_root = root / "dependency_setup" / ".venvs" + if venv_root.exists(): + for candidate in sorted(venv_root.glob("deepseek*/bin/python"), key=_runtime_sort_key): + _append(candidate) + + _append(sys.executable) + return candidates + + +def resolve_deepseek_python( + *, + explicit_python: Optional[Path | str] = None, + env: Optional[Dict[str, str]] = None, + repo_root: Optional[Path] = None, +) -> Path: + """Return the best available DeepSeek Python interpreter path. + + Preference order: + 1. explicit function argument + 2. explicit environment override + 3. validated repo-local DeepSeek venv(s) + 4. current process interpreter + """ + + resolved_env = dict(env or os.environ) + explicit_candidate = Path(explicit_python).expanduser() if explicit_python else None + if explicit_candidate is not None: + return explicit_candidate + + for key in ("GLOSSAPI_DEEPSEEK_PYTHON", "GLOSSAPI_DEEPSEEK_TEST_PYTHON"): + raw = resolved_env.get(key) + if raw: + return Path(raw).expanduser() + + candidates = _candidate_deepseek_pythons( + explicit_python=None, + env={}, + repo_root=repo_root, + ) + for candidate in candidates: + if candidate.exists(): + return candidate + return candidates[0] + + +__all__ = ["resolve_deepseek_python"] diff --git a/src/glossapi/ocr/deepseek/scheduling.py b/src/glossapi/ocr/deepseek/scheduling.py new file mode 100644 index 0000000..339b3e6 --- /dev/null +++ b/src/glossapi/ocr/deepseek/scheduling.py @@ -0,0 +1,242 @@ +"""Scheduling helpers for DeepSeek OCR page-range planning. + +The core abstraction is a divisible PDF page stream. We can cut a document into +page ranges exactly where a batch boundary needs it, then reconstruct outputs +later by `(doc_id, page_number)`. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +import heapq +from typing import Iterable, List, Optional + + +@dataclass(frozen=True) +class SourceDocument: + name: str + pages: int + + +@dataclass(frozen=True) +class WorkSlice: + source_name: str + source_pages: int + start_page: int + end_page: int + + @property + def pages(self) -> int: + return int(self.end_page) - int(self.start_page) + 1 + + @property + def is_full_document(self) -> bool: + return int(self.start_page) == 1 and int(self.end_page) == int(self.source_pages) + + @property + def item_id(self) -> str: + if self.is_full_document: + return str(self.source_name) + return f"{self.source_name}:{int(self.start_page)}:{int(self.end_page)}" + + @property + def cli_file(self) -> Optional[str]: + return str(self.source_name) if self.is_full_document else None + + @property + def cli_page_range(self) -> Optional[str]: + if self.is_full_document: + return None + return self.item_id + + def to_dict(self) -> dict: + return { + "item_id": self.item_id, + "pages": int(self.pages), + "file": self.cli_file, + "page_range": self.cli_page_range, + "source_name": str(self.source_name), + "start_page": int(self.start_page), + "end_page": int(self.end_page), + "is_full_document": bool(self.is_full_document), + } + + +@dataclass +class DocumentCursor: + name: str + total_pages: int + next_page: int = 1 + + @property + def remaining_pages(self) -> int: + return max(0, int(self.total_pages) - int(self.next_page) + 1) + + def take(self, requested_pages: int) -> WorkSlice: + take_pages = min(max(1, int(requested_pages)), int(self.remaining_pages)) + start_page = int(self.next_page) + end_page = start_page + take_pages - 1 + self.next_page = end_page + 1 + return WorkSlice( + source_name=str(self.name), + source_pages=int(self.total_pages), + start_page=int(start_page), + end_page=int(end_page), + ) + + +@dataclass +class BatchPlan: + batch_id: int + items: List[WorkSlice] = field(default_factory=list) + + @property + def pages(self) -> int: + return sum(int(item.pages) for item in self.items) + + def to_dict(self) -> dict: + return { + "batch_id": int(self.batch_id), + "item_ids": [item.item_id for item in self.items], + "files": [item.cli_file for item in self.items if item.cli_file], + "page_ranges": [item.cli_page_range for item in self.items if item.cli_page_range], + "pages": int(self.pages), + "items": [item.to_dict() for item in self.items], + } + + +@dataclass +class LanePlan: + lane_id: int + visible_device: int + batches: List[BatchPlan] = field(default_factory=list) + + @property + def assigned_pages(self) -> int: + return sum(int(batch.pages) for batch in self.batches) + + def to_dict(self) -> dict: + return { + "lane_id": int(self.lane_id), + "visible_device": int(self.visible_device), + "assigned_pages": int(self.assigned_pages), + "batches": [batch.to_dict() for batch in self.batches], + } + + +def build_whole_document_slices(documents: Iterable[SourceDocument]) -> List[WorkSlice]: + return [ + WorkSlice( + source_name=str(doc.name), + source_pages=int(doc.pages), + start_page=1, + end_page=int(doc.pages), + ) + for doc in documents + ] + + +def build_fixed_shard_slices( + documents: Iterable[SourceDocument], + *, + shard_pages: int, + shard_threshold_pages: int, +) -> List[WorkSlice]: + shard_size = max(0, int(shard_pages)) + threshold = max(0, int(shard_threshold_pages)) + slices: List[WorkSlice] = [] + for doc in documents: + total_pages = int(doc.pages) + if shard_size <= 0 or total_pages <= max(threshold, shard_size): + slices.extend(build_whole_document_slices([doc])) + continue + start_page = 1 + while start_page <= total_pages: + end_page = min(total_pages, start_page + shard_size - 1) + slices.append( + WorkSlice( + source_name=str(doc.name), + source_pages=total_pages, + start_page=int(start_page), + end_page=int(end_page), + ) + ) + start_page = end_page + 1 + return slices + + +def build_exact_fill_batches( + documents: Iterable[SourceDocument], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + heap: List[tuple[int, int, DocumentCursor]] = [] + for idx, doc in enumerate(documents): + cursor = DocumentCursor(name=str(doc.name), total_pages=int(doc.pages)) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + + batches: List[BatchPlan] = [] + while heap: + remaining_capacity = int(target) + items: List[WorkSlice] = [] + while remaining_capacity > 0 and heap: + _neg_remaining, idx, cursor = heapq.heappop(heap) + take_pages = min(int(cursor.remaining_pages), int(remaining_capacity)) + items.append(cursor.take(take_pages)) + remaining_capacity -= int(take_pages) + if cursor.remaining_pages > 0: + heapq.heappush(heap, (-int(cursor.remaining_pages), idx, cursor)) + batches.append(BatchPlan(batch_id=len(batches), items=items)) + return batches + + +def pack_slices_into_batches( + slices: Iterable[WorkSlice], + *, + target_batch_pages: int, +) -> List[BatchPlan]: + target = max(1, int(target_batch_pages)) + ordered = sorted(list(slices), key=lambda item: (-int(item.pages), item.item_id)) + batches: List[BatchPlan] = [] + current: List[WorkSlice] = [] + current_pages = 0 + + def flush() -> None: + nonlocal current, current_pages + if not current: + return + batches.append(BatchPlan(batch_id=len(batches), items=list(current))) + current = [] + current_pages = 0 + + for item in ordered: + item_pages = int(item.pages) + if current and current_pages + item_pages > target: + flush() + current.append(item) + current_pages += item_pages + if current_pages >= target: + flush() + flush() + return batches + + +def assign_batches_to_lanes( + batches: Iterable[BatchPlan], + *, + devices: List[int], + workers_per_gpu: int, +) -> List[LanePlan]: + lanes: List[LanePlan] = [] + lane_id = 0 + for visible_device in devices: + for _ in range(max(1, int(workers_per_gpu))): + lanes.append(LanePlan(lane_id=lane_id, visible_device=int(visible_device))) + lane_id += 1 + for batch in batches: + lane = min(lanes, key=lambda item: (int(item.assigned_pages), int(item.lane_id))) + lane.batches.append(batch) + return lanes + diff --git a/src/glossapi/ocr/deepseek/work_queue.py b/src/glossapi/ocr/deepseek/work_queue.py new file mode 100644 index 0000000..9cf8d0b --- /dev/null +++ b/src/glossapi/ocr/deepseek/work_queue.py @@ -0,0 +1,380 @@ +"""Durable batch queue helpers for multi-GPU DeepSeek OCR runs.""" + +from __future__ import annotations + +import json +import sqlite3 +import time +from pathlib import Path +from typing import Any, Dict, Iterable, Optional + +STATUS_DONE = "done" +STATUS_FAILED = "failed" +STATUS_PENDING = "pending" +STATUS_RUNNING = "running" +QUEUE_MAIN = "main" +QUEUE_REPAIR = "repair" + + +def _empty_counts() -> Dict[str, int]: + return { + STATUS_PENDING: 0, + STATUS_RUNNING: 0, + STATUS_DONE: 0, + STATUS_FAILED: 0, + "total": 0, + } + + +def _normalize_queue_name(queue_name: str) -> str: + queue_norm = str(queue_name or QUEUE_MAIN).strip().lower() + if queue_norm not in {QUEUE_MAIN, QUEUE_REPAIR}: + raise ValueError(f"Unsupported queue name: {queue_name}") + return queue_norm + + +def _connect(db_path: Path) -> sqlite3.Connection: + db_path = Path(db_path).expanduser().resolve() + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(db_path), timeout=30.0, isolation_level=None) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA synchronous=NORMAL") + return conn + + +def init_work_db(db_path: Path, *, batches: Iterable[Dict[str, Any]], replace: bool = True) -> None: + db_path = Path(db_path).expanduser().resolve() + if replace and db_path.exists(): + db_path.unlink() + with _connect(db_path) as conn: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS work_items ( + batch_id INTEGER PRIMARY KEY, + queue_name TEXT NOT NULL, + queue_key TEXT NOT NULL UNIQUE, + batch_json TEXT NOT NULL, + pages INTEGER NOT NULL, + status TEXT NOT NULL, + worker_id TEXT, + attempt_count INTEGER NOT NULL DEFAULT 0, + started_at REAL, + finished_at REAL, + last_heartbeat REAL, + last_error TEXT, + result_json TEXT + ); + CREATE INDEX IF NOT EXISTS idx_work_items_status ON work_items(status); + CREATE INDEX IF NOT EXISTS idx_work_items_queue_status ON work_items(queue_name, status); + CREATE INDEX IF NOT EXISTS idx_work_items_worker ON work_items(worker_id); + """ + ) + rows = [ + ( + int(batch["batch_id"]), + QUEUE_MAIN, + str(batch.get("queue_key") or f"{QUEUE_MAIN}:{int(batch['batch_id'])}"), + json.dumps(batch, sort_keys=True), + int(batch.get("pages", 0)), + STATUS_PENDING, + ) + for batch in batches + ] + conn.executemany( + """ + INSERT OR REPLACE INTO work_items(batch_id, queue_name, queue_key, batch_json, pages, status) + VALUES (?, ?, ?, ?, ?, ?) + """, + rows, + ) + + +def enqueue_batches( + db_path: Path, + *, + queue_name: str, + batches: Iterable[Dict[str, Any]], +) -> list[int]: + queue_norm = _normalize_queue_name(queue_name) + inserted_ids: list[int] = [] + with _connect(db_path) as conn: + _with_transaction(conn) + next_batch_id = int( + conn.execute("SELECT COALESCE(MAX(batch_id), -1) + 1 AS next_id FROM work_items").fetchone()["next_id"] + ) + for batch in batches: + payload = dict(batch) + queue_key = str(payload.get("queue_key") or f"{queue_norm}:{next_batch_id}") + row = conn.execute( + "SELECT batch_id FROM work_items WHERE queue_key = ?", + (queue_key,), + ).fetchone() + if row is None: + batch_id = int(payload.get("batch_id", next_batch_id)) + next_batch_id = max(next_batch_id, batch_id + 1) + else: + batch_id = int(row["batch_id"]) + payload["batch_id"] = batch_id + payload["queue_name"] = queue_norm + payload_json = json.dumps(payload, sort_keys=True) + pages = int(payload.get("pages", 0)) + if row is None: + conn.execute( + """ + INSERT INTO work_items(batch_id, queue_name, queue_key, batch_json, pages, status) + VALUES (?, ?, ?, ?, ?, ?) + """, + (batch_id, queue_norm, queue_key, payload_json, pages, STATUS_PENDING), + ) + else: + conn.execute( + """ + UPDATE work_items + SET queue_name = ?, batch_json = ?, pages = ?, status = ?, worker_id = NULL, attempt_count = 0, + started_at = NULL, finished_at = NULL, last_heartbeat = NULL, last_error = NULL, result_json = NULL + WHERE batch_id = ? + """, + (queue_norm, payload_json, pages, STATUS_PENDING, batch_id), + ) + inserted_ids.append(batch_id) + conn.commit() + return inserted_ids + + +def _with_transaction(conn: sqlite3.Connection) -> None: + conn.execute("BEGIN IMMEDIATE") + + +def requeue_stale_running_batches( + db_path: Path, + *, + stale_after_sec: float, + now_ts: Optional[float] = None, +) -> int: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + cutoff = now_value - float(max(1.0, stale_after_sec)) + with _connect(db_path) as conn: + _with_transaction(conn) + cursor = conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = NULL, started_at = NULL, finished_at = NULL + WHERE status = ? AND COALESCE(last_heartbeat, started_at, 0) < ? + """, + (STATUS_PENDING, STATUS_RUNNING, cutoff), + ) + conn.commit() + return int(cursor.rowcount or 0) + + +def requeue_worker_batches( + db_path: Path, + *, + worker_id: str, + error: Optional[str] = None, + max_attempts: int = 2, +) -> int: + max_attempts_value = max(1, int(max_attempts)) + with _connect(db_path) as conn: + _with_transaction(conn) + # `attempt_count` is incremented on claim. With the default max_attempts=2 + # each work item gets one retry after its first failed claim, then becomes + # terminally failed instead of bouncing forever between workers. + cursor = conn.execute( + """ + UPDATE work_items + SET status = CASE WHEN attempt_count < ? THEN ? ELSE ? END, + worker_id = CASE WHEN attempt_count < ? THEN NULL ELSE ? END, + started_at = NULL, + finished_at = NULL, + last_heartbeat = NULL, + last_error = ?, + result_json = NULL + WHERE status = ? AND worker_id = ? + """, + ( + max_attempts_value, + STATUS_PENDING, + STATUS_FAILED, + max_attempts_value, + str(worker_id), + str(error) if error else None, + STATUS_RUNNING, + str(worker_id), + ), + ) + conn.commit() + return int(cursor.rowcount or 0) + + +def claim_next_batch( + db_path: Path, + *, + worker_id: str, + stale_after_sec: float, + queue_name: str = QUEUE_MAIN, + now_ts: Optional[float] = None, +) -> Optional[Dict[str, Any]]: + queue_norm = _normalize_queue_name(queue_name) + now_value = float(now_ts) if now_ts is not None else float(time.time()) + cutoff = now_value - float(max(1.0, stale_after_sec)) + with _connect(db_path) as conn: + _with_transaction(conn) + conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = NULL, started_at = NULL, finished_at = NULL + WHERE status = ? AND COALESCE(last_heartbeat, started_at, 0) < ? + """, + (STATUS_PENDING, STATUS_RUNNING, cutoff), + ) + row = conn.execute( + """ + SELECT batch_id, batch_json + FROM work_items + WHERE status = ? AND queue_name = ? + ORDER BY batch_id ASC + LIMIT 1 + """, + (STATUS_PENDING, queue_norm), + ).fetchone() + if row is None: + conn.commit() + return None + conn.execute( + """ + UPDATE work_items + SET status = ?, worker_id = ?, attempt_count = attempt_count + 1, started_at = ?, last_heartbeat = ?, last_error = NULL + WHERE batch_id = ? + """, + (STATUS_RUNNING, str(worker_id), now_value, now_value, int(row["batch_id"])), + ) + conn.commit() + return json.loads(str(row["batch_json"])) + + +def heartbeat_batch(db_path: Path, *, batch_id: int, worker_id: str, now_ts: Optional[float] = None) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET last_heartbeat = ? + WHERE batch_id = ? AND status = ? AND worker_id = ? + """, + (now_value, int(batch_id), STATUS_RUNNING, str(worker_id)), + ) + + +def mark_batch_done( + db_path: Path, + *, + batch_id: int, + worker_id: str, + result: Optional[Dict[str, Any]] = None, + now_ts: Optional[float] = None, +) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET status = ?, finished_at = ?, last_heartbeat = ?, result_json = ? + WHERE batch_id = ? AND worker_id = ? + """, + ( + STATUS_DONE, + now_value, + now_value, + json.dumps(result, sort_keys=True) if result is not None else None, + int(batch_id), + str(worker_id), + ), + ) + + +def mark_batch_failed( + db_path: Path, + *, + batch_id: int, + worker_id: str, + error: str, + max_attempts: int = 2, + now_ts: Optional[float] = None, +) -> None: + now_value = float(now_ts) if now_ts is not None else float(time.time()) + max_attempts_value = max(1, int(max_attempts)) + with _connect(db_path) as conn: + conn.execute( + """ + UPDATE work_items + SET status = CASE WHEN attempt_count < ? THEN ? ELSE ? END, + worker_id = CASE WHEN attempt_count < ? THEN NULL ELSE ? END, + started_at = NULL, + finished_at = ?, + last_heartbeat = ?, + last_error = ?, + result_json = NULL + WHERE batch_id = ? AND worker_id = ? + """, + ( + max_attempts_value, + STATUS_PENDING, + STATUS_FAILED, + max_attempts_value, + str(worker_id), + now_value, + now_value, + str(error), + int(batch_id), + str(worker_id), + ), + ) + + +def work_queue_counts(db_path: Path) -> Dict[str, int]: + counts = _empty_counts() + counts["by_queue"] = { + QUEUE_MAIN: _empty_counts(), + QUEUE_REPAIR: _empty_counts(), + } + with _connect(db_path) as conn: + for row in conn.execute("SELECT queue_name, status, COUNT(*) AS count FROM work_items GROUP BY queue_name, status"): + queue_name = _normalize_queue_name(str(row["queue_name"])) + status = str(row["status"]) + count = int(row["count"]) + counts[status] = int(counts.get(status, 0)) + count + counts["total"] += count + counts["by_queue"][queue_name][status] = count + counts["by_queue"][queue_name]["total"] += count + return counts + + +def iter_work_items(db_path: Path) -> Iterable[Dict[str, Any]]: + with _connect(db_path) as conn: + for row in conn.execute( + """ + SELECT batch_id, queue_name, queue_key, batch_json, pages, status, worker_id, attempt_count, started_at, + finished_at, last_heartbeat, last_error, result_json + FROM work_items + ORDER BY batch_id ASC + """ + ): + item = json.loads(str(row["batch_json"])) + item.update( + { + "queue_name": str(row["queue_name"]), + "queue_key": str(row["queue_key"]), + "status": str(row["status"]), + "worker_id": row["worker_id"], + "attempt_count": int(row["attempt_count"]), + "started_at": row["started_at"], + "finished_at": row["finished_at"], + "last_heartbeat": row["last_heartbeat"], + "last_error": row["last_error"], + "result": json.loads(str(row["result_json"])) if row["result_json"] else None, + "pages": int(row["pages"]), + } + ) + yield item diff --git a/src/glossapi/ocr/docling/__init__.py b/src/glossapi/ocr/docling/__init__.py new file mode 100644 index 0000000..28d4b0a --- /dev/null +++ b/src/glossapi/ocr/docling/__init__.py @@ -0,0 +1,5 @@ +"""Docling PDF pipeline helpers used by GlossAPI.""" + +from .pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/docling/pipeline.py b/src/glossapi/ocr/docling/pipeline.py new file mode 100644 index 0000000..df23030 --- /dev/null +++ b/src/glossapi/ocr/docling/pipeline.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import os +from typing import Tuple + +from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + PdfPipelineOptions, + TableFormerMode, + TableStructureOptions, +) + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import LayoutOptions +except ImportError: # pragma: no cover - older Docling versions + LayoutOptions = None + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import PictureDescriptionApiOptions +except ImportError: # pragma: no cover - older Docling versions + PictureDescriptionApiOptions = None + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions +except ImportError: # pragma: no cover - older Docling versions + ThreadedPdfPipelineOptions = None + +try: # pragma: no cover - depends on installed Docling version + from docling.datamodel.settings import settings as docling_settings +except ImportError: # pragma: no cover - older Docling versions + docling_settings = None + + +def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: + """Return accelerator options and whether CUDA was requested.""" + dev = device or "cuda:0" + if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): + try: + acc = AcceleratorOptions(device=dev) + except Exception: + acc = AcceleratorOptions(device=dev.split(":", 1)[0]) + want_cuda = dev.lower().startswith("cuda") + else: + want_cuda = str(dev).lower().startswith("cuda") + acc = AcceleratorOptions( + device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU + ) + return acc, want_cuda + + +def _apply_common_pdf_options( + *, + acc: AcceleratorOptions, + images_scale: float, + formula_enrichment: bool, + code_enrichment: bool, +) -> PdfPipelineOptions: + def _supports_kwarg(model_cls, field_name: str) -> bool: + fields = getattr(model_cls, "model_fields", None) or getattr(model_cls, "__fields__", None) + if fields is None: + return True + return field_name in fields + + options_cls = ThreadedPdfPipelineOptions or PdfPipelineOptions + table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) + try: + if hasattr(table_opts, "do_cell_matching"): + table_opts.do_cell_matching = True + except Exception: + pass + + option_kwargs = { + "accelerator_options": acc, + "do_ocr": False, + "do_table_structure": True, + "do_formula_enrichment": bool(formula_enrichment), + "do_code_enrichment": bool(code_enrichment), + "force_backend_text": False, + "generate_parsed_pages": False, + "allow_external_plugins": True, + } + if LayoutOptions is not None and _supports_kwarg(options_cls, "layout_options"): + option_kwargs["layout_options"] = LayoutOptions() + if _supports_kwarg(options_cls, "table_structure_options"): + option_kwargs["table_structure_options"] = table_opts + opts = options_cls(**{key: value for key, value in option_kwargs.items() if _supports_kwarg(options_cls, key)}) + try: + if hasattr(opts, "do_picture_description"): + opts.do_picture_description = False + if PictureDescriptionApiOptions is not None and getattr(opts, "picture_description_options", None) is None: + opts.picture_description_options = PictureDescriptionApiOptions() + if hasattr(opts, "enable_remote_services"): + opts.enable_remote_services = False + except Exception: + pass + try: + setattr(opts, "images_scale", images_scale) + except Exception: + pass + _apply_runtime_overrides(opts) + return opts + + +def _apply_runtime_overrides(opts: PdfPipelineOptions) -> None: + """Apply optional runtime tuning knobs exposed by newer Docling releases.""" + + int_env_map = { + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE": "layout_batch_size", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE": "table_batch_size", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE": "ocr_batch_size", + "GLOSSAPI_DOCLING_QUEUE_MAX_SIZE": "queue_max_size", + "GLOSSAPI_DOCLING_DOCUMENT_TIMEOUT": "document_timeout", + } + float_env_map = { + "GLOSSAPI_DOCLING_BATCH_POLL_INTERVAL": "batch_polling_interval_seconds", + } + + for env_name, attr_name in int_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = int(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + for env_name, attr_name in float_env_map.items(): + raw = os.getenv(env_name) + if not raw: + continue + try: + value = float(raw) + except ValueError: + continue + if value <= 0 or not hasattr(opts, attr_name): + continue + try: + setattr(opts, attr_name, value) + except Exception: + pass + + raw_page_batch_size = os.getenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE") + if raw_page_batch_size and docling_settings is not None: + try: + page_batch_size = int(raw_page_batch_size) + except ValueError: + page_batch_size = 0 + if page_batch_size > 0: + try: + perf_settings = getattr(docling_settings, "perf", None) + if perf_settings is not None and hasattr(perf_settings, "page_batch_size"): + setattr(perf_settings, "page_batch_size", page_batch_size) + except Exception: + pass + + +def build_layout_pipeline( + *, + device: str = "cuda:0", + images_scale: float = 1.25, + formula_enrichment: bool = False, + code_enrichment: bool = False, +) -> Tuple[object, PdfPipelineOptions]: + """Create a Docling layout-only PDF pipeline.""" + + acc, _ = _resolve_accelerator(device) + opts = _apply_common_pdf_options( + acc=acc, + images_scale=float(images_scale), + formula_enrichment=formula_enrichment, + code_enrichment=code_enrichment, + ) + + try: + from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + except Exception: # pragma: no cover + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore + + pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] + return pipeline, opts diff --git a/src/glossapi/ocr/docling_pipeline.py b/src/glossapi/ocr/docling_pipeline.py new file mode 100644 index 0000000..4a96e09 --- /dev/null +++ b/src/glossapi/ocr/docling_pipeline.py @@ -0,0 +1,5 @@ +"""Compatibility wrapper for the canonical Docling pipeline builder.""" + +from .docling.pipeline import build_layout_pipeline + +__all__ = ["build_layout_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/__init__.py b/src/glossapi/ocr/rapidocr/__init__.py deleted file mode 100644 index c0d1232..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""RapidOCR subpackage with lazy re-exports.""" - -from __future__ import annotations - -from importlib import import_module -from typing import Any - -__all__ = [ - "dispatch", - "docling_pipeline", - "pool", - "safe", - "onnx", - "_paths", - "pipeline", -] - - -def __getattr__(name: str) -> Any: - if name in __all__: - return import_module(f"glossapi.ocr.rapidocr.{name}") - raise AttributeError(name) - - -def __dir__() -> list[str]: - return sorted(set(globals().keys()) | set(__all__)) diff --git a/src/glossapi/ocr/rapidocr/__init__.py.backup b/src/glossapi/ocr/rapidocr/__init__.py.backup deleted file mode 100644 index 865f119..0000000 --- a/src/glossapi/ocr/rapidocr/__init__.py.backup +++ /dev/null @@ -1,6 +0,0 @@ -"""RapidOCR subpackage (shim).""" - -from __future__ import annotations - -__all__ = ["dispatch"] - diff --git a/src/glossapi/ocr/rapidocr/_paths.py b/src/glossapi/ocr/rapidocr/_paths.py deleted file mode 100644 index 4c1cc2a..0000000 --- a/src/glossapi/ocr/rapidocr/_paths.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Tuple -import importlib -import os - - -@dataclass -class ResolvedOnnx: - det: Optional[str] - rec: Optional[str] - cls: Optional[str] - keys: Optional[str] - - -def _find_first(base: Path, patterns: list[str]) -> Optional[str]: - for pat in patterns: - for p in base.rglob(pat): - if p.is_file(): - return str(p) - return None - - -def _resolve_packaged_cls_fallback() -> Optional[str]: - try: - rapidocr = importlib.import_module("rapidocr") - base = Path(rapidocr.__file__).resolve().parent / "models" - pref = base / "ch_ppocr_mobile_v2.0_cls_infer.onnx" - if pref.exists(): - return str(pref) - return _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - except Exception: - return None - - -def resolve_packaged_onnx_and_keys() -> ResolvedOnnx: - """Locate ONNX det/rec/cls and Greek keys packaged with the glossapi package. - - Search order: - 1) GLOSSAPI_RAPIDOCR_ONNX_DIR (env var) with heuristic file names - 2) Under the installed glossapi package folder `models/` and common subfolders - 3) CLS only: fallback to RapidOCR’s bundled cls model if missing - """ - # 1) Explicit override directory - override = os.getenv("GLOSSAPI_RAPIDOCR_ONNX_DIR") - if override: - base = Path(override) - det = _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - if det or rec or cls or keys: - return ResolvedOnnx(det, rec, cls, keys) - - # 2) Search inside installed glossapi package - try: - glossapi = importlib.import_module("glossapi") - pkg_root = Path(glossapi.__file__).resolve().parent - # Candidate asset directories inside the package - candidates = [ - pkg_root / "models", - pkg_root / "models" / "rapidocr", - pkg_root / "models" / "rapidocr" / "onnx", - pkg_root / "models" / "rapidocr" / "keys", - pkg_root / "resources", - pkg_root / "assets", - pkg_root / "data", - ] - det = rec = cls = keys = None - for base in candidates: - if not base.exists(): - continue - det = det or _find_first(base, [ - "**/det/**/inference.onnx", - "*det*server*onnx", - "*PP*det*.onnx", - "det*.onnx", - ]) - rec = rec or _find_first(base, [ - "**/rec/**/inference.onnx", - "*el*rec*onnx", - "*greek*rec*onnx", - "*PP*rec*.onnx", - "rec*.onnx", - ]) - cls = cls or _find_first(base, ["*cls*infer*.onnx", "*cls*.onnx"]) - keys = keys or _find_first(base, ["*greek*keys*.txt", "*ppocr*keys*.txt", "*keys*.txt"]) - - if cls is None: - cls = _resolve_packaged_cls_fallback() - return ResolvedOnnx(det, rec, cls, keys) - except Exception: - return ResolvedOnnx(None, None, _resolve_packaged_cls_fallback(), None) - - -def summarize_resolution() -> Tuple[bool, str]: - r = resolve_packaged_onnx_and_keys() - ok = bool(r.det and r.rec and r.cls and r.keys) - msg = f"det={bool(r.det)} rec={bool(r.rec)} cls={bool(r.cls)} keys={bool(r.keys)}" - return ok, msg - diff --git a/src/glossapi/ocr/rapidocr/dispatch.py b/src/glossapi/ocr/rapidocr/dispatch.py deleted file mode 100644 index 7deeba2..0000000 --- a/src/glossapi/ocr/rapidocr/dispatch.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -from typing import Iterable, Optional - - -def run_via_extract( - corpus, - files: Iterable[str], - *, - export_doc_json: bool = False, - internal_debug: bool = False, - content_debug: Optional[bool] = None, -) -> None: - """Thin adapter that forwards to Corpus.extract for RapidOCR/Docling. - - This exists for symmetry with deepseek_runner and to keep the OCR package - as the single entry point for OCR backends. - """ - # Note: internal_debug/content_debug are no-ops for the Docling/RapidOCR path. - # Docling's output already produces a single concatenated Markdown document. - corpus.extract( - input_format="pdf", - num_threads=1, # let extract decide; override in tests if needed - accel_type="CUDA", - force_ocr=True, - formula_enrichment=False, - code_enrichment=False, - filenames=list(files), - skip_existing=False, - export_doc_json=bool(export_doc_json), - emit_formula_index=bool(export_doc_json), - phase1_backend="docling", - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py b/src/glossapi/ocr/rapidocr/docling_pipeline.py deleted file mode 100644 index bb8988f..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi.ocr.rapidocr._paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup b/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup deleted file mode 100644 index f80344d..0000000 --- a/src/glossapi/ocr/rapidocr/docling_pipeline.py.backup +++ /dev/null @@ -1,501 +0,0 @@ -"""Docling + RapidOCR (ONNX) pipeline for batch PDF OCR. - -Provides build_pipeline() and convert_dir() mirroring the behavior of the -repro script greek_pdf_ocr.py, but self-contained inside glossapi and with -packaged ONNX models/keys. Includes robust logging and native Docling timeout. -""" -from __future__ import annotations - -import argparse -import logging -import os -import sys -import time -import inspect -import importlib -from pathlib import Path -from typing import Iterable, Optional, Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import ( - ConversionResult, - DocumentConverter, - PdfFormatOption, -) -from docling.datamodel.settings import settings - -from glossapi._rapidocr_paths import resolve_packaged_onnx_and_keys -from glossapi.metrics import compute_per_page_metrics -# Ensure RapidOCR factory is registered (avoids masked errors in older paths) -import docling.models.rapid_ocr_model # noqa: F401 - - -log = logging.getLogger(__name__) - - -def _maybe_import_torch(*, force: bool = False): - torch_mod = sys.modules.get("torch") - if torch_mod is not None: - return torch_mod - try: - return importlib.import_module("torch") # type: ignore - except Exception: - return None - return None - - -def _available_ort_providers() -> str: - try: - import onnxruntime as ort # type: ignore - return ",".join(ort.get_available_providers()) - except Exception as e: - return f"unavailable: {e}" - - -def _supports_native_timeout(converter: DocumentConverter) -> Optional[str]: - try: - sig = inspect.signature(converter.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - pass - return None - - -def _convert_with_timeout(converter: DocumentConverter, *, source: str, raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return converter.convert(source=source, **kw) - - -def _convert_all_with_timeout(converter: DocumentConverter, *, sources: Iterable[str], raises_on_error: bool, timeout_s: Optional[int] = None, **kwargs): - kw = dict(raises_on_error=raises_on_error) - kw.update(kwargs) - if timeout_s is not None: - tkw = _supports_native_timeout(converter) - if tkw: - kw[tkw] = int(timeout_s) - return list(converter.convert_all(sources, **kw)) - - -def build_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - # Delegate to canonical pipeline builder to avoid duplication - try: - from glossapi._pipeline import build_rapidocr_pipeline # type: ignore - except Exception as _e: # pragma: no cover - # Backward-compat fallback: inline builder (kept minimal to satisfy tests) - from docling.datamodel.pipeline_options import AcceleratorOptions, TableStructureOptions, TableFormerMode, LayoutOptions, PdfPipelineOptions, RapidOcrOptions # type: ignore - dev = device or "cuda:0" - acc = AcceleratorOptions(device=dev) - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - raise FileNotFoundError("Packaged RapidOCR ONNX models/keys not found under glossapi.models.") - ocr_opts = RapidOcrOptions( - backend="onnxruntime", lang=["el", "en"], force_full_page_ocr=False, - use_det=True, use_cls=False, use_rec=True, text_score=text_score, - det_model_path=r.det, rec_model_path=r.rec, cls_model_path=r.cls, print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - opts = PdfPipelineOptions( - accelerator_options=acc, - ocr_options=ocr_opts, - layout_options=LayoutOptions(), - do_ocr=True, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - from docling.document_converter import DocumentConverter, PdfFormatOption # type: ignore - from docling.datamodel.base_models import InputFormat # type: ignore - return DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}), opts - return build_rapidocr_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - -def convert_dir( - input_dir: Path, - output_dir: Path, - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, - normalize_output: bool = True, - timeout_s: Optional[int] = 600, -) -> None: - input_dir = Path(input_dir) - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - # Device-aware preflight: only enforce CUDA provider when device requests CUDA - want_cuda = isinstance(device, str) and device.lower().startswith("cuda") - if want_cuda: - try: - import onnxruntime as _ort # type: ignore - _providers = _ort.get_available_providers() - if "CUDAExecutionProvider" not in _providers: - raise RuntimeError(f"CUDAExecutionProvider not available in onnxruntime providers={_providers}") - except Exception as e: - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - if formula_enrichment and want_cuda: - try: - torch_mod = _maybe_import_torch(force=True) - if torch_mod is None or not torch_mod.cuda.is_available(): - raise RuntimeError("Torch CUDA not available but formula enrichment requested.") - except Exception as e: - raise RuntimeError(f"Torch CUDA preflight failed: {e}") - - # Optional: tune CodeFormula batch size and math precision when enrichment is requested - if formula_enrichment: - try: - torch_mod = _maybe_import_torch() - if torch_mod is not None and getattr(torch_mod, "cuda", None) and torch_mod.cuda.is_available(): - try: - torch_mod.set_float32_matmul_precision("high") - except Exception: - pass - except Exception: - pass - - engine, opts = build_pipeline( - device=device, - text_score=text_score, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - # Logging block - log.info("Docling+RapidOCR pipeline ready") - log.info("device=%s text_score=%.2f images_scale=%.2f formula=%s code=%s", device, text_score, images_scale, formula_enrichment, code_enrichment) - log.info("ORT providers: %s", _available_ort_providers()) - log.info("Caches: HF_HOME=%s XDG_CACHE_HOME=%s DOCLING_CACHE_DIR=%s", os.getenv("HF_HOME"), os.getenv("XDG_CACHE_HOME"), os.getenv("DOCLING_CACHE_DIR")) - try: - r = resolve_packaged_onnx_and_keys() - import os as _os - log.info( - "Models: det=%s rec=%s cls=%s keys=%s", - _os.path.basename(r.det) if r.det else None, - _os.path.basename(r.rec) if r.rec else None, - _os.path.basename(r.cls) if r.cls else None, - _os.path.basename(r.keys) if r.keys else None, - ) - except Exception: - pass - - # Collect PDFs - pdfs = sorted(str(p) for p in input_dir.rglob("*.pdf") if p.is_file()) - if not pdfs: - log.warning("No PDFs under %s", input_dir) - return - - # Enable timing profile - try: - settings.debug.profile_pipeline_timings = True - except Exception: - pass - - total_start = time.time() - # If we got a StandardPdfPipeline, it has a .convert method similar in spirit - # to DocumentConverter.convert; detect native timeout support by signature. - def _native_timeout_kw(obj) -> Optional[str]: - try: - sig = inspect.signature(obj.convert) - for name in ("timeout", "timeout_s"): - if name in sig.parameters: - return name - except Exception: - return None - return None - - tkw = _native_timeout_kw(engine) - for src in pdfs: - try: - kwargs = {} - if tkw and timeout_s is not None: - kwargs[tkw] = int(timeout_s) - conv = engine.convert(source=src, **kwargs) # type: ignore - _export(conv, output_dir, normalize_output=normalize_output) - # Per-page metrics and per-page console logs - try: - per_page = compute_per_page_metrics(conv) - # Harmonize with GlossExtract: write to sibling json/metrics/ - metrics_dir = output_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - pp = metrics_dir / f"{Path(src).stem}.per_page.metrics.json" - import json as _json - pp.write_text(_json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8") - for row in per_page.get("pages", []): - log.info("[PAGE] %s p%d: parse=%.3fs ocr=%.3fs formulas=%d code=%d", - Path(src).name, - int(row.get("page_no", 0)), - float(row.get("parse_sec", 0.0)), - float(row.get("ocr_sec", 0.0)), - int(row.get("formula_count", 0)), - int(row.get("code_count", 0))) - except Exception as _e: - log.warning("Failed to compute per-page metrics for %s: %s", src, _e) - log.info("[OK] %s", src) - except Exception as e: - log.error("[FAIL] %s: %s", src, e) - log.info("Done in %.2fs", time.time() - total_start) - - -def _normalize_text(s: str) -> str: - import unicodedata, re - zw = re.compile(r"[\u200B\u200C\u200D\uFEFF]") - s = unicodedata.normalize("NFC", s) - return zw.sub("", s) - - -def _normalize_obj(o): - if isinstance(o, str): - return _normalize_text(o) - if isinstance(o, list): - return [_normalize_obj(x) for x in o] - if isinstance(o, dict): - return {k: _normalize_obj(v) for k, v in o.items()} - return o - - -def _export(conv: ConversionResult, out_dir: Path, *, normalize_output: bool) -> None: - doc = conv.document - p = Path(conv.input.file) - md_path = out_dir / f"{p.stem}.md" - # Write Docling JSON under sibling json/ directory (no JSON in markdown dir) - json_dir = out_dir.parent / "json" - json_dir.mkdir(parents=True, exist_ok=True) - json_path = json_dir / f"{p.stem}.docling.json" - # Harmonize metrics location with GlossExtract: sibling json/metrics/ - metrics_dir = out_dir.parent / "json" / "metrics" - metrics_dir.mkdir(parents=True, exist_ok=True) - metrics_path = metrics_dir / f"{p.stem}.metrics.json" - - md = doc.export_to_markdown() - if normalize_output: - md = _normalize_text(md) - md_path.write_text(md, encoding="utf-8") - # Export DoclingDocument JSON via helper (compressed by default) - try: - from glossapi.ocr.utils.json_io import export_docling_json # type: ignore - # Attach minimal meta for provenance - meta = {"source_pdf_relpath": str(p)} - export_docling_json(doc, json_path, compress="zstd", meta=meta) # type: ignore[arg-type] - except Exception: - # Fallback: write plain JSON under json/ without compression - try: - import json as _json - dd = doc.export_to_dict() - if normalize_output: - dd = _normalize_obj(dd) - json_path.write_text(_json.dumps(dd, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - # Timings if present - try: - from typing import Any, Dict, List - def _q(vals: list[float], q: float) -> float: - if not vals: - return 0.0 - s = sorted(vals) - i = int(round((len(s) - 1) * q)) - return float(s[i]) - metrics: Dict[str, Any] = {"file": str(p), "timings": {}} - for key, item in conv.timings.items(): - times = list(item.times) - cnt = int(item.count) - tot = float(sum(times)) if times else 0.0 - avg = float(tot / cnt) if cnt else 0.0 - metrics["timings"][key] = { - "scope": str(item.scope.value) if hasattr(item, "scope") else "unknown", - "count": cnt, - "total_sec": tot, - "avg_sec": avg, - "p50_sec": _q(times, 0.50), - "p90_sec": _q(times, 0.90), - } - import json as _json - metrics_path.write_text(_json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8") - except Exception: - pass - - -def _compute_per_page_metrics(conv: ConversionResult): - try: - doc = conv.document - except Exception: - return {"pages": []} - try: - page_count = len(doc.pages) # type: ignore[attr-defined] - except Exception: - page_count = 0 - timings = {} - try: - for key, item in conv.timings.items(): - times = list(item.times) - timings[key] = { - "scope": str(getattr(getattr(item, 'scope', None), 'value', 'unknown')), - "times": times, - "total": float(sum(times)) if times else float(getattr(item, 'total', 0.0)), - } - except Exception: - pass - def _pt(k): - arr = timings.get(k, {}).get("times", []) or [] - if page_count and len(arr) == page_count: - return [float(x) for x in arr] - return [float(x) for x in (arr + [0.0] * page_count)[:page_count]] - ocr = _pt("ocr") - parse = _pt("page_parse") - layout = _pt("layout") - table = _pt("table_structure") - # counts with sanitization and capping - fcnt = [0] * max(1, page_count) - fch = [0] * max(1, page_count) - ftr = [0] * max(1, page_count) - ftrc = [0] * max(1, page_count) - ccnt = [0] * max(1, page_count) - try: - as_dict = doc.export_to_dict() - import re as _re - _run_pat = _re.compile(r"\\\\\s*&(?P(?:\\quad|\\;|\\:|\\,|\\\\s|\s){200,})") - _ws_collapse = _re.compile(r"(?:(?:\\quad|\\;|\\:|\\,|\\\\s)|\s){2,}") - _CAP = 3000 - def _sanitize(s: str): - dropped=0 - m=_run_pat.search(s) - if m: - s_new=s[:m.start('ws')]; dropped+=len(s)-len(s_new); s=s_new - if len(s)>_CAP: - cut=s.rfind('\\\\',0,_CAP); cut = cut if cut>=0 else _CAP; dropped+=len(s)-cut; s=s[:cut] - s2=_ws_collapse.sub(' ', s) - return s2, dropped - def _walk(label, cnt, chars=False): - for node in as_dict.get("texts", []): - if str(node.get("label")) != label: - continue - raw = str(node.get("text") or node.get("orig") or "") - txt, dropped = _sanitize(raw) if label=='formula' else (raw,0) - ch = len(txt) - for prov in node.get("prov", []) or []: - pno = int(prov.get("page_no") or 0) - if 1 <= pno <= len(cnt): - cnt[pno - 1] += 1 - if chars: - fch[pno - 1] += ch - if label=='formula' and dropped: - ftr[pno - 1] += 1 - ftrc[pno - 1] += int(dropped) - _walk("formula", fcnt, True) - _walk("code", ccnt, False) - except Exception: - pass - try: - den_total = float(timings.get("doc_enrich", {}).get("total", 0.0)) - except Exception: - den_total = 0.0 - shares = [0.0] * max(1, page_count) - if den_total and page_count: - s = float(sum(fch)) or float(sum(fcnt)) or 0.0 - if s > 0: - base = fch if sum(fch) > 0 else fcnt - shares = [den_total * (float(x) / s) for x in base] - rows = [] - n = max(page_count, len(ocr), len(parse)) - for i in range(n): - rows.append({ - "page_no": i + 1, - "ocr_sec": float(ocr[i]) if i < len(ocr) else 0.0, - "parse_sec": float(parse[i]) if i < len(parse) else 0.0, - "layout_sec": float(layout[i]) if i < len(layout) else 0.0, - "table_sec": float(table[i]) if i < len(table) else 0.0, - "formula_count": int(fcnt[i]) if i < len(fcnt) else 0, - "formula_chars": int(fch[i]) if i < len(fch) else 0, - "formula_truncated": int(ftr[i]) if i < len(ftr) else 0, - "formula_truncated_chars": int(ftrc[i]) if i < len(ftrc) else 0, - "code_count": int(ccnt[i]) if i < len(ccnt) else 0, - "doc_enrich_share_sec": float(shares[i]) if i < len(shares) else 0.0, - }) - return {"file": str(getattr(conv.input.file, 'name', 'unknown')), "page_count": int(page_count), "totals": {"doc_enrich_total_sec": den_total}, "pages": rows} - - -def _setup_logging(level: int = logging.INFO) -> None: - logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - - -if __name__ == "__main__": - _setup_logging() - ap = argparse.ArgumentParser(description="Batch OCR with Docling + RapidOCR (ONNX)") - ap.add_argument("input_dir", type=Path) - ap.add_argument("output_dir", type=Path) - ap.add_argument("--device", default=os.getenv("GLOSSAPI_DOCLING_DEVICE", "cuda:0")) - ap.add_argument("--text-score", type=float, default=float(os.getenv("GLOSSAPI_TEXT_SCORE", "0.45"))) - ap.add_argument("--images-scale", type=float, default=float(os.getenv("GLOSSAPI_IMAGES_SCALE", "1.25"))) - ap.add_argument("--docling-formula", dest="docling_formula", action="store_true", help="Enable formula enrichment (CodeFormula)") - ap.add_argument("--no-docling-formula", dest="docling_formula", action="store_false") - ap.set_defaults(docling_formula=False) - ap.add_argument("--formula-batch", type=int, default=int(os.getenv("GLOSSAPI_FORMULA_BATCH", "8")), help="CodeFormula batch size (default 8)") - ap.add_argument("--docling-code", dest="docling_code", action="store_true", help="Enable code enrichment") - ap.add_argument("--no-docling-code", dest="docling_code", action="store_false") - ap.set_defaults(docling_code=False) - ap.add_argument("--normalize-output", action="store_true") - ap.add_argument("--no-normalize-output", dest="normalize_output", action="store_false") - ap.set_defaults(normalize_output=True) - ap.add_argument("--timeout-s", type=int, default=int(os.getenv("GLOSSAPI_DOCLING_TIMEOUT", "600"))) - args = ap.parse_args() - # Apply formula batch size if requested - try: - if getattr(args, "docling_formula", False): - from docling.models.code_formula_model import CodeFormulaModel # type: ignore - if isinstance(args.formula_batch, int) and args.formula_batch > 0: - CodeFormulaModel.elements_batch_size = int(args.formula_batch) # type: ignore[attr-defined] - except Exception: - pass - convert_dir( - args.input_dir, - args.output_dir, - device=args.device, - text_score=args["text_score"] if isinstance(args, dict) else args.text_score, - images_scale=args.images_scale, - formula_enrichment=args.docling_formula, - code_enrichment=args.docling_code, - normalize_output=args.normalize_output, - timeout_s=args.timeout_s, - ) diff --git a/src/glossapi/ocr/rapidocr/onnx.py b/src/glossapi/ocr/rapidocr/onnx.py deleted file mode 100644 index 57430d1..0000000 --- a/src/glossapi/ocr/rapidocr/onnx.py +++ /dev/null @@ -1,105 +0,0 @@ -"""OCR helpers for GlossAPI using Docling + RapidOCR (ONNXRuntime). - -GPU-first OCR that auto-discovers packaged ONNX models and Greek keys within -the installed `glossapi` package. Designed as a drop-in for Corpus.ocr(). -""" -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Dict, Any, Tuple - -_PIPELINE_CACHE: dict[str, Tuple[object, object]] = {} - - -def _build_pipeline( - device: Optional[str] = None, - *, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, -): - # Delegate to canonical builder to avoid duplication - from glossapi.ocr.rapidocr.pipeline import build_rapidocr_pipeline - - engine, opts = build_rapidocr_pipeline( - device=(device or "cuda:0"), - text_score=(0.45 if text_score is None else float(text_score)), - images_scale=(1.25 if images_scale is None else float(images_scale)), - formula_enrichment=False, - code_enrichment=False, - ) - # Apply use_cls override if requested - try: - if use_cls is not None and hasattr(opts, "ocr_options"): - setattr(opts.ocr_options, "use_cls", bool(use_cls)) # type: ignore[attr-defined] - except Exception: - pass - return engine, opts - - -def run_rapidocr_onnx( - pdf_path: Path | str, - *, - device: Optional[str] = None, - use_cls: Optional[bool] = None, - text_score: Optional[float] = None, - images_scale: Optional[float] = None, - max_pages: Optional[int] = None, -) -> Dict[str, Any]: - """Run Docling + RapidOCR (ONNX) OCR on a PDF and return markdown text. - - Returns - ------- - dict with keys: - - markdown_text: str - - duration_s: float - - pages: int - - models: dict with file names of det/rec/cls/keys - """ - from time import perf_counter - pdf_p = Path(pdf_path) - if not pdf_p.exists(): - raise FileNotFoundError(pdf_p) - - key = str(device or "cuda:0").lower() - cached = _PIPELINE_CACHE.get(key) - if cached is None: - pipe, r = _build_pipeline(device=device, use_cls=use_cls, text_score=text_score, images_scale=images_scale) - _PIPELINE_CACHE[key] = (pipe, r) - else: - pipe, r = cached # type: ignore[misc] - - t0 = perf_counter() - conv = pipe.convert(source=str(pdf_p)) # type: ignore[attr-defined] - doc = conv.document - md_text = doc.export_to_markdown() - duration = perf_counter() - t0 - - # Attempt to get page count from conv/document - pages = 0 - try: - if hasattr(doc, "pages"): - pages = len(doc.pages) # type: ignore[attr-defined] - except Exception: - pages = 0 - - # Return model identifiers as file names only (no full paths) - import os as _os - models = { - "det": _os.path.basename(r.det) if r.det else None, - "rec": _os.path.basename(r.rec) if r.rec else None, - "cls": _os.path.basename(r.cls) if r.cls else None, - "keys": _os.path.basename(r.keys) if r.keys else None, - } - - return { - "markdown_text": md_text or "", - "duration_s": duration, - "pages": int(pages), - "models": models, - } - - -__all__ = [ - "run_rapidocr_onnx", -] diff --git a/src/glossapi/ocr/rapidocr/pipeline.py b/src/glossapi/ocr/rapidocr/pipeline.py deleted file mode 100644 index a623c3d..0000000 --- a/src/glossapi/ocr/rapidocr/pipeline.py +++ /dev/null @@ -1,229 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Tuple - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - AcceleratorDevice, - AcceleratorOptions, - LayoutOptions, - PictureDescriptionApiOptions, - PdfPipelineOptions, - RapidOcrOptions, - TableFormerMode, - TableStructureOptions, -) -from docling.document_converter import DocumentConverter, PdfFormatOption - -from ._paths import resolve_packaged_onnx_and_keys -from .pool import GLOBAL_RAPID_OCR_POOL -from .safe import SafeRapidOcrModel, patch_docling_rapidocr - -_logger = logging.getLogger(__name__) - -patch_docling_rapidocr() - - -def _resolve_accelerator(device: str | None) -> Tuple[AcceleratorOptions, bool]: - """Return accelerator options and whether CUDA was requested.""" - dev = device or "cuda:0" - if isinstance(dev, str) and dev.lower().startswith(("cuda", "mps", "cpu")): - acc = AcceleratorOptions(device=dev) - want_cuda = dev.lower().startswith("cuda") - else: - want_cuda = str(dev).lower().startswith("cuda") - acc = AcceleratorOptions( - device=AcceleratorDevice.CUDA if want_cuda else AcceleratorDevice.CPU - ) - return acc, want_cuda - - -def _apply_common_pdf_options( - *, - acc: AcceleratorOptions, - images_scale: float, - formula_enrichment: bool, - code_enrichment: bool, -) -> PdfPipelineOptions: - table_opts = TableStructureOptions(mode=TableFormerMode.ACCURATE) - try: - if hasattr(table_opts, "do_cell_matching"): - table_opts.do_cell_matching = True - except Exception: - pass - - opts = PdfPipelineOptions( - accelerator_options=acc, - layout_options=LayoutOptions(), - do_ocr=False, - do_table_structure=True, - do_formula_enrichment=bool(formula_enrichment), - do_code_enrichment=bool(code_enrichment), - force_backend_text=False, - generate_parsed_pages=False, - table_structure_options=table_opts, - allow_external_plugins=True, - ) - # Prefer lightweight placeholder picture descriptions to avoid heavy VLM backends. - try: - if hasattr(opts, "do_picture_description"): - opts.do_picture_description = False - if getattr(opts, "picture_description_options", None) is None: - opts.picture_description_options = PictureDescriptionApiOptions() - if hasattr(opts, "enable_remote_services"): - opts.enable_remote_services = False - except Exception: - pass - try: - setattr(opts, "images_scale", images_scale) - except Exception: - pass - return opts - - -def build_layout_pipeline( - *, - device: str = "cuda:0", - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Builder for a Docling PDF pipeline without RapidOCR. - - Returns ``(converter, PdfPipelineOptions)`` where ``converter`` is a - ``StandardPdfPipeline`` configured for layout extraction only. - """ - - acc, _ = _resolve_accelerator(device) - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - pipeline = StandardPdfPipeline(opts) # type: ignore[arg-type] - return pipeline, opts - - -def build_rapidocr_pipeline( - *, - device: str = "cuda:0", - text_score: float = 0.45, - images_scale: float = 1.25, - formula_enrichment: bool = False, - code_enrichment: bool = False, -) -> Tuple[object, PdfPipelineOptions]: - """Canonical builder for Docling + RapidOCR pipeline. - - Returns a tuple (engine, PdfPipelineOptions). Prefers explicit RapidOCR injection - when supported; otherwise returns a DocumentConverter using the factory path. - """ - - def _fallback_layout(reason: str) -> Tuple[object, PdfPipelineOptions]: - _logger.warning( - "RapidOCR pipeline fallback: %s. Using Docling layout-only configuration.", - reason, - ) - pipeline, opts = build_layout_pipeline( - device=device, - images_scale=images_scale, - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - return pipeline, opts - - acc, want_cuda = _resolve_accelerator(device) - - # Optional provider preflight only when CUDA requested - if want_cuda: - try: - import onnxruntime as ort # type: ignore - - prov = ort.get_available_providers() - if "CUDAExecutionProvider" not in prov: - raise RuntimeError(f"CUDAExecutionProvider not available: {prov}") - except Exception as e: # pragma: no cover - raise RuntimeError(f"onnxruntime-gpu not available or misconfigured: {e}") - - r = resolve_packaged_onnx_and_keys() - if not (r.det and r.rec and r.cls and r.keys): - return _fallback_layout("packaged RapidOCR ONNX assets missing") - - ocr_opts = RapidOcrOptions( - backend="onnxruntime", - lang=["el", "en"], - force_full_page_ocr=False, - use_det=True, - use_cls=False, - use_rec=True, - text_score=text_score, - det_model_path=r.det, - rec_model_path=r.rec, - cls_model_path=r.cls, - print_verbose=False, - ) - ocr_opts.rec_keys_path = r.keys - - opts = _apply_common_pdf_options( - acc=acc, - images_scale=float(images_scale), - formula_enrichment=formula_enrichment, - code_enrichment=code_enrichment, - ) - opts.do_ocr = True - opts.ocr_options = ocr_opts - - # Prefer explicit injection of RapidOCR model when available - try: - from docling.models.rapid_ocr_model import RapidOcrModel # type: ignore - - try: - from docling.pipelines.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - except Exception: # pragma: no cover - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline # type: ignore - - import inspect - - sig = inspect.signature(StandardPdfPipeline.__init__) - if "ocr_model" not in sig.parameters: - raise RuntimeError("Docling build does not support RapidOCR injection") - - def _factory(): - try: - return SafeRapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - except Exception: # pragma: no cover - # Fall back to the stock implementation if our wrapper misbehaves. - return RapidOcrModel(True, None, ocr_opts, acc) # type: ignore[arg-type] - - pooled_model = GLOBAL_RAPID_OCR_POOL.get( - str(acc.device), - ocr_opts, - _factory, - expected_type=SafeRapidOcrModel, - ) - pipeline = StandardPdfPipeline(opts, ocr_model=pooled_model) # type: ignore - return pipeline, opts - except Exception as exc: - _logger.warning( - "RapidOCR injection unavailable (%s); using DocumentConverter factory path.", - exc, - ) - - # Fallback: use DocumentConverter factory - try: - converter = DocumentConverter( - format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} - ) - return converter, opts - except Exception as exc: - return _fallback_layout(f"DocumentConverter failed: {exc}") - - -__all__ = ["build_layout_pipeline", "build_rapidocr_pipeline"] diff --git a/src/glossapi/ocr/rapidocr/pool.py b/src/glossapi/ocr/rapidocr/pool.py deleted file mode 100644 index db1e8f2..0000000 --- a/src/glossapi/ocr/rapidocr/pool.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Shared RapidOCR engine pooling utilities.""" -from __future__ import annotations - -from dataclasses import dataclass -from threading import Lock -from typing import Callable, Dict, Optional, Union, Type - -from docling.datamodel.pipeline_options import RapidOcrOptions - - -@dataclass(frozen=True) -class _PoolKey: - device: str - det_model_path: str - rec_model_path: str - cls_model_path: str - lang: Tuple[str, ...] - text_score: float - use_det: bool - use_cls: bool - use_rec: bool - - -class RapidOcrEnginePool: - """Process-local cache of RapidOCR models keyed by configuration.""" - - def __init__(self) -> None: - self._lock = Lock() - self._cache: Dict[_PoolKey, object] = {} - - def _make_key(self, device: str, opts: RapidOcrOptions) -> _PoolKey: - lang = tuple(opts.lang or []) - return _PoolKey( - device=str(device), - det_model_path=str(getattr(opts, "det_model_path", "")), - rec_model_path=str(getattr(opts, "rec_model_path", "")), - cls_model_path=str(getattr(opts, "cls_model_path", "")), - lang=lang, - text_score=float(getattr(opts, "text_score", 0.0)), - use_det=bool(getattr(opts, "use_det", True)), - use_cls=bool(getattr(opts, "use_cls", False)), - use_rec=bool(getattr(opts, "use_rec", True)), - ) - - def get( - self, - device: str, - opts: RapidOcrOptions, - factory: Callable[[], object], - *, - expected_type: Optional[Union[Type[object], tuple[Type[object], ...]]] = None, - ) -> object: - key = self._make_key(device, opts) - with self._lock: - model = self._cache.get(key) - if expected_type is not None and model is not None and not isinstance(model, expected_type): - self._cache.pop(key, None) - model = None - if model is None: - model = factory() - if expected_type is None or isinstance(model, expected_type): - self._cache[key] = model - return model - - def clear(self) -> None: - with self._lock: - self._cache.clear() - - -GLOBAL_RAPID_OCR_POOL = RapidOcrEnginePool() - -__all__ = ["RapidOcrEnginePool", "GLOBAL_RAPID_OCR_POOL"] diff --git a/src/glossapi/ocr/rapidocr/safe.py b/src/glossapi/ocr/rapidocr/safe.py deleted file mode 100644 index 5534563..0000000 --- a/src/glossapi/ocr/rapidocr/safe.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Temporary wrappers around Docling's RapidOCR integration. - -The upstream Docling release (2.48.x) does not tolerate RapidOCR returning -``None`` for a given crop. That bubbles up as an AttributeError inside the -conversion loop and the entire document fails. Until Docling includes a fix, we -wrap the loader so that ``None`` simply means "no detections" and processing -continues. Once Docling ships a release with the guard we can drop this shim and -revert to the vanilla ``RapidOcrModel``. -""" - -from __future__ import annotations - -import importlib.util -import sys -from collections.abc import Iterable -from pathlib import Path -from typing import Optional, Type - -import numpy - -from docling.datamodel.base_models import Page -from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import OcrOptions, RapidOcrOptions -from docling.models.rapid_ocr_model import RapidOcrModel as _RapidOcrModel -from docling.models.rapid_ocr_model import TextCell, _log -from docling.utils.profiling import TimeRecorder -from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle - -from ._paths import resolve_packaged_onnx_and_keys - - -class SafeRapidOcrModel(_RapidOcrModel): - """Drop-in RapidOCR wrapper that copes with ``None`` OCR results. - - Docling 2.48.0 assumes ``self.reader`` always returns an object with - ``boxes/txts/scores``. RapidOCR occasionally yields ``None`` for problematic - crops, which crashes the extractor. We normalise the return value before the - original list(zip(...)) call and treat anything unexpected as "no boxes". - Remove this once Docling hardens the upstream implementation. - """ - - # NOTE: keep signature identical so StandardPdfPipeline can instantiate it. - _rapidocr_available: Optional[bool] = None - - def __init__( - self, - enabled: bool, - artifacts_path: Optional[Path], - options: RapidOcrOptions, - accelerator_options, - ): - rapidocr_available = self._rapidocr_available - if rapidocr_available is None: - rapidocr_available = bool( - importlib.util.find_spec("rapidocr") is not None or "rapidocr" in sys.modules - ) - SafeRapidOcrModel._rapidocr_available = rapidocr_available - - effective_enabled = bool(enabled and rapidocr_available) - if enabled and not rapidocr_available: - _log.warning( - "RapidOCR python package not found; continuing with Docling pipeline OCR disabled." - ) - - if effective_enabled: - try: - resolved = resolve_packaged_onnx_and_keys() - - _log.warning( - 'SafeRapidOcrModel initial options: det=%s rec=%s cls=%s keys=%s', - getattr(options, 'det_model_path', None), - getattr(options, 'rec_model_path', None), - getattr(options, 'cls_model_path', None), - getattr(options, 'rec_keys_path', None), - ) - - if resolved.det: - options.det_model_path = resolved.det - if resolved.rec: - options.rec_model_path = resolved.rec - if resolved.cls: - options.cls_model_path = resolved.cls - if resolved.keys: - options.rec_keys_path = resolved.keys - - try: - from rapidocr.ch_ppocr_rec import main as _rapidocr_rec_main - - if not getattr(_rapidocr_rec_main.TextRecognizer, '_glossapi_patch', False): - original_get_character_dict = _rapidocr_rec_main.TextRecognizer.get_character_dict - - def _patched_get_character_dict(self, cfg): - try: - current_keys = cfg.get('keys_path', None) - current_rec_keys = cfg.get('rec_keys_path', None) - if current_rec_keys is None and current_keys is not None: - cfg['rec_keys_path'] = current_keys - _log.warning('Patched RapidOCR cfg: set rec_keys_path from keys_path=%s', current_keys) - else: - _log.warning('Patched RapidOCR cfg: existing rec_keys_path=%s keys_path=%s', current_rec_keys, current_keys) - except Exception: - _log.warning('RapidOCR cfg inspection failed', exc_info=True) - return original_get_character_dict(self, cfg) - - _rapidocr_rec_main.TextRecognizer.get_character_dict = _patched_get_character_dict - _rapidocr_rec_main.TextRecognizer._glossapi_patch = True - except Exception: - _log.warning('Failed to patch RapidOCR TextRecognizer for keys fallback', exc_info=True) - - _log.warning( - 'SafeRapidOcrModel using packaged assets: det=%s rec=%s cls=%s keys=%s', - options.det_model_path, - options.rec_model_path, - options.cls_model_path, - options.rec_keys_path, - ) - except Exception: - _log.warning( - 'SafeRapidOcrModel bootstrap failed to resolve packaged assets', - exc_info=True, - ) - - super().__init__( - enabled=effective_enabled, - artifacts_path=artifacts_path, - options=options, - accelerator_options=accelerator_options, - ) - - @classmethod - def get_options_type(cls) -> Type[OcrOptions]: - return RapidOcrOptions - - def _normalise_result(self, result): - """Return an iterable of (bbox, text, score) triples. - - RapidOCR returns ``None`` or semi-populated structures in some corner - cases. We swallow those and log a one-line warning so the page still - progresses through the pipeline. - """ - - if result is None: - _log.warning("RapidOCR returned None; skipping crop") - return [] - boxes = getattr(result, "boxes", None) - txts = getattr(result, "txts", None) - scores = getattr(result, "scores", None) - if boxes is None or txts is None or scores is None: - _log.warning("RapidOCR returned incomplete data; treating crop as empty") - return [] - try: - return list(zip(boxes.tolist(), txts, scores)) - except Exception as exc: # pragma: no cover - defensive only - _log.warning("RapidOCR result normalisation failed: %s", exc) - return [] - - def __call__( - self, conv_res: ConversionResult, page_batch: Iterable[Page] - ) -> Iterable[Page]: - if not self.enabled: - yield from page_batch - return - - for page in page_batch: - assert page._backend is not None - if not page._backend.is_valid(): - yield page - continue - - with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - raw_result = self.reader( - im, - use_det=self.options.use_det, - use_cls=self.options.use_cls, - use_rec=self.options.use_rec, - ) - result = self._normalise_result(raw_result) - del high_res_image - del im - - if not result: - continue - - cells = [ - TextCell( - index=ix, - text=line[1], - orig=line[1], - confidence=line[2], - from_ocr=True, - rect=BoundingRectangle.from_bounding_box( - BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ) - ), - ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) - - self.post_process_cells(all_ocr_cells, page) - - from docling.datamodel.settings import settings - - if settings.debug.visualize_ocr: - self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) - - yield page - - -def patch_docling_rapidocr() -> bool: - """Replace Docling's RapidOcrModel with the safe shim if available.""" - - try: - import docling.models.rapid_ocr_model as rapid_module - except Exception: # pragma: no cover - Docling missing - return False - - current = getattr(rapid_module, "RapidOcrModel", None) - if current is SafeRapidOcrModel: - return False - - rapid_module.RapidOcrModel = SafeRapidOcrModel - try: - from docling.models.factories import get_ocr_factory # type: ignore - import logging - except Exception: - return True - - try: - factory = get_ocr_factory() - options_type = SafeRapidOcrModel.get_options_type() - - if hasattr(factory, "classes"): - factory.classes[options_type] = SafeRapidOcrModel - elif hasattr(factory, "_classes"): - factory._classes[options_type] = SafeRapidOcrModel - logging.getLogger(__name__).info( - "Registered SafeRapidOcrModel for %s", options_type - ) - try: - from docling.pipeline import standard_pdf_pipeline as _std_pdf # type: ignore - from docling.datamodel.pipeline_options import RapidOcrOptions # type: ignore - from functools import lru_cache - except Exception as _exc: # pragma: no cover - best effort - logging.getLogger(__name__).warning( - "Docling factory patch limited to local mutation: %s", _exc - ) - else: - original_get_factory = getattr( - _std_pdf.get_ocr_factory, "__wrapped__", _std_pdf.get_ocr_factory - ) - - def _ensure_safe(factory_obj): - try: - current = factory_obj.classes.get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - factory_obj.classes[RapidOcrOptions] = SafeRapidOcrModel - except AttributeError: - current = getattr(factory_obj, "_classes", {}).get(RapidOcrOptions) - if current is not SafeRapidOcrModel: - getattr(factory_obj, "_classes", {})[RapidOcrOptions] = SafeRapidOcrModel - return factory_obj - - @lru_cache(maxsize=None) - def _patched_get_ocr_factory(allow_external_plugins: bool = False): - return _ensure_safe(original_get_factory(allow_external_plugins)) - - _patched_get_ocr_factory.__wrapped__ = original_get_factory # type: ignore[attr-defined] - _std_pdf.get_ocr_factory = _patched_get_ocr_factory # type: ignore[attr-defined] - try: - _ensure_safe(_std_pdf.get_ocr_factory(False)) - except Exception: - pass - except Exception as exc: # pragma: no cover - best effort - import logging - - logging.getLogger(__name__).warning( - "Failed to re-register SafeRapidOcrModel: %s", exc - ) - return True - - -__all__ = ["SafeRapidOcrModel", "patch_docling_rapidocr"] diff --git a/src/glossapi/ocr/utils/cleaning.py b/src/glossapi/ocr/utils/cleaning.py index 9b4e287..c194c72 100644 --- a/src/glossapi/ocr/utils/cleaning.py +++ b/src/glossapi/ocr/utils/cleaning.py @@ -260,11 +260,207 @@ def _detect_repeated_lines_cut(text: str, *, threshold: int = 10) -> Optional[in return None +def _is_private_use_char(ch: str) -> bool: + codepoint = ord(ch) + return ( + 0xE000 <= codepoint <= 0xF8FF + or 0xF0000 <= codepoint <= 0xFFFFD + or 0x100000 <= codepoint <= 0x10FFFD + ) + + +def _is_symbol_garbage_char(ch: str) -> bool: + if _is_private_use_char(ch): + return True + return ch in { + "•", + "", + "·", + "◦", + "▪", + "▫", + "‣", + "∙", + "⋅", + "●", + "○", + "◉", + "◌", + "◆", + "◇", + "■", + "□", + "▲", + "△", + "▼", + "▽", + "►", + "◄", + "◊", + "", + "", + "", + "", + "", + "", + } + + +def _detect_symbol_garbage_cut(text: str, *, threshold: int = 16) -> Optional[int]: + """Cut on long runs of isolated bullet/dingbat/private-use symbols. + + This targets the common DeepSeek garbage mode where the model emits long + whitespace-separated runs of bullets or private-use glyphs instead of text. + """ + if threshold <= 1: + return 0 + run_count = 0 + run_start: Optional[int] = None + last_non_ws = -10_000 + for index, ch in enumerate(text): + if ch.isspace(): + continue + if _is_symbol_garbage_char(ch): + if run_count == 0 or (index - last_non_ws) > 3: + run_start = index + run_count = 1 + else: + run_count += 1 + last_non_ws = index + if run_count >= threshold: + return run_start + continue + run_count = 0 + run_start = None + last_non_ws = index + return None + + +NUMERIC_LIST_TOKEN_PATTERN = re.compile(r"(? Optional[int]: + """Cut on degenerate `1. 2. 3. ...` style list output.""" + if threshold <= 1: + return 0 + matches = list(NUMERIC_LIST_TOKEN_PATTERN.finditer(text)) + if len(matches) < threshold: + return None + run_start = matches[0].start() + run_count = 1 + prev_value = int(matches[0].group(1)) + prev_end = matches[0].end() + for match in matches[1:]: + current_value = int(match.group(1)) + gap = text[prev_end : match.start()] + if current_value == prev_value + 1 and len(gap) <= 4 and gap.strip() == "": + run_count += 1 + else: + run_start = match.start() + run_count = 1 + if run_count >= threshold: + return run_start + prev_value = current_value + prev_end = match.end() + return None + + +class StreamingGarbageDetector: + """Incremental detector for common OCR garbage generation modes. + + This is designed for hot decode loops: feed only newly decoded text chunks + and keep O(1) mutable state instead of rescanning the whole suffix. + """ + + def __init__( + self, + *, + symbol_threshold: int = 16, + numeric_list_threshold: int = 12, + ) -> None: + self.symbol_threshold = int(symbol_threshold) + self.numeric_list_threshold = int(numeric_list_threshold) + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number: Optional[int] = None + self._digits_buffer: str = "" + self.triggered_reason: Optional[str] = None + + def reset(self) -> None: + self._symbol_run = 0 + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + self.triggered_reason = None + + def _reset_numeric(self) -> None: + self._numeric_run = 0 + self._expected_next_number = None + self._digits_buffer = "" + + def _feed_symbol_char(self, ch: str) -> bool: + if ch.isspace(): + return False + if _is_symbol_garbage_char(ch): + self._symbol_run += 1 + if self._symbol_run >= self.symbol_threshold: + self.triggered_reason = "symbol_garbage" + return True + return False + self._symbol_run = 0 + return False + + def _feed_numeric_char(self, ch: str) -> bool: + if ch.isspace(): + if self._digits_buffer: + self._reset_numeric() + return False + if "0" <= ch <= "9": + self._digits_buffer += ch + return False + if ch in {".", ")"} and self._digits_buffer: + value = int(self._digits_buffer) + self._digits_buffer = "" + if self._expected_next_number is None: + if value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + else: + if value == self._expected_next_number: + self._numeric_run += 1 + self._expected_next_number += 1 + elif value == 1: + self._numeric_run = 1 + self._expected_next_number = 2 + else: + self._reset_numeric() + if self._numeric_run >= self.numeric_list_threshold: + self.triggered_reason = "numeric_list_garbage" + return True + return False + self._reset_numeric() + return False + + def feed(self, text: str) -> bool: + if self.triggered_reason is not None: + return True + for ch in str(text or ""): + if self._feed_symbol_char(ch): + return True + if self._feed_numeric_char(ch): + return True + return False + + def detect_early_stop_index( text: str, *, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, ) -> Optional[int]: """Find earliest cut index based on repetition heuristics. @@ -273,11 +469,12 @@ def detect_early_stop_index( """ idx_char = _detect_repeated_char_cut(text, threshold=char_repeat_threshold) idx_line = _detect_repeated_lines_cut(text, threshold=line_repeat_threshold) - if idx_char is None: - return idx_line - if idx_line is None: - return idx_char - return min(idx_char, idx_line) + idx_symbol = _detect_symbol_garbage_cut(text, threshold=symbol_garbage_threshold) + idx_numeric = _detect_numeric_list_garbage_cut(text, threshold=numeric_list_threshold) + candidates = [idx for idx in (idx_char, idx_line, idx_symbol, idx_numeric) if idx is not None] + if not candidates: + return None + return min(candidates) def apply_early_stop( @@ -286,6 +483,8 @@ def apply_early_stop( content_debug: bool = False, line_repeat_threshold: int = 10, char_repeat_threshold: int = 200, + symbol_garbage_threshold: int = 16, + numeric_list_threshold: int = 12, metrics: Optional[dict] = None, ) -> str: """Apply early termination heuristics to ``text`` and optionally append notice. @@ -299,6 +498,8 @@ def apply_early_stop( text, line_repeat_threshold=line_repeat_threshold, char_repeat_threshold=char_repeat_threshold, + symbol_garbage_threshold=symbol_garbage_threshold, + numeric_list_threshold=numeric_list_threshold, ) if cut is None: return text diff --git a/src/glossapi/scripts/build_ocr_golden_pages.py b/src/glossapi/scripts/build_ocr_golden_pages.py new file mode 100644 index 0000000..f6bb5b9 --- /dev/null +++ b/src/glossapi/scripts/build_ocr_golden_pages.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple + +PAGE_SPLIT_MARKER = "<--- Page Split --->" + + +def _read_jsonl(path: Path) -> List[Dict[str, object]]: + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def _stable_sort_rows(rows: Sequence[Dict[str, object]], seed: str) -> List[Dict[str, object]]: + def _key(row: Dict[str, object]) -> str: + basis = f"{seed}|{row['source_stem']}|{row['page_number']}" + return hashlib.sha1(basis.encode("utf-8")).hexdigest() + + return sorted(rows, key=_key) + + +def _take_rows( + rows: Sequence[Dict[str, object]], + selected_keys: set[Tuple[str, int]], + *, + limit: int, + seed: str, +) -> List[Dict[str, object]]: + out: List[Dict[str, object]] = [] + for row in _stable_sort_rows(rows, seed): + key = (str(row["source_stem"]), int(row["page_number"])) + if key in selected_keys: + continue + out.append(row) + selected_keys.add(key) + if len(out) >= limit: + break + return out + + +def _split_pages(path: Path) -> List[str]: + return path.read_text(encoding="utf-8", errors="ignore").split(PAGE_SPLIT_MARKER) + + +def build_ocr_goldens( + *, + run_dir: Path, + source_dir: Path, + output_dir: Path, + seed: str = "ocr-golden-v1", +) -> Dict[str, object]: + page_metrics = _read_jsonl(run_dir / "page_metrics.jsonl") + manifest_rows = _read_jsonl(run_dir / "manifest.jsonl") + source_by_stem = {Path(str(row["source_path"])).stem: Path(str(row["source_path"])) for row in manifest_rows} + output_by_stem = {Path(str(row["output_path"])).stem: Path(str(row["output_path"])) for row in manifest_rows} + + for target in (output_dir / "inputs", output_dir / "expected"): + target.mkdir(parents=True, exist_ok=True) + for stale in target.iterdir(): + if stale.is_file(): + stale.unlink() + for stale_name in ("manifest.jsonl", "summary.json"): + stale = output_dir / stale_name + if stale.exists(): + stale.unlink() + + source_pages_cache: Dict[str, List[str]] = {} + output_pages_cache: Dict[str, List[str]] = {} + + rows_with_features: List[Dict[str, object]] = [] + for row in page_metrics: + stem = str(row["source_stem"]) + source_path = source_by_stem.get(stem) + output_path = output_by_stem.get(stem) + if source_path is None or output_path is None: + continue + if stem not in source_pages_cache: + source_pages_cache[stem] = _split_pages(source_path) + output_pages_cache[stem] = _split_pages(output_path) + page_idx = int(row["page_number"]) - 1 + source_page = source_pages_cache[stem][page_idx] + output_page = output_pages_cache[stem][page_idx] + feature_row = dict(row) + feature_row["has_table_html"] = " 0 + ] + feature_row["positive_categories"] = positive_categories + rows_with_features.append(feature_row) + + selected_keys: set[Tuple[str, int]] = set() + selected_rows: List[Tuple[str, Dict[str, object]]] = [] + + def add_bucket(label: str, candidates: Iterable[Dict[str, object]], limit: int) -> None: + bucket = _take_rows(list(candidates), selected_keys, limit=limit, seed=f"{seed}:{label}") + for item in bucket: + selected_rows.append((label, item)) + + add_bucket( + "hybrid_positive", + [row for row in rows_with_features if int(row.get("hybrid_match_count", 0)) > 0], + 9999, + ) + add_bucket( + "latex_positive", + [row for row in rows_with_features if int(row.get("latex_match_count", 0)) > 0], + 9999, + ) + add_bucket( + "mixed_positive", + [row for row in rows_with_features if len(list(row.get("positive_categories", []))) >= 2], + 120, + ) + add_bucket( + "numeric_positive", + [row for row in rows_with_features if int(row.get("numeric_match_count", 0)) > 0], + 140, + ) + add_bucket( + "word_positive", + [row for row in rows_with_features if int(row.get("word_match_count", 0)) > 0], + 140, + ) + add_bucket( + "table_positive", + [row for row in rows_with_features if int(row.get("table_match_count", 0)) > 0], + 180, + ) + add_bucket( + "table_kept_conversion", + [ + row + for row in rows_with_features + if row.get("has_table_html") + and all(int(row.get(f"{category}_match_count", 0)) == 0 for category in ("table", "numeric", "latex", "hybrid", "word")) + ], + 60, + ) + add_bucket( + "negative_plain", + [ + row + for row in rows_with_features + if not row.get("has_table_html") + and all(int(row.get(f"{category}_match_count", 0)) == 0 for category in ("table", "numeric", "latex", "hybrid", "word")) + ], + 60, + ) + + manifest_out = output_dir / "manifest.jsonl" + summary_out = output_dir / "summary.json" + written_rows: List[Dict[str, object]] = [] + category_counts: Dict[str, int] = {} + + for idx, (label, row) in enumerate(selected_rows, start=1): + stem = str(row["source_stem"]) + page_number = int(row["page_number"]) + base_name = f"{idx:04d}__{stem}__page_{page_number:05d}" + input_path = output_dir / "inputs" / f"{base_name}.md" + expected_path = output_dir / "expected" / f"{base_name}.md" + input_path.write_text(str(row["source_page"]), encoding="utf-8") + expected_path.write_text(str(row["expected_page"]), encoding="utf-8") + + category_counts[label] = category_counts.get(label, 0) + 1 + written_rows.append( + { + "case_id": base_name, + "label": label, + "source_stem": stem, + "page_number": page_number, + "input_path": str(input_path), + "expected_path": str(expected_path), + "source_path": str(source_by_stem[stem]), + "output_path": str(output_by_stem[stem]), + "match_counts": { + category: int(row.get(f"{category}_match_count", 0)) + for category in ("table", "numeric", "latex", "hybrid", "word") + }, + "has_table_html": bool(row.get("has_table_html")), + } + ) + + with manifest_out.open("w", encoding="utf-8") as handle: + for row in written_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "run_dir": str(run_dir), + "source_dir": str(source_dir), + "output_dir": str(output_dir), + "case_count": len(written_rows), + "category_counts": category_counts, + } + summary_out.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build OCR golden page fixtures from a combined debug run.") + parser.add_argument("--run-dir", required=True, type=Path) + parser.add_argument("--source-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--seed", default="ocr-golden-v1") + args = parser.parse_args() + + summary = build_ocr_goldens( + run_dir=args.run_dir, + source_dir=args.source_dir, + output_dir=args.output_dir, + seed=args.seed, + ) + print(json.dumps(summary, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/deepseek_pipeline_benchmark.py b/src/glossapi/scripts/deepseek_pipeline_benchmark.py new file mode 100644 index 0000000..4ffb064 --- /dev/null +++ b/src/glossapi/scripts/deepseek_pipeline_benchmark.py @@ -0,0 +1,401 @@ +from __future__ import annotations + +import argparse +import json +import random +import shutil +import subprocess +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +from glossapi.ocr.deepseek.scheduling import ( + SourceDocument, + assign_batches_to_lanes, + build_exact_fill_batches, + build_fixed_shard_slices, + build_whole_document_slices, + pack_slices_into_batches, +) + + +def _parse_devices(spec: str) -> List[int]: + tokens = [piece.strip() for piece in str(spec or "").split(",") if piece.strip()] + if not tokens: + raise argparse.ArgumentTypeError("--devices must contain at least one GPU id") + try: + return [int(token) for token in tokens] + except ValueError as exc: + raise argparse.ArgumentTypeError(f"Invalid GPU list: {spec}") from exc + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_pipeline_benchmark", + description="Benchmark DeepSeek OCR pipeline throughput for different scheduling strategies.", + ) + p.add_argument("--repo", required=True) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--python-bin", required=True) + p.add_argument("--model-dir", required=True) + p.add_argument("--label", required=True) + p.add_argument("--mode", default="static", choices=["static", "streaming"]) + p.add_argument( + "--scheduler", + default="whole_doc", + choices=["whole_doc", "fixed_shard", "exact_fill"], + ) + p.add_argument("--devices", default="0,1,2,3,4,5,6,7") + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--max-docs", type=int, default=None) + p.add_argument("--doc-order", default="name", choices=["name", "random", "largest_first"]) + p.add_argument("--seed", type=int, default=20260330) + p.add_argument("--target-batch-pages", type=int, default=160) + p.add_argument("--stream-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) + p.add_argument("--runtime-backend", default="vllm", choices=["transformers", "vllm"]) + p.add_argument("--ocr-profile", default="markdown_grounded", choices=["markdown_grounded", "plain_ocr"]) + p.add_argument("--prompt-override", default=None) + p.add_argument("--repair-mode", default="auto", choices=["auto", "off"]) + p.add_argument("--attn-backend", default="auto") + p.add_argument("--base-size", type=int, default=None) + p.add_argument("--image-size", type=int, default=None) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--max-new-tokens", type=int, default=2048) + p.add_argument("--vllm-batch-size", type=int, default=None) + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + p.add_argument("--disable-fp8-kv", action="store_true") + p.add_argument("--clean", action="store_true") + return p.parse_args() + + +def _weighted_documents( + *, + input_dir: Path, + max_docs: Optional[int], + doc_order: str, + seed: int, +) -> List[SourceDocument]: + from glossapi.ocr.deepseek import runner as deepseek_runner + + documents = [ + SourceDocument(name=path.name, pages=int(deepseek_runner._effective_page_count(path, None))) + for path in sorted(input_dir.glob("*.pdf")) + ] + if doc_order == "largest_first": + documents.sort(key=lambda item: (-int(item.pages), str(item.name))) + elif doc_order == "random": + rng = random.Random(int(seed)) + rng.shuffle(documents) + if max_docs is not None: + documents = documents[: max(0, int(max_docs))] + return documents + + +def _plan_lanes( + *, + documents: List[SourceDocument], + devices: List[int], + workers_per_gpu: int, + scheduler: str, + target_batch_pages: int, + shard_pages: int, + shard_threshold_pages: int, +) -> List[Dict[str, Any]]: + scheduler_norm = str(scheduler or "whole_doc").strip().lower() + if scheduler_norm == "exact_fill": + batches = build_exact_fill_batches(documents, target_batch_pages=max(1, int(target_batch_pages))) + else: + if scheduler_norm == "fixed_shard": + slices = build_fixed_shard_slices( + documents, + shard_pages=max(1, int(shard_pages)), + shard_threshold_pages=max(0, int(shard_threshold_pages)), + ) + else: + slices = build_whole_document_slices(documents) + batches = pack_slices_into_batches(slices, target_batch_pages=max(1, int(target_batch_pages))) + lanes = assign_batches_to_lanes( + batches, + devices=devices, + workers_per_gpu=max(1, int(workers_per_gpu)), + ) + return [lane.to_dict() for lane in lanes if lane.batches] + + +def _collect_repair_metrics(run_dir: Path) -> Dict[str, int]: + metrics_dir = run_dir / "json" / "metrics" + totals = { + "docs_with_metrics": 0, + "pages_flagged": 0, + "pages_repaired": 0, + "plain_repairs": 0, + "tiled_repairs": 0, + } + if not metrics_dir.exists(): + return totals + for path in metrics_dir.glob("*.metrics.json"): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + totals["docs_with_metrics"] += 1 + summary = data.get("repair_summary") or {} + totals["pages_flagged"] += int(summary.get("pages_flagged", 0)) + totals["pages_repaired"] += int(summary.get("pages_repaired", 0)) + totals["plain_repairs"] += int(summary.get("plain_repairs", 0)) + totals["tiled_repairs"] += int(summary.get("tiled_repairs", 0)) + return totals + + +def _collect_runtime_summary(run_dir: Path) -> Dict[str, Any]: + summary_path = run_dir / "sidecars" / "ocr_runtime" / "runtime_summary.json" + if not summary_path.exists(): + return {} + try: + return json.loads(summary_path.read_text(encoding="utf-8")) + except Exception: + return {} + + +def _flatten_lane_batches(lane: Dict[str, Any]) -> Dict[str, Any]: + files: List[str] = [] + page_ranges: List[str] = [] + pages = 0 + planned_batch_pages: List[int] = [] + for batch in list(lane.get("batches") or []): + batch_pages = int(batch.get("pages", 0)) + pages += batch_pages + planned_batch_pages.append(batch_pages) + files.extend(list(batch.get("files") or [])) + page_ranges.extend(list(batch.get("page_ranges") or [])) + return { + "files": files, + "page_ranges": page_ranges, + "pages": int(pages), + "planned_batch_count": len(planned_batch_pages), + "planned_batch_pages": planned_batch_pages, + } + + +def main() -> int: + args = _parse_args() + repo = Path(args.repo).resolve() + input_dir = Path(args.input_dir).resolve() + output_root = Path(args.output_dir).resolve() + python_bin = Path(args.python_bin).expanduser() + model_dir = Path(args.model_dir).resolve() + devices = _parse_devices(args.devices) + + from glossapi.ocr.deepseek import runner as deepseek_runner + + documents = _weighted_documents( + input_dir=input_dir, + max_docs=args.max_docs, + doc_order=args.doc_order, + seed=int(args.seed), + ) + if not documents: + raise SystemExit("No PDFs found for benchmark input set.") + lanes = _plan_lanes( + documents=documents, + devices=devices, + workers_per_gpu=max(1, int(args.workers_per_gpu)), + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + ) + + run_dir = output_root / args.label + if args.clean and run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + logs_dir = run_dir / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "lane_plan.json").write_text(json.dumps(lanes, indent=2), encoding="utf-8") + + script_path = ( + deepseek_runner.DEFAULT_VLLM_SCRIPT + if str(args.runtime_backend) == "vllm" + else deepseek_runner.DEFAULT_SCRIPT + ) + py_env = {"PYTHONPATH": str(repo / "src")} + + def start_lane(lane: Dict[str, Any]) -> Dict[str, Any]: + lane_id = int(lane["lane_id"]) + visible_device = int(lane["visible_device"]) + lane_plan = _flatten_lane_batches(lane) + files = list(lane_plan["files"]) + page_ranges = list(lane_plan["page_ranges"]) + pages = int(lane_plan["pages"]) + resolved_vllm_batch_size = ( + int(args.vllm_batch_size) + if args.vllm_batch_size is not None + else min(max(1, int(args.target_batch_pages)), max(1, pages)) + ) + log_path = logs_dir / f"lane_{lane_id:02d}_gpu{visible_device}.log" + fh = log_path.open("w", encoding="utf-8") + cmd = deepseek_runner._build_cli_command( + input_dir=input_dir, + output_dir=run_dir, + files=files, + page_ranges=page_ranges, + model_dir=model_dir, + python_bin=python_bin, + script=script_path, + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile=str(args.ocr_profile), + prompt_override=args.prompt_override, + attn_backend=str(args.attn_backend), + base_size=args.base_size, + image_size=args.image_size, + crop_mode=None, + render_dpi=int(args.render_dpi), + max_new_tokens=args.max_new_tokens, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend=str(args.runtime_backend), + vllm_batch_size=resolved_vllm_batch_size, + gpu_memory_utilization=float(args.gpu_memory_utilization), + disable_fp8_kv=bool(args.disable_fp8_kv), + repair_mode=str(args.repair_mode), + ) + env = deepseek_runner._build_env(python_bin=python_bin, visible_device=visible_device) + env["PYTHONPATH"] = f"{py_env['PYTHONPATH']}:{env['PYTHONPATH']}" if env.get("PYTHONPATH") else py_env["PYTHONPATH"] + proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.STDOUT, env=env) # nosec: controlled args + return { + "lane_id": lane_id, + "visible_device": visible_device, + "batch_id": 0, + "pages": pages, + "files": files, + "page_ranges": page_ranges, + "planned_batch_count": int(lane_plan["planned_batch_count"]), + "planned_batch_pages": list(lane_plan["planned_batch_pages"]), + "resolved_vllm_batch_size": resolved_vllm_batch_size, + "log_path": str(log_path), + "fh": fh, + "proc": proc, + "start_ts": time.perf_counter(), + "cmd": cmd, + } + + global_start = time.perf_counter() + active: List[Dict[str, Any]] = [start_lane(lane) for lane in lanes] + + batch_results: List[Dict[str, Any]] = [] + while active: + time.sleep(0.2) + for item in list(active): + rc = item["proc"].poll() + if rc is None: + continue + end_ts = time.perf_counter() + item["fh"].close() + elapsed = max(0.000001, float(end_ts - item["start_ts"])) + batch_results.append( + { + "lane_id": int(item["lane_id"]), + "visible_device": int(item["visible_device"]), + "batch_id": int(item["batch_id"]), + "pages": int(item["pages"]), + "files": list(item["files"]), + "page_ranges": list(item.get("page_ranges") or []), + "planned_batch_count": int(item.get("planned_batch_count", 1)), + "planned_batch_pages": list(item.get("planned_batch_pages") or []), + "return_code": int(rc), + "resolved_vllm_batch_size": int(item["resolved_vllm_batch_size"]), + "start_offset_sec": float(item["start_ts"] - global_start), + "end_offset_sec": float(end_ts - global_start), + "elapsed_sec": float(elapsed), + "sec_per_page": float(elapsed / max(1, int(item["pages"]))), + "log_path": str(item["log_path"]), + "cmd": item["cmd"], + } + ) + active.remove(item) + + total_elapsed = max(0.000001, time.perf_counter() - global_start) + total_pages = sum(int(doc.pages) for doc in documents) + failures = [item for item in batch_results if int(item["return_code"]) != 0] + + lane_results: List[Dict[str, Any]] = [] + for lane in lanes: + lane_batches = [item for item in batch_results if int(item["lane_id"]) == int(lane["lane_id"])] + if not lane_batches: + continue + lane_start = min(float(item["start_offset_sec"]) for item in lane_batches) + lane_end = max(float(item["end_offset_sec"]) for item in lane_batches) + lane_elapsed = max(0.000001, lane_end - lane_start) + lane_pages = sum(int(item["pages"]) for item in lane_batches) + lane_results.append( + { + "lane_id": int(lane["lane_id"]), + "visible_device": int(lane["visible_device"]), + "batch_count": len(lane_batches), + "pages": int(lane_pages), + "active_elapsed_sec": float(lane_elapsed), + "sec_per_page": float(lane_elapsed / max(1, lane_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in lane_batches), + } + ) + + gpu_results: List[Dict[str, Any]] = [] + for visible_device in sorted({int(item["visible_device"]) for item in batch_results}): + gpu_batches = [item for item in batch_results if int(item["visible_device"]) == visible_device] + gpu_start = min(float(item["start_offset_sec"]) for item in gpu_batches) + gpu_end = max(float(item["end_offset_sec"]) for item in gpu_batches) + gpu_elapsed = max(0.000001, gpu_end - gpu_start) + gpu_pages = sum(int(item["pages"]) for item in gpu_batches) + gpu_results.append( + { + "visible_device": visible_device, + "batch_count": len(gpu_batches), + "pages": int(gpu_pages), + "active_elapsed_sec": float(gpu_elapsed), + "sec_per_page": float(gpu_elapsed / max(1, gpu_pages)), + "all_return_codes_zero": all(int(item["return_code"]) == 0 for item in gpu_batches), + } + ) + + repair_metrics = _collect_repair_metrics(run_dir) + runtime_summary = _collect_runtime_summary(run_dir) + summary = { + "label": str(args.label), + "status": "pass" if not failures else "fail", + "mode": str(args.mode), + "scheduler": str(args.scheduler), + "runtime_backend": str(args.runtime_backend), + "ocr_profile": str(args.ocr_profile), + "repair_mode": str(args.repair_mode), + "devices": devices, + "workers_per_gpu": int(args.workers_per_gpu), + "doc_order": str(args.doc_order), + "target_batch_pages": int(args.target_batch_pages), + "stream_batch_pages": int(args.stream_batch_pages), + "docs": len(documents), + "pages": int(total_pages), + "shard_pages": int(args.shard_pages), + "shard_threshold_pages": int(args.shard_threshold_pages), + "wall_time_sec": float(total_elapsed), + "sec_per_page": float(total_elapsed / max(1, total_pages)), + "batch_results": batch_results, + "lane_results": lane_results, + "gpu_results": gpu_results, + "repair_metrics": repair_metrics, + "runtime_summary": runtime_summary, + "steady_state": dict(runtime_summary.get("steady_state") or {}), + "failures": failures, + } + (run_dir / "pipeline_benchmark_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") + print(json.dumps(summary, indent=2)) + return 1 if failures else 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/deepseek_runtime_report.py b/src/glossapi/scripts/deepseek_runtime_report.py new file mode 100644 index 0000000..cb93729 --- /dev/null +++ b/src/glossapi/scripts/deepseek_runtime_report.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import argparse +import json +import os +import platform +import subprocess +import sys +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +PACKAGE_NAMES = ( + "torch", + "vllm", + "transformers", + "nvidia.cuda_runtime", + "nvidia.cuda_nvrtc", +) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.deepseek_runtime_report", + description="Print a reproducible DeepSeek OCR runtime report for a GlossAPI checkout.", + ) + p.add_argument("--repo-root", default=".") + p.add_argument("--python-bin", default="") + p.add_argument("--json", action="store_true") + return p.parse_args(argv) + + +def _detect_python_bin(repo_root: Path, explicit: str) -> Path: + if str(explicit).strip(): + path = Path(explicit).expanduser() + if not path.is_absolute(): + path = repo_root / path + return path.absolute() + candidates = ( + repo_root / "dependency_setup" / ".venvs" / "deepseek" / "bin" / "python", + repo_root / "dependency_setup" / "deepseek_uv" / ".venv" / "bin" / "python", + ) + for candidate in candidates: + if candidate.exists(): + return candidate.absolute() + return Path(sys.executable).absolute() + + +def _read_os_release() -> Dict[str, str]: + path = Path("/etc/os-release") + if not path.exists(): + return {} + out: Dict[str, str] = {} + for line in path.read_text(encoding="utf-8").splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + out[key] = value.strip().strip('"') + return out + + +def _run_text(*cmd: str) -> str: + try: + completed = subprocess.run( + list(cmd), + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + except FileNotFoundError: + return "" + return completed.stdout.strip() + + +def _gpu_rows() -> List[Dict[str, str]]: + text = _run_text( + "nvidia-smi", + "--query-gpu=index,name,driver_version,memory.total", + "--format=csv,noheader,nounits", + ) + rows: List[Dict[str, str]] = [] + for line in text.splitlines(): + parts = [part.strip() for part in line.split(",")] + if len(parts) != 4: + continue + rows.append( + { + "index": parts[0], + "name": parts[1], + "driver_version": parts[2], + "memory_total_mib": parts[3], + } + ) + return rows + + +def _python_json(python_bin: Path, code: str) -> Dict[str, Any]: + completed = subprocess.run( + [str(python_bin), "-c", code], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if completed.returncode != 0: + return { + "ok": False, + "stdout": completed.stdout.strip(), + "stderr": completed.stderr.strip(), + } + try: + return {"ok": True, "data": json.loads(completed.stdout)} + except json.JSONDecodeError: + return { + "ok": False, + "stdout": completed.stdout.strip(), + "stderr": completed.stderr.strip(), + } + + +def _package_report(python_bin: Path) -> Dict[str, Any]: + code = """ +import importlib +import json +import os +import sys + +mods = {} +for name in %s: + try: + mod = importlib.import_module(name) + mods[name] = { + "version": getattr(mod, "__version__", None), + "file": getattr(mod, "__file__", None), + } + except Exception as exc: + mods[name] = {"error": repr(exc)} + +payload = { + "python_version": sys.version, + "executable": sys.executable, + "virtual_env": os.environ.get("VIRTUAL_ENV"), + "ld_library_path": os.environ.get("LD_LIBRARY_PATH"), + "packages": mods, +} +print(json.dumps(payload)) +""" % (repr(PACKAGE_NAMES),) + return _python_json(python_bin, code) + + +def _site_package_nvidia_libs(venv_root: Path) -> List[Path]: + libs: List[Path] = [] + for site_packages in sorted((venv_root / "lib").glob("python*/site-packages")): + for lib_dir in sorted((site_packages / "nvidia").glob("*/lib")): + if lib_dir.is_dir(): + libs.append(lib_dir) + return libs + + +def _interesting_libs(lib_dir: Path) -> List[str]: + names = [] + for child in sorted(lib_dir.iterdir()): + if not child.is_file(): + continue + name = child.name + if any(token in name for token in ("libcudart", "libnvrtc", "libcudnn", "libcuda")): + names.append(name) + return names + + +def _venv_root(python_bin: Path) -> Path: + return python_bin.parent.parent + + +def _pip_freeze_subset(python_bin: Path) -> List[str]: + text = _run_text(str(python_bin), "-m", "pip", "freeze") + prefixes = ( + "torch", + "vllm", + "transformers", + "nvidia-cuda", + "nvidia-cudnn", + "xformers", + "flash-attn", + ) + lines = [] + for line in text.splitlines(): + normalized = line.strip().lower() + if any(normalized.startswith(prefix) for prefix in prefixes): + lines.append(line.strip()) + return lines + + +def _report(repo_root: Path, python_bin: Path) -> Dict[str, Any]: + os_release = _read_os_release() + venv_root = _venv_root(python_bin) + lib_dirs = _site_package_nvidia_libs(venv_root) + return { + "repo_root": str(repo_root), + "repo_head": _run_text("git", "-C", str(repo_root), "rev-parse", "HEAD"), + "hostname": platform.node(), + "os_release": { + "PRETTY_NAME": os_release.get("PRETTY_NAME"), + "VERSION_ID": os_release.get("VERSION_ID"), + }, + "python_bin": str(python_bin), + "venv_root": str(venv_root), + "gpus": _gpu_rows(), + "python_env": _package_report(python_bin), + "nvidia_lib_dirs": [ + { + "path": str(lib_dir), + "interesting_libs": _interesting_libs(lib_dir), + } + for lib_dir in lib_dirs + ], + "pip_freeze_subset": _pip_freeze_subset(python_bin), + "selected_env": { + "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES"), + "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH"), + "VIRTUAL_ENV": os.environ.get("VIRTUAL_ENV"), + }, + } + + +def _print_text(report: Dict[str, Any]) -> None: + print(f"repo_root: {report['repo_root']}") + print(f"repo_head: {report['repo_head']}") + print(f"hostname: {report['hostname']}") + os_release = report["os_release"] + print(f"os: {os_release.get('PRETTY_NAME')} (VERSION_ID={os_release.get('VERSION_ID')})") + print(f"python_bin: {report['python_bin']}") + print(f"venv_root: {report['venv_root']}") + print() + print("gpus:") + for row in report["gpus"]: + print( + f" - index={row['index']} name={row['name']} " + f"driver={row['driver_version']} memory_mib={row['memory_total_mib']}" + ) + print() + print("python_env:") + py_env = report["python_env"] + print(f" ok: {py_env.get('ok')}") + if py_env.get("ok"): + data = py_env["data"] + print(f" executable: {data.get('executable')}") + print(f" python_version: {data.get('python_version')}") + print(f" virtual_env: {data.get('virtual_env')}") + print(f" ld_library_path: {data.get('ld_library_path')}") + for name, package in data.get("packages", {}).items(): + print(f" {name}: {package}") + else: + print(f" stdout: {py_env.get('stdout')}") + print(f" stderr: {py_env.get('stderr')}") + print() + print("nvidia_lib_dirs:") + for item in report["nvidia_lib_dirs"]: + print(f" - path: {item['path']}") + for lib in item["interesting_libs"]: + print(f" {lib}") + print() + print("pip_freeze_subset:") + for line in report["pip_freeze_subset"]: + print(f" - {line}") + print() + print("selected_env:") + for key, value in report["selected_env"].items(): + print(f" {key}={value}") + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + repo_root = Path(args.repo_root).expanduser().resolve() + python_bin = _detect_python_bin(repo_root, str(args.python_bin or "")) + report = _report(repo_root, python_bin) + if args.json: + print(json.dumps(report, indent=2, ensure_ascii=False)) + else: + _print_text(report) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/extract_checkpoint_benchmark.py b/src/glossapi/scripts/extract_checkpoint_benchmark.py new file mode 100644 index 0000000..ec5800d --- /dev/null +++ b/src/glossapi/scripts/extract_checkpoint_benchmark.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import shutil +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +from glossapi import Corpus + + +HEADER_RE = re.compile(r"(?m)^[ \t]{0,3}#{1,6}\s+\S") + +TUNING_ENV_VARS = ( + "GLOSSAPI_DOCLING_MAX_BATCH_FILES", + "GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", + "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE", + "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE", + "GLOSSAPI_DOCLING_OCR_BATCH_SIZE", + "GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", +) + +TUNING_ARG_TO_ENV = { + "docling_max_batch_files": "GLOSSAPI_DOCLING_MAX_BATCH_FILES", + "docling_batch_target_pages": "GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", + "docling_layout_batch_size": "GLOSSAPI_DOCLING_LAYOUT_BATCH_SIZE", + "docling_table_batch_size": "GLOSSAPI_DOCLING_TABLE_BATCH_SIZE", + "docling_ocr_batch_size": "GLOSSAPI_DOCLING_OCR_BATCH_SIZE", + "docling_page_batch_size": "GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", +} + + +def _runtime_env_snapshot() -> Dict[str, str]: + return {name: os.getenv(name, "") for name in TUNING_ENV_VARS} + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.extract_checkpoint_benchmark", + description=( + "Run a strict Phase-1 extraction benchmark on a fixed PDF set and audit " + "canonical markdown outputs for presence, byte size, header counts, and drift." + ), + ) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--report-path", required=True) + p.add_argument("--baseline-report", default="") + p.add_argument("--phase1-backend", default="docling", choices=["auto", "safe", "docling"]) + p.add_argument("--accel-type", default="CUDA") + p.add_argument("--num-threads", type=int, default=1) + p.add_argument("--use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--devices", nargs="*", type=int, default=None) + p.add_argument("--workers-per-device", type=int, default=1) + p.add_argument("--benchmark-mode", action="store_true") + p.add_argument("--docling-max-batch-files", type=int, default=None) + p.add_argument("--docling-batch-target-pages", type=int, default=None) + p.add_argument("--docling-layout-batch-size", type=int, default=None) + p.add_argument("--docling-table-batch-size", type=int, default=None) + p.add_argument("--docling-ocr-batch-size", type=int, default=None) + p.add_argument("--docling-page-batch-size", type=int, default=None) + p.add_argument("--filenames", nargs="*", default=[]) + p.add_argument("--clean-output-dir", action="store_true") + p.add_argument("--log-level", default="INFO") + return p.parse_args(argv) + + +def _apply_cli_tuning_overrides(args: argparse.Namespace) -> None: + for arg_name, env_name in TUNING_ARG_TO_ENV.items(): + value = getattr(args, arg_name, None) + if value is None: + continue + os.environ[env_name] = str(int(value)) + + +def _count_pdf_pages(pdf_path: Path) -> int: + try: + import fitz + + doc = fitz.open(pdf_path) + try: + return int(doc.page_count) + finally: + doc.close() + except Exception: + pass + + try: + import pypdfium2 as pdfium + + pdf = pdfium.PdfDocument(str(pdf_path)) + try: + return int(len(pdf)) + finally: + try: + pdf.close() + except Exception: + pass + except Exception: + pass + + try: + from pypdf import PdfReader + + return int(len(PdfReader(str(pdf_path)).pages)) + except Exception as exc: + try: + from PyPDF2 import PdfReader # type: ignore + + return int(len(PdfReader(str(pdf_path)).pages)) + except Exception as exc2: + raise RuntimeError(f"Unable to count PDF pages for {pdf_path}: {exc2}") from exc2 + + +def _sha256_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _markdown_headers(text: str) -> int: + return int(len(HEADER_RE.findall(text or ""))) + + +def _inventory_markdown(markdown_dir: Path, *, pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]: + inventory: Dict[str, Dict[str, Any]] = {} + for pdf_path in pdf_paths: + stem = pdf_path.stem + md_path = markdown_dir / f"{stem}.md" + present = md_path.exists() + payload = md_path.read_bytes() if present else b"" + text = payload.decode("utf-8") if present else "" + inventory[stem] = { + "filename": pdf_path.name, + "markdown_path": str(md_path), + "present": bool(present), + "byte_size": int(len(payload)), + "header_count": _markdown_headers(text), + "sha256": _sha256_bytes(payload) if present else None, + } + return inventory + + +def _compare_inventory( + current_inventory: Dict[str, Dict[str, Any]], + baseline_inventory: Dict[str, Dict[str, Any]], +) -> Dict[str, Any]: + added = [] + missing = [] + byte_size_changed = [] + header_count_changed = [] + sha_changed = [] + for stem, current in sorted(current_inventory.items()): + baseline = baseline_inventory.get(stem) + if baseline is None: + added.append(stem) + continue + if bool(baseline.get("present")) and not bool(current.get("present")): + missing.append(stem) + if int(baseline.get("byte_size", 0)) != int(current.get("byte_size", 0)): + byte_size_changed.append(stem) + if int(baseline.get("header_count", 0)) != int(current.get("header_count", 0)): + header_count_changed.append(stem) + if baseline.get("sha256") != current.get("sha256"): + sha_changed.append(stem) + for stem, baseline in sorted(baseline_inventory.items()): + if stem in current_inventory: + continue + if bool(baseline.get("present")): + missing.append(stem) + return { + "added_markdown": added, + "missing_markdown": sorted(set(missing)), + "byte_size_changed": byte_size_changed, + "header_count_changed": header_count_changed, + "sha_changed": sha_changed, + } + + +def _load_baseline_inventory(path: Path) -> Dict[str, Dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + return dict(payload.get("markdown_inventory") or {}) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + _apply_cli_tuning_overrides(args) + input_dir = Path(args.input_dir).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + report_path = Path(args.report_path).expanduser().resolve() + report_path.parent.mkdir(parents=True, exist_ok=True) + + pdf_paths = sorted(input_dir.glob("*.pdf")) + if args.filenames: + selected = {str(name) for name in args.filenames} + pdf_paths = [path for path in pdf_paths if path.name in selected] + if not pdf_paths: + raise SystemExit(f"No PDF files selected under {input_dir}") + + if bool(args.clean_output_dir) and output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + total_pages = int(sum(_count_pdf_pages(path) for path in pdf_paths)) + start_ts = time.time() + start_perf = time.perf_counter() + + corpus = Corpus(input_dir=input_dir, output_dir=output_dir) + corpus.extract( + input_format="pdf", + accel_type=str(args.accel_type), + num_threads=int(args.num_threads), + phase1_backend=str(args.phase1_backend), + use_gpus=str(args.use_gpus), + devices=list(args.devices) if args.devices else None, + workers_per_device=int(args.workers_per_device), + benchmark_mode=bool(args.benchmark_mode), + filenames=[path.name for path in pdf_paths], + ) + + elapsed_sec = float(time.perf_counter() - start_perf) + end_ts = time.time() + markdown_dir = output_dir / "markdown" + inventory = _inventory_markdown(markdown_dir, pdf_paths=pdf_paths) + markdown_present = int(sum(1 for item in inventory.values() if bool(item["present"]))) + + report: Dict[str, Any] = { + "input_dir": str(input_dir), + "output_dir": str(output_dir), + "started_at": int(start_ts), + "finished_at": int(end_ts), + "elapsed_sec": elapsed_sec, + "files_total": int(len(pdf_paths)), + "pages_total": int(total_pages), + "pages_per_sec": (float(total_pages) / elapsed_sec) if elapsed_sec > 0 else None, + "phase1_backend": str(args.phase1_backend), + "accel_type": str(args.accel_type), + "num_threads": int(args.num_threads), + "use_gpus": str(args.use_gpus), + "devices": list(args.devices) if args.devices else [], + "workers_per_device": int(args.workers_per_device), + "benchmark_mode": bool(args.benchmark_mode), + "runtime_env": _runtime_env_snapshot(), + "markdown_present": markdown_present, + "markdown_missing": int(len(pdf_paths) - markdown_present), + "markdown_inventory": inventory, + } + + baseline_raw = str(args.baseline_report or "").strip() + if baseline_raw: + baseline_path = Path(baseline_raw).expanduser().resolve() + if baseline_path.exists(): + report["comparison"] = _compare_inventory( + inventory, + _load_baseline_inventory(baseline_path), + ) + else: + report["comparison_error"] = f"Baseline report not found: {baseline_path}" + + report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print(json.dumps({ + "files_total": report["files_total"], + "pages_total": report["pages_total"], + "elapsed_sec": round(report["elapsed_sec"], 3), + "pages_per_sec": round(report["pages_per_sec"], 4) if report["pages_per_sec"] is not None else None, + "markdown_present": report["markdown_present"], + "markdown_missing": report["markdown_missing"], + "report_path": str(report_path), + }, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/full_pipeline_checkpoint.py b/src/glossapi/scripts/full_pipeline_checkpoint.py new file mode 100644 index 0000000..406b8ed --- /dev/null +++ b/src/glossapi/scripts/full_pipeline_checkpoint.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import argparse +import json +import shutil +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from glossapi import Corpus +from glossapi.scripts.extract_checkpoint_benchmark import _apply_cli_tuning_overrides + + +def _parse_int_list(values: Optional[List[int]]) -> List[int]: + return list(values or []) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.full_pipeline_checkpoint", + description=( + "Run a sample GlossAPI pipeline checkpoint from extract through JSONL export " + "and write a compact timing/continuity report." + ), + ) + p.add_argument("--input-dir", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--export-path", required=True) + p.add_argument("--report-path", required=True) + p.add_argument("--clean-output-dir", action="store_true") + p.add_argument("--skip-extract", action="store_true") + p.add_argument("--skip-clean", action="store_true") + p.add_argument("--skip-ocr", action="store_true") + + p.add_argument("--phase1-backend", default="docling", choices=["auto", "safe", "docling"]) + p.add_argument("--accel-type", default="CUDA") + p.add_argument("--num-threads", type=int, default=1) + p.add_argument("--use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--devices", nargs="*", type=int, default=None) + p.add_argument("--workers-per-device", type=int, default=1) + p.add_argument("--benchmark-mode", action="store_true") + p.add_argument("--filenames", nargs="*", default=[]) + p.add_argument("--drop-bad", action="store_true") + + p.add_argument("--docling-max-batch-files", type=int, default=None) + p.add_argument("--docling-batch-target-pages", type=int, default=None) + p.add_argument("--docling-layout-batch-size", type=int, default=None) + p.add_argument("--docling-table-batch-size", type=int, default=None) + p.add_argument("--docling-ocr-batch-size", type=int, default=None) + p.add_argument("--docling-page-batch-size", type=int, default=None) + + p.add_argument("--ocr-backend", default="deepseek") + p.add_argument("--ocr-runtime-backend", default="vllm") + p.add_argument("--ocr-use-gpus", default="single", choices=["single", "multi"]) + p.add_argument("--ocr-devices", nargs="*", type=int, default=None) + p.add_argument("--ocr-workers-per-gpu", type=int, default=1) + p.add_argument("--ocr-vllm-batch-size", type=int, default=None) + p.add_argument("--ocr-repair-exec-batch-target-pages", type=int, default=None) + p.add_argument("--ocr-repair-exec-batch-target-items", type=int, default=None) + p.add_argument("--ocr-target-batch-pages", type=int, default=160) + p.add_argument("--ocr-render-dpi", type=int, default=None) + p.add_argument("--ocr-scheduler", default="auto") + p.add_argument("--ocr-math-enhance", action="store_true") + + p.add_argument("--text-key", default="text") + p.add_argument("--metadata-key", default="pipeline_metadata") + return p.parse_args(argv) + + +def _read_metadata_counts(parquet_path: Path) -> Dict[str, int]: + if not parquet_path.exists(): + return { + "rows_total": 0, + "needs_ocr_true": 0, + "ocr_success_true": 0, + "text_nonempty": 0, + } + df = pd.read_parquet(parquet_path) + if df.empty: + return { + "rows_total": 0, + "needs_ocr_true": 0, + "ocr_success_true": 0, + "text_nonempty": 0, + } + text_series = df["text"] if "text" in df.columns else pd.Series([], dtype=object) + text_nonempty = int( + sum(bool(str(value).strip()) for value in text_series.fillna("").tolist()) + ) if len(text_series) else 0 + needs_ocr_true = int(df["needs_ocr"].fillna(False).astype(bool).sum()) if "needs_ocr" in df.columns else 0 + ocr_success_true = int(df["ocr_success"].fillna(False).astype(bool).sum()) if "ocr_success" in df.columns else 0 + return { + "rows_total": int(len(df)), + "needs_ocr_true": needs_ocr_true, + "ocr_success_true": ocr_success_true, + "text_nonempty": text_nonempty, + } + + +def _count_jsonl_records(path: Path) -> int: + if not path.exists(): + return 0 + with path.open("r", encoding="utf-8") as fp: + return sum(1 for line in fp if line.strip()) + + +def _export_jsonl_with_retry( + corpus: Corpus, + *, + export_path: Path, + metadata_path: Path, + text_key: str, + metadata_key: str, + post_ocr_counts: Dict[str, int], + max_attempts: int = 4, + retry_delay_sec: float = 1.0, +) -> int: + needs_retry = int(post_ocr_counts.get("text_nonempty", 0) or 0) > 0 + attempts = max_attempts if needs_retry else 1 + + for attempt in range(attempts): + if export_path.exists(): + export_path.unlink() + corpus.jsonl( + export_path, + text_key=text_key, + metadata_key=metadata_key, + include_remaining_metadata=False, + metadata_path=metadata_path, + ) + export_records = _count_jsonl_records(export_path) + if export_records > 0 or attempt == attempts - 1: + return export_records + time.sleep(retry_delay_sec) + return 0 + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + _apply_cli_tuning_overrides(args) + + input_dir = Path(args.input_dir).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + export_path = Path(args.export_path).expanduser().resolve() + report_path = Path(args.report_path).expanduser().resolve() + + if bool(args.clean_output_dir) and output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + report_path.parent.mkdir(parents=True, exist_ok=True) + export_path.parent.mkdir(parents=True, exist_ok=True) + + corpus = Corpus(input_dir=input_dir, output_dir=output_dir) + metadata_path = output_dir / "download_results" / "download_results.parquet" + + started_at = time.time() + skipped_phases: List[str] = [] + + if bool(args.skip_extract): + skipped_phases.append("extract") + extract_elapsed = 0.0 + else: + extract_start = time.perf_counter() + corpus.extract( + input_format="pdf", + accel_type=str(args.accel_type), + num_threads=int(args.num_threads), + phase1_backend=str(args.phase1_backend), + use_gpus=str(args.use_gpus), + devices=_parse_int_list(args.devices), + workers_per_device=int(args.workers_per_device), + benchmark_mode=bool(args.benchmark_mode), + filenames=list(args.filenames or []), + ) + extract_elapsed = float(time.perf_counter() - extract_start) + post_extract_counts = _read_metadata_counts(metadata_path) + + if bool(args.skip_clean): + skipped_phases.append("clean") + clean_elapsed = 0.0 + else: + clean_start = time.perf_counter() + corpus.clean(drop_bad=bool(args.drop_bad)) + clean_elapsed = float(time.perf_counter() - clean_start) + post_clean_counts = _read_metadata_counts(metadata_path) + + if bool(args.skip_ocr): + skipped_phases.append("ocr") + ocr_elapsed = 0.0 + else: + ocr_start = time.perf_counter() + corpus.ocr( + backend=str(args.ocr_backend), + runtime_backend=str(args.ocr_runtime_backend), + use_gpus=str(args.ocr_use_gpus), + devices=_parse_int_list(args.ocr_devices), + workers_per_gpu=int(args.ocr_workers_per_gpu), + vllm_batch_size=args.ocr_vllm_batch_size, + repair_exec_batch_target_pages=args.ocr_repair_exec_batch_target_pages, + repair_exec_batch_target_items=args.ocr_repair_exec_batch_target_items, + target_batch_pages=int(args.ocr_target_batch_pages), + render_dpi=args.ocr_render_dpi, + scheduler=str(args.ocr_scheduler), + math_enhance=bool(args.ocr_math_enhance), + ) + ocr_elapsed = float(time.perf_counter() - ocr_start) + post_ocr_counts = _read_metadata_counts(metadata_path) + + export_start = time.perf_counter() + export_records = _export_jsonl_with_retry( + corpus, + export_path=export_path, + metadata_path=metadata_path, + text_key=str(args.text_key), + metadata_key=str(args.metadata_key), + post_ocr_counts=post_ocr_counts, + ) + export_elapsed = float(time.perf_counter() - export_start) + + finished_at = time.time() + report: Dict[str, Any] = { + "input_dir": str(input_dir), + "output_dir": str(output_dir), + "export_path": str(export_path), + "metadata_path": str(metadata_path), + "started_at": int(started_at), + "finished_at": int(finished_at), + "elapsed_total_sec": float(finished_at - started_at), + "skipped_phases": list(skipped_phases), + "extract_elapsed_sec": extract_elapsed, + "clean_elapsed_sec": clean_elapsed, + "ocr_elapsed_sec": ocr_elapsed, + "export_elapsed_sec": export_elapsed, + "post_extract_counts": post_extract_counts, + "post_clean_counts": post_clean_counts, + "post_ocr_counts": post_ocr_counts, + "export_records": int(export_records), + } + report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print( + json.dumps( + { + "extract_elapsed_sec": round(extract_elapsed, 3), + "clean_elapsed_sec": round(clean_elapsed, 3), + "ocr_elapsed_sec": round(ocr_elapsed, 3), + "export_elapsed_sec": round(export_elapsed, 3), + "rows_total": post_ocr_counts["rows_total"], + "needs_ocr_after_clean": post_clean_counts["needs_ocr_true"], + "ocr_success_after_ocr": post_ocr_counts["ocr_success_true"], + "export_records": int(export_records), + "report_path": str(report_path), + }, + indent=2, + sort_keys=True, + ) + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/install_glossapi.py b/src/glossapi/scripts/install_glossapi.py new file mode 100644 index 0000000..195d662 --- /dev/null +++ b/src/glossapi/scripts/install_glossapi.py @@ -0,0 +1,230 @@ +"""Guided installer for GlossAPI extras.""" + +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Set + + +PHASE_TO_EXTRAS: Dict[str, Set[str]] = { + "download": set(), + "browser_download": {"browser"}, + "extract": {"docling"}, + "ocr": set(), + "docs": {"docs"}, +} + + +@dataclass(frozen=True) +class InstallPlan: + phases: tuple[str, ...] + extras: tuple[str, ...] + editable: bool + include_cuda: bool + needs_deepseek_runtime: bool + + +def _supports_color() -> bool: + return sys.stdout.isatty() and os.environ.get("TERM") not in {"", "dumb", None} + + +def _style(text: str, code: str) -> str: + if not _supports_color(): + return text + return f"\033[{code}m{text}\033[0m" + + +def _prompt_yes_no(question: str, default: bool = False) -> bool: + suffix = "[Y/n]" if default else "[y/N]" + while True: + raw = input(f"{question} {suffix} ").strip().lower() + if not raw: + return default + if raw in {"y", "yes"}: + return True + if raw in {"n", "no"}: + return False + print("Please answer 'y' or 'n'.") + + +def _resolve_phase_selection(tokens: Iterable[str]) -> List[str]: + resolved: List[str] = [] + seen: Set[str] = set() + for token in tokens: + phase = str(token).strip().lower() + if not phase: + continue + if phase not in PHASE_TO_EXTRAS: + raise ValueError(f"Unsupported phase '{token}'. Valid phases: {', '.join(sorted(PHASE_TO_EXTRAS))}") + if phase not in seen: + seen.add(phase) + resolved.append(phase) + return resolved + + +def build_install_plan( + *, + phases: Sequence[str], + editable: bool, + include_cuda: bool, +) -> InstallPlan: + selected = _resolve_phase_selection(phases) + extras: Set[str] = set() + for phase in selected: + extras.update(PHASE_TO_EXTRAS[phase]) + if include_cuda: + extras.add("cuda") + return InstallPlan( + phases=tuple(selected), + extras=tuple(sorted(extras)), + editable=bool(editable), + include_cuda=bool(include_cuda), + needs_deepseek_runtime=("ocr" in selected), + ) + + +def build_pip_command(plan: InstallPlan, repo_root: Path) -> List[str]: + target = "." + if plan.extras: + target = f".[{','.join(plan.extras)}]" + cmd = [sys.executable, "-m", "pip", "install"] + if plan.editable: + cmd.append("-e") + cmd.append(target) + return cmd + + +def build_deepseek_command(repo_root: Path) -> Optional[List[str]]: + script = repo_root / "dependency_setup" / "setup_deepseek_uv.sh" + if not script.exists(): + return None + shell = shutil.which("bash") or shutil.which("sh") + if not shell: + return None + return [shell, str(script)] + + +def _interactive_plan(default_editable: bool) -> InstallPlan: + print(_style("GlossAPI Installer", "1;36")) + print("Select only the phases you plan to use so optional dependencies stay minimal.\n") + + selected: List[str] = ["download"] + print(_style("Core", "1;37")) + print(" download: base downloader/data pipeline dependencies") + if _prompt_yes_no("Add browser-gated download support?", default=False): + selected.append("browser_download") + if _prompt_yes_no("Add extraction support (Docling)?", default=False): + selected.append("extract") + if _prompt_yes_no("Add OCR support (DeepSeek backend)?", default=False): + selected.append("ocr") + if _prompt_yes_no("Add docs tooling?", default=False): + selected.append("docs") + include_cuda = _prompt_yes_no("Include CUDA extras where relevant?", default=False) + editable = _prompt_yes_no("Install in editable mode?", default=default_editable) + return build_install_plan(phases=selected, editable=editable, include_cuda=include_cuda) + + +def _plan_summary(plan: InstallPlan, command: Sequence[str]) -> str: + extras = ", ".join(plan.extras) if plan.extras else "(none)" + phases = ", ".join(plan.phases) if plan.phases else "(none)" + return "\n".join( + [ + _style("Install plan", "1;32"), + f" phases: {phases}", + f" extras: {extras}", + f" editable: {'yes' if plan.editable else 'no'}", + f" command: {shlex.join(command)}", + f" deepseek runtime: {'separate setup required' if plan.needs_deepseek_runtime else 'not requested'}", + ] + ) + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="python install_glossapi.py", + description="Guided installer for GlossAPI optional dependency groups.", + ) + parser.add_argument( + "--phases", + default="", + help=( + "Comma-separated phases to install. Valid values: " + + ", ".join(sorted(PHASE_TO_EXTRAS)) + + ". If omitted, an interactive wizard is shown." + ), + ) + parser.add_argument( + "--cuda", + action="store_true", + help="Include the CUDA extra.", + ) + parser.add_argument( + "--editable", + dest="editable", + action="store_true", + help="Install in editable mode.", + ) + parser.add_argument( + "--no-editable", + dest="editable", + action="store_false", + help="Install as a regular package.", + ) + parser.set_defaults(editable=True) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the computed pip command without running it.", + ) + parser.add_argument( + "--yes", + action="store_true", + help="Skip confirmation prompts in non-interactive mode.", + ) + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + repo_root = Path(__file__).resolve().parents[3] + + if args.phases.strip(): + plan = build_install_plan( + phases=[token for token in args.phases.split(",") if token.strip()], + editable=args.editable, + include_cuda=bool(args.cuda), + ) + else: + plan = _interactive_plan(default_editable=bool(args.editable)) + + command = build_pip_command(plan, repo_root) + print(_plan_summary(plan, command)) + deepseek_command = build_deepseek_command(repo_root) if plan.needs_deepseek_runtime else None + if deepseek_command: + print(f" deepseek command: {shlex.join(deepseek_command)}") + + if args.dry_run: + return 0 + if not args.yes and not args.phases.strip(): + if not _prompt_yes_no("Run this install command now?", default=True): + print("Aborted.") + return 1 + + completed = subprocess.run(command, cwd=repo_root) + if completed.returncode != 0: + return int(completed.returncode) + if plan.needs_deepseek_runtime and deepseek_command: + print(_style("Provisioning dedicated DeepSeek runtime…", "1;33")) + completed = subprocess.run(deepseek_command, cwd=repo_root) + return int(completed.returncode) + + +if __name__ == "__main__": # pragma: no cover - CLI entrypoint + raise SystemExit(main()) diff --git a/src/glossapi/scripts/ocr_gpu_batch.py b/src/glossapi/scripts/ocr_gpu_batch.py index 2183664..2646baa 100644 --- a/src/glossapi/scripts/ocr_gpu_batch.py +++ b/src/glossapi/scripts/ocr_gpu_batch.py @@ -115,15 +115,21 @@ def main(argv: Optional[List[str]] = None) -> int: "--force-ocr", dest="force_ocr", action="store_true", - help="Force GPU OCR during extraction (default).", + help="Deprecated no-op retained for compatibility; OCR now runs through Corpus.ocr(...).", ) parser.add_argument( "--no-force-ocr", dest="force_ocr", action="store_false", - help="Skip forced OCR (only run math/layout).", + help="Explicitly disable the deprecated Phase-1 OCR flag.", + ) + parser.set_defaults(force_ocr=False) + parser.add_argument( + "--workers-per-device", + type=int, + default=1, + help="Number of extraction workers to bind to each visible GPU (default: 1).", ) - parser.set_defaults(force_ocr=True) parser.add_argument( "--dry-run", action="store_true", @@ -182,6 +188,7 @@ def main(argv: Optional[List[str]] = None) -> int: export_doc_json=True, emit_formula_index=emit_formula_index, phase1_backend=args.phase1_backend, + workers_per_device=max(1, int(args.workers_per_device)), ) print("[ocr_gpu_batch] Extraction complete.") @@ -190,4 +197,3 @@ def main(argv: Optional[List[str]] = None) -> int: if __name__ == "__main__": # pragma: no cover - CLI entrypoint raise SystemExit(main()) - diff --git a/src/glossapi/scripts/openarchives_download_freeze.py b/src/glossapi/scripts/openarchives_download_freeze.py new file mode 100644 index 0000000..e358781 --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_freeze.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import argparse +import logging +from pathlib import Path +from typing import List, Optional + +from glossapi import Corpus +from glossapi.scripts.openarchives_ocr_run_node import ( + DEFAULT_DOWNLOAD_CONCURRENCY, + DEFAULT_DOWNLOAD_TIMEOUT, + _load_frame, + _normalize_download_results, + _prepare_download_input, + _write_canonical_metadata, +) + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_freeze", + description=( + "Materialize one OpenArchives manifest into a canonical GlossAPI downloads root " + "without starting OCR. This is the reproducible PDF-freeze entrypoint." + ), + ) + p.add_argument("--input-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-mode", default="auto") + p.add_argument("--download-scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") + p.add_argument("--supported-formats", default="pdf") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + input_path = Path(args.input_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + manifest_df = _prepare_download_input(_load_frame(input_path)) + download_input = manifests_dir / "download_input.parquet" + manifest_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + _write_canonical_metadata(work_root, manifest_df) + + if args.dry_run: + return 0 + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + download_mode=str(args.download_mode), + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + supported_formats=[part.strip() for part in str(args.supported_formats).split(",") if part.strip()], + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=manifest_df, download_results_df=dl_df, url_column="url") + _write_canonical_metadata(work_root, canonical_df) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_download_probe.py b/src/glossapi/scripts/openarchives_download_probe.py new file mode 100644 index 0000000..d253b9b --- /dev/null +++ b/src/glossapi/scripts/openarchives_download_probe.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable, Optional +from urllib.parse import urlparse + +import pandas as pd + +from glossapi import Corpus + + +def _parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_download_probe", + description=( + "Sample OpenArchives OCR-target PDFs by host, run a controlled download probe, " + "and write per-host success summaries." + ), + ) + p.add_argument("--parquet", required=True, help="needs_ocr_enriched parquet with pdf_url and filename columns") + p.add_argument("--output-dir", required=True) + p.add_argument("--policy-file", default="") + p.add_argument("--samples-per-host", type=int, default=12) + p.add_argument("--max-hosts", type=int, default=12) + p.add_argument("--seed", type=int, default=42) + p.add_argument("--concurrency", type=int, default=12) + p.add_argument("--request-timeout", type=int, default=60) + p.add_argument("--scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--hosts", nargs="*", default=None, help="Optional explicit host allowlist") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _host_from_url(url: str) -> str: + try: + return (urlparse(str(url)).hostname or "").lower() + except Exception: + return "" + + +def _prepare_probe_frame( + df: pd.DataFrame, + *, + samples_per_host: int, + max_hosts: int, + seed: int, + hosts: Optional[Iterable[str]] = None, +) -> pd.DataFrame: + frame = df.copy() + if "pdf_url" not in frame.columns or "filename" not in frame.columns: + raise SystemExit("Probe parquet must include at least 'pdf_url' and 'filename' columns") + frame["host"] = frame["pdf_url"].astype(str).map(_host_from_url) + frame = frame[frame["host"].astype(bool)].copy() + if hosts: + allowed = {str(h).strip().lower() for h in hosts if str(h).strip()} + frame = frame[frame["host"].isin(allowed)].copy() + ranked_hosts = ( + frame.groupby("host", dropna=False) + .size() + .sort_values(ascending=False) + .head(max(1, int(max_hosts))) + .index.tolist() + ) + probe = frame[frame["host"].isin(ranked_hosts)].copy() + sampled = ( + probe.groupby("host", group_keys=True) + .apply( + lambda grp: grp.sample(n=min(len(grp), int(samples_per_host)), random_state=int(seed)), + include_groups=False, + ) + .reset_index(level=0) + .reset_index(drop=True) + ) + sampled["url"] = sampled["pdf_url"].astype(str) + sampled["base_domain"] = sampled["pdf_url"].astype(str).map( + lambda s: f"{urlparse(str(s)).scheme or 'https'}://{(urlparse(str(s)).netloc or '').lower()}".rstrip("/") + if _host_from_url(str(s)) + else "" + ) + return sampled + + +def _summary_payload(df: pd.DataFrame, *, source_rows: int) -> dict: + out = df.copy() + if "download_success" not in out.columns: + out["download_success"] = False + grouped = ( + out.groupby("host", dropna=False) + .agg( + docs=("host", "size"), + successes=("download_success", lambda s: int(pd.Series(s).fillna(False).sum())), + failures=("download_success", lambda s: int((~pd.Series(s).fillna(False)).sum())), + ) + .reset_index() + .sort_values(["docs", "successes"], ascending=[False, False]) + ) + return { + "source_rows": int(source_rows), + "probe_rows": int(len(out)), + "hosts": grouped.to_dict(orient="records"), + } + + +def main(argv: Optional[list[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + source_df = pd.read_parquet(parquet_path) + probe_df = _prepare_probe_frame( + source_df, + samples_per_host=int(args.samples_per_host), + max_hosts=int(args.max_hosts), + seed=int(args.seed), + hosts=args.hosts, + ) + probe_input = output_dir / "probe_input.parquet" + probe_df.to_parquet(probe_input, index=False) + + if args.dry_run: + summary = _summary_payload(probe_df, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + corpus = Corpus( + input_dir=output_dir / "downloads", + output_dir=output_dir, + log_level="INFO", + verbose=False, + ) + results = corpus.download( + input_parquet=probe_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.concurrency), + request_timeout=int(args.request_timeout), + scheduler_mode=str(args.scheduler_mode), + download_policy_file=(str(args.policy_file) if str(args.policy_file or "").strip() else None), + ) + merged = results.merge( + probe_df[["url", "host", "filename"]], + on="url", + how="left", + suffixes=("", "_probe"), + ) + merged_path = output_dir / "probe_results.parquet" + merged.to_parquet(merged_path, index=False) + summary = _summary_payload(merged, source_rows=len(source_df)) + (output_dir / "probe_summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_hf_refresh.py b/src/glossapi/scripts/openarchives_hf_refresh.py new file mode 100644 index 0000000..133852f --- /dev/null +++ b/src/glossapi/scripts/openarchives_hf_refresh.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import argparse +import io +import json +import re +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import _resolve_jsonl_path + + +PIPELINE_FIELDS = ( + "greek_badness_score", + "mojibake_badness_score", + "latin_percentage", + "polytonic_ratio", + "char_count_no_comments", + "is_empty", + "filter", + "needs_ocr", + "ocr_success", + "quality_method", + "reevaluated_at", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_hf_refresh", + description=( + "Refresh the canonical OpenArchives HF jsonl.zst shards in place from a refreshed " + "document-level parquet and update the dataset card counts." + ), + ) + p.add_argument("--dataset-root", required=True, help="Local clone/snapshot root of the HF dataset repo.") + p.add_argument("--metadata-parquet", required=True, help="Refreshed document-level parquet with source_jsonl/doc ids.") + p.add_argument("--output-root", default="", help="Optional separate output root. Defaults to in-place dataset-root.") + p.add_argument("--readme-path", default="README.md", help="Dataset card path relative to dataset-root/output-root.") + p.add_argument("--dry-run", action="store_true") + return p.parse_args(argv) + + +def _normalize_source_key(dataset_root: Path, recorded_path: str) -> str: + resolved = _resolve_jsonl_path(dataset_root, recorded_path) + return str(resolved.relative_to(dataset_root)) + + +def _clean_value(value: object) -> object: + if pd.isna(value): # type: ignore[arg-type] + return None + if isinstance(value, pd.Timestamp): + return value.isoformat() + if hasattr(value, "item"): + try: + return value.item() + except Exception: + return value + return value + + +def _build_update_index(metadata_df: pd.DataFrame, *, dataset_root: Path) -> Dict[str, Dict[str, dict]]: + required = {"source_doc_id", "source_jsonl"} + missing = sorted(required - set(metadata_df.columns)) + if missing: + raise SystemExit(f"Metadata parquet missing required column(s): {', '.join(missing)}") + updates: Dict[str, Dict[str, dict]] = {} + work = metadata_df.copy() + work["_source_key"] = work["source_jsonl"].astype(str).map(lambda p: _normalize_source_key(dataset_root, p)) + for _, row in work.iterrows(): + source_key = str(row["_source_key"]) + doc_id = str(row["source_doc_id"] or "") + payload = {field: _clean_value(row[field]) for field in PIPELINE_FIELDS if field in row.index} + updates.setdefault(source_key, {})[doc_id] = payload + return updates + + +def _iter_jsonl_rows(path: Path) -> Iterable[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + yield json.loads(line) + + +def _write_jsonl_rows(path: Path, rows: Iterable[dict]) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + count = 0 + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + payload = (json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8") + writer.write(payload) + count += 1 + return count + + +def _refresh_readme(readme_text: str, *, total_docs: int, needs_ocr_docs: int) -> str: + title_text = f"OpenArchives.gr {total_docs:,} docs".replace(",", ",") + percent = (100.0 * needs_ocr_docs / total_docs) if total_docs else 0.0 + pct_text = f"{percent:.2f}%" + + replacements = [ + (r"pretty_name:\s*OpenArchives\.gr [^\n]+", f"pretty_name: {title_text}"), + (r"# OpenArchives\.gr [^\n]+", f"# {title_text}"), + ( + r"- Σύνολο markdown αρχείων: \*\*[0-9,]+\*\* (?:from|από) openarchives\.gr", + f"- Σύνολο markdown αρχείων: **{total_docs:,}** από openarchives.gr", + ), + ( + r"- Total markdown files: \*\*[0-9,]+\*\* from openarchives\.gr", + f"- Total markdown files: **{total_docs:,}** from openarchives.gr", + ), + ( + r"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ( + r"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: \*\*[0-9,]+ / [0-9,]+ \([0-9.]+%\)\*\*", + f"- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **{needs_ocr_docs:,} / {total_docs:,} ({pct_text})**", + ), + ] + updated = readme_text + for pattern, replacement in replacements: + updated = re.sub(pattern, replacement, updated) + return updated + + +def _refresh_shard( + *, + input_path: Path, + output_path: Path, + updates: Dict[str, dict], + dry_run: bool, +) -> dict: + total = 0 + matched = 0 + needs_ocr = 0 + unmatched_doc_ids: list[str] = [] + rows_out: list[dict] = [] + + for row in _iter_jsonl_rows(input_path): + total += 1 + doc_id = str(row.get("doc_id") or "") + payload = updates.get(doc_id) + if payload is not None: + pipeline = dict(row.get("pipeline_metadata") or {}) + pipeline.update({k: v for k, v in payload.items() if v is not None}) + row["pipeline_metadata"] = pipeline + matched += 1 + else: + unmatched_doc_ids.append(doc_id) + pipeline = row.get("pipeline_metadata") or {} + if bool(pipeline.get("needs_ocr")): + needs_ocr += 1 + rows_out.append(row) + + if not dry_run: + _write_jsonl_rows(output_path, rows_out) + + return { + "path": str(input_path), + "total_rows": total, + "matched_rows": matched, + "unmatched_rows": total - matched, + "needs_ocr_rows": needs_ocr, + "sample_unmatched_doc_ids": unmatched_doc_ids[:5], + } + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + dataset_root = Path(args.dataset_root).expanduser().resolve() + output_root = Path(args.output_root).expanduser().resolve() if str(args.output_root).strip() else dataset_root + output_root.mkdir(parents=True, exist_ok=True) + metadata_path = Path(args.metadata_parquet).expanduser().resolve() + + metadata_df = pd.read_parquet(metadata_path).copy() + updates_by_shard = _build_update_index(metadata_df, dataset_root=dataset_root) + + summaries: list[dict] = [] + total_rows = 0 + matched_rows = 0 + needs_ocr_rows = 0 + shard_root = dataset_root / "data" / "openarchives" + for rel_key, updates in sorted(updates_by_shard.items()): + input_path = dataset_root / rel_key + output_path = output_root / rel_key + summary = _refresh_shard( + input_path=input_path, + output_path=output_path, + updates=updates, + dry_run=bool(args.dry_run), + ) + summaries.append(summary) + total_rows += int(summary["total_rows"]) + matched_rows += int(summary["matched_rows"]) + needs_ocr_rows += int(summary["needs_ocr_rows"]) + + readme_rel = Path(args.readme_path) + readme_in = dataset_root / readme_rel + readme_out = output_root / readme_rel + if readme_in.exists() and not args.dry_run: + readme_text = readme_in.read_text(encoding="utf-8") + readme_out.write_text( + _refresh_readme(readme_text, total_docs=matched_rows, needs_ocr_docs=int(metadata_df["needs_ocr"].fillna(False).sum())), + encoding="utf-8", + ) + + summary = { + "dataset_root": str(dataset_root), + "output_root": str(output_root), + "metadata_parquet": str(metadata_path), + "shards_touched": len(summaries), + "total_rows_seen": total_rows, + "matched_rows": matched_rows, + "unmatched_rows": total_rows - matched_rows, + "needs_ocr_rows_after_refresh": needs_ocr_rows, + "metadata_rows": int(len(metadata_df)), + "metadata_needs_ocr_rows": int(metadata_df["needs_ocr"].fillna(False).sum()) if "needs_ocr" in metadata_df.columns else None, + "sample_shards": summaries[:5], + } + print(json.dumps(summary, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py b/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py new file mode 100644 index 0000000..8548faa --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_cutoff_shards.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import argparse +import hashlib +import json +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import pandas as pd + +from glossapi.scripts.openarchives_ocr_shards import ( + PAGE_COLUMN_CANDIDATES, + _assign_rows, + _coerce_bool_series, + _resolve_page_column, + _resolve_targets, +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_cutoff_shards", + description=( + "Build OCR shard manifests from the materialized local PDFs available at a cutoff, " + "plus residual manifests for missing OCR targets." + ), + ) + p.add_argument("--parquet", required=True) + p.add_argument("--output-dir", required=True) + p.add_argument("--local-download-root", action="append", default=[]) + p.add_argument("--nodes", type=int, default=4) + p.add_argument("--pages-per-hour-per-node", type=float, default=50700.0) + p.add_argument("--filename-column", default="filename") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument("--page-column", default=None) + p.add_argument("--allow-threshold-derive", action="store_true") + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + p.add_argument("--key-column", default="source_doc_id") + p.add_argument("--cutoff-id", default="") + return p.parse_args(argv) + + +def _canonical_stem_from_row(row: pd.Series, filename_column: str) -> str: + if "filename_base" in row.index and str(row.get("filename_base") or "").strip(): + return str(row.get("filename_base")).strip() + return Path(str(row.get(filename_column) or "")).stem + + +def _scan_local_pdfs(roots: Sequence[Path]) -> Dict[str, Tuple[Path, Path]]: + available: Dict[str, Tuple[Path, Path]] = {} + for root in roots: + root = root.expanduser().resolve() + if not root.exists(): + continue + for pdf in sorted(p for p in root.rglob("*.pdf") if p.is_file()): + stem = pdf.stem + if stem not in available: + available[stem] = (root, pdf) + return available + + +def _stable_item_id(cutoff_id: str, key_value: str, stem: str) -> str: + payload = f"{cutoff_id}|{key_value}|{stem}" + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + local_roots = [Path(p).expanduser().resolve() for p in (args.local_download_root or [])] + if not local_roots: + raise SystemExit("Pass at least one --local-download-root.") + + df = pd.read_parquet(parquet_path).copy() + if args.filename_column not in df.columns: + raise SystemExit(f"Filename column '{args.filename_column}' not found in parquet.") + + page_column = _resolve_page_column(df, args.page_column) + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + target_df = df.loc[target_mask].copy() + if target_df.empty: + raise SystemExit("No OCR target rows selected at cutoff.") + + cutoff_id = str(args.cutoff_id or pd.Timestamp.utcnow().strftime("%Y%m%dT%H%M%SZ")) + target_df["filename_base"] = target_df.apply( + lambda row: _canonical_stem_from_row(row, str(args.filename_column)), + axis=1, + ) + available = _scan_local_pdfs(local_roots) + + rows_available: List[Dict[str, object]] = [] + rows_missing: List[Dict[str, object]] = [] + key_column = str(args.key_column) + preserve_columns = [c for c in target_df.columns if c not in {"filename_base"}] + + for row in target_df.to_dict(orient="records"): + stem = str(row.get("filename_base") or "") + key_value = str(row.get(key_column) or stem or row.get(args.filename_column) or "") + base = {col: row.get(col) for col in preserve_columns} + item_id = _stable_item_id(cutoff_id, key_value, stem) + if stem in available: + root, pdf_path = available[stem] + rel_path = pdf_path.relative_to(root) + out = dict(base) + out["source_filename"] = str(row.get(args.filename_column) or "") + out["filename"] = pdf_path.name + out["md_filename"] = f"{stem}.md" + out["filename_base"] = stem + out["ocr_item_id"] = item_id + out["ocr_cutoff_id"] = cutoff_id + out["local_pdf_path"] = str(pdf_path) + out["local_pdf_root"] = str(root) + out["local_pdf_relpath"] = str(rel_path) + out["available_at_cutoff"] = True + rows_available.append(out) + else: + out = dict(base) + out["filename_base"] = stem + out["ocr_item_id"] = item_id + out["ocr_cutoff_id"] = cutoff_id + out["available_at_cutoff"] = False + rows_missing.append(out) + + available_df = pd.DataFrame(rows_available) + missing_df = pd.DataFrame(rows_missing) + available_path = output_dir / "openarchives_ocr_available_at_cutoff.parquet" + missing_path = output_dir / "openarchives_ocr_missing_at_cutoff.parquet" + if not available_df.empty: + bins = _assign_rows(available_df, page_column=page_column, node_count=int(args.nodes)) + else: + bins = [] + + summaries: List[Dict[str, object]] = [] + total_pages = 0 + total_docs = 0 + for node in bins: + node_id = int(node["node_id"]) + node_df = pd.DataFrame(list(node["rows"])) + if "_pages_int" in node_df.columns: + node_df = node_df.drop(columns=["_pages_int"]) + node_df["shard_id"] = f"node-{node_id:02d}" + node_df["node_id"] = node_id + out_path = output_dir / f"openarchives_ocr_shard_node_{node_id:02d}.parquet" + node_df.to_parquet(out_path, index=False) + node_pages = int(node["pages_total"]) + node_docs = int(node["docs_total"]) + total_pages += node_pages + total_docs += node_docs + summaries.append( + { + "node_id": node_id, + "manifest_path": str(out_path), + "docs_total": node_docs, + "pages_total": node_pages, + "eta_hours_at_validated_speed": float(node_pages / float(args.pages_per_hour_per_node)), + } + ) + + available_df.to_parquet(available_path, index=False) + missing_df.to_parquet(missing_path, index=False) + overall = { + "source_parquet": str(parquet_path), + "cutoff_id": cutoff_id, + "nodes": int(args.nodes), + "key_column": key_column, + "filename_column": str(args.filename_column), + "page_column": str(page_column), + "available_docs_total": int(len(available_df)), + "available_pages_total": int(total_pages), + "missing_docs_total": int(len(missing_df)), + "missing_pages_total": int(pd.to_numeric(missing_df.get(page_column, pd.Series(dtype=float)), errors="coerce").fillna(0).sum()) if not missing_df.empty else 0, + "pages_per_hour_per_node": float(args.pages_per_hour_per_node), + "eta_hours_one_node": float(total_pages / float(args.pages_per_hour_per_node)) if total_pages else 0.0, + "eta_hours_all_nodes": float(total_pages / (float(args.pages_per_hour_per_node) * max(1, int(args.nodes)))) if total_pages else 0.0, + "available_manifest_path": str(available_path), + "missing_manifest_path": str(missing_path), + "node_summaries": summaries, + } + (output_dir / "openarchives_ocr_cutoff_summary.json").write_text(json.dumps(overall, indent=2), encoding="utf-8") + print(json.dumps(overall, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_enrich.py b/src/glossapi/scripts/openarchives_ocr_enrich.py new file mode 100644 index 0000000..7bfd767 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_enrich.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import argparse +import io +import json +from pathlib import Path +from typing import Dict, Iterable, Optional, Sequence + +import pandas as pd +import zstandard as zstd + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_enrich", + description="Enrich OpenArchives OCR routing rows with page counts and PDF URLs from raw JSONL shards.", + ) + p.add_argument("--parquet", required=True, help="Canonical parquet after OpenArchives cleaning/fill.") + p.add_argument("--raw-repo-root", required=True, help="Local root of the raw HF OpenArchives dataset.") + p.add_argument("--output-parquet", required=True, help="Where the enriched parquet will be written.") + p.add_argument("--filename-column", default="filename") + p.add_argument("--doc-id-column", default="source_doc_id") + p.add_argument("--source-jsonl-column", default="source_jsonl") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive targets from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _resolve_jsonl_path(raw_repo_root: Path, recorded_path: str) -> Path: + candidate = Path(recorded_path) + if candidate.exists(): + return candidate + + marker = "data/openarchives/" + text = str(recorded_path) + idx = text.find(marker) + if idx != -1: + rel = Path(text[idx:]) + rewritten = raw_repo_root / rel + if rewritten.exists(): + return rewritten + + name = Path(recorded_path).name + matches = list((raw_repo_root / "data" / "openarchives").glob(f"**/{name}")) + if len(matches) == 1: + return matches[0] + raise FileNotFoundError(f"could not resolve JSONL path for {recorded_path}") + + +def _pick_pdf_url(source_meta: dict) -> str: + for key in ("refined_pdf_links_json", "pdf_links_json"): + value = source_meta.get(key) + url = _normalize_pdf_link(value) + if url: + return url + for key in ("external_link", "handle_url", "url"): + value = source_meta.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return "" + + +def _normalize_pdf_link(value: object) -> str: + if value is None: + return "" + if isinstance(value, str): + text = value.strip() + if not text: + return "" + if text.startswith("http://") or text.startswith("https://"): + return text + try: + parsed = json.loads(text) + except Exception: + return text + return _normalize_pdf_link(parsed) + if isinstance(value, list): + for item in value: + normalized = _normalize_pdf_link(item) + if normalized: + return normalized + return "" + if isinstance(value, dict): + for key in ("url", "href", "pdf_url", "link"): + if key in value: + normalized = _normalize_pdf_link(value[key]) + if normalized: + return normalized + return "" + return "" + + +def _coerce_page_count(value: object) -> Optional[int]: + if value is None: + return None + try: + return max(1, int(float(value))) + except Exception: + return None + + +def _enrich_targets( + targets: pd.DataFrame, + *, + raw_repo_root: Path, + doc_id_column: str, + source_jsonl_column: str, +) -> pd.DataFrame: + work = targets.copy() + work["_resolved_jsonl"] = work[source_jsonl_column].map( + lambda p: str(_resolve_jsonl_path(raw_repo_root, str(p))) + ) + grouped: Dict[str, Dict[str, int]] = {} + for row_index, row in work[[doc_id_column, "_resolved_jsonl"]].iterrows(): + grouped.setdefault(str(row["_resolved_jsonl"]), {})[str(row[doc_id_column])] = int(row_index) + + dctx = zstd.ZstdDecompressor() + for jsonl_path, doc_map in grouped.items(): + with Path(jsonl_path).open("rb") as fh, dctx.stream_reader(fh) as reader: + text_reader = io.TextIOWrapper(reader, encoding="utf-8") + for line in text_reader: + record = json.loads(line) + doc_id = str(record.get("doc_id") or "") + row_index = doc_map.get(doc_id) + if row_index is None: + continue + pipeline = record.get("pipeline_metadata") or {} + source_meta = record.get("source_metadata") or {} + page_count = _coerce_page_count(pipeline.get("page_count")) + pages_total = _coerce_page_count(pipeline.get("pages_total")) + if page_count is None: + page_count = pages_total + if pages_total is None: + pages_total = page_count + work.at[row_index, "page_count_source"] = page_count + work.at[row_index, "pages_total_source"] = pages_total + work.at[row_index, "pdf_url"] = _pick_pdf_url(source_meta) + work.at[row_index, "source_collection_slug"] = source_meta.get("collection_slug") or "" + work.at[row_index, "source_language_code"] = source_meta.get("language_code") or "" + + return work.drop(columns=["_resolved_jsonl"]) + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + raw_repo_root = Path(args.raw_repo_root).expanduser().resolve() + output_path = Path(args.output_parquet).expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + for required in (args.filename_column, args.doc_id_column, args.source_jsonl_column): + if required not in df.columns: + raise SystemExit(f"Required column '{required}' not found in parquet.") + + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + targets = df.loc[target_mask].copy() + if targets.empty: + raise SystemExit("No OCR target rows selected; enriched parquet was not created.") + + enriched_targets = _enrich_targets( + targets, + raw_repo_root=raw_repo_root, + doc_id_column=str(args.doc_id_column), + source_jsonl_column=str(args.source_jsonl_column), + ) + + enriched_targets.to_parquet(output_path, index=False) + summary = { + "source_parquet": str(parquet_path), + "output_parquet": str(output_path), + "target_docs": int(len(enriched_targets)), + "page_count_source_non_null": int(enriched_targets["page_count_source"].notna().sum()), + "pdf_url_non_empty": int(enriched_targets["pdf_url"].fillna("").astype(str).str.len().gt(0).sum()), + "pages_total_sum": int(pd.to_numeric(enriched_targets["page_count_source"], errors="coerce").fillna(0).sum()), + } + print(json.dumps(summary, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_merge.py b/src/glossapi/scripts/openarchives_ocr_merge.py new file mode 100644 index 0000000..b88b8c3 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_merge.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import argparse +import hashlib +import re +import shutil +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd + + +_MARKDOWN_SHARD_RE = re.compile(r"^(?P.+)__p(?P\d+)-(?P\d+)\.md$") + + +def _parse_args(argv: List[str] | None = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_merge", + description="Merge shard-level OCR metadata back into a canonical GlossAPI download_results parquet.", + ) + p.add_argument("--master-parquet", required=True) + p.add_argument("--shard-parquets", nargs="+", required=True) + p.add_argument("--output-parquet", required=True) + p.add_argument("--key-column", default="filename") + p.add_argument("--preserve-master-columns", default="") + p.add_argument("--artifact-work-roots", nargs="*", default=[]) + p.add_argument("--artifact-output-root", default="") + return p.parse_args(argv) + + +def _normalize_key(df: pd.DataFrame, key: str) -> pd.Series: + if key not in df.columns: + raise SystemExit(f"Key column '{key}' not present in dataframe.") + return df[key].astype(str).str.strip() + + +def _merge_markdown_parts(parts: List[str]) -> str: + merged: List[str] = [] + for part in parts: + if not part: + continue + if merged and not merged[-1].endswith("\n"): + merged[-1] = merged[-1] + "\n" + merged.append(part) + return "".join(merged) + + +def _copy_once(src: Path, dst: Path) -> None: + dst.parent.mkdir(parents=True, exist_ok=True) + if dst.exists(): + return + shutil.copy2(src, dst) + + +def _resolve_markdown_payload( + *, + stem: str, + md_name: str, + work_roots: List[Path], + output_root: Optional[Path], +) -> tuple[Optional[str], Optional[str]]: + markdown_out = output_root / "markdown" if output_root is not None else None + shard_out = output_root / "sidecars" / "ocr_shards" / "markdown" if output_root is not None else None + + for root in work_roots: + canonical_src = root / "markdown" / f"{stem}.md" + if canonical_src.exists(): + payload = canonical_src.read_text(encoding="utf-8") + if markdown_out is not None: + _copy_once(canonical_src, markdown_out / md_name) + return payload, str(Path("markdown") / md_name) + return payload, None + + shard_sources = [] + for candidate in sorted((root / "markdown").glob(f"{stem}__p*.md")): + match = _MARKDOWN_SHARD_RE.match(candidate.name) + if not match or match.group("stem") != stem: + continue + shard_sources.append((int(match.group("start")), candidate)) + if not shard_sources: + continue + + shard_sources.sort(key=lambda item: item[0]) + payload = _merge_markdown_parts([path.read_text(encoding="utf-8") for _, path in shard_sources]) + if markdown_out is not None: + destination = markdown_out / md_name + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_text(payload, encoding="utf-8") + if shard_out is not None: + for _, shard_path in shard_sources: + _copy_once(shard_path, shard_out / shard_path.name) + return payload, str(Path("markdown") / md_name) + return payload, None + return None, None + + +def _collect_artifact_updates( + *, + shard_rows: pd.DataFrame, + work_roots: List[Path], + output_root: Optional[Path], +) -> tuple[int, pd.DataFrame]: + copied = 0 + markdown_out = output_root / "markdown" if output_root is not None else None + metrics_out = output_root / "json" / "metrics" if output_root is not None else None + if metrics_out is not None: + metrics_out.mkdir(parents=True, exist_ok=True) + updates: List[Dict[str, object]] = [] + for row in shard_rows.to_dict(orient="records"): + merge_key = str(row.get("_merge_key") or "").strip() + stem = str(row.get("filename_base") or Path(str(row.get("filename") or "")).stem).strip() + if not stem: + continue + md_name = str(row.get("md_filename") or f"{stem}.md") + md_payload, md_relpath = _resolve_markdown_payload( + stem=stem, + md_name=md_name, + work_roots=work_roots, + output_root=output_root, + ) + if md_payload is not None and markdown_out is not None: + copied += 1 + metrics_relpath = None + for suffix in (".metrics.json", ".per_page.metrics.json"): + for root in work_roots: + src = root / "json" / "metrics" / f"{stem}{suffix}" + if src.exists(): + if metrics_out is not None: + _copy_once(src, metrics_out / src.name) + copied += 1 + metrics_relpath = str(Path("json") / "metrics" / src.name) + break + if metrics_relpath is not None: + break + updates.append( + { + "_merge_key": merge_key, + "text": md_payload, + "ocr_markdown_relpath": md_relpath, + "ocr_metrics_relpath": metrics_relpath, + "ocr_text_sha256": ( + hashlib.sha256(md_payload.encode("utf-8")).hexdigest() + if isinstance(md_payload, str) + else None + ), + } + ) + return copied, pd.DataFrame(updates) + + +def main(argv: List[str] | None = None) -> int: + args = _parse_args(argv) + master_path = Path(args.master_parquet).expanduser().resolve() + out_path = Path(args.output_parquet).expanduser().resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + + preserve_master_columns = [c.strip() for c in str(args.preserve_master_columns or "").split(",") if c.strip()] + master = pd.read_parquet(master_path).copy() + master["_merge_key"] = _normalize_key(master, str(args.key_column)) + + shard_frames: List[pd.DataFrame] = [] + for shard in args.shard_parquets: + shard_df = pd.read_parquet(Path(shard).expanduser().resolve()).copy() + shard_df["_merge_key"] = _normalize_key(shard_df, str(args.key_column)) + shard_frames.append(shard_df) + shards = pd.concat(shard_frames, ignore_index=True) + shards = shards.drop_duplicates(subset=["_merge_key"], keep="last") + + master = master.set_index("_merge_key", drop=False) + shards = shards.set_index("_merge_key", drop=False) + + for column in shards.columns: + if column == "_merge_key": + continue + if column in preserve_master_columns: + continue + master.loc[shards.index, column] = shards[column] + + copied = 0 + if args.artifact_work_roots: + roots = [Path(p).expanduser().resolve() for p in args.artifact_work_roots] + artifact_output_root = ( + Path(args.artifact_output_root).expanduser().resolve() + if str(args.artifact_output_root or "").strip() + else None + ) + copied, artifact_updates = _collect_artifact_updates( + shard_rows=shards.reset_index(drop=True), + work_roots=roots, + output_root=artifact_output_root, + ) + if not artifact_updates.empty: + artifact_updates = artifact_updates.drop_duplicates(subset=["_merge_key"], keep="last").set_index("_merge_key") + for column in artifact_updates.columns: + if column in preserve_master_columns: + continue + if column not in master.columns: + master[column] = None + mask = artifact_updates[column].notna() + if bool(mask.any()): + master.loc[artifact_updates.index[mask], column] = artifact_updates.loc[mask, column] + master = master.reset_index(drop=True).drop(columns=["_merge_key"], errors="ignore") + master.to_parquet(out_path, index=False) + print(f"Merged {len(shards)} shard row(s) into {master_path} -> {out_path}; copied {copied} artifact file(s)") + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_run_node.py b/src/glossapi/scripts/openarchives_ocr_run_node.py new file mode 100644 index 0000000..aeb2751 --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_run_node.py @@ -0,0 +1,377 @@ +from __future__ import annotations + +import argparse +import json +import logging +import os +import socket +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd + +from glossapi import Corpus +from glossapi.parquet_schema import ParquetSchema + + +DEFAULT_DOWNLOAD_CONCURRENCY = 24 +DEFAULT_DOWNLOAD_TIMEOUT = 60 +DEFAULT_HEARTBEAT_INTERVAL = 60 + + +def _parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_run_node", + description=( + "Materialize one OpenArchives OCR shard into a normal GlossAPI corpus root, " + "download its PDFs, and run DeepSeek OCR with the standardized settings." + ), + ) + p.add_argument("--shard-parquet", required=True) + p.add_argument("--work-root", required=True) + p.add_argument("--python-log-level", default="INFO") + p.add_argument("--download-concurrency", type=int, default=DEFAULT_DOWNLOAD_CONCURRENCY) + p.add_argument("--download-timeout", type=int, default=DEFAULT_DOWNLOAD_TIMEOUT) + p.add_argument("--download-scheduler-mode", default="per_domain") + p.add_argument("--download-group-by", default="base_domain") + p.add_argument("--download-policy-file", default="") + p.add_argument("--heartbeat-path") + p.add_argument("--heartbeat-interval", type=int, default=DEFAULT_HEARTBEAT_INTERVAL) + p.add_argument("--instance-id", default="") + p.add_argument("--node-id", default="") + p.add_argument("--dry-run", action="store_true") + p.add_argument("--skip-download", action="store_true") + p.add_argument("--scheduler", default="whole_doc") + p.add_argument("--target-batch-pages", type=int, default=160) + p.add_argument("--shard-pages", type=int, default=0) + p.add_argument("--shard-threshold-pages", type=int, default=0) + p.add_argument("--workers-per-gpu", type=int, default=1) + p.add_argument("--runtime-backend", default="vllm") + p.add_argument("--ocr-profile", default="markdown_grounded") + p.add_argument("--max-new-tokens", type=int, default=2048) + p.add_argument("--render-dpi", type=int, default=144) + p.add_argument("--repair-mode", default="auto") + p.add_argument("--repair-exec-batch-target-pages", type=int, default=None) + p.add_argument("--repair-exec-batch-target-items", type=int, default=None) + p.add_argument("--gpu-memory-utilization", type=float, default=0.9) + return p.parse_args(argv) + + +def _hostname() -> str: + try: + return socket.gethostname() + except Exception: + return "" + + +def _atomic_write_json(path: Path, payload: Dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def _prepare_download_input(df: pd.DataFrame) -> pd.DataFrame: + required = {"filename", "pdf_url"} + missing = sorted(required - set(df.columns)) + if missing: + raise SystemExit(f"Shard parquet missing required column(s): {', '.join(missing)}") + out = df.copy() + out["url"] = out["pdf_url"].astype(str) + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + return out + + +def _prepare_materialized_input(df: pd.DataFrame) -> pd.DataFrame: + if "filename" not in df.columns: + raise SystemExit("Shard parquet missing required column: filename") + out = df.copy() + if "filename_base" not in out.columns: + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + return out + + +def _load_frame(path: Path) -> pd.DataFrame: + return pd.read_parquet(path).copy() + + +def _normalize_download_results( + *, + shard_df: pd.DataFrame, + download_results_df: pd.DataFrame, + url_column: str = "url", +) -> pd.DataFrame: + shard = shard_df.copy() + if "filename_base" not in shard.columns: + shard["filename_base"] = shard["filename"].astype(str).map(lambda s: Path(s).stem) + + dl = download_results_df.copy() + if "filename_base" not in dl.columns: + dl["filename_base"] = dl["filename"].astype(str).map(lambda s: Path(s).stem) + + merged = dl.merge( + shard, + on="filename_base", + how="left", + suffixes=("", "_shard"), + ) + if "filename_shard" in merged.columns: + merged["filename"] = merged["filename_shard"].fillna(merged["filename"]) + merged = merged.drop(columns=["filename_shard"]) + if "pdf_url" in merged.columns and url_column in merged.columns: + merged[url_column] = merged["pdf_url"].fillna(merged[url_column]) + elif "pdf_url" in merged.columns and url_column not in merged.columns: + merged[url_column] = merged["pdf_url"] + if "download_success" not in merged.columns: + merged["download_success"] = False + if "download_error" not in merged.columns: + merged["download_error"] = "" + if "ocr_success" not in merged.columns: + merged["ocr_success"] = False + if "needs_ocr" not in merged.columns: + merged["needs_ocr"] = True + return merged + + +def _write_canonical_metadata(work_root: Path, df: pd.DataFrame) -> Path: + schema = ParquetSchema({"url_column": "url"}) + canonical = work_root / "download_results" / "download_results.parquet" + canonical.parent.mkdir(parents=True, exist_ok=True) + normalized = schema.normalize_metadata_frame(df) + schema.write_metadata_parquet(normalized, canonical) + return canonical + + +def _normalize_materialized_results( + *, + shard_df: pd.DataFrame, + downloads_dir: Path, +) -> pd.DataFrame: + out = shard_df.copy() + if "filename_base" not in out.columns: + out["filename_base"] = out["filename"].astype(str).map(lambda s: Path(s).stem) + if "local_pdf_path" in out.columns: + local_exists = out["local_pdf_path"].astype(str).map(lambda s: Path(s).exists()) + else: + local_exists = out["filename"].astype(str).map(lambda s: (downloads_dir / s).exists()) + out["download_success"] = local_exists.astype(bool) + out["download_error"] = out["download_success"].map(lambda ok: "" if ok else "materialized_pdf_missing") + if "needs_ocr" not in out.columns: + out["needs_ocr"] = True + if "ocr_success" not in out.columns: + out["ocr_success"] = False + if "url" not in out.columns: + if "pdf_url" in out.columns: + out["url"] = out["pdf_url"].fillna("").astype(str) + else: + out["url"] = "" + return out + + +def _read_progress(parquet_path: Path, page_col: str = "page_count_source") -> Dict[str, Any]: + try: + df = pd.read_parquet(parquet_path) + except Exception as exc: + return {"parquet_error": str(exc)} + total_docs = int(len(df)) + docs_done = int(df.get("ocr_success", pd.Series(dtype=bool)).fillna(False).sum()) if "ocr_success" in df.columns else 0 + total_pages = 0 + pages_done = 0 + if page_col in df.columns: + page_values = pd.to_numeric(df[page_col], errors="coerce").fillna(0) + total_pages = int(page_values.sum()) + if "ocr_success" in df.columns: + pages_done = int(page_values[df["ocr_success"].fillna(False)].sum()) + return { + "docs_total": total_docs, + "docs_done": docs_done, + "pages_total": total_pages, + "pages_done": pages_done, + } + + +class _HeartbeatThread(threading.Thread): + def __init__( + self, + *, + heartbeat_path: Path, + interval: int, + parquet_path: Path, + context: Dict[str, Any], + ) -> None: + super().__init__(daemon=True) + self.heartbeat_path = heartbeat_path + self.interval = max(10, int(interval)) + self.parquet_path = parquet_path + self.context = dict(context) + self.stage = "init" + self.error = "" + self.stop_event = threading.Event() + self.started_at = time.time() + + def set_stage(self, stage: str) -> None: + self.stage = str(stage) + + def set_error(self, error: str) -> None: + self.error = str(error) + + def stop(self) -> None: + self.stop_event.set() + + def _payload(self) -> Dict[str, Any]: + payload = dict(self.context) + payload.update( + { + "timestamp": int(time.time()), + "hostname": _hostname(), + "stage": self.stage, + "error": self.error, + "uptime_sec": round(time.time() - self.started_at, 1), + "parquet_path": str(self.parquet_path), + } + ) + payload.update(_read_progress(self.parquet_path)) + return payload + + def run(self) -> None: + while not self.stop_event.is_set(): + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + self.stop_event.wait(self.interval) + try: + _atomic_write_json(self.heartbeat_path, self._payload()) + except Exception: + pass + + +def main(argv: Optional[List[str]] = None) -> int: + args = _parse_args(argv) + shard_path = Path(args.shard_parquet).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + work_root.mkdir(parents=True, exist_ok=True) + manifests_dir = work_root / "manifests" + manifests_dir.mkdir(parents=True, exist_ok=True) + + raw_shard_df = _load_frame(shard_path) + shard_df = ( + _prepare_materialized_input(raw_shard_df) + if args.skip_download + else _prepare_download_input(raw_shard_df) + ) + download_input = manifests_dir / "download_input.parquet" + if not args.skip_download: + shard_df.to_parquet(download_input, index=False) + + metadata_path = work_root / "download_results" / "download_results.parquet" + if not metadata_path.exists(): + metadata_path.parent.mkdir(parents=True, exist_ok=True) + if args.skip_download: + _write_canonical_metadata( + work_root, + _normalize_materialized_results(shard_df=shard_df, downloads_dir=work_root / "downloads"), + ) + else: + _write_canonical_metadata(work_root, shard_df) + + heartbeat: Optional[_HeartbeatThread] = None + if args.heartbeat_path: + heartbeat = _HeartbeatThread( + heartbeat_path=Path(args.heartbeat_path).expanduser().resolve(), + interval=int(args.heartbeat_interval), + parquet_path=metadata_path, + context={ + "instance_id": str(args.instance_id or ""), + "node_id": str(args.node_id or ""), + "shard_parquet": str(shard_path), + "work_root": str(work_root), + }, + ) + heartbeat.start() + + try: + if args.dry_run: + if heartbeat: + heartbeat.set_stage("dry_run") + return 0 + + if args.skip_download: + if heartbeat: + heartbeat.set_stage("materialized") + canonical_df = _normalize_materialized_results( + shard_df=shard_df, + downloads_dir=work_root / "downloads", + ) + metadata_path = _write_canonical_metadata(work_root, canonical_df) + else: + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + + if heartbeat: + heartbeat.set_stage("download") + dl_df = corpus.download( + input_parquet=download_input, + links_column="url", + parallelize_by=str(args.download_group_by), + concurrency=int(args.download_concurrency), + request_timeout=int(args.download_timeout), + scheduler_mode=str(args.download_scheduler_mode), + download_policy_file=(str(args.download_policy_file) if str(args.download_policy_file or "").strip() else None), + ) + canonical_df = _normalize_download_results(shard_df=shard_df, download_results_df=dl_df, url_column="url") + metadata_path = _write_canonical_metadata(work_root, canonical_df) + if heartbeat: + heartbeat.parquet_path = metadata_path + heartbeat.set_stage("ocr") + + corpus = Corpus( + input_dir=work_root / "downloads", + output_dir=work_root, + metadata_path=metadata_path, + log_level=getattr(logging, str(args.python_log_level).upper(), logging.INFO), + verbose=False, + ) + corpus.ocr( + fix_bad=True, + mode="ocr_bad", + backend="deepseek", + runtime_backend=str(args.runtime_backend), + ocr_profile=str(args.ocr_profile), + use_gpus="multi", + workers_per_gpu=int(args.workers_per_gpu), + render_dpi=int(args.render_dpi), + max_new_tokens=int(args.max_new_tokens), + repair_mode=str(args.repair_mode), + repair_exec_batch_target_pages=args.repair_exec_batch_target_pages, + repair_exec_batch_target_items=args.repair_exec_batch_target_items, + scheduler=str(args.scheduler), + target_batch_pages=int(args.target_batch_pages), + shard_pages=int(args.shard_pages), + shard_threshold_pages=int(args.shard_threshold_pages), + gpu_memory_utilization=float(args.gpu_memory_utilization), + math_enhance=False, + ) + if heartbeat: + heartbeat.set_stage("done") + return 0 + except Exception as exc: + if heartbeat: + heartbeat.set_stage("failed") + heartbeat.set_error(str(exc)) + raise + finally: + if heartbeat: + heartbeat.stop() + heartbeat.join(timeout=5) + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_ocr_shards.py b/src/glossapi/scripts/openarchives_ocr_shards.py new file mode 100644 index 0000000..e68833c --- /dev/null +++ b/src/glossapi/scripts/openarchives_ocr_shards.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import pandas as pd + + +PAGE_COLUMN_CANDIDATES: Sequence[str] = ( + "page_count_source", + "pages_total_source", + "pages_total", + "page_count", + "total_pages", + "num_pages", + "pages", +) + + +def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_ocr_shards", + description="Create page-balanced OCR shard manifests from a canonical GlossAPI parquet.", + ) + p.add_argument("--parquet", required=True, help="Canonical download_results parquet with needs_ocr flags.") + p.add_argument("--output-dir", required=True, help="Directory where shard manifests and summaries will be written.") + p.add_argument("--nodes", type=int, default=4, help="Number of OCR nodes to shard across.") + p.add_argument( + "--pages-per-hour-per-node", + type=float, + default=50700.0, + help="Validated throughput per OCR node, used for ETA calculations.", + ) + p.add_argument("--filename-column", default="filename") + p.add_argument("--needs-ocr-column", default="needs_ocr") + p.add_argument( + "--page-column", + default=None, + help="Explicit page-count column. If omitted, the script searches common page columns.", + ) + p.add_argument( + "--copy-columns", + default="", + help="Comma-separated extra metadata columns to preserve in every shard manifest.", + ) + p.add_argument( + "--allow-threshold-derive", + action="store_true", + help="If needs_ocr is missing, derive the target set from greek/mojibake thresholds.", + ) + p.add_argument("--greek-threshold", type=float, default=60.0) + p.add_argument("--mojibake-threshold", type=float, default=0.1) + return p.parse_args(argv) + + +def _resolve_page_column(df: pd.DataFrame, explicit: Optional[str]) -> str: + if explicit: + if explicit not in df.columns: + raise SystemExit(f"--page-column '{explicit}' not found in parquet.") + return explicit + for candidate in PAGE_COLUMN_CANDIDATES: + if candidate in df.columns: + return candidate + raise SystemExit( + "No page-count column found. Expected one of: " + + ", ".join(PAGE_COLUMN_CANDIDATES) + + " or pass --page-column." + ) + + +def _coerce_bool_series(series: pd.Series) -> pd.Series: + if series.dtype == bool: + return series.fillna(False) + lowered = series.astype(str).str.strip().str.lower() + return lowered.isin({"1", "true", "t", "yes", "y"}) + + +def _resolve_targets( + df: pd.DataFrame, + *, + needs_ocr_column: str, + allow_threshold_derive: bool, + greek_threshold: float, + mojibake_threshold: float, +) -> pd.Series: + if needs_ocr_column in df.columns: + return _coerce_bool_series(df[needs_ocr_column]) + if not allow_threshold_derive: + raise SystemExit( + f"Column '{needs_ocr_column}' not found and threshold derivation is disabled." + ) + greek = pd.to_numeric(df.get("greek_badness_score"), errors="coerce") + moj = pd.to_numeric(df.get("mojibake_badness_score"), errors="coerce") + if greek is None and moj is None: + raise SystemExit( + "Cannot derive OCR targets: neither needs_ocr nor greek/mojibake badness columns are present." + ) + greek_mask = (greek > float(greek_threshold)).fillna(False) if greek is not None else False + moj_mask = (moj > float(mojibake_threshold)).fillna(False) if moj is not None else False + return greek_mask | moj_mask + + +def _page_int(value: object) -> int: + try: + return max(1, int(value)) + except Exception: + return 1 + + +def _make_node_bins(node_count: int) -> List[Dict[str, object]]: + return [ + { + "node_id": idx, + "pages_total": 0, + "docs_total": 0, + "rows": [], + } + for idx in range(max(1, int(node_count))) + ] + + +def _assign_rows(df: pd.DataFrame, *, page_column: str, node_count: int) -> List[Dict[str, object]]: + ordered = df.copy() + ordered["_pages_int"] = ordered[page_column].map(_page_int) + ordered = ordered.sort_values(["_pages_int"], ascending=[False]).reset_index(drop=True) + bins = _make_node_bins(node_count) + for row in ordered.to_dict(orient="records"): + node = min(bins, key=lambda item: (int(item["pages_total"]), int(item["node_id"]))) + row["node_id"] = int(node["node_id"]) + node["rows"].append(row) + node["docs_total"] = int(node["docs_total"]) + 1 + node["pages_total"] = int(node["pages_total"]) + int(row["_pages_int"]) + return bins + + +def main(argv: Optional[Sequence[str]] = None) -> int: + args = _parse_args(argv) + parquet_path = Path(args.parquet).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(parquet_path) + if args.filename_column not in df.columns: + raise SystemExit(f"Filename column '{args.filename_column}' not found in parquet.") + + page_column = _resolve_page_column(df, args.page_column) + target_mask = _resolve_targets( + df, + needs_ocr_column=str(args.needs_ocr_column), + allow_threshold_derive=bool(args.allow_threshold_derive), + greek_threshold=float(args.greek_threshold), + mojibake_threshold=float(args.mojibake_threshold), + ) + shard_df = df.loc[target_mask].copy() + if shard_df.empty: + raise SystemExit("No OCR target rows selected; shard manifests were not created.") + + copy_columns = [c.strip() for c in str(args.copy_columns or "").split(",") if c.strip()] + selected_columns = [args.filename_column, page_column] + for optional in [ + "needs_ocr", + "greek_badness_score", + "mojibake_badness_score", + "ocr_success", + "source_row", + "document_type", + ] + copy_columns: + if optional in shard_df.columns and optional not in selected_columns: + selected_columns.append(optional) + shard_df = shard_df[selected_columns].copy() + + bins = _assign_rows(shard_df, page_column=page_column, node_count=int(args.nodes)) + summaries: List[Dict[str, object]] = [] + total_pages = 0 + total_docs = 0 + for node in bins: + node_id = int(node["node_id"]) + rows = list(node["rows"]) + node_df = pd.DataFrame(rows) + if "_pages_int" in node_df.columns: + node_df = node_df.drop(columns=["_pages_int"]) + node_df["shard_id"] = f"node-{node_id:02d}" + node_df["node_id"] = node_id + out_path = output_dir / f"openarchives_ocr_shard_node_{node_id:02d}.parquet" + node_df.to_parquet(out_path, index=False) + + node_pages = int(node["pages_total"]) + node_docs = int(node["docs_total"]) + total_pages += node_pages + total_docs += node_docs + summaries.append( + { + "node_id": node_id, + "manifest_path": str(out_path), + "docs_total": node_docs, + "pages_total": node_pages, + "eta_hours_at_validated_speed": float(node_pages / float(args.pages_per_hour_per_node)), + } + ) + + overall = { + "source_parquet": str(parquet_path), + "nodes": int(args.nodes), + "filename_column": str(args.filename_column), + "page_column": str(page_column), + "docs_total": int(total_docs), + "pages_total": int(total_pages), + "pages_per_hour_per_node": float(args.pages_per_hour_per_node), + "eta_hours_one_node": float(total_pages / float(args.pages_per_hour_per_node)), + "eta_hours_all_nodes": float(total_pages / (float(args.pages_per_hour_per_node) * max(1, int(args.nodes)))), + "node_summaries": summaries, + } + (output_dir / "openarchives_ocr_shard_summary.json").write_text( + json.dumps(overall, indent=2), + encoding="utf-8", + ) + print(json.dumps(overall, indent=2)) + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/src/glossapi/scripts/openarchives_pdf_stage_pull.py b/src/glossapi/scripts/openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..330f0c5 --- /dev/null +++ b/src/glossapi/scripts/openarchives_pdf_stage_pull.py @@ -0,0 +1,737 @@ +from __future__ import annotations + +import argparse +import csv +import json +import os +import re +import signal +import sqlite3 +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable, Optional, Sequence + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +@dataclass(frozen=True) +class TransferItem: + canonical_filename: str + remote_path: str + remote_size_bytes: int + remote_name: str + + +SCHEMA = """ +CREATE TABLE IF NOT EXISTS transfer_items ( + canonical_filename TEXT PRIMARY KEY, + remote_path TEXT NOT NULL, + remote_size_bytes INTEGER NOT NULL, + remote_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + attempts INTEGER NOT NULL DEFAULT 0, + priority_rank INTEGER NOT NULL DEFAULT 0, + last_error TEXT NOT NULL DEFAULT '', + transfer_started_at TEXT, + transfer_finished_at TEXT, + last_seen_size_bytes INTEGER NOT NULL DEFAULT 0 +); +""" + +PDF_NAME_PATTERN = re.compile(r"([A-Za-z0-9._-]+\.pdf(?:\.[A-Za-z0-9_-]+)?)", re.IGNORECASE) +FILENAME_KEYS = ("filename", "canonical_filename", "md_filename", "source_filename") + + +def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: + p = argparse.ArgumentParser( + prog="python -m glossapi.scripts.openarchives_pdf_stage_pull", + description="Resumable staged pull of OpenArchives PDFs from the Greece storage box.", + ) + p.add_argument("--manifest", required=True, help="TSV manifest with canonical_filename, remote_path, remote_size_bytes, remote_name.") + p.add_argument("--work-root", required=True, help="Root directory for downloads, partials, logs, and state.") + p.add_argument("--remote-host", default="debian@83.212.80.170") + p.add_argument("--password-env", default="GREECE_BOX_PASSWORD", help="Environment variable containing the remote SSH password.") + p.add_argument("--transport", choices=("sftp", "rsync"), default="sftp") + p.add_argument("--max-attempts", type=int, default=20) + p.add_argument("--connect-timeout", type=int, default=30) + p.add_argument("--io-timeout", type=int, default=180) + p.add_argument("--sleep-after-failure", type=float, default=10.0) + p.add_argument("--summary-interval-seconds", type=float, default=5.0) + p.add_argument("--limit", type=int, default=0, help="Optional limit for testing.") + p.add_argument( + "--priority-dir", + default=None, + help="Directory of dynamic priority files or filename lists. Items here are transferred first.", + ) + p.add_argument( + "--priority-only", + action="store_true", + help="Transfer only files currently present in the priority set; do not fall through to the rest of the manifest.", + ) + return p.parse_args(argv) + + +class TransferState: + def __init__(self, db_path: Path): + self.db_path = db_path + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self.conn = sqlite3.connect(str(self.db_path)) + self.conn.execute("PRAGMA journal_mode=WAL") + self.conn.execute(SCHEMA) + self._ensure_columns() + self.conn.commit() + + def close(self) -> None: + self.conn.close() + + def _ensure_columns(self) -> None: + cols = {row[1] for row in self.conn.execute("PRAGMA table_info(transfer_items)").fetchall()} + if "priority_rank" not in cols: + self.conn.execute("ALTER TABLE transfer_items ADD COLUMN priority_rank INTEGER NOT NULL DEFAULT 0") + + def sync_manifest(self, items: Iterable[TransferItem]) -> None: + rows = [ + (item.canonical_filename, item.remote_path, int(item.remote_size_bytes), item.remote_name) + for item in items + ] + self.conn.executemany( + """ + INSERT INTO transfer_items ( + canonical_filename, remote_path, remote_size_bytes, remote_name, status + ) VALUES (?, ?, ?, ?, 'pending') + ON CONFLICT(canonical_filename) DO UPDATE SET + remote_path=excluded.remote_path, + remote_size_bytes=excluded.remote_size_bytes, + remote_name=excluded.remote_name + """, + rows, + ) + self.conn.commit() + + def reset_stale_in_progress(self) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error=CASE + WHEN last_error = '' THEN 'Recovered from interrupted transfer' + ELSE last_error || ' | Recovered from interrupted transfer' + END + WHERE status='in_progress' + """ + ) + self.conn.commit() + + def mark_completed_if_present(self, downloads_dir: Path, partial_dir: Path) -> None: + cur = self.conn.execute( + "SELECT canonical_filename, remote_size_bytes, status FROM transfer_items" + ) + updates = [] + for canonical_filename, remote_size_bytes, status in cur.fetchall(): + final_path = downloads_dir / canonical_filename + if final_path.exists() and final_path.stat().st_size == int(remote_size_bytes): + updates.append((int(remote_size_bytes), utc_now(), canonical_filename)) + continue + part_path = partial_dir / f"{canonical_filename}.part" + if part_path.exists() and status == "completed": + self.conn.execute( + """ + UPDATE transfer_items + SET status='pending', + last_error='Final file missing; resuming from partial', + transfer_finished_at=NULL + WHERE canonical_filename=? + """, + (canonical_filename,), + ) + if updates: + self.conn.executemany( + """ + UPDATE transfer_items + SET status='completed', + last_seen_size_bytes=?, + transfer_finished_at=?, + last_error='' + WHERE canonical_filename=? + """, + updates, + ) + self.conn.commit() + + def next_item(self, *, max_attempts: int, priority_only: bool = False) -> Optional[sqlite3.Row]: + self.conn.row_factory = sqlite3.Row + if priority_only: + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + AND priority_rank > 0 + ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) + else: + cur = self.conn.execute( + """ + SELECT * + FROM transfer_items + WHERE status IN ('pending', 'failed') + AND attempts < ? + ORDER BY priority_rank DESC, attempts ASC, canonical_filename ASC + LIMIT 1 + """, + (max_attempts,), + ) + return cur.fetchone() + + def mark_in_progress(self, canonical_filename: str, current_size: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='in_progress', + attempts=attempts+1, + transfer_started_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(current_size), canonical_filename), + ) + self.conn.commit() + + def mark_completed(self, canonical_filename: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='completed', + transfer_finished_at=?, + last_seen_size_bytes=?, + last_error='' + WHERE canonical_filename=? + """, + (utc_now(), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def mark_failed(self, canonical_filename: str, error: str, size_bytes: int) -> None: + self.conn.execute( + """ + UPDATE transfer_items + SET status='failed', + last_error=?, + last_seen_size_bytes=? + WHERE canonical_filename=? + """, + (str(error), int(size_bytes), canonical_filename), + ) + self.conn.commit() + + def counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT status, COUNT(*) AS c + FROM transfer_items + GROUP BY status + """ + ) + counts = {"pending": 0, "in_progress": 0, "completed": 0, "failed": 0} + for status, count in cur.fetchall(): + counts[str(status)] = int(count) + counts["total"] = sum(counts.values()) + return counts + + def byte_counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT + COALESCE(SUM(remote_size_bytes), 0) AS bytes_total, + COALESCE(SUM(CASE WHEN status = 'completed' THEN remote_size_bytes ELSE 0 END), 0) AS bytes_completed, + COALESCE(SUM(CASE WHEN status = 'in_progress' THEN last_seen_size_bytes ELSE 0 END), 0) AS bytes_in_progress + FROM transfer_items + """ + ) + row = cur.fetchone() + bytes_total = int(row[0] or 0) + bytes_completed = int(row[1] or 0) + bytes_in_progress = int(row[2] or 0) + bytes_remaining = max(0, bytes_total - bytes_completed) + return { + "bytes_total": bytes_total, + "bytes_completed": bytes_completed, + "bytes_in_progress": bytes_in_progress, + "bytes_remaining": bytes_remaining, + } + + def set_priorities(self, canonical_filenames: set[str]) -> None: + self.conn.execute("UPDATE transfer_items SET priority_rank=0 WHERE priority_rank != 0") + if canonical_filenames: + batch = [] + for name in sorted(canonical_filenames): + batch.append(name) + if len(batch) >= 500: + placeholders = ",".join("?" for _ in batch) + self.conn.execute( + f"UPDATE transfer_items SET priority_rank=100 WHERE canonical_filename IN ({placeholders})", + batch, + ) + batch.clear() + if batch: + placeholders = ",".join("?" for _ in batch) + self.conn.execute( + f"UPDATE transfer_items SET priority_rank=100 WHERE canonical_filename IN ({placeholders})", + batch, + ) + self.conn.commit() + + def priority_counts(self) -> dict[str, int]: + cur = self.conn.execute( + """ + SELECT + COALESCE(SUM(CASE WHEN priority_rank > 0 THEN 1 ELSE 0 END), 0) AS priority_total, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='pending' THEN 1 ELSE 0 END), 0) AS priority_pending, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='completed' THEN 1 ELSE 0 END), 0) AS priority_completed, + COALESCE(SUM(CASE WHEN priority_rank > 0 AND status='failed' THEN 1 ELSE 0 END), 0) AS priority_failed + FROM transfer_items + """ + ) + row = cur.fetchone() + return { + "priority_total": int(row[0] or 0), + "priority_pending": int(row[1] or 0), + "priority_completed": int(row[2] or 0), + "priority_failed": int(row[3] or 0), + } + + +def read_manifest(path: Path) -> list[TransferItem]: + items: list[TransferItem] = [] + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle, delimiter="\t") + required = {"canonical_filename", "remote_path", "remote_size_bytes", "remote_name"} + if not required.issubset(reader.fieldnames or set()): + raise SystemExit(f"Manifest missing required columns: {sorted(required)}") + for row in reader: + items.append( + TransferItem( + canonical_filename=str(row["canonical_filename"]).strip(), + remote_path=str(row["remote_path"]).strip(), + remote_size_bytes=int(row["remote_size_bytes"]), + remote_name=str(row["remote_name"]).strip(), + ) + ) + return items + + +def write_json(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + os.replace(tmp, path) + + +def append_event(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload, ensure_ascii=False) + "\n") + + +def sshpass_env(password_env: str) -> dict[str, str]: + env = os.environ.copy() + secret = env.get(password_env) + if not secret: + raise SystemExit(f"Password env var '{password_env}' is not set.") + env["SSHPASS"] = secret + return env + + +def ssh_transport_options(connect_timeout: int) -> list[str]: + return [ + "-o", + "BatchMode=no", + "-o", + "PreferredAuthentications=password", + "-o", + "PubkeyAuthentication=no", + "-o", + "KbdInteractiveAuthentication=yes", + "-o", + f"ConnectTimeout={int(connect_timeout)}", + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", + "-o", + "ConnectionAttempts=3", + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/greece_box_known_hosts", + ] + + +def canonicalize_pdf_name(raw: str) -> Optional[str]: + text = os.path.basename(str(raw).strip()) + if not text: + return None + lower = text.lower() + marker = ".pdf." + if marker in lower: + idx = lower.index(marker) + return text[: idx + 4] + if lower.endswith(".pdf"): + return text + return None + + +def _walk_json_strings(obj) -> Iterable[str]: + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(key, str): + yield key + yield from _walk_json_strings(value) + elif isinstance(obj, list): + for item in obj: + yield from _walk_json_strings(item) + elif isinstance(obj, str): + yield obj + + +def _extract_priority_filenames_from_csv(path: Path) -> set[str]: + results: set[str] = set() + with path.open("r", encoding="utf-8", errors="ignore", newline="") as handle: + reader = csv.DictReader(handle) + fields = {field.strip() for field in (reader.fieldnames or []) if field} + keyed = any(key in fields for key in FILENAME_KEYS) + for row in reader: + if keyed: + for key in FILENAME_KEYS: + value = row.get(key) + if value: + canonical = canonicalize_pdf_name(value) + if canonical is not None: + results.add(canonical) + break + else: + for value in row.values(): + if not value: + continue + for match in PDF_NAME_PATTERN.findall(str(value)): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + +def _extract_priority_filenames_from_json(path: Path) -> set[str]: + results: set[str] = set() + data = json.loads(path.read_text(encoding="utf-8", errors="ignore")) + for text in _walk_json_strings(data): + canonical = canonicalize_pdf_name(text) + if canonical is not None: + results.add(canonical) + continue + for match in PDF_NAME_PATTERN.findall(text): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + +def _extract_priority_filenames_from_text(path: Path) -> set[str]: + results: set[str] = set() + text = path.read_text(encoding="utf-8", errors="ignore") + for line in text.splitlines(): + canonical = canonicalize_pdf_name(line) + if canonical is not None: + results.add(canonical) + for match in PDF_NAME_PATTERN.findall(text): + canonical = canonicalize_pdf_name(match) + if canonical is not None: + results.add(canonical) + return results + + +def load_priority_filenames(priority_dir: Path) -> set[str]: + results: set[str] = set() + if not priority_dir.exists(): + return results + for path in sorted(priority_dir.rglob("*")): + if not path.is_file(): + continue + direct = canonicalize_pdf_name(path.name) + if direct is not None: + results.add(direct) + continue + suffix = path.suffix.lower() + try: + if suffix == ".csv": + results.update(_extract_priority_filenames_from_csv(path)) + elif suffix == ".json": + results.update(_extract_priority_filenames_from_json(path)) + elif suffix in {".txt", ".list", ".lst", ".log"}: + results.update(_extract_priority_filenames_from_text(path)) + else: + continue + except Exception: + continue + return results + + +def rsync_one( + *, + remote_host: str, + remote_path: str, + temp_path: Path, + password_env: str, + connect_timeout: int, + io_timeout: int, +) -> subprocess.CompletedProcess[str]: + ssh_cmd = ( + "ssh " + "-o BatchMode=no " + "-o PreferredAuthentications=password " + "-o PubkeyAuthentication=no " + "-o KbdInteractiveAuthentication=yes " + f"-o ConnectTimeout={int(connect_timeout)} " + "-o ServerAliveInterval=15 " + "-o ServerAliveCountMax=3 " + "-o ConnectionAttempts=3 " + "-o StrictHostKeyChecking=no " + "-o UserKnownHostsFile=/tmp/greece_box_known_hosts" + ) + cmd = [ + "sshpass", + "-e", + "rsync", + "-av", + "--partial", + "--append-verify", + "--inplace", + f"--timeout={int(io_timeout)}", + "-e", + ssh_cmd, + f"{remote_host}:{remote_path}", + str(temp_path), + ] + return subprocess.run(cmd, capture_output=True, text=True, env=sshpass_env(password_env)) + + +def sftp_one( + *, + remote_host: str, + remote_path: str, + temp_path: Path, + password_env: str, + connect_timeout: int, + io_timeout: int, +) -> subprocess.CompletedProcess[str]: + cmd = [ + "sshpass", + "-e", + "sftp", + *ssh_transport_options(connect_timeout), + "-b", + "-", + remote_host, + ] + batch = f'reget "{remote_path}" "{temp_path}"\n' + return subprocess.run(cmd, capture_output=True, text=True, env=sshpass_env(password_env), input=batch) + + +def run(argv: Optional[Sequence[str]] = None) -> int: + args = parse_args(argv) + manifest_path = Path(args.manifest).expanduser().resolve() + work_root = Path(args.work_root).expanduser().resolve() + priority_dir = Path(args.priority_dir).expanduser().resolve() if args.priority_dir else (work_root / "unreachable_from_source_20260331") + downloads_dir = work_root / "downloads" + partial_dir = work_root / "partials" + logs_dir = work_root / "logs" + state_dir = work_root / "state" + downloads_dir.mkdir(parents=True, exist_ok=True) + partial_dir.mkdir(parents=True, exist_ok=True) + logs_dir.mkdir(parents=True, exist_ok=True) + state_dir.mkdir(parents=True, exist_ok=True) + + state = TransferState(state_dir / "transfer_state.sqlite3") + items = read_manifest(manifest_path) + if args.limit and int(args.limit) > 0: + items = items[: int(args.limit)] + state.sync_manifest(items) + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads_dir, partial_dir) + manifest_names = {item.canonical_filename for item in items} + + stop_requested = False + + def _handle_signal(signum, _frame) -> None: + nonlocal stop_requested + stop_requested = True + print(f"[transfer] signal {signum} received; stopping after current file", file=sys.stderr) + + signal.signal(signal.SIGINT, _handle_signal) + signal.signal(signal.SIGTERM, _handle_signal) + + last_summary_ts = 0.0 + current_path = state_dir / "current_transfer.json" + summary_path = state_dir / "summary.json" + events_path = logs_dir / "events.jsonl" + priority_summary_path = state_dir / "priority_summary.json" + priority_available_path = state_dir / "priority_available_in_manifest.txt" + priority_missing_path = state_dir / "priority_missing_in_manifest.txt" + last_priority_set: Optional[set[str]] = None + + def refresh_priorities() -> dict[str, int]: + nonlocal last_priority_set + requested = load_priority_filenames(priority_dir) + if last_priority_set is None or requested != last_priority_set: + available = requested & manifest_names + missing = requested - manifest_names + state.set_priorities(available) + priority_available_path.write_text( + "".join(f"{name}\n" for name in sorted(available)), + encoding="utf-8", + ) + priority_missing_path.write_text( + "".join(f"{name}\n" for name in sorted(missing)), + encoding="utf-8", + ) + write_json( + priority_summary_path, + { + "updated_at": utc_now(), + "priority_dir": str(priority_dir), + "priority_only": bool(args.priority_only), + "requested_total": len(requested), + "available_in_manifest_total": len(available), + "missing_in_manifest_total": len(missing), + }, + ) + last_priority_set = requested + return state.priority_counts() + + priority_counts = refresh_priorities() + + while not stop_requested: + priority_counts = refresh_priorities() + row = state.next_item(max_attempts=int(args.max_attempts), priority_only=bool(args.priority_only)) + if row is None: + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": True}) + break + + canonical = str(row["canonical_filename"]) + remote_path = str(row["remote_path"]) + remote_size = int(row["remote_size_bytes"]) + final_path = downloads_dir / canonical + temp_path = partial_dir / f"{canonical}.part" + current_size = temp_path.stat().st_size if temp_path.exists() else 0 + + state.mark_in_progress(canonical, current_size) + write_json( + current_path, + { + "updated_at": utc_now(), + "transport": str(args.transport), + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_path": str(temp_path), + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + append_event( + events_path, + { + "ts": utc_now(), + "event": "start", + "transport": str(args.transport), + "canonical_filename": canonical, + "remote_path": remote_path, + "remote_size_bytes": remote_size, + "partial_size_bytes": current_size, + "attempt_number": int(row["attempts"]) + 1, + }, + ) + + transfer_kwargs = { + "remote_host": str(args.remote_host), + "remote_path": remote_path, + "temp_path": temp_path, + "password_env": str(args.password_env), + "connect_timeout": int(args.connect_timeout), + "io_timeout": int(args.io_timeout), + } + if str(args.transport) == "rsync": + result = rsync_one(**transfer_kwargs) + else: + result = sftp_one(**transfer_kwargs) + + if result.returncode == 0 and temp_path.exists(): + actual_size = temp_path.stat().st_size + if remote_size > 0 and actual_size != remote_size: + state.mark_failed( + canonical, + f"Size mismatch after transfer: expected {remote_size}, got {actual_size}", + actual_size, + ) + else: + final_path.parent.mkdir(parents=True, exist_ok=True) + os.replace(temp_path, final_path) + state.mark_completed(canonical, actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "completed", + "transport": str(args.transport), + "canonical_filename": canonical, + "size_bytes": actual_size, + }, + ) + else: + actual_size = temp_path.stat().st_size if temp_path.exists() else 0 + error = (result.stderr or result.stdout or "").strip()[-4000:] + state.mark_failed(canonical, error or f"transfer failed with code {result.returncode}", actual_size) + append_event( + events_path, + { + "ts": utc_now(), + "event": "failed", + "transport": str(args.transport), + "canonical_filename": canonical, + "return_code": int(result.returncode), + "partial_size_bytes": actual_size, + "error": error or f"transfer failed with code {result.returncode}", + }, + ) + time.sleep(float(args.sleep_after_failure)) + + now = time.time() + if now - last_summary_ts >= float(args.summary_interval_seconds): + priority_counts = refresh_priorities() + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": False}) + last_summary_ts = now + + if current_path.exists(): + try: + current_path.unlink() + except Exception: + pass + + priority_counts = refresh_priorities() + write_json(summary_path, {"updated_at": utc_now(), **state.counts(), **state.byte_counts(), **priority_counts, "priority_only": bool(args.priority_only), "done": True}) + state.close() + return 0 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(run()) diff --git a/src/glossapi/scripts/review_manifest_materialize.py b/src/glossapi/scripts/review_manifest_materialize.py new file mode 100644 index 0000000..56fc7b1 --- /dev/null +++ b/src/glossapi/scripts/review_manifest_materialize.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import argparse +import json +import re +import shutil +from collections import Counter +from pathlib import Path +from typing import Dict, Iterable, List + + +_SAFE_LABEL_RE = re.compile(r"[^a-z0-9._-]+") + + +def _slugify_label(value: object) -> str: + text = str(value).strip().lower() + text = text.replace(" ", "_") + text = _SAFE_LABEL_RE.sub("_", text) + text = text.strip("._-") + return text or "unlabeled" + + +def _format_metadata_lines(row: Dict[str, object], source_field: str, label_field: str, category_name: str) -> List[str]: + lines = [ + f"REVIEW_CATEGORY: {category_name}", + f"REVIEW_LABEL: {row.get(label_field, '')}", + ] + for key, value in row.items(): + if key in {source_field, label_field}: + continue + if isinstance(value, (dict, list)): + rendered = json.dumps(value, ensure_ascii=False) + else: + rendered = str(value) + lines.append(f"{key.upper()}: {rendered}") + return lines + + +def _read_manifest_rows(path: Path) -> List[Dict[str, object]]: + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def _write_review_copy( + src: Path, + dest: Path, + row: Dict[str, object], + source_field: str, + label_field: str, + category_name: str, +) -> None: + body = src.read_text(encoding="utf-8", errors="ignore") + header = "\n".join(_format_metadata_lines(row, source_field, label_field, category_name)) + dest.write_text(f"{header}\n\n=== REVIEW_SOURCE_CONTENT ===\n{body}", encoding="utf-8") + + +def materialize_manifest_categories( + manifest_path: Path, + output_dir: Path, + *, + source_field: str = "path", + label_field: str = "label", + category_name: str | None = None, +) -> Dict[str, object]: + rows = _read_manifest_rows(manifest_path) + category_name = category_name or label_field + + if output_dir.exists(): + for stale in output_dir.rglob("*.txt"): + stale.unlink() + for stale in output_dir.rglob("*.json"): + stale.unlink() + for stale in output_dir.rglob("*.jsonl"): + stale.unlink() + output_dir.mkdir(parents=True, exist_ok=True) + + labels_dir = output_dir / "by_label" + labels_dir.mkdir(parents=True, exist_ok=True) + + label_counts: Counter[str] = Counter() + written_rows: List[Dict[str, object]] = [] + + for row in rows: + if source_field not in row or label_field not in row: + raise KeyError(f"Manifest row missing required fields: {source_field!r}, {label_field!r}") + + src = Path(str(row[source_field])) + label = str(row[label_field]) + label_slug = _slugify_label(label) + dest_dir = labels_dir / label_slug + dest_dir.mkdir(parents=True, exist_ok=True) + dest = dest_dir / src.name + if dest.exists(): + stem = dest.stem + suffix = dest.suffix + counter = 2 + while True: + candidate = dest_dir / f"{stem}__dup{counter}{suffix}" + if not candidate.exists(): + dest = candidate + break + counter += 1 + + _write_review_copy(src, dest, row, source_field, label_field, category_name) + label_counts[label] += 1 + written_rows.append( + { + "label": label, + "label_slug": label_slug, + "source_path": str(src), + "copied_path": str(dest), + } + ) + + manifest_out = output_dir / "materialized_manifest.jsonl" + with manifest_out.open("w", encoding="utf-8") as handle: + for row in written_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + summary = { + "manifest_path": str(manifest_path), + "output_dir": str(output_dir), + "category_name": category_name, + "source_field": source_field, + "label_field": label_field, + "row_count": len(rows), + "label_counts": dict(label_counts), + "label_dirs": { + _slugify_label(label): str(labels_dir / _slugify_label(label)) + for label in sorted(label_counts) + }, + } + (output_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary + + +def main() -> None: + parser = argparse.ArgumentParser(description="Materialize categorized review copies from a JSONL manifest.") + parser.add_argument("--manifest", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--source-field", default="path") + parser.add_argument("--label-field", default="label") + parser.add_argument("--category-name", default=None) + args = parser.parse_args() + + materialize_manifest_categories( + args.manifest, + args.output_dir, + source_field=args.source_field, + label_field=args.label_field, + category_name=args.category_name, + ) + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/table_markdown_audit.py b/src/glossapi/scripts/table_markdown_audit.py new file mode 100644 index 0000000..1bba05d --- /dev/null +++ b/src/glossapi/scripts/table_markdown_audit.py @@ -0,0 +1,522 @@ +from __future__ import annotations + +import argparse +import html +import json +import re +from collections import Counter +from dataclasses import dataclass +from html.parser import HTMLParser +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + + +TABLE_BLOCK_RE = re.compile(r"(?is)") +ROW_RE = re.compile(r"(?is).*?") +CELL_RE = re.compile(r"(?is)<(td|th)\b(.*?)>(.*?)") +ATTR_RE = re.compile(r'([A-Za-z_:][-A-Za-z0-9_:.]*)\s*=\s*(".*?"|\'.*?\'|[^\s>]+)', re.S) +TAG_RE = re.compile(r"(?is)<[^>]+>") +DISALLOWED_TAG_RE = re.compile(r"(?is)]*>") +BREAK_TAG_RE = re.compile(r"(?is)") + + +@dataclass +class ParsedCell: + tag: str + text: str + rowspan: int + colspan: int + + +@dataclass +class TableAudit: + source_path: str + source_stem: str + table_index_in_doc: int + global_index: int + html: str + status: str + convertible: bool + broken: bool + reasons: List[str] + row_count: int + col_count: int + nonempty_ratio: float + duplicate_rows: int + header_mode: str + spans_present: bool + markdown: Optional[str] + + +class _CellHTMLNormalizer(HTMLParser): + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.parts: List[str] = [] + self.link_stack: List[Optional[str]] = [] + + def _append_break(self) -> None: + if self.parts and not self.parts[-1].endswith("\n"): + self.parts.append("\n") + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + tag = tag.lower() + attr_map = {key.lower(): (value or "") for key, value in attrs} + if tag == "br": + self._append_break() + return + if tag in {"p", "div", "li"}: + self._append_break() + if tag == "li": + self.parts.append("- ") + return + if tag in {"sub", "sup"}: + self.parts.append(f"<{tag}>") + return + if tag == "img": + alt = " ".join(attr_map.get("alt", "").split()) + if alt: + self.parts.append(alt) + return + if tag == "a": + href = attr_map.get("href", "").strip() + self.link_stack.append(href or None) + self.parts.append("[") + return + + def handle_endtag(self, tag: str) -> None: + tag = tag.lower() + if tag in {"p", "div", "li"}: + self._append_break() + return + if tag in {"sub", "sup"}: + self.parts.append(f"") + return + if tag == "a": + href = self.link_stack.pop() if self.link_stack else None + if href: + self.parts.append(f"]({href})") + else: + self.parts.append("]") + + def handle_data(self, data: str) -> None: + self.parts.append(data) + + def get_text(self) -> str: + return "".join(self.parts) + + +def _parse_attrs(attr_text: str) -> Dict[str, str]: + attrs: Dict[str, str] = {} + for key, raw_value in ATTR_RE.findall(attr_text): + value = raw_value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}: + value = value[1:-1] + attrs[key.lower()] = html.unescape(value) + return attrs + + +def _normalize_cell_html(cell_html: str) -> str: + parser = _CellHTMLNormalizer() + parser.feed(cell_html) + parser.close() + text = parser.get_text() + text = BREAK_TAG_RE.sub("\n", text) + text = DISALLOWED_TAG_RE.sub(" ", text) + text = html.unescape(text) + lines = [" ".join(line.split()) for line in text.splitlines()] + return "\n".join(line for line in lines if line).strip() + + +def _parse_table_rows(table_html: str) -> Tuple[List[List[ParsedCell]], List[str]]: + reasons: List[str] = [] + if re.search(r"(?is) Tuple[Optional[List[List[str]]], List[str]]: + reasons: List[str] = [] + active_rowspans: Dict[int, int] = {} + expanded_rows: List[List[str]] = [] + max_cols = 0 + + for parsed_row in parsed_rows: + row: List[str] = [] + col_idx = 0 + + def fill_active_until_free() -> None: + nonlocal col_idx + while active_rowspans.get(col_idx, 0) > 0: + row.append("") + active_rowspans[col_idx] -= 1 + if active_rowspans[col_idx] <= 0: + del active_rowspans[col_idx] + col_idx += 1 + + fill_active_until_free() + for cell in parsed_row: + fill_active_until_free() + row.append(cell.text) + if cell.rowspan > 1: + active_rowspans[col_idx] = max(active_rowspans.get(col_idx, 0), cell.rowspan - 1) + start_col = col_idx + col_idx += 1 + for extra in range(1, cell.colspan): + row.append("") + if cell.rowspan > 1: + active_rowspans[start_col + extra] = max( + active_rowspans.get(start_col + extra, 0), cell.rowspan - 1 + ) + col_idx += 1 + fill_active_until_free() + + max_cols = max(max_cols, len(row)) + expanded_rows.append(row) + + while active_rowspans: + row: List[str] = [] + col_idx = 0 + max_active_col = max(active_rowspans) + while col_idx <= max_active_col: + if active_rowspans.get(col_idx, 0) > 0: + row.append("") + active_rowspans[col_idx] -= 1 + if active_rowspans[col_idx] <= 0: + del active_rowspans[col_idx] + else: + row.append("") + col_idx += 1 + max_cols = max(max_cols, len(row)) + expanded_rows.append(row) + + if max_cols == 0 or not expanded_rows: + reasons.append("empty_grid") + return None, reasons + + for row in expanded_rows: + if len(row) < max_cols: + row.extend([""] * (max_cols - len(row))) + return expanded_rows, reasons + + +def _markdown_escape(text: str) -> str: + text = text.replace("\\", "\\\\") + text = text.replace("|", "\\|") + text = text.replace("\n", "
") + return text + + +def _format_markdown_row(values: Sequence[str], widths: Sequence[int]) -> str: + padded = [value.ljust(width) for value, width in zip(values, widths)] + return "| " + " | ".join(padded) + " |" + + +def _should_infer_header_row(grid: Sequence[Sequence[str]]) -> bool: + if len(grid) < 2: + return False + first_row = grid[0] + if not first_row: + return False + return all(any(ch.isalnum() for ch in cell) for cell in first_row) + + +def _grid_to_markdown(grid: Sequence[Sequence[str]], header_mode: str) -> str: + if not grid: + return "" + cols = len(grid[0]) + if header_mode in {"explicit_first_row", "inferred_first_row"}: + header = [_markdown_escape(cell) for cell in grid[0]] + data_rows = list(grid[1:]) + else: + header = [""] * cols + data_rows = list(grid) + escaped_rows = [[_markdown_escape(cell) for cell in row] for row in data_rows] + sep = ["---"] * cols + widths = [ + max( + len(header[idx]), + len(sep[idx]), + *(len(row[idx]) for row in escaped_rows), + ) + for idx in range(cols) + ] + + lines = [ + _format_markdown_row(header, widths), + _format_markdown_row(sep, widths), + ] + for row in escaped_rows: + lines.append(_format_markdown_row(row, widths)) + return "\n".join(lines) + + +def _assess_content( + grid: Sequence[Sequence[str]], + *, + spans_present: bool, +) -> Tuple[bool, List[str], float, int]: + total_cells = sum(len(row) for row in grid) + nonempty_cells = sum(1 for row in grid for cell in row if any(ch.isalnum() for ch in cell)) + nonempty_ratio = (nonempty_cells / total_cells) if total_cells else 0.0 + + row_keys = [] + for row in grid: + normalized = tuple(" ".join(cell.split()).casefold() for cell in row) + nonempty_in_row = sum(1 for cell in normalized if any(ch.isalnum() for ch in cell)) + if nonempty_in_row >= 2: + row_keys.append(normalized) + duplicate_rows = sum(freq - 1 for freq in Counter(row_keys).values() if freq >= 2) + + reasons: List[str] = [] + broken = False + if total_cells >= 18 and nonempty_ratio <= 0.15: + broken = True + reasons.append("near_empty_table") + if spans_present and total_cells >= 4 and nonempty_ratio <= 0.34: + broken = True + reasons.append("sparse_span_shell") + if len(grid) >= 4 and duplicate_rows >= 2: + broken = True + reasons.append("repeated_rows") + return broken, reasons, round(nonempty_ratio, 4), duplicate_rows + + +def audit_table(source_path: Path, table_index_in_doc: int, global_index: int, table_html: str) -> TableAudit: + parsed_rows, parse_reasons = _parse_table_rows(table_html) + spans_present = any(cell.rowspan > 1 or cell.colspan > 1 for row in parsed_rows for cell in row) + explicit_header = bool(parsed_rows and any(cell.tag == "th" for cell in parsed_rows[0])) + grid, expand_reasons = _expand_rows(parsed_rows) + reasons = list(dict.fromkeys(parse_reasons + expand_reasons)) + + if grid is None: + return TableAudit( + source_path=str(source_path), + source_stem=source_path.stem, + table_index_in_doc=table_index_in_doc, + global_index=global_index, + html=table_html, + status="broken_or_ambiguous", + convertible=False, + broken=True, + reasons=reasons or ["parse_failure"], + row_count=0, + col_count=0, + nonempty_ratio=0.0, + duplicate_rows=0, + header_mode="none", + spans_present=spans_present, + markdown=None, + ) + + broken, content_reasons, nonempty_ratio, duplicate_rows = _assess_content( + grid, + spans_present=spans_present, + ) + reasons = list(dict.fromkeys(reasons + content_reasons)) + if explicit_header: + header_mode = "explicit_first_row" + elif _should_infer_header_row(grid): + header_mode = "inferred_first_row" + else: + header_mode = "blank_first_row" + markdown = _grid_to_markdown(grid, header_mode=header_mode) + + if any(reason in {"nested_table", "invalid_rowspan", "invalid_colspan"} for reason in reasons): + status = "broken_or_ambiguous" + convertible = False + markdown = None + broken = True + else: + status = "convertible_but_broken" if broken else "convertible_clean" + convertible = True + + return TableAudit( + source_path=str(source_path), + source_stem=source_path.stem, + table_index_in_doc=table_index_in_doc, + global_index=global_index, + html=table_html, + status=status, + convertible=convertible, + broken=broken, + reasons=reasons, + row_count=len(grid), + col_count=len(grid[0]) if grid else 0, + nonempty_ratio=nonempty_ratio, + duplicate_rows=duplicate_rows, + header_mode=header_mode, + spans_present=spans_present, + markdown=markdown, + ) + + +def iter_tables(markdown_dir: Path): + global_index = 0 + for source_path in sorted(markdown_dir.glob("*.md")): + text = source_path.read_text(encoding="utf-8", errors="ignore") + table_index = 0 + for match in TABLE_BLOCK_RE.finditer(text): + table_index += 1 + global_index += 1 + yield source_path, table_index, global_index, match.group(0) + + +def write_review_file(output_dir: Path, audit: TableAudit) -> str: + filename = f"{audit.global_index:05d}__{audit.source_stem}__table_{audit.table_index_in_doc:03d}.txt" + output_path = output_dir / filename + lines = [ + f"SOURCE_PATH: {audit.source_path}", + f"SOURCE_STEM: {audit.source_stem}", + f"TABLE_INDEX_IN_DOC: {audit.table_index_in_doc}", + f"GLOBAL_INDEX: {audit.global_index}", + f"STATUS: {audit.status}", + f"CONVERTIBLE: {audit.convertible}", + f"BROKEN: {audit.broken}", + f"REASONS: {', '.join(audit.reasons) if audit.reasons else 'none'}", + f"ROWS: {audit.row_count}", + f"COLS: {audit.col_count}", + f"NONEMPTY_RATIO: {audit.nonempty_ratio}", + f"DUPLICATE_ROWS: {audit.duplicate_rows}", + f"HEADER_MODE: {audit.header_mode}", + f"SPANS_PRESENT: {audit.spans_present}", + "", + "=== HTML ===", + audit.html, + "", + "=== GITHUB_MD ===", + audit.markdown if audit.markdown is not None else "UNAVAILABLE", + "", + ] + output_path.write_text("\n".join(lines), encoding="utf-8") + return str(output_path) + + +def write_clean_markdown_file(output_dir: Path, audit: TableAudit) -> Optional[str]: + if audit.markdown is None: + return None + filename = f"{audit.global_index:05d}__{audit.source_stem}__table_{audit.table_index_in_doc:03d}.md" + output_path = output_dir / filename + output_path.write_text( + "\n".join( + [ + "## ORIGINAL_HTML", + "", + audit.html, + "", + "## GITHUB_MD", + "", + audit.markdown, + "", + ] + ), + encoding="utf-8", + ) + return str(output_path) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Audit HTML tables and export GitHub Markdown conversions.") + parser.add_argument("--input-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + parser.add_argument("--max-tables", type=int, default=1000) + args = parser.parse_args() + + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + tables_dir = output_dir / "tables" + tables_dir.mkdir(parents=True, exist_ok=True) + clean_md_dir = output_dir / "github_md_tables" + clean_md_dir.mkdir(parents=True, exist_ok=True) + + manifest_path = output_dir / "manifest.jsonl" + summary_path = output_dir / "summary.json" + if manifest_path.exists(): + manifest_path.unlink() + if summary_path.exists(): + summary_path.unlink() + for stale in tables_dir.glob("*.txt"): + stale.unlink() + for stale in clean_md_dir.glob("*.md"): + stale.unlink() + + rows = [] + audited = 0 + for source_path, table_index, global_index, table_html in iter_tables(args.input_dir): + audited += 1 + audit = audit_table(source_path, table_index, global_index, table_html) + output_path = write_review_file(tables_dir, audit) + markdown_path = write_clean_markdown_file(clean_md_dir, audit) + row = { + "source_path": audit.source_path, + "source_stem": audit.source_stem, + "table_index_in_doc": audit.table_index_in_doc, + "global_index": audit.global_index, + "status": audit.status, + "convertible": audit.convertible, + "broken": audit.broken, + "reasons": audit.reasons, + "row_count": audit.row_count, + "col_count": audit.col_count, + "nonempty_ratio": audit.nonempty_ratio, + "duplicate_rows": audit.duplicate_rows, + "header_mode": audit.header_mode, + "spans_present": audit.spans_present, + "output_path": output_path, + "markdown_output_path": markdown_path, + } + rows.append(row) + if audited >= args.max_tables: + break + + with manifest_path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + reason_counts = Counter(reason for row in rows for reason in row["reasons"]) + status_counts = Counter(row["status"] for row in rows) + summary = { + "input_dir": str(args.input_dir), + "output_dir": str(output_dir), + "github_md_dir": str(clean_md_dir), + "audited_table_count": len(rows), + "convertible_count": sum(1 for row in rows if row["convertible"]), + "broken_count": sum(1 for row in rows if row["broken"]), + "status_counts": dict(status_counts), + "reason_counts": dict(reason_counts), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/src/glossapi/scripts/table_sentence_context_review.py b/src/glossapi/scripts/table_sentence_context_review.py new file mode 100644 index 0000000..6e2a074 --- /dev/null +++ b/src/glossapi/scripts/table_sentence_context_review.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +import argparse +import importlib.util +import json +import re +import sys +from collections import Counter +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +PAGE_SPLIT_MARKER = "<--- Page Split --->" +TABLE_BLOCK_RE = re.compile(r"(?is)") +WORD_RE = re.compile(r"[^\W\d_]+", re.UNICODE) + + +_TABLE_AUDIT_PATH = Path(__file__).with_name("table_markdown_audit.py") +_TABLE_AUDIT_SPEC = importlib.util.spec_from_file_location("table_markdown_audit_local", _TABLE_AUDIT_PATH) +assert _TABLE_AUDIT_SPEC and _TABLE_AUDIT_SPEC.loader +_TABLE_AUDIT_MODULE = importlib.util.module_from_spec(_TABLE_AUDIT_SPEC) +sys.modules[_TABLE_AUDIT_SPEC.name] = _TABLE_AUDIT_MODULE +_TABLE_AUDIT_SPEC.loader.exec_module(_TABLE_AUDIT_MODULE) +_expand_rows = _TABLE_AUDIT_MODULE._expand_rows +_parse_table_rows = _TABLE_AUDIT_MODULE._parse_table_rows + + +def _extract_review_html(review_text: str) -> str: + return review_text.split("=== HTML ===\n", 1)[1].split("\n\n=== GITHUB_MD ===", 1)[0] + + +def _flatten_nonempty_cells(table_html: str) -> List[str]: + parsed_rows, _ = _parse_table_rows(table_html) + grid, _ = _expand_rows(parsed_rows) + if not grid: + return [] + nonempty: List[str] = [] + for row in grid: + for cell in row: + normalized = " ".join(cell.split()) + if any(ch.isalnum() for ch in normalized): + nonempty.append(normalized) + return nonempty + + +def _is_sentence_shell_candidate(review_row: Dict[str, object], table_html: str) -> Tuple[bool, Dict[str, int]]: + nonempty_cells = _flatten_nonempty_cells(table_html) + word_count = sum(len(WORD_RE.findall(cell)) for cell in nonempty_cells) + max_cell_len = max((len(cell) for cell in nonempty_cells), default=0) + metrics = { + "nonempty_cell_count": len(nonempty_cells), + "word_count": word_count, + "max_cell_len": max_cell_len, + } + is_candidate = ( + bool(review_row.get("broken")) + and "sparse_span_shell" in list(review_row.get("reasons", [])) + and len(nonempty_cells) == 1 + and word_count >= 6 + and max_cell_len >= 40 + ) + return is_candidate, metrics + + +def _find_table_page_context( + source_path: Path, + table_index_in_doc: int, +) -> Tuple[int, int, int, int, str, str, str]: + text = source_path.read_text(encoding="utf-8", errors="ignore") + pages = text.split(PAGE_SPLIT_MARKER) + seen = 0 + for page_idx, page in enumerate(pages): + matches = list(TABLE_BLOCK_RE.finditer(page)) + if seen + len(matches) < table_index_in_doc: + seen += len(matches) + continue + local_idx = table_index_in_doc - seen - 1 + match = matches[local_idx] + prev_page = pages[page_idx - 1] if page_idx > 0 else "" + curr_page = page + next_page = pages[page_idx + 1] if page_idx + 1 < len(pages) else "" + return page_idx, match.start(), match.end(), len(pages), prev_page, curr_page, next_page + raise ValueError(f"Could not find table {table_index_in_doc} in {source_path}") + + +def _smart_join(before_text: str, inline_text: str, after_text: str) -> str: + left = before_text.rstrip() + right = after_text.lstrip() + insertion = inline_text.strip() + + if left and not left.endswith(("\n", " ", "(", "[", "{", "“", "\"", "'")): + if left[-1].isalnum() and insertion and insertion[0].isalnum(): + left += " " + if right and not right.startswith(("\n", " ", ".", ",", ";", ":", "!", "?", ")", "]", "}", "”", "\"", "'")): + if insertion and insertion[-1].isalnum() and right[0].isalnum(): + insertion += " " + return left + insertion + right + + +def _context_fit_guess(before_text: str, inline_text: str, after_text: str) -> Tuple[bool, List[str]]: + reasons: List[str] = [] + word_count = len(WORD_RE.findall(inline_text)) + if word_count < 6: + reasons.append("short_inline_text") + left_window = before_text[-4:] + right_window = after_text[:4] + left_blockish = (not before_text) or ("\n" in left_window) or before_text.endswith((" ", "\t")) + right_blockish = (not after_text) or ("\n" in right_window) or after_text.startswith((" ", "\t")) + if not left_blockish: + reasons.append("not_block_isolated_left") + if not right_blockish: + reasons.append("not_block_isolated_right") + fit = word_count >= 6 and left_blockish and right_blockish + return fit, reasons + + +def _format_three_page_context( + prev_page: str, + curr_page: str, + next_page: str, + start: int, + end: int, + inline_text: str, +) -> Tuple[str, str]: + tagged_current = curr_page[:start] + "[[[TABLE_START]]]" + curr_page[start:end] + "[[[TABLE_END]]]" + curr_page[end:] + replaced_current = ( + curr_page[:start] + + "[[[INLINE_TEXT_START]]]" + + inline_text + + "[[[INLINE_TEXT_END]]]" + + curr_page[end:] + ) + original_context = ( + f"=== PAGE -1 ===\n{prev_page}\n\n" + f"=== PAGE 0 ===\n{tagged_current}\n\n" + f"=== PAGE +1 ===\n{next_page}\n" + ) + replaced_context = ( + f"=== PAGE -1 ===\n{prev_page}\n\n" + f"=== PAGE 0 ===\n{replaced_current}\n\n" + f"=== PAGE +1 ===\n{next_page}\n" + ) + return original_context, replaced_context + + +def main() -> None: + parser = argparse.ArgumentParser(description="Export 3-page context review files for sentence-in-table shells.") + parser.add_argument("--audit-dir", required=True, type=Path) + parser.add_argument("--output-dir", required=True, type=Path) + args = parser.parse_args() + + audit_dir = args.audit_dir + manifest_path = audit_dir / "manifest.jsonl" + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + contexts_dir = output_dir / "contexts" + contexts_dir.mkdir(parents=True, exist_ok=True) + + summary_path = output_dir / "summary.json" + review_manifest_path = output_dir / "manifest.jsonl" + if summary_path.exists(): + summary_path.unlink() + if review_manifest_path.exists(): + review_manifest_path.unlink() + for stale in contexts_dir.glob("*.txt"): + stale.unlink() + + rows = [json.loads(line) for line in manifest_path.read_text(encoding="utf-8").splitlines() if line.strip()] + review_rows: List[Dict[str, object]] = [] + + for row in rows: + review_text = Path(str(row["output_path"])).read_text(encoding="utf-8") + table_html = _extract_review_html(review_text) + is_candidate, metrics = _is_sentence_shell_candidate(row, table_html) + if not is_candidate: + continue + + inline_text = _flatten_nonempty_cells(table_html)[0] + page_idx, start, end, page_count, prev_page, curr_page, next_page = _find_table_page_context( + Path(str(row["source_path"])), + int(row["table_index_in_doc"]), + ) + fit_guess, fit_reasons = _context_fit_guess(curr_page[:start], inline_text, curr_page[end:]) + original_context, replaced_context = _format_three_page_context( + prev_page, + curr_page, + next_page, + start, + end, + inline_text, + ) + filename = f"{int(row['global_index']):05d}__{row['source_stem']}__table_{int(row['table_index_in_doc']):03d}.txt" + output_path = contexts_dir / filename + output_path.write_text( + "\n".join( + [ + f"SOURCE_PATH: {row['source_path']}", + f"SOURCE_STEM: {row['source_stem']}", + f"TABLE_INDEX_IN_DOC: {row['table_index_in_doc']}", + f"GLOBAL_INDEX: {row['global_index']}", + f"PAGE_INDEX_ZERO_BASED: {page_idx}", + f"PAGE_NUMBER_ONE_BASED: {page_idx + 1}", + f"PAGE_COUNT: {page_count}", + f"FIT_GUESS: {fit_guess}", + f"FIT_REASONS: {', '.join(fit_reasons) if fit_reasons else 'none'}", + f"INLINE_TEXT_WORDS: {metrics['word_count']}", + f"INLINE_TEXT_CHARS: {metrics['max_cell_len']}", + "", + "=== INLINE_TEXT ===", + inline_text, + "", + "=== ORIGINAL_CONTEXT_3P ===", + original_context, + "", + "=== REPLACED_CONTEXT_3P ===", + replaced_context, + "", + ] + ), + encoding="utf-8", + ) + review_rows.append( + { + "source_path": row["source_path"], + "source_stem": row["source_stem"], + "table_index_in_doc": row["table_index_in_doc"], + "global_index": row["global_index"], + "page_number": page_idx + 1, + "page_count": page_count, + "fit_guess": fit_guess, + "fit_reasons": fit_reasons, + "inline_text_words": metrics["word_count"], + "inline_text_chars": metrics["max_cell_len"], + "output_path": str(output_path), + } + ) + + with review_manifest_path.open("w", encoding="utf-8") as handle: + for row in review_rows: + handle.write(json.dumps(row, ensure_ascii=False)) + handle.write("\n") + + fit_counter = Counter(bool(row["fit_guess"]) for row in review_rows) + reason_counter = Counter(reason for row in review_rows for reason in row["fit_reasons"]) + summary = { + "audit_dir": str(audit_dir), + "output_dir": str(output_dir), + "candidate_count": len(review_rows), + "fit_guess_count": fit_counter.get(True, 0), + "fit_guess_rate": round((fit_counter.get(True, 0) / len(review_rows)), 4) if review_rows else 0.0, + "fit_reason_counts": dict(reason_counter), + } + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/tests/test_browser_gloss_downloader.py b/tests/test_browser_gloss_downloader.py new file mode 100644 index 0000000..ab94f15 --- /dev/null +++ b/tests/test_browser_gloss_downloader.py @@ -0,0 +1,477 @@ +import asyncio +import io + +import pandas as pd +from PIL import Image + +from glossapi import Corpus +from glossapi.download_policy import build_download_policy +from glossapi.gloss_browser_downloader import BrowserGlossDownloader, BrowserSessionState +import glossapi.corpus.phase_download as phase_download_mod + + +def test_browser_downloader_skips_viewer_interstitial(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + called = False + + async def _fake_browser_download(**kwargs): + nonlocal called + called = True + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue=( + "HTML document viewer returned instead of a downloadable file; " + "a source-specific fetcher with persisted cookies/redirect handling is required" + ), + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result is None + assert called is False + + +def test_browser_downloader_recovers_challenge_page(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + async def _fake_browser_download(**kwargs): + return ( + b"%PDF-1.7\n%dummy\n", + {"Content-Type": "application/pdf"}, + {"candidate_url": "https://example.org/file.pdf"}, + ) + + monkeypatch.setattr(downloader, "_download_via_browser_session", _fake_browser_download) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://example.org/file.pdf", + headers={"Content-Type": "text/html"}, + content=b"challenge", + html_issue=( + "HTML challenge page returned instead of a document; " + "browser automation or cookie bootstrap is required" + ), + retry_count=1, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 1) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.7") + assert not (tmp_path / "downloads" / ".part_browser_0").exists() + + +def test_browser_downloader_detects_anubis_challenge(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + issue = downloader._detect_html_interstitial( + "https://dias.library.tuc.gr/view/view/manf/77495", + {"Content-Type": "text/html"}, + b"Making sure you're not a bot!" + b"anubis /.within.website/", + ) + + assert issue is not None + assert "challenge page returned" in issue.lower() + + +def test_infer_file_extension_prefers_html_magic_over_pdf_url(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + file_ext = downloader.infer_file_extension( + "https://repository.academyofathens.gr/document/43963.pdf", + {"Content-Type": "text/html"}, + b"spa shell", + ) + + assert file_ext == "html" + + +def test_infer_file_extension_accepts_pdf_header_after_small_prefix(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + file_ext = downloader.infer_file_extension( + "https://pergamos.lib.uoa.gr/uoa/dl/object/1316268/file.pdf", + {"Content-Type": "application/pdf"}, + b"test123%PDF-1.5\nrest", + ) + + assert file_ext == "pdf" + + +def test_finalize_download_result_rejects_invalid_pdf_payload(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + result = asyncio.run( + downloader._finalize_download_result( + row_index=0, + url="https://example.org/file.pdf", + resp_headers={"Content-Type": "application/pdf"}, + content=b"this is not a pdf payload", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result[0] is False + assert result[2] == "pdf" + assert "invalid pdf signature" in result[3].lower() + assert not (tmp_path / "downloads" / "AAA_000.pdf").exists() + + +def test_browser_downloader_recovers_academy_bookreader_pdf(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="standard") + + async def _fake_download_academy(url: str): + return b"%PDF-1.4\n%academy\n" + + monkeypatch.setattr(downloader, "_download_academy_bookreader_pdf", _fake_download_academy) + + result = asyncio.run( + downloader._recover_html_interstitial( + row_index=0, + url="https://repository.academyofathens.gr/document/43963.pdf", + headers={"Content-Type": "text/html"}, + content=b"", + html_issue="Expected a file-like response but received HTML instead", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert (tmp_path / "downloads" / "AAA_000.pdf").read_bytes().startswith(b"%PDF-1.4") + + +def test_academy_images_to_pdf_bytes_builds_pdf(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + blobs = [] + for color in ("red", "blue"): + image = Image.new("RGB", (16, 16), color=color) + buf = io.BytesIO() + image.save(buf, format="JPEG") + blobs.append(buf.getvalue()) + + pdf_bytes = downloader._academy_images_to_pdf_bytes(blobs) + + assert pdf_bytes.startswith(b"%PDF-") + + +def test_browser_downloader_domain_cookie_lookup(tmp_path): + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + domain_cookies={"eur-lex.europa.eu": {"token": "abc123"}}, + ) + + cookies = downloader._domain_cookies_for_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) + + assert cookies == {"token": "abc123"} + + +def test_browser_downloader_bootstrap_url_uses_base_for_file_endpoints(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._choose_browser_bootstrap_url( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + ) == "https://eur-lex.europa.eu" + + +def test_browser_downloader_ignores_err_aborted_for_file_navigation(tmp_path): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path)) + + assert downloader._should_ignore_navigation_exception( + "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + assert not downloader._should_ignore_navigation_exception( + "https://example.org/article", + RuntimeError("Page.goto: net::ERR_ABORTED"), + ) + + +def test_browser_downloader_uses_default_browser_route_for_preflight(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="browser") + + async def _fake_download_browser_route(**kwargs): + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://example.org/file.pdf", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + + +def test_browser_downloader_reuses_cached_domain_session(tmp_path, monkeypatch): + downloader = BrowserGlossDownloader(output_dir=str(tmp_path), default_download_route="auto") + bootstraps = 0 + fetches = 0 + + async def _fake_fetch_with_browser_session_state(**kwargs): + nonlocal fetches + fetches += 1 + return b"%PDF-1.7\n", {"Content-Type": "application/pdf"}, {"candidate_url": kwargs["url"]} + + async def _bootstrap(**kwargs): + nonlocal bootstraps + bootstraps += 1 + return BrowserSessionState(user_agent="UA", cookie_header="a=b", cached_at=10_000.0), [] + + monkeypatch.setattr(downloader, "_bootstrap_browser_session_state", _bootstrap) + monkeypatch.setattr(downloader, "_fetch_with_browser_session_state", _fake_fetch_with_browser_session_state) + monkeypatch.setattr("glossapi.gloss_browser_downloader.time.time", lambda: 10_100.0) + + first = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file.pdf", referer=None) + ) + second = asyncio.run( + downloader._download_via_browser_session(url="https://eur-lex.europa.eu/file2.pdf", referer=None) + ) + + assert first[0].startswith(b"%PDF") + assert second[0].startswith(b"%PDF") + assert bootstraps == 1 + assert fetches == 2 + + +def test_browser_downloader_policy_routes_domain_to_browser(tmp_path, monkeypatch): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["eur-lex.europa.eu"]}, + "downloader": "browser", + "browser_timeout_ms": 1234, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + observed = {} + + async def _fake_download_browser_route(**kwargs): + observed.update(kwargs) + return True, "AAA_000.pdf", "pdf", "", 0 + + monkeypatch.setattr(downloader, "_download_browser_route", _fake_download_browser_route) + + result = asyncio.run( + downloader._preflight_download( + row_index=0, + url="https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360", + retry_count=0, + filename_base="AAA_000", + referer=None, + ) + ) + + assert result == (True, "AAA_000.pdf", "pdf", "", 0) + assert observed["route_options"]["browser_timeout_ms"] == 1234 + + +def test_download_policy_preserves_transport_and_scheduler_options(): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ikee.lib.auth.gr"]}, + "downloader": "standard", + "request_timeout": 120, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 3, + "skip_failed_after": 5, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + + route, options = policy.resolve("https://ikee.lib.auth.gr/record/123/files/file.pdf") + + assert route == "standard" + assert options["request_timeout"] == 120 + assert options["ssl_verify"] is False + assert options["per_domain_concurrency"] == 2 + assert options["domain_concurrency_floor"] == 1 + assert options["domain_concurrency_ceiling"] == 3 + assert options["skip_failed_after"] == 5 + assert options["domain_cookies"] == {"sessionid": "abc"} + + +def test_browser_downloader_route_options_apply_standard_transport_settings(tmp_path): + policy = build_download_policy( + { + "default": {"downloader": "standard"}, + "rules": [ + { + "match": {"domains": ["ktisis.cut.ac.cy"]}, + "downloader": "standard", + "request_timeout": 90, + "ssl_verify": False, + "per_domain_concurrency": 2, + "domain_concurrency_floor": 1, + "domain_concurrency_ceiling": 2, + "skip_failed_after": 4, + "domain_cookies": {"sessionid": "abc"}, + } + ], + } + ) + downloader = BrowserGlossDownloader( + output_dir=str(tmp_path), + download_policy=policy, + default_download_route="standard", + ) + + async def _build_connector(): + return downloader._build_session_connector( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + + route, route_options = downloader._resolve_route("https://ktisis.cut.ac.cy/items/123/file.pdf") + timeout = downloader._build_request_timeout(0, route_options=route_options) + connector = asyncio.run(_build_connector()) + cookies = downloader._resolve_request_cookies( + "https://ktisis.cut.ac.cy/items/123/file.pdf", + route_options=route_options, + ) + floor, ceiling, start, skip_after = downloader._resolve_domain_scheduler_settings(route_options) + + assert route == "standard" + assert timeout.total == 90 + assert connector is not None + assert cookies["sessionid"] == "abc" + assert (floor, ceiling, start, skip_after) == (1, 2, 2, 4) + + +def test_corpus_download_mode_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + result = corpus.download(input_parquet=input_parquet, download_mode="browser") + + assert observed["cls"] == "browser" + assert observed["kwargs"]["default_download_route"] == "browser" + assert bool(result["download_success"].iloc[0]) is True + assert (tmp_path / "download_results" / f"download_results_{input_parquet.name}").exists() + + +def test_corpus_browser_mode_alias_selects_browser_downloader(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://example.org/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["cls"] = "browser" + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://example.org/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, browser_mode=True) + + assert observed["cls"] == "browser" + + +def test_corpus_policy_file_selects_browser_router(tmp_path, monkeypatch): + input_df = pd.DataFrame({"url": ["https://eur-lex.europa.eu/file.pdf"]}) + input_parquet = tmp_path / "urls.parquet" + input_df.to_parquet(input_parquet, index=False) + policy_path = tmp_path / "download_policy.yml" + policy_path.write_text( + "default:\n downloader: standard\nrules:\n - match:\n domains: [eur-lex.europa.eu]\n downloader: browser\n", + encoding="utf-8", + ) + + observed = {} + + class DummyBrowserDownloader: + def __init__(self, *args, **kwargs): + observed["kwargs"] = kwargs + + def download_files(self, input_parquet: str, **kwargs): + return pd.DataFrame( + { + "url": ["https://eur-lex.europa.eu/file.pdf"], + "filename": ["AAA_000.pdf"], + "download_success": [True], + "download_error": [""], + } + ) + + monkeypatch.setattr(phase_download_mod, "BrowserGlossDownloader", DummyBrowserDownloader) + + corpus = Corpus(input_dir=tmp_path, output_dir=tmp_path) + corpus.download(input_parquet=input_parquet, download_policy_file=policy_path) + + assert observed["kwargs"]["download_policy_file"] == policy_path.resolve() + assert observed["kwargs"]["default_download_route"] == "standard" diff --git a/tests/test_corpus_clean_enhancements.py b/tests/test_corpus_clean_enhancements.py index b876a20..8ef8926 100644 --- a/tests/test_corpus_clean_enhancements.py +++ b/tests/test_corpus_clean_enhancements.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os from pathlib import Path @@ -7,6 +8,14 @@ import pytest from glossapi import Corpus +from glossapi.corpus.phase_clean import ( + _find_word_repeat_spans, + _find_word_repeat_spans_python, + _merge_labeled_raw_spans, + _normalize_alnum_with_map_skip_tags, +) +from glossapi.scripts.table_markdown_audit import audit_table, write_clean_markdown_file +from glossapi.scripts.review_manifest_materialize import materialize_manifest_categories LATEX_MOJIBAKE_MD = """# Sample Document @@ -55,6 +64,144 @@ def _run_clean_and_read_row( return row.iloc[0] +def _run_clean_ocr_and_read_row( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + drop_bad: bool = False, +) -> pd.Series: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + corpus.clean_ocr(drop_bad=drop_bad) + parquet = corpus.output_dir / "download_results" / "download_results.parquet" + df = pd.read_parquet(parquet) + row = df[df["filename"] == f"{stem}.pdf"] + assert not row.empty, "Expected OCR metrics entry for generated markdown" + return row.iloc[0] + + +def _run_clean_ocr_and_read_cleaned_text( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + write_cleaned_files: bool = True, +) -> str: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + corpus.clean_ocr(write_cleaned_files=write_cleaned_files) + cleaned_path = corpus.cleaned_markdown_dir / f"{stem}.md" + assert cleaned_path.exists(), f"Expected cleaned markdown output at {cleaned_path}" + return cleaned_path.read_text(encoding="utf-8") + + +def _run_clean_ocr_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_pages: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_debug" + rows = corpus.clean_ocr_debug(debug_dir, max_pages=max_pages) + return rows, debug_dir + + +def _run_clean_ocr_numeric_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_pages: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_numeric_debug" + rows = corpus.clean_ocr_numeric_debug(debug_dir, max_pages=max_pages) + return rows, debug_dir + + +def _run_clean_ocr_numeric_word_debug_docs( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 100, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_numeric_word_debug" + rows = corpus.clean_ocr_numeric_word_debug_docs(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def _run_clean_ocr_hybrid_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 100, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_hybrid_debug" + rows = corpus.clean_ocr_hybrid_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def _run_clean_ocr_latex_slot_progression_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_latex_slot_progression_debug" + rows = corpus.clean_ocr_latex_slot_progression_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def _run_clean_ocr_latex_debug_export( + corpus: Corpus, + markdown_text: str, + *, + stem: str = "sample", + max_docs: int | None = 1000, +) -> tuple[list[dict], Path]: + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(markdown_text, encoding="utf-8") + debug_dir = corpus.output_dir / "ocr_latex_debug" + rows = corpus.clean_ocr_latex_debug(debug_dir, max_docs=max_docs) + return rows, debug_dir + + +def test_merge_labeled_raw_spans_merges_same_type_with_gap_of_40() -> None: + text = "A" * 10 + ("x" * 40) + "B" * 10 + spans = [ + {"start": 0, "end": 10, "match_types": ["word_repeat"], "category": "word"}, + {"start": 50, "end": 60, "match_types": ["word_repeat"], "category": "word"}, + ] + merged = _merge_labeled_raw_spans(text, spans) + assert len(merged) == 1 + assert merged[0]["start"] == 0 + assert merged[0]["end"] == 60 + + +def test_merge_labeled_raw_spans_does_not_merge_same_type_with_gap_of_41() -> None: + text = "A" * 10 + ("x" * 41) + "B" * 10 + spans = [ + {"start": 0, "end": 10, "match_types": ["word_repeat"], "category": "word"}, + {"start": 51, "end": 61, "match_types": ["word_repeat"], "category": "word"}, + ] + merged = _merge_labeled_raw_spans(text, spans) + assert len(merged) == 2 + + def test_clean_skips_latex_blocks_for_mojibake(tmp_path: Path) -> None: corpus = _build_corpus(tmp_path) row = _run_clean_and_read_row(corpus, LATEX_MOJIBAKE_MD, stem="latex-case") @@ -88,3 +235,1490 @@ def test_clean_flags_uppercase_glyph_noise(tmp_path: Path) -> None: filter_value = row.get("filter") or "" assert "mojibake>0.1" in filter_value or "non_greek_text" in filter_value assert bool(row.get("needs_ocr", False)) + + +def test_clean_ocr_populates_script_metrics(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "Αυτή είναι η πρώτη σελίδα.\n<--- Page Split --->\nΚαὶ αὕτη εἶναι ἡ δευτέρα.", + stem="ocr-script-metrics", + ) + assert float(row.get("percentage_greek") or 0.0) > 70.0 + assert float(row.get("latin_percentage") or 0.0) < 5.0 + assert float(row.get("polytonic_ratio") or 0.0) > 0.0 + assert not bool(row.get("ocr_noise_suspect", False)) + assert (row.get("filter") or "") == "ok" + + +def test_clean_ocr_writes_cleaned_markdown_with_combined_loop(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + content = _run_clean_ocr_and_read_cleaned_text( + corpus, + ( + "1111 1 1 1 1 1 1 1 1 1 1\n" + "<--- Page Split --->\n" + "1. Από το 2020, η αγορά των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + "
NameScore
Alice10
\n" + ), + stem="ocr-clean-shared-loop", + ) + assert "<--- Page Split --->" in content + assert "" not in content + assert "| Name" in content + assert "| Alice" in content + assert corpus.markdown_dir == corpus.cleaned_markdown_dir + + +def test_clean_ocr_drops_sentence_shell_and_repeated_row_tables(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + content = _run_clean_ocr_and_read_cleaned_text( + corpus, + ( + "Πρόλογος\n" + "
Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας
\n" + "
StateValue
Alpha10
Beta20
Alpha10
Beta20
\n" + "Επίλογος\n" + ), + stem="ocr-clean-drop-tables", + ) + assert "" not in content + assert "Η οινοφόρος άμπελος" not in content + assert "| Alpha" not in content + assert "Πρόλογος" in content + assert "Επίλογος" in content + + +def test_clean_ocr_supports_score_only_mode(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + md_path = corpus.markdown_dir / "ocr-clean-score-only.md" + md_path.write_text("Κανονικό περιεχόμενο.\n", encoding="utf-8") + corpus.clean_ocr(write_cleaned_files=False) + assert not any(corpus.cleaned_markdown_dir.glob("*.md")) + assert corpus.markdown_dir == corpus.output_dir / "markdown" + + +def test_clean_ocr_supports_combined_clean_and_debug_outputs(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + stem = "ocr-clean-debug" + source_text = ( + "Πρόλογος\n" + "
Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας
\n" + "<--- Page Split --->\n" + "των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + ) + md_path = corpus.markdown_dir / f"{stem}.md" + md_path.write_text(source_text, encoding="utf-8") + + corpus.clean_ocr(write_cleaned_files=True, write_debug_files=True) + + cleaned_path = corpus.cleaned_markdown_dir / f"{stem}.md" + debug_dir = corpus.output_dir / "debug" + debug_path = debug_dir / f"{stem}.md" + assert cleaned_path.exists() + assert debug_path.exists() + + cleaned_text = cleaned_path.read_text(encoding="utf-8") + debug_text = debug_path.read_text(encoding="utf-8") + assert "= 2 + + match_rows = [ + json.loads(line) + for line in (debug_dir / "match_index.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert len(match_rows) >= 2 + source_pages = source_text.split("<--- Page Split --->") + for row in match_rows: + page_text = source_pages[int(row["page_number"]) - 1] + assert page_text[int(row["start_char"]):int(row["end_char"])] == row["matched_text"] + word_row = next(row for row in match_rows if row["match_type"] == "word_repeat") + assert int(word_row["repeat_count"]) >= 3 + assert int(word_row["period"]) > 0 + + page_metrics_rows = (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert len(page_metrics_rows) == 2 + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["doc_count"] == 1 + assert summary["match_count"] >= 2 + + +def test_clean_ocr_ignores_numeric_lists_and_dotted_values(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "1. 2. 3. 4. 5. 6. 7.\n9.9.9.9.9\n", + stem="ocr-non-repeat-noise", + drop_bad=True, + ) + assert not bool(row.get("ocr_noise_suspect", False)) + assert int(row.get("ocr_repeat_phrase_run_max") or 0) == 0 + assert int(row.get("ocr_repeat_line_run_max") or 0) == 0 + flags = row.get("ocr_noise_flags") or "" + assert flags == "" + assert "ocr_noise" not in (row.get("filter") or "") + assert "ocr-non-repeat-noise" in corpus.good_files + + +def test_clean_ocr_flags_repeated_phrase_noise(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + row = _run_clean_ocr_and_read_row( + corpus, + "0 0 0 0 0 0\n1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n", + stem="ocr-repeat-noise", + drop_bad=True, + ) + assert bool(row.get("ocr_noise_suspect", False)) + assert int(row.get("ocr_repeat_phrase_run_max") or 0) >= 6 + assert int(row.get("ocr_repeat_line_run_max") or 0) >= 6 + flags = row.get("ocr_noise_flags") or "" + assert "repeat_phrase_run" in flags + assert "repeat_line_run" in flags + assert "ocr_noise" in (row.get("filter") or "") + assert "ocr-repeat-noise" not in corpus.good_files + + +def test_clean_ocr_debug_exports_annotated_pages(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_debug_export( + corpus, + ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7.\n" + "0 0 0 0 0 0\n" + "1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n" + ), + stem="ocr-debug-source", + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert row["page_index_in_file"] == 2 + assert row["match_count"] >= 2 + assert "repeat_phrase_run" in row["match_types"] + assert "repeat_line_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "1. 2. 3. 4. 5. 6. 7." in content + assert "0 0 0 0 0 0" in content + assert "1.1" in content + + manifest = debug_dir / "manifest.jsonl" + lines = manifest.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 1 + + +def test_clean_ocr_debug_respects_sample_limit(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + md_path = corpus.markdown_dir / "ocr-debug-many.md" + md_path.write_text( + ( + "0 0 0 0 0 0\n" + "<--- Page Split --->\n" + "0 0 0 0 0 0\n" + "<--- Page Split --->\n" + "0 0 0 0 0 0\n" + ), + encoding="utf-8", + ) + debug_dir = corpus.output_dir / "ocr_debug" + rows = corpus.clean_ocr_debug(debug_dir, max_pages=2, sample_seed=0) + assert len(rows) == 2 + manifest = debug_dir / "manifest.jsonl" + lines = manifest.read_text(encoding="utf-8").strip().splitlines() + assert len(lines) == 2 + + +def test_clean_ocr_numeric_debug_flags_ascending_sequences(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + ( + "Κανονικό κείμενο.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7. 8. 9. 10.\n" + ), + stem="ocr-numeric-progress", + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert "ascending_numeric_sequence" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert ( + "1. 2. 3. 4. 5. 6. 7. 8. 9. 10" + in content + ) + + +def test_clean_ocr_numeric_debug_flags_compact_repeated_numbers(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "2.2.2.2.2.2.2.2.\n", + stem="ocr-numeric-compact-repeat", + ) + assert len(rows) == 1 + row = rows[0] + assert "repeat_numeric_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "2.2.2.2.2.2.2.2" in content + + +def test_clean_ocr_numeric_debug_flags_same_digit_runs(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "1111 1 1 1 111 11 1 111 1 11\n", + stem="ocr-numeric-same-digit", + ) + assert len(rows) == 1 + row = rows[0] + assert "same_digit_numeric_run" in row["match_types"] + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert ( + "1111 1 1 1 111 11 1 111 1 11" + in content + ) + + +def test_clean_ocr_numeric_debug_merges_close_same_category_spans(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + "1111 1 1 1 1 1 1 1 1 1 1 xy 1111 1 1 1 1 1 1 1 1 1 1\n", + stem="ocr-numeric-gap-merge", + ) + assert len(rows) == 1 + exported = Path(rows[0]["output_path"]) + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert content.count("") == 1 + assert ( + "" + "1111 1 1 1 1 1 1 1 1 1 1 xy 1111 1 1 1 1 1 1 1 1 1 1" + "" + in content + ) + + +def test_clean_ocr_numeric_debug_flags_numeric_page_collapse(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + tokens = ("22 2 22 6 22 8 22 1 22 7 22 5 " * 12).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + tokens + "\n", + stem="ocr-numeric-page-collapse", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_ignores_punctuation_only_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + tokens = ("1 1 . 1 1 . 2 2 . 2 2 . " * 16).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + tokens + "\n", + stem="ocr-numeric-page-collapse-punct", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_ignores_container_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + numeric_body = ("11 11 11 22 22 22 33 33 33 44 44 44 " * 8).strip() + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + f"```\n( {numeric_body} )\n```\n", + stem="ocr-numeric-page-collapse-fenced", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert numeric_body in content + + +def test_clean_ocr_numeric_debug_page_collapse_accepts_dotted_numeric_tokens( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + dotted_tokens = " ".join(f"{major}.{minor}." for major in range(1, 6) for minor in range(1, 21)) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + dotted_tokens + "\n", + stem="ocr-numeric-page-collapse-dotted", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert dotted_tokens in content + + +def test_clean_ocr_numeric_debug_page_collapse_accepts_compact_numeric_atom_pages( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + compact_tokens = " ".join(["1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1."] * 20) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + compact_tokens + "\n", + stem="ocr-numeric-page-collapse-compact-atoms", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "" in content + assert compact_tokens in content + + +def test_clean_ocr_numeric_debug_flags_numeric_block_after_heading(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + numeric_block = "\n\n".join( + f"{i}.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.{i}.1.1.1.1.1.1.1.1.1.1.1.1.1.1" + for i in range(1, 27) + ) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + f"1\n\n## ΑΠΡΙΛΙΟΣ\n\n1\n\n{numeric_block}\n", + stem="ocr-numeric-block-heading", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_block_collapse" in row["match_types"] + assert row["match_count"] == 1 + + exported = Path(row["output_path"]) + assert exported.exists() + assert exported.parent == debug_dir + content = exported.read_text(encoding="utf-8") + assert "## ΑΠΡΙΛΙΟΣ" in content + assert "" in content + assert numeric_block in content + + +def test_clean_ocr_numeric_word_debug_docs_runs_numeric_then_word(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "1111 1 1 1 1 1 1 1 1 1 1\n" + "<--- Page Split --->\n" + "1. Από το 2020, η αγορά των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ\n" + "
Standard nameStandard nameStandard name
\n" + ), + stem="ocr-number-word-doc", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_count"] == 2 + assert row["matched_page_count"] == 2 + assert row["numeric_match_count"] >= 1 + assert row["word_match_count"] >= 1 + assert "word_repeat" in row["match_types"] + + exported = debug_dir / "ocr-number-word-doc.md" + content = exported.read_text(encoding="utf-8") + assert "<--- Page Split --->" in content + assert content.count("") == 1 + assert "" in content + assert "Standard name" not in content + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["doc_count"] == 1 + assert summary["match_count"] >= 2 + assert summary["numeric_match_count"] >= 1 + assert summary["word_match_count"] >= 1 + + page_metrics = (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + assert len(page_metrics) == 2 + match_index = [ + json.loads(line) + for line in (debug_dir / "match_index.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert any(row["match_type"] == "same_digit_numeric_run" for row in match_index) + assert any(row["match_type"] == "word_repeat" for row in match_index) + + +def test_rust_word_repeat_spans_match_python_reference(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + corpus._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_word_repeat_spans",), + ) + cases = [ + "των εργασιών των εργασιών των εργασιών των εργασιών των εργασιώ", + "1.1 Hypergeometric function 1.1.1 Hypergeometric function 1.1.2 Hypergeometric function 1.1.3 Hypergeometric function", + r"\Delta \Delta \Delta \Delta \Delta", + "το σημείο 1, το σημείο 2, το σημείο 3, το σημείο 4, το σημείο 5, το σημείο 6", + ] + for text in cases: + normalized, _ = _normalize_alnum_with_map_skip_tags(text) + assert _find_word_repeat_spans( + normalized, + rep_threshold=4, + min_period=3, + window=96, + ) == _find_word_repeat_spans_python( + normalized, + rep_threshold=4, + min_period=3, + window=96, + ) + + +def test_clean_ocr_numeric_word_debug_docs_flags_empty_html_table_collapse(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + empty_table = ( + "" + "" + "" + "" + "" + "" + "" + "
\n" + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + empty_table, + stem="ocr-empty-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-empty-table.md").read_text(encoding="utf-8") + assert "" not in content + assert "|" in content + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["table_match_count"] == 1 + + +def test_clean_ocr_numeric_word_debug_docs_flags_repeated_html_table_rows(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + repeated_table = ( + "" + "" + "" + "" + "" + "" + "
StateValue
Alpha10
Beta20
Alpha10
Beta20
\n" + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated_table, + stem="ocr-repeated-table-rows", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-repeated-table-rows.md").read_text(encoding="utf-8") + assert "" not in content + assert "| Alpha" in content or "| Beta" in content + + +def test_clean_ocr_numeric_word_debug_docs_ignores_small_distinct_html_table(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "" + "" + "" + "" + "
NameScore
Alice10
Bob11
\n" + ), + stem="ocr-distinct-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 0 + assert "table_repeat" not in row["match_types"] + + content = (debug_dir / "ocr-distinct-table.md").read_text(encoding="utf-8") + assert "" not in content + assert "| Name" in content + assert "| Alice" in content + + +def test_clean_ocr_numeric_word_debug_docs_flags_sentence_shell_table(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + "
" + "Η οινοφόρος άμπελος αναπτύχθηκε στην Αρμενία, νότια της Κασπίας" + "
\n" + ), + stem="ocr-sentence-shell-table", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["table_match_count"] == 1 + assert "table_repeat" in row["match_types"] + + content = (debug_dir / "ocr-sentence-shell-table.md").read_text(encoding="utf-8") + assert "" not in content + + +def test_clean_ocr_numeric_word_debug_docs_transfers_pure_numeric_repeats_to_numeric( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "12 12 12 12 12 12 12 12 12 12 12 12\n", + stem="ocr-number-transfer", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["numeric_match_count"] >= 1 + assert row["word_match_count"] == 0 + assert "numeric_repeat" in row["match_types"] + assert "word_repeat" not in row["match_types"] + + content = (debug_dir / "ocr-number-transfer.md").read_text(encoding="utf-8") + assert "12 12 12 12 12 12 12 12 12 12 12 12" in content + + +def test_clean_ocr_numeric_word_debug_docs_flags_hybrid_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "1.1 Hypergeometric function 1.1.1 Hypergeometric function 1.1.2 Hypergeometric function 1.1.3 Hypergeometric function 1.1.4 Hypergeometric function\n", + stem="ocr-combined-hybrid", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["hybrid_match_count"] >= 1 + assert "hybrid_repeat" in row["match_types"] + + content = (debug_dir / "ocr-combined-hybrid.md").read_text(encoding="utf-8") + assert "= 1 + + +def test_clean_ocr_numeric_word_debug_docs_ignores_latex_in_shared_repeat(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ S=\frac{1}{16\pi}\int\sqrt{-g}d^{4}x\left[\phi R-\frac{\omega(\phi)}{\phi}\phi_{,a}\phi^{,a}+2\phi\lambda(\phi)\right]+S_{M} \quad (149) \]" + "\n", + stem="ocr-latex-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["word_match_count"] == 0 + assert row["latex_match_count"] == 0 + assert "word_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\( \varepsilon_{H} = \frac{1}{2} \left( \frac{1}{2} \left( \frac{1}{2} \left( \frac{1}{2} \left( x \right) \right) \right) \right) \)" + + "\n", + stem="ocr-latex-structural-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-structural-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ uαuαuαuαuαuαuαuαuα \]" + + "\n", + stem="ocr-latex-markup-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-markup-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ K:\mathrm{\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa} \]" + + "\n", + stem="ocr-latex-text-wrapper-noise", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-text-wrapper-noise.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ K:\mathrm{\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa\iota\kappa \]" + + "\n", + stem="ocr-latex-unclosed-text-wrapper-noise", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + assert "latex_repeat" in row["match_types"] + content = (debug_dir / "ocr-latex-unclosed-text-wrapper-noise.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"\[ \delta R^{\mu\nu}=g^{\mu\alpha}g^{\nu\beta}\left(\nabla_{\kappa}\left(\delta g_{\nu\alpha}\right)\right). \]" + + "\n", + stem="ocr-latex-bookkeeping-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + assert "latex_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-bookkeeping-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + r"\[ N_{bd} = \frac{f_{bk} \cdot l_e \cdot \pi \cdot d_b}{\gamma_b} \]", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-consecutive-exact", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-consecutive-exact.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,02m}{1,5} = 10,05KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,03m}{1,5} = 15,07KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,04m}{1,5} = 20,10KN \)", + r"\( N_{bd} = \frac{0,6Mpa \cdot 0,4m \cdot \pi \cdot 0,05m}{1,5} = 25,12KN \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-consecutive-template", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-consecutive-template.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join([r"\( \Delta \)", r"\( \Delta \)", r"\( \Delta \)", r"\( \Delta \)"]) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-delta-run", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-delta-run.md").read_text(encoding="utf-8") + assert content.count(" None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + ( + r"\( Q^{I} \) : \( \uparrow\uparrow\uparrow \) + \( \uparrow\downarrow\downarrow \) + ..." + "\n\n" + r"\( Q^{IV} \) : \( \uparrow\uparrow\uparrow \) + \( \downarrow\downarrow\downarrow \) + ..." + "\n" + ), + stem="ocr-latex-diagram-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + assert "latex_repeat" not in row["match_types"] + content = (debug_dir / "ocr-latex-diagram-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + warmup = [r"\( \alpha \)", r"\( \beta \)", r"\( \gamma \)", r"\( \gamma \)"] + block = [ + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \beta \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \gamma \)", + ] + repeated = " ".join(warmup + block + block) + " \\( \\alpha" + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-block", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-block.md").read_text(encoding="utf-8") + assert " \( \alpha" not in content + assert r"\( \alpha" in content + + +def test_clean_ocr_numeric_word_debug_docs_ignores_nonrepeating_short_atom_inventory( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \alpha \)", + r"\( \beta \)", + r"\( \gamma \)", + r"\( \delta \)", + r"\( \omega \)", + r"\( \mu \)", + r"\( \nu \)", + r"\( \lambda \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-inventory-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-short-atom-inventory-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma\delta \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + r"\( \alpha\beta\gamma \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-segment-repeat", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-segment-repeat.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = r"\( \Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta_{i}\Delta \)" + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-short-atom-chain-segment", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-short-atom-chain-segment.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + command_run = (r"\cdots" * 18) + (r"\vdots") + (r"\cdots" * 18) + (r"\ddots") + (r"\cdots" * 18) + repeated = rf"\[ \begin{{aligned}}{command_run}\end{{aligned}} \]" + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-internal-small-vocab-command-run", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-internal-small-vocab-command-run.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr_*^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr_*^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^5\Psi}{dr_*^5} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^5\Psi}{dr^5} + (\omega^2 - V(r))\Psi = 0 \]", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-derivative-ladder", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-derivative-ladder.md").read_text(encoding="utf-8") + assert content.count("") + + +def test_clean_ocr_numeric_word_debug_docs_ignores_small_parameterized_formula_family( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( f_{11}(k) = (1 - 0.0561)^{k-1}0.0561 \)", + r"\( f_{12}(k) = (1 - 0.0617)^{k-1}0.0617 \)", + r"\( f_{21}(k) = (1 - 0.1057)^{k-1}0.1057 \)", + r"\( f_{22}(k) = (1 - 0.1724)^{k-1}0.1724 \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-parameter-family-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-parameter-family-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = " ".join( + [ + r"\( \tilde{p}_{(1,1)(1,2)}^{\prime} \)", + r"\( \tilde{p}_{(1,1)(2,0)}^{\prime} \)", + r"\( \tilde{p}_{(1,1)(1,0)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(1,0)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(2,1)}^{\prime} \)", + r"\( \tilde{p}_{(2,0)(2,0)}^{\prime} \)", + ] + ) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + repeated + "\n", + stem="ocr-latex-symbol-inventory-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-symbol-inventory-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + r"where \( \Delta \) CFF = \( \Delta \) CFF(t) - \( \Delta \) CFF(t-1)." + "\n", + stem="ocr-latex-delta-definition-ignore", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["latex_match_count"] == 0 + content = (debug_dir / "ocr-latex-delta-definition-ignore.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + vertical_numbers = "\n\n".join(str(i) for i in range(0, 121)) + rows, debug_dir = _run_clean_ocr_numeric_debug_export( + corpus, + vertical_numbers + "\n", + stem="ocr-vertical-numeric-page", + ) + assert len(rows) == 1 + row = rows[0] + assert "numeric_page_collapse" in row["match_types"] + + content = Path(row["output_path"]).read_text(encoding="utf-8") + assert "" in content + assert "100" in content + assert "120" in content + + +def test_clean_ocr_numeric_word_debug_docs_records_bad_char_metrics(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_numeric_word_debug_docs( + corpus, + "Κανονική γραμμή\n<--- Page Split --->\n## \x01\x02\x00 漢 \uf0b7\n", + stem="ocr-bad-char-metrics", + max_docs=1, + ) + assert len(rows) == 1 + + page_metric_rows = [ + json.loads(line) + for line in (debug_dir / "page_metrics.jsonl").read_text(encoding="utf-8").strip().splitlines() + ] + assert len(page_metric_rows) == 2 + second_page = page_metric_rows[1] + assert second_page["bad_char_count"] >= 4 + assert second_page["bad_char_ratio"] > 0.0 + assert second_page["control_count"] >= 3 + assert second_page["cjk_count"] >= 1 + assert second_page["private_use_count"] >= 1 + + summary = json.loads((debug_dir / "summary.json").read_text(encoding="utf-8")) + assert summary["bad_char_ratio"]["max"] > 0.0 + + +def test_clean_ocr_numeric_word_debug_docs_respects_doc_offset(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + (corpus.markdown_dir / "a-first.md").write_text("χωρίς επανάληψη\n", encoding="utf-8") + (corpus.markdown_dir / "b-second.md").write_text( + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n\n" + r"\( \Delta \)" + "\n", + encoding="utf-8", + ) + + debug_dir = corpus.output_dir / "ocr_numeric_word_debug" + rows = corpus.clean_ocr_numeric_word_debug_docs(debug_dir, max_docs=1, doc_offset=1) + + assert len(rows) == 1 + row = rows[0] + assert row["source_stem"] == "b-second" + assert row["latex_match_count"] >= 1 + assert not (debug_dir / "a-first.md").exists() + assert (debug_dir / "b-second.md").exists() + + +def test_clean_ocr_hybrid_debug_flags_same_body_numbered_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Απόκτηση της αξίας του αξιώματος. " + "2. Απόκτηση της αξίας του αξιώματος. " + "3. Απόκτηση της αξίας του αξιώματος. " + "4. Απόκτηση της αξίας του αξιώματος. " + "5. Απόκτηση της αξίας του αξιώματος.\n" + ), + stem="ocr-hybrid-same-body", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-same-body__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1.1 Hypergeometric function " + "1.1.1 Hypergeometric function " + "1.1.2 Hypergeometric function " + "1.1.3 Hypergeometric function " + "1.1.4 Hypergeometric function " + "1.1.5 Hypergeometric function\n" + ), + stem="ocr-hybrid-hierarchical", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-hierarchical__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Σχεδία 1.1. Σχεδία 1.2. Σχεδία 1.3. Σχεδία 1.4. Σχεδία 1.5. Σχεδ\n" + ), + stem="ocr-hybrid-partial-tail", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-partial-tail__debug_page_00001.md").read_text(encoding="utf-8") + assert "1.5. Σχεδ" in content + assert content.index("1.5. Σχεδ") < content.index("") + + +def test_clean_ocr_hybrid_debug_flags_body_cycle_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Εισαγωγή 2. Φυσικοχημικές ιδιότητες 3. Φάσεις 4. Επιπλοκές " + "5. Εισαγωγή 6. Φυσικοχημικές ιδιότητες 7. Φάσεις 8. Επιπλοκές " + "9. Εισαγωγή 10. Φυσικοχημικές ιδιότητες 11. Φάσεις 12. Επιπλοκές\n" + ), + stem="ocr-hybrid-cycle", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-cycle__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Από το σημείο 1, το σημείο 2, το σημείο 3, " + "το σημείο 4, το σημείο 5, το σημείο 6, το σημείο 7.\n" + ), + stem="ocr-hybrid-inline-progress", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-hybrid-inline-progress__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Από το σημείο 1, το σημείο 2, το σημείο 3, " + "το σημείο 4, το σημείο 5.\n" + ), + stem="ocr-hybrid-inline-short-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_hybrid_debug_ignores_diverse_numbered_list(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + "1. Εισαγωγή 2. Μέθοδοι 3. Αποτελέσματα 4. Συζήτηση 5. Συμπεράσματα\n" + ), + stem="ocr-hybrid-diverse-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_hybrid_debug_ignores_markup_number_progression(tmp_path: Path) -> None: + corpus = _build_corpus(tmp_path) + rows, debug_dir = _run_clean_ocr_hybrid_debug_export( + corpus, + ( + 'Π ' + 'Π ' + 'Π ' + 'Π\n' + ), + stem="ocr-hybrid-markup-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_clean_ocr_latex_debug_exports_short_atom_block_pages( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + warmup = [r"\( \alpha \)", r"\( \beta \)", r"\( \gamma \)", r"\( \gamma \)"] + block = [ + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \beta \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \alpha \)", + r"\( \gamma \)", + ] + markdown_text = ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + + " ".join(warmup + block + block) + + " \\( \\alpha" + + "\n" + "<--- Page Split --->\n" + "Κανονική τρίτη σελίδα.\n" + ) + rows, debug_dir = _run_clean_ocr_latex_debug_export( + corpus, + markdown_text, + stem="ocr-latex-debug-short-atom", + max_docs=1, + ) + assert len(rows) == 1 + row = rows[0] + assert row["page_number"] == 2 + assert row["latex_match_count"] >= 1 + content = (debug_dir / "ocr-latex-debug-short-atom__debug_page_00002.md").read_text(encoding="utf-8") + assert " \( \alpha" not in content + assert r"\( \alpha" in content + + +def test_clean_ocr_latex_slot_progression_debug_flags_derivative_ladder( + tmp_path: Path, +) -> None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\[ \frac{d^2\Psi}{dr_*^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^2\Psi}{dr^2} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr_*^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^3\Psi}{dr^3} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr_*^4} + (\omega^2 - V(r))\Psi = 0 \]", + r"\[ \frac{d^4\Psi}{dr^4} + (\omega^2 - V(r))\Psi = 0 \]", + ] + ) + rows, debug_dir = _run_clean_ocr_latex_slot_progression_debug_export( + corpus, + repeated + "\n", + stem="ocr-latex-slot-derivative", + max_docs=1, + ) + assert len(rows) == 1 + content = (debug_dir / "ocr-latex-slot-derivative__debug_page_00001.md").read_text(encoding="utf-8") + assert " None: + corpus = _build_corpus(tmp_path) + repeated = "\n\n".join( + [ + r"\( f_{11}(k) = (1 - 0.0561)^{k-1}0.0561 \)", + r"\( f_{12}(k) = (1 - 0.0617)^{k-1}0.0617 \)", + r"\( f_{21}(k) = (1 - 0.1057)^{k-1}0.1057 \)", + r"\( f_{22}(k) = (1 - 0.1724)^{k-1}0.1724 \)", + ] + ) + rows, debug_dir = _run_clean_ocr_latex_slot_progression_debug_export( + corpus, + repeated + "\n", + stem="ocr-latex-slot-parameter-family-ignore", + max_docs=1, + ) + assert rows == [] + assert not any(debug_dir.glob("*.md")) + + +def test_review_manifest_materialize_creates_labeled_copies(tmp_path: Path) -> None: + source_dir = tmp_path / "contexts" + source_dir.mkdir() + first = source_dir / "case_001.txt" + second = source_dir / "case_002.txt" + first.write_text("alpha body\n", encoding="utf-8") + second.write_text("beta body\n", encoding="utf-8") + + manifest = tmp_path / "semantic_review_manifest.jsonl" + manifest.write_text( + "\n".join( + [ + json.dumps( + { + "path": str(first), + "label": "fits_semantically", + "confidence": "high", + "notes": "complete", + }, + ensure_ascii=False, + ), + json.dumps( + { + "path": str(second), + "label": "fits_but_truncated_or_incomplete", + "confidence": "medium", + "notes": "cut off", + }, + ensure_ascii=False, + ), + ] + ) + + "\n", + encoding="utf-8", + ) + + output_dir = tmp_path / "categorized" + summary = materialize_manifest_categories( + manifest, + output_dir, + category_name="semantic_fit", + ) + + assert summary["row_count"] == 2 + fit_copy = output_dir / "by_label" / "fits_semantically" / "case_001.txt" + trunc_copy = output_dir / "by_label" / "fits_but_truncated_or_incomplete" / "case_002.txt" + assert fit_copy.exists() + assert trunc_copy.exists() + + fit_text = fit_copy.read_text(encoding="utf-8") + assert "REVIEW_CATEGORY: semantic_fit" in fit_text + assert "REVIEW_LABEL: fits_semantically" in fit_text + assert "=== REVIEW_SOURCE_CONTENT ===" in fit_text + assert "alpha body" in fit_text + + +def test_table_markdown_audit_preserves_semantic_inline_html() -> None: + audit = audit_table( + Path("/tmp/demo.md"), + 1, + 1, + ( + "" + "" + "" + "
Line A
Line B
xi2source\"diagram\"
" + ), + ) + assert audit.convertible is True + assert audit.markdown is not None + assert "Line A
Line B" in audit.markdown + assert "xi2" in audit.markdown + assert "[source](https://example.com)" in audit.markdown + assert "diagram" in audit.markdown + + +def test_table_markdown_audit_writes_clean_markdown_file(tmp_path: Path) -> None: + audit = audit_table( + Path("/tmp/demo.md"), + 1, + 7, + "
ΑΒ
12
", + ) + output = write_clean_markdown_file(tmp_path, audit) + assert output is not None + path = Path(output) + assert path.exists() + text = path.read_text(encoding="utf-8") + assert text.startswith("## ORIGINAL_HTML") + assert "## GITHUB_MD" in text + assert "" in text + assert "Α" in text + assert "1" in text diff --git a/tests/test_corpus_guards.py b/tests/test_corpus_guards.py index 29db5be..d6911ee 100644 --- a/tests/test_corpus_guards.py +++ b/tests/test_corpus_guards.py @@ -50,12 +50,6 @@ def make_corpus(tmp_path): return Corpus(input_dir=input_dir, output_dir=output_dir) -def set_onnx_providers(monkeypatch, providers): - stub = SimpleNamespace(get_available_providers=lambda: providers) - monkeypatch.setitem(sys.modules, "onnxruntime", stub) - return stub - - def set_torch_stub(monkeypatch, *, available: bool, device_count: int): cuda_ns = SimpleNamespace( is_available=lambda: available, @@ -66,22 +60,23 @@ def set_torch_stub(monkeypatch, *, available: bool, device_count: int): return torch_ns -def test_prime_extractor_requires_cuda_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_force_ocr_is_ignored_for_backend_selection(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() - set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CPUExecutionProvider"]) + set_torch_stub(monkeypatch, available=False, device_count=0) - with pytest.raises(RuntimeError) as exc: - corpus.prime_extractor( - input_format="pdf", - accel_type="CUDA", - force_ocr=True, - phase1_backend="docling", - ) + corpus.prime_extractor( + input_format="pdf", + accel_type="CPU", + force_ocr=True, + phase1_backend="auto", + ) - assert "CUDAExecutionProvider" in str(exc.value) + assert corpus.extractor.last_policy == "safe" + ensure_kwargs = corpus.extractor.ensure_calls[0] + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch): @@ -89,8 +84,6 @@ def test_prime_extractor_requires_cuda_for_docling_backend(tmp_path, monkeypatch corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -106,8 +99,6 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=True, device_count=1) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CPU", @@ -120,26 +111,24 @@ def test_prime_extractor_configures_safe_backend_for_text_layer(tmp_path, monkey assert corpus.extractor.ensure_calls[0]["enable_ocr"] is False -def test_prime_extractor_configures_docling_backend_for_ocr(tmp_path, monkeypatch): +def test_prime_extractor_configures_docling_backend_explicitly(tmp_path, monkeypatch): corpus = make_corpus(tmp_path) corpus.extractor = DummyExtractor() + monkeypatch.delenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", raising=False) set_torch_stub(monkeypatch, available=True, device_count=2) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - corpus.prime_extractor( input_format="pdf", accel_type="CUDA", - force_ocr=True, - phase1_backend="auto", + phase1_backend="docling", ) assert corpus.extractor.last_policy == "docling" assert corpus.extractor.last_max_batch_files == 1 assert corpus.extractor.last_prefer_safe_backend is False ensure_kwargs = corpus.extractor.ensure_calls[0] - assert ensure_kwargs["enable_ocr"] is True - assert ensure_kwargs["force_full_page_ocr"] is True + assert ensure_kwargs["enable_ocr"] is False + assert ensure_kwargs["force_full_page_ocr"] is False def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypatch): @@ -147,8 +136,6 @@ def test_prime_extractor_requires_cuda_for_formula_enrichment(tmp_path, monkeypa corpus.extractor = DummyExtractor() set_torch_stub(monkeypatch, available=False, device_count=0) - set_onnx_providers(monkeypatch, ["CUDAExecutionProvider"]) - with pytest.raises(RuntimeError) as exc: corpus.prime_extractor( input_format="pdf", @@ -203,6 +190,8 @@ def extract(self, *, file_paths=None, **kwargs): with pytest.raises(SystemExit) as exit_info: corpus_mod.gpu_extract_worker_queue( device_id=0, + worker_slot=0, + worker_key="gpu0-w0", in_dir=str(tmp_path), out_dir=str(tmp_path), work_q=work_q, @@ -240,4 +229,116 @@ def extract(self, *, file_paths=None, **kwargs): assert processed_batches == [["doc.pdf"]] assert work_q.empty() + + +def test_gpu_worker_accepts_bundled_work_items(tmp_path, monkeypatch): + import glossapi.corpus as corpus_mod + + processed_batches = [] + + class FakeCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + self.extractor = SimpleNamespace(max_batch_files=1) + + def prime_extractor(self, *args, **kwargs): + return None + + def extract(self, *, file_paths=None, **kwargs): + processed_batches.append(list(file_paths or [])) + return None + + monkeypatch.setattr(corpus_mod, "Corpus", FakeCorpus) + monkeypatch.setattr("glossapi.Corpus", FakeCorpus) + monkeypatch.delenv("GLOSSAPI_WORKER_LOG_DIR", raising=False) + + work_q = queue.Queue() + work_q.put(["doc-a.pdf", "doc-b.pdf"]) + result_q = queue.Queue() + status_map: dict = {} + + with pytest.raises(SystemExit) as exit_info: + corpus_mod.gpu_extract_worker_queue( + device_id=0, + worker_slot=0, + worker_key="gpu0-w0", + in_dir=str(tmp_path), + out_dir=str(tmp_path), + work_q=work_q, + force=False, + fe=False, + ce=False, + use_cls_w=False, + skip=False, + input_fmt="pdf", + threads=1, + benchmark=False, + export_json=False, + emit_index=False, + backend="safe", + result_q=result_q, + status_map=status_map, + marker_dir=None, + ) + + assert exit_info.value.code == 0 + assert processed_batches == [["doc-a.pdf", "doc-b.pdf"]] + assert status_map == {} + + +def test_gpu_worker_keeps_singleton_queue_items_separate(tmp_path, monkeypatch): + import glossapi.corpus as corpus_mod + + processed_batches = [] + + class FakeCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + self.extractor = SimpleNamespace(max_batch_files=2) + + def prime_extractor(self, *args, **kwargs): + return None + + def extract(self, *, file_paths=None, **kwargs): + processed_batches.append(list(file_paths or [])) + return None + + monkeypatch.setattr(corpus_mod, "Corpus", FakeCorpus) + monkeypatch.setattr("glossapi.Corpus", FakeCorpus) + monkeypatch.delenv("GLOSSAPI_WORKER_LOG_DIR", raising=False) + + work_q = queue.Queue() + work_q.put("doc-a.pdf") + work_q.put("doc-b.pdf") + result_q = queue.Queue() + status_map: dict = {} + + with pytest.raises(SystemExit) as exit_info: + corpus_mod.gpu_extract_worker_queue( + device_id=0, + worker_slot=0, + worker_key="gpu0-w0", + in_dir=str(tmp_path), + out_dir=str(tmp_path), + work_q=work_q, + force=False, + fe=False, + ce=False, + use_cls_w=False, + skip=False, + input_fmt="pdf", + threads=1, + benchmark=False, + export_json=False, + emit_index=False, + backend="docling", + result_q=result_q, + status_map=status_map, + marker_dir=None, + ) + + assert exit_info.value.code == 0 + assert processed_batches == [["doc-a.pdf"], ["doc-b.pdf"]] assert status_map == {} diff --git a/tests/test_corpus_ocr_modules.py b/tests/test_corpus_ocr_modules.py new file mode 100644 index 0000000..4d5fedd --- /dev/null +++ b/tests/test_corpus_ocr_modules.py @@ -0,0 +1,211 @@ +import json +from pathlib import Path + +import pandas as pd + +from glossapi import Corpus +from glossapi.corpus.ocr.artifacts import apply_ocr_success_updates +from glossapi.corpus.ocr.config import normalize_ocr_request +from glossapi.corpus.ocr.targets import build_ocr_selection +from glossapi.ocr.deepseek.defaults import DEFAULT_GPU_MEMORY_UTILIZATION, DEFAULT_RENDER_DPI + + +def _mk_corpus(tmp_path: Path) -> Corpus: + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_normalize_ocr_request_uses_shared_vllm_defaults(tmp_path): + corpus = _mk_corpus(tmp_path) + + request = normalize_ocr_request( + logger=corpus.logger, + fix_bad=True, + mode="ocr_bad", + backend="deepseek", + device=None, + model_dir=None, + max_pages=None, + persist_engine=True, + precision=None, + runtime_backend="vllm", + render_dpi=None, + gpu_memory_utilization=None, + math_enhance=False, + force=None, + reprocess_completed=None, + skip_existing=None, + ) + + assert request is not None + assert request.render_dpi == DEFAULT_RENDER_DPI + assert request.gpu_memory_utilization == DEFAULT_GPU_MEMORY_UTILIZATION + + +def test_build_ocr_selection_collapses_chunk_rows_and_skips_completed(tmp_path): + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "needs.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + {"filename": "needs.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "done.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": True}, + ] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + + selection = build_ocr_selection( + corpus, + mode="ocr_bad", + reprocess_completed=False, + ) + + assert selection.bad_files == ["needs.pdf"] + assert selection.ocr_candidates_initial == 2 + assert selection.skipped_completed == 1 + assert selection.skipped_skiplist == 0 + assert selection.ocr_done_stems == {"done"} + + +def test_apply_ocr_success_updates_maps_canonical_artifacts_by_stem(tmp_path): + markdown_dir = tmp_path / "markdown" + metrics_dir = tmp_path / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + + (markdown_dir / "needs.md").write_text("fixed markdown\n", encoding="utf-8") + (metrics_dir / "needs.metrics.json").write_text('{"page_count": 1}\n', encoding="utf-8") + + df = pd.DataFrame( + [ + {"filename": "needs.pdf", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", "needs_ocr": True, "ocr_success": False}, + ] + ) + + updated = apply_ocr_success_updates( + df, + filenames=["needs.pdf"], + markdown_dir=markdown_dir, + metrics_dir=metrics_dir, + backend_norm="deepseek", + ).set_index("filename") + + assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs__p0001-0002.pdf", "ocr_success"]) is True + assert updated.loc["needs.pdf", "text"] == "fixed markdown\n" + assert updated.loc["needs__p0001-0002.pdf", "text"] == "fixed markdown\n" + assert updated.loc["needs.pdf", "ocr_markdown_relpath"] == "markdown/needs.md" + assert updated.loc["needs__p0001-0002.pdf", "ocr_metrics_relpath"] == "json/metrics/needs.metrics.json" + assert updated.loc["needs.pdf", "extraction_mode"] == "deepseek" + + +def test_ocr_pipeline_exports_cleaned_and_raw_text_side_by_side(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + { + "filename": "doc.pdf", + corpus.url_column: "https://example.com/doc.pdf", + "needs_ocr": True, + "ocr_success": False, + } + ] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + raw_markdown = ( + "Κανονική πρώτη σελίδα.\n" + "<--- Page Split --->\n" + "1. 2. 3. 4. 5. 6. 7.\n" + "0 0 0 0 0 0\n" + "1.1\n1.1\n1.1\n1.1\n1.1\n1.1\n" + ) + + from glossapi.ocr.deepseek import runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for current in files: + stem = Path(current).stem + (markdown_dir / f"{stem}.md").write_text(raw_markdown, encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + json.dumps( + { + "page_count": 2, + "pages": [ + {"page_no": 1, "formula_count": 0, "code_count": 0}, + {"page_no": 2, "formula_count": 0, "code_count": 0}, + ], + } + ) + + "\n", + encoding="utf-8", + ) + return {"doc": {"page_count": 2}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + calls = [] + original_clean_ocr = corpus.clean_ocr + original_clean = corpus.clean + original_markdown_dir = corpus.markdown_dir + original_cleaned_markdown_dir = corpus.cleaned_markdown_dir + + def record_clean_ocr(*args, **kwargs): + calls.append( + ( + "clean_ocr", + Path(str(kwargs.get("input_dir"))), + kwargs.get("write_cleaned_files", True), + ) + ) + return original_clean_ocr(*args, **kwargs) + + def record_clean(*args, **kwargs): + calls.append( + ( + "clean", + Path(str(kwargs.get("input_dir"))), + kwargs.get("write_cleaned_files", True), + ) + ) + return original_clean(*args, **kwargs) + + monkeypatch.setattr(corpus, "clean_ocr", record_clean_ocr) + monkeypatch.setattr(corpus, "clean", record_clean) + + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + assert calls[0] == ("clean_ocr", original_markdown_dir, True) + assert calls[1] == ("clean", original_cleaned_markdown_dir, False) + + raw_text = (original_markdown_dir / "doc.md").read_text(encoding="utf-8") + cleaned_text = (original_cleaned_markdown_dir / "doc.md").read_text(encoding="utf-8") + assert raw_text == raw_markdown + assert cleaned_text != raw_text + assert "1.1\n1.1" in raw_text + assert "1.1\n1.1" not in cleaned_text + + out_path = corpus.output_dir / "export.jsonl" + corpus.jsonl(out_path) + records = [json.loads(line) for line in out_path.read_text(encoding="utf-8").splitlines() if line] + assert len(records) == 1 + record = records[0] + + assert record["document"] == cleaned_text + assert record["text"] == raw_text + assert record["filename"] == "doc" + assert record["url"] == "https://example.com/doc.pdf" + assert record["ocr_success"] is True + assert record["extraction_mode"] == "deepseek" + assert record["page_count"] == 2 diff --git a/tests/test_deepseek_multi_gpu_runtime.py b/tests/test_deepseek_multi_gpu_runtime.py new file mode 100644 index 0000000..e465949 --- /dev/null +++ b/tests/test_deepseek_multi_gpu_runtime.py @@ -0,0 +1,515 @@ +import json +from pathlib import Path +from types import SimpleNamespace + + +def test_build_env_adds_wheel_managed_cuda_lib_dirs(tmp_path): + from glossapi.ocr.deepseek import runner + + venv_root = tmp_path / "venv" + python_bin = venv_root / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("") + cuda_runtime_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cuda_runtime" / "lib" + cublas_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cublas" / "lib" + cuda_runtime_lib.mkdir(parents=True, exist_ok=True) + cublas_lib.mkdir(parents=True, exist_ok=True) + + env = runner._build_env(python_bin=python_bin, visible_device=1, script=None) + + assert env["CUDA_VISIBLE_DEVICES"] == "1" + ld_entries = env["LD_LIBRARY_PATH"].split(":") + assert str(cuda_runtime_lib) in ld_entries + assert str(cublas_lib) in ld_entries + + +def test_build_env_uses_virtualenv_path_when_python_bin_is_symlink(tmp_path): + from glossapi.ocr.deepseek import runner + + venv_root = tmp_path / "venv" + python_bin = venv_root / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.symlink_to("/usr/bin/python3") + cuda_runtime_lib = venv_root / "lib" / "python3.11" / "site-packages" / "nvidia" / "cuda_runtime" / "lib" + cuda_runtime_lib.mkdir(parents=True, exist_ok=True) + + env = runner._build_env(python_bin=python_bin, visible_device=0, script=None) + + ld_entries = env["LD_LIBRARY_PATH"].split(":") + assert str(cuda_runtime_lib) in ld_entries + + +def test_resolve_deepseek_python_prefers_repo_local_runtime(tmp_path): + from glossapi.ocr.deepseek import runtime_paths + + repo_root = tmp_path / "repo" + python_bin = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("", encoding="utf-8") + + resolved = runtime_paths.resolve_deepseek_python(env={}, repo_root=repo_root) + + assert resolved == python_bin + + +def test_resolve_deepseek_python_prefers_versioned_runtime_over_generic_alias(tmp_path): + from glossapi.ocr.deepseek import runtime_paths + + repo_root = tmp_path / "repo" + generic = repo_root / "dependency_setup" / ".venvs" / "deepseek" / "bin" / "python" + versioned = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + generic.parent.mkdir(parents=True, exist_ok=True) + versioned.parent.mkdir(parents=True, exist_ok=True) + generic.write_text("", encoding="utf-8") + versioned.write_text("", encoding="utf-8") + + resolved = runtime_paths.resolve_deepseek_python(env={}, repo_root=repo_root) + + assert resolved == versioned + + +def test_work_queue_requeues_stale_running_batch(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 0, + "pages": 12, + "files": ["a.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-a", + stale_after_sec=30.0, + now_ts=100.0, + ) + + assert claimed["batch_id"] == 0 + assert work_queue.work_queue_counts(db_path)["running"] == 1 + + requeued = work_queue.requeue_stale_running_batches( + db_path, + stale_after_sec=30.0, + now_ts=200.0, + ) + + assert requeued == 1 + assert work_queue.work_queue_counts(db_path)["pending"] == 1 + + +def test_work_queue_mark_done_persists_result(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 1, + "pages": 8, + "files": [], + "page_ranges": ["b.pdf:1:8"], + "items": [], + } + ], + ) + + work_queue.claim_next_batch( + db_path, + worker_id="worker-b", + stale_after_sec=60.0, + now_ts=50.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=1, + worker_id="worker-b", + result={"pages": 8, "first_infer_started_at": "2026-04-02T10:00:00Z"}, + now_ts=75.0, + ) + + items = list(work_queue.iter_work_items(db_path)) + + assert items[0]["status"] == work_queue.STATUS_DONE + assert items[0]["result"]["pages"] == 8 + assert work_queue.work_queue_counts(db_path)["done"] == 1 + + +def test_work_queue_repair_enqueue_reuses_queue_key(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db(db_path, batches=[]) + + inserted = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:5:doc", + "stem": "doc", + "repair_page_numbers": [2, 5], + "pages": 2, + } + ], + ) + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-r", + stale_after_sec=60.0, + queue_name=work_queue.QUEUE_REPAIR, + now_ts=10.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=claimed["batch_id"], + worker_id="worker-r", + result={"pages": 2}, + now_ts=12.0, + ) + + inserted_again = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:5:doc", + "stem": "doc", + "repair_page_numbers": [2], + "pages": 1, + } + ], + ) + repair_item = [ + item + for item in work_queue.iter_work_items(db_path) + if item["queue_name"] == work_queue.QUEUE_REPAIR + ][0] + + assert inserted_again == inserted + assert repair_item["batch_id"] == inserted[0] + assert repair_item["status"] == work_queue.STATUS_PENDING + assert repair_item["repair_page_numbers"] == [2] + + +def test_work_queue_marks_batch_failed_after_one_retry(tmp_path): + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 2, + "pages": 4, + "files": ["c.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + + first = work_queue.claim_next_batch( + db_path, + worker_id="worker-a", + stale_after_sec=60.0, + now_ts=10.0, + ) + work_queue.mark_batch_failed( + db_path, + batch_id=first["batch_id"], + worker_id="worker-a", + error="first failure", + max_attempts=2, + now_ts=20.0, + ) + + second = work_queue.claim_next_batch( + db_path, + worker_id="worker-b", + stale_after_sec=60.0, + now_ts=30.0, + ) + work_queue.mark_batch_failed( + db_path, + batch_id=second["batch_id"], + worker_id="worker-b", + error="second failure", + max_attempts=2, + now_ts=40.0, + ) + + item = list(work_queue.iter_work_items(db_path))[0] + + assert item["attempt_count"] == 2 + assert item["status"] == work_queue.STATUS_FAILED + assert item["worker_id"] == "worker-b" + assert item["last_error"] == "second failure" + + +def test_claim_additional_repair_batches_packs_multiple_items(tmp_path): + from glossapi.ocr.deepseek import run_pdf_ocr_vllm + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db(db_path, batches=[]) + inserted = work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + {"queue_key": "repair:1:a", "batch_id": 10, "stem": "a", "repair_page_numbers": [1, 2], "pages": 2}, + {"queue_key": "repair:1:b", "batch_id": 11, "stem": "b", "repair_page_numbers": [3, 4], "pages": 2}, + {"queue_key": "repair:1:c", "batch_id": 12, "stem": "c", "repair_page_numbers": [5], "pages": 1}, + ], + ) + assert inserted == [10, 11, 12] + + first = work_queue.claim_next_batch( + db_path, + worker_id="worker-pack", + stale_after_sec=60.0, + queue_name=work_queue.QUEUE_REPAIR, + now_ts=10.0, + ) + packed = run_pdf_ocr_vllm._claim_additional_repair_batches( + db_path, + worker_id="worker-pack", + stale_after_sec=60.0, + first_batch=first, + target_pages=4, + target_items=8, + ) + + assert [int(batch["batch_id"]) for batch in packed] == [10, 11] + counts = work_queue.work_queue_counts(db_path) + assert counts["by_queue"][work_queue.QUEUE_REPAIR][work_queue.STATUS_RUNNING] == 2 + assert counts["by_queue"][work_queue.QUEUE_REPAIR][work_queue.STATUS_PENDING] == 1 + + +def test_claim_next_phase_batch_switches_to_repair_after_main_drains(tmp_path): + from glossapi.ocr.deepseek import run_pdf_ocr_vllm + from glossapi.ocr.deepseek import work_queue + + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + { + "batch_id": 0, + "pages": 8, + "files": ["a.pdf"], + "page_ranges": [], + "items": [], + } + ], + ) + claimed = work_queue.claim_next_batch( + db_path, + worker_id="worker-main", + stale_after_sec=60.0, + now_ts=10.0, + ) + work_queue.mark_batch_done( + db_path, + batch_id=claimed["batch_id"], + worker_id="worker-main", + result={"pages": 8}, + now_ts=20.0, + ) + work_queue.enqueue_batches( + db_path, + queue_name=work_queue.QUEUE_REPAIR, + batches=[ + { + "queue_key": "repair:0:doc", + "stem": "doc", + "repair_page_numbers": [2, 5], + "pages": 2, + } + ], + ) + + queue_name, batch, should_wait = run_pdf_ocr_vllm._claim_next_phase_batch( + db_path, + worker_id="worker-repair", + stale_after_sec=60.0, + ) + + assert queue_name == work_queue.QUEUE_REPAIR + assert batch is not None + assert batch["queue_key"] == "repair:0:doc" + assert should_wait is False + + +def test_runner_runtime_summary_reports_steady_state_windows(tmp_path): + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek import work_queue + + runtime_dir = tmp_path / "runtime" + runtime_dir.mkdir(parents=True, exist_ok=True) + (runtime_dir / "worker_00.runtime.json").write_text( + json.dumps( + { + "worker_id": "worker_00", + "engine_ready_at": "2026-04-02T10:00:10Z", + "first_batch_started_at": "2026-04-02T10:00:20Z", + "last_batch_finished_at": "2026-04-02T10:05:20Z", + } + ), + encoding="utf-8", + ) + (runtime_dir / "worker_01.runtime.json").write_text( + json.dumps( + { + "worker_id": "worker_01", + "engine_ready_at": "2026-04-02T10:00:12Z", + "first_batch_started_at": "2026-04-02T10:00:24Z", + "last_batch_finished_at": "2026-04-02T10:04:20Z", + } + ), + encoding="utf-8", + ) + db_path = tmp_path / "work.sqlite" + work_queue.init_work_db( + db_path, + batches=[ + {"batch_id": 0, "pages": 50, "files": ["a.pdf"], "page_ranges": [], "items": []}, + {"batch_id": 1, "pages": 50, "files": ["b.pdf"], "page_ranges": [], "items": []}, + ], + ) + work_queue.claim_next_batch(db_path, worker_id="worker_00", stale_after_sec=60.0, now_ts=1.0) + work_queue.mark_batch_done(db_path, batch_id=0, worker_id="worker_00", now_ts=2.0) + work_queue.claim_next_batch(db_path, worker_id="worker_01", stale_after_sec=60.0, now_ts=3.0) + work_queue.mark_batch_done(db_path, batch_id=1, worker_id="worker_01", now_ts=4.0) + + summary_path = runner._write_runtime_summary(runtime_dir=runtime_dir, db_path=db_path) + summary = json.loads(summary_path.read_text(encoding="utf-8")) + + assert summary["queue_counts"]["done"] == 2 + assert summary["steady_state"]["first_batch_started_at"] == "2026-04-02T10:00:20Z" + assert summary["steady_state"]["all_workers_ready_at"] == "2026-04-02T10:00:12Z" + assert summary["steady_state"]["last_batch_finished_at"] == "2026-04-02T10:05:20Z" + assert summary["steady_state"]["first_batch_to_last_batch_window_sec"] == 300.0 + assert summary["steady_state"]["all_workers_ready_to_last_batch_window_sec"] == 308.0 + assert summary["queue_counts"]["by_queue"]["main"]["done"] == 2 + assert summary["queue_counts"]["by_queue"]["repair"]["done"] == 0 + + +def test_runner_preflight_can_ensure_persistence_mode(monkeypatch): + from glossapi.ocr.deepseek import runner + + responses = [ + [{"index": "0", "persistence_mode": "Disabled"}], + [{"index": "0", "persistence_mode": "Enabled"}], + ] + + monkeypatch.setattr(runner, "_query_persistence_mode", lambda *, visible_devices: responses.pop(0)) + + calls = {} + + def fake_run(cmd, check, capture_output, text): + calls["cmd"] = cmd + return SimpleNamespace(returncode=0) + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + + status = runner._ensure_gpu_preflight(visible_devices=[0], mode="ensure") + + assert calls["cmd"] == ["sudo", "-n", "nvidia-smi", "-pm", "1"] + assert status["changed"] is True + assert status["after"] == [{"index": "0", "persistence_mode": "Enabled"}] + + +def test_build_cli_command_includes_work_queue_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=[], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="vllm", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=144, + max_new_tokens=2048, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=96, + gpu_memory_utilization=0.9, + disable_fp8_kv=False, + repair_mode="auto", + repair_exec_batch_target_pages=48, + repair_exec_batch_target_items=32, + work_db=tmp_path / "work.sqlite", + worker_id="worker_00_gpu0", + worker_runtime_file=tmp_path / "worker_00.runtime.json", + work_stale_after_sec=900.0, + work_heartbeat_sec=10.0, + work_max_attempts=2, + ) + + assert "--work-db" in cmd + assert str(tmp_path / "work.sqlite") in cmd + assert "--worker-id" in cmd and "worker_00_gpu0" in cmd + assert "--worker-runtime-file" in cmd and str(tmp_path / "worker_00.runtime.json") in cmd + assert "--work-stale-after-sec" in cmd and "900.0" in cmd + assert "--work-heartbeat-sec" in cmd and "10.0" in cmd + assert "--work-max-attempts" in cmd and "2" in cmd + assert "--repair-exec-batch-target-pages" in cmd and "48" in cmd + assert "--repair-exec-batch-target-items" in cmd and "32" in cmd + + +def test_launch_worker_process_uses_start_new_session(monkeypatch): + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_popen(cmd, stdout, stderr, env, start_new_session): + calls["cmd"] = cmd + calls["start_new_session"] = start_new_session + return SimpleNamespace(pid=1234) + + monkeypatch.setattr(runner.subprocess, "Popen", fake_popen) + + proc = runner._launch_worker_process(["python", "worker.py"], fh=object(), env={"A": "1"}) + + assert calls["cmd"] == ["python", "worker.py"] + assert calls["start_new_session"] is True + assert proc.pid == 1234 + + +def test_terminate_worker_process_group_signals_group(monkeypatch): + from glossapi.ocr.deepseek import runner + + signals = [] + monkeypatch.setattr(runner.os, "killpg", lambda pgid, sig: signals.append((pgid, sig))) + monkeypatch.setattr(runner, "_wait_for_process_group_exit", lambda pgid, *, timeout_sec: True) + + ok = runner._terminate_worker_process_group( + { + "worker_id": "worker_00_gpu0", + "proc": SimpleNamespace(pid=4321), + } + ) + + assert ok is True + assert signals == [(4321, runner.signal.SIGTERM)] diff --git a/tests/test_deepseek_preflight.py b/tests/test_deepseek_preflight.py index 1900a2b..73e761d 100644 --- a/tests/test_deepseek_preflight.py +++ b/tests/test_deepseek_preflight.py @@ -1,5 +1,4 @@ import sys -from pathlib import Path from glossapi.ocr.deepseek.preflight import check_deepseek_env @@ -9,45 +8,34 @@ def test_preflight_reports_missing_components(tmp_path): "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "0", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "1", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": str(tmp_path / "missing_python"), - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(tmp_path / "missing_script.py"), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(tmp_path / "missing_script.py"), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(tmp_path / "missing_model"), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(tmp_path / "missing_lib"), - "PATH": str(tmp_path), # no cc1plus here } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) names = {c.name for c in report.errors} + assert "allow_cli" in names + assert "allow_stub" in names assert "deepseek_python" in names - assert "vllm_script" in names + assert "runner_script" in names assert "model_dir" in names - assert "ld_library_path" in names - assert "cc1plus" in names assert not report.ok def test_preflight_passes_with_complete_env(tmp_path): - script = tmp_path / "run_pdf_ocr_vllm.py" + script = tmp_path / "run_pdf_ocr_transformers.py" script.write_text("#!/usr/bin/env python3\n", encoding="utf-8") - model_dir = tmp_path / "DeepSeek-OCR" + model_dir = tmp_path / "DeepSeek-OCR-2" model_dir.mkdir() (model_dir / "config.json").write_text("{}", encoding="utf-8") (model_dir / "model-00001-of-000001.safetensors").write_bytes(b"stub") - lib_dir = tmp_path / "libjpeg" - lib_dir.mkdir() - fake_bin = tmp_path / "bin" - fake_bin.mkdir() - cc1plus = fake_bin / "cc1plus" - cc1plus.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") - cc1plus.chmod(0o755) env = { "GLOSSAPI_DEEPSEEK_ALLOW_CLI": "1", "GLOSSAPI_DEEPSEEK_ALLOW_STUB": "0", "GLOSSAPI_DEEPSEEK_TEST_PYTHON": sys.executable, - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT": str(script), + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT": str(script), "GLOSSAPI_DEEPSEEK_MODEL_DIR": str(model_dir), - "GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH": str(lib_dir), - "PATH": str(fake_bin), } - report = check_deepseek_env(env, check_flashinfer=False) + report = check_deepseek_env(env, check_torch=False) assert report.ok assert not report.errors diff --git a/tests/test_deepseek_runner_contract.py b/tests/test_deepseek_runner_contract.py new file mode 100644 index 0000000..1c7fef6 --- /dev/null +++ b/tests/test_deepseek_runner_contract.py @@ -0,0 +1,826 @@ +import json +import sys +from pathlib import Path +from types import SimpleNamespace + +import pandas as pd +import pytest + + +def _mk_corpus(tmp_path: Path): + from glossapi import Corpus + + root = tmp_path / "corpus" + root.mkdir() + return Corpus(input_dir=root, output_dir=root) + + +def test_deepseek_backend_rejects_stub_mode(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + df = pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ) + parquet_path = dl_dir / "download_results.parquet" + df.to_parquet(parquet_path, index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%real\n") + + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "1") + + with pytest.raises(RuntimeError, match="stub execution has been removed"): + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) + + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc[fname, "ocr_success"]) is False + assert bool(updated.loc[fname, "needs_ocr"]) is True + + +def test_progress_artifacts_stay_out_of_canonical_markdown(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _write_outputs, _write_progress + + output_dir = tmp_path / "output" + _write_progress( + output_dir=output_dir, + stem="doc", + page_outputs=["page one"], + total_pages=5, + completed_pages=1, + ) + + canonical_markdown = output_dir / "markdown" / "doc.md" + progress_markdown = output_dir / "sidecars" / "ocr_progress" / "doc.partial.md" + progress_json = output_dir / "json" / "metrics" / "doc.progress.json" + + assert not canonical_markdown.exists() + assert progress_markdown.exists() + assert progress_json.exists() + + _write_outputs(output_dir=output_dir, stem="doc", markdown="final", page_count=5) + + assert canonical_markdown.exists() + assert canonical_markdown.read_text(encoding="utf-8") == "final\n" + assert not progress_markdown.exists() + + +def test_page_output_helpers_roundtrip_numbered_blank_pages(): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs + + page_outputs = ["page one", "", "page three"] + + markdown = _join_page_outputs(page_outputs) + + assert markdown == ( + "page one\n" + "\n" + "<--- Page Split --->\n" + "\n" + "\n" + "<--- Page Split --->\n" + "page three" + ) + assert _split_page_outputs(markdown) == page_outputs + + +def test_write_outputs_preserves_blank_first_page_structure(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + + output_dir = tmp_path / "output" + markdown = _join_page_outputs(["", "page two"]) + + _write_outputs(output_dir=output_dir, stem="doc", markdown=markdown, page_count=2) + + written = (output_dir / "markdown" / "doc.md").read_text(encoding="utf-8") + assert written == ( + "\n" + "<--- Page Split --->\n" + "page two\n" + ) + assert _split_page_outputs(written) == ["", "page two"] + + +def test_auto_attn_backend_prefers_eager_when_flash_attn_is_unavailable(monkeypatch): + import builtins + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _resolve_attn_backend + + original_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "flash_attn": + raise ImportError("flash_attn unavailable") + return original_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert _resolve_attn_backend("auto") == "eager" + + +def test_runner_uses_downloads_subdir_when_present(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["input_dir"] = input_dir + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_transformers.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"]) + + assert calls["input_dir"] == downloads_dir.resolve() + assert result["doc"]["page_count"] == 1 + + +def test_build_cli_command_includes_speed_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="plain_ocr", + prompt_override="custom prompt", + attn_backend="flash_attention_2", + base_size=768, + image_size=512, + crop_mode=True, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=1.05, + no_repeat_ngram_size=12, + runtime_backend="transformers", + vllm_batch_size=None, + gpu_memory_utilization=None, + disable_fp8_kv=False, + repair_mode=None, + ) + + assert "--ocr-profile" in cmd and "plain_ocr" in cmd + assert "--prompt-override" in cmd and "custom prompt" in cmd + assert "--attn-backend" in cmd and "flash_attention_2" in cmd + assert "--base-size" in cmd and "768" in cmd + assert "--image-size" in cmd and "512" in cmd + assert "--crop-mode" in cmd + assert "--render-dpi" in cmd and "144" in cmd + assert "--max-new-tokens" in cmd and "1024" in cmd + + +def test_deepseek_default_max_new_tokens_is_standardized(): + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import DEFAULT_MAX_NEW_TOKENS + + assert DEFAULT_MAX_NEW_TOKENS == 2048 + assert runner.DEFAULT_MAX_NEW_TOKENS == 2048 + + +def test_build_cli_command_includes_vllm_flags(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=["a.pdf"], + page_ranges=None, + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=1, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=110, + max_new_tokens=768, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=16, + gpu_memory_utilization=0.92, + disable_fp8_kv=True, + repair_mode="auto", + ) + + assert "--batch-size" in cmd and "16" in cmd + assert "--gpu-memory-utilization" in cmd and "0.92" in cmd + assert "--disable-fp8-kv" in cmd + assert "--repair-mode" in cmd and "auto" in cmd + + +def test_build_env_prepends_script_src_to_pythonpath(tmp_path, monkeypatch): + import os + + from glossapi.ocr.deepseek.runner import _build_env + + repo_root = tmp_path / "repo" + script = repo_root / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_vllm.py" + script.parent.mkdir(parents=True, exist_ok=True) + script.write_text("# stub\n", encoding="utf-8") + (repo_root / "src" / "glossapi").mkdir(parents=True, exist_ok=True) + + monkeypatch.setenv("PYTHONPATH", os.pathsep.join(["/tmp/old-a", "/tmp/old-b"])) + env = _build_env( + python_bin=Path("/usr/bin/python3"), + visible_device=1, + script=script, + ) + + assert env["PYTHONPATH"].split(os.pathsep) == [ + str((repo_root / "src").resolve()), + "/tmp/old-a", + "/tmp/old-b", + ] + assert env["CUDA_VISIBLE_DEVICES"] == "1" + + +def test_build_cli_command_includes_page_ranges(tmp_path): + from glossapi.ocr.deepseek.runner import _build_cli_command + + cmd = _build_cli_command( + input_dir=tmp_path / "in", + output_dir=tmp_path / "out", + files=[], + page_ranges=["a.pdf:1:64", "b.pdf:65:128"], + model_dir=tmp_path / "model", + python_bin=Path("/usr/bin/python3"), + script=tmp_path / "run_vllm.py", + max_pages=None, + content_debug=False, + device="cuda", + ocr_profile="markdown_grounded", + prompt_override=None, + attn_backend="auto", + base_size=None, + image_size=None, + crop_mode=None, + render_dpi=144, + max_new_tokens=1024, + repetition_penalty=None, + no_repeat_ngram_size=None, + runtime_backend="vllm", + vllm_batch_size=32, + gpu_memory_utilization=0.9, + disable_fp8_kv=False, + repair_mode="auto", + ) + + assert "--page-ranges" in cmd + assert "a.pdf:1:64" in cmd + assert "b.pdf:65:128" in cmd + + +def test_vllm_empty_page_detector_is_conservative(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _is_effectively_empty_page + + empty_page = { + "top_dark_ratio": 0.0004, + "bottom_dark_ratio": 0.0006, + "top_third_dark_ratio": 0.0002, + "middle_third_dark_ratio": 0.0005, + "bottom_third_dark_ratio": 0.0007, + "overall_dark_ratio": 0.0008, + } + non_empty_sparse_page = { + "top_dark_ratio": 0.003, + "bottom_dark_ratio": 0.004, + "top_third_dark_ratio": 0.0028, + "middle_third_dark_ratio": 0.0031, + "bottom_third_dark_ratio": 0.0042, + "overall_dark_ratio": 0.0022, + } + assert _is_effectively_empty_page(empty_page, "auto") is True + assert _is_effectively_empty_page(non_empty_sparse_page, "auto") is False + assert _is_effectively_empty_page(empty_page, "off") is False + + +def test_repair_disposition_drops_repeat_garbage_cutoff(): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _resolve_repair_disposition + + disposition = _resolve_repair_disposition( + repair_text="garbage", + repair_postprocess={"early_stops": 1}, + ) + + assert disposition == { + "final_text": "", + "repair_applied": False, + "page_dropped_after_repair": True, + "drop_reason": "repeat_garbage_cutoff", + } + + +def test_repair_batch_updates_persisted_outputs_with_repeat_cutoff_drop(tmp_path, monkeypatch): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _run_repair_batch_to_outputs + + output_dir = tmp_path / "output" + _write_outputs( + output_dir=output_dir, + stem="doc", + markdown=_join_page_outputs(["bad first page", "page two"]), + page_count=2, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 1.0, + "raw_chars": 20, + "final_chars": 14, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + }, + { + "page_number": 2, + "infer_sec": 0.5, + "raw_chars": 8, + "final_chars": 8, + "repair_strategy": "none", + "repair_reason": None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + }, + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._iter_selected_rendered_pages", + lambda pdf_path, *, render_dpi, source_page_numbers: [(1, Image.new("RGB", (4, 4), "white"))], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._generate_batch_outputs", + lambda llm, *, jobs, prompt, batch_size, sampling_params: [ + {"item": jobs[0], "raw_text": "still broken", "infer_sec": 0.25} + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._postprocess_page_text", + lambda text, *, prompt, content_debug: ("garbage", {"early_stops": 1}), + ) + + result = _run_repair_batch_to_outputs( + SimpleNamespace(render_dpi=144, batch_size=8, content_debug=False, repair_mode="auto"), + batch={ + "stem": "doc", + "pdf_path": str(tmp_path / "doc.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + }, + output_dir=output_dir, + llm=object(), + plain_prompt="plain prompt", + sampling_params=object(), + ) + + markdown = (output_dir / "markdown" / "doc.md").read_text(encoding="utf-8") + metrics = json.loads((output_dir / "json" / "metrics" / "doc.metrics.json").read_text(encoding="utf-8")) + + assert result["pages"] == 1 + assert _split_page_outputs(markdown) == ["", "page two"] + assert metrics["repair_summary"]["pages_dropped_after_repeat_cutoff"] == 1 + + +def test_repair_batch_pack_updates_multiple_stems(tmp_path, monkeypatch): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _split_page_outputs, _write_outputs + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _run_repair_batches_to_outputs + + output_dir = tmp_path / "output" + _write_outputs( + output_dir=output_dir, + stem="doc_a", + markdown=_join_page_outputs(["bad a", "page two a"]), + page_count=2, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 1.0, + "raw_chars": 10, + "final_chars": 5, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + }, + { + "page_number": 2, + "infer_sec": 0.5, + "raw_chars": 9, + "final_chars": 9, + "repair_strategy": "none", + "repair_reason": None, + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": False, + }, + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + _write_outputs( + output_dir=output_dir, + stem="doc_b", + markdown=_join_page_outputs(["bad b"]), + page_count=1, + extra_metrics={ + "repair_mode": "auto", + "page_metrics": [ + { + "page_number": 1, + "infer_sec": 0.7, + "raw_chars": 8, + "final_chars": 5, + "repair_strategy": "plain", + "repair_reason": "early_stop_markdown_garbage", + "repair_attempted": False, + "repair_applied": False, + "page_dropped_after_repair": False, + "empty_page_skipped": False, + "garbage_early_stop_applied": True, + } + ], + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 0}, + }, + ) + + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._iter_selected_rendered_pages", + lambda pdf_path, *, render_dpi, source_page_numbers: [ + (page_number, Image.new("RGB", (4, 4), "white")) for page_number in source_page_numbers + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._generate_batch_outputs", + lambda llm, *, jobs, prompt, batch_size, sampling_params: [ + {"item": job, "raw_text": f"fixed-{job['stem']}-{job['page_number']}", "infer_sec": 0.25} + for job in jobs + ], + ) + monkeypatch.setattr( + "glossapi.ocr.deepseek.run_pdf_ocr_vllm._postprocess_page_text", + lambda text, *, prompt, content_debug: (text, {"early_stops": 0}), + ) + + result = _run_repair_batches_to_outputs( + SimpleNamespace(render_dpi=144, batch_size=8, content_debug=False, repair_mode="auto"), + batches=[ + { + "batch_id": 10, + "stem": "doc_a", + "pdf_path": str(tmp_path / "doc_a.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + "pages": 1, + }, + { + "batch_id": 11, + "stem": "doc_b", + "pdf_path": str(tmp_path / "doc_b.pdf"), + "source_start_page": 1, + "repair_page_numbers": [1], + "pages": 1, + }, + ], + output_dir=output_dir, + llm=object(), + plain_prompt="plain prompt", + sampling_params=object(), + ) + + markdown_a = (output_dir / "markdown" / "doc_a.md").read_text(encoding="utf-8") + markdown_b = (output_dir / "markdown" / "doc_b.md").read_text(encoding="utf-8") + metrics_a = json.loads((output_dir / "json" / "metrics" / "doc_a.metrics.json").read_text(encoding="utf-8")) + metrics_b = json.loads((output_dir / "json" / "metrics" / "doc_b.metrics.json").read_text(encoding="utf-8")) + + assert result["pages"] == 2 + assert result["docs"] == 2 + assert set(result["per_batch_results"]) == {10, 11} + assert _split_page_outputs(markdown_a)[0] == "fixed-doc_a-1" + assert _split_page_outputs(markdown_b)[0] == "fixed-doc_b-1" + assert metrics_a["repair_summary"]["pages_repaired"] == 1 + assert metrics_b["repair_summary"]["pages_repaired"] == 1 + + +def test_vllm_progress_sidecar_keeps_absolute_page_numbers(tmp_path): + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _emit_progress + + state = { + "page_outputs": ["", "page two"], + "total_pages": 2, + "completed_pages": 2, + } + + _emit_progress(tmp_path / "output", "doc", state) + + partial_markdown = (tmp_path / "output" / "sidecars" / "ocr_progress" / "doc.partial.md").read_text( + encoding="utf-8" + ) + assert partial_markdown == ( + "\n" + "<--- Page Split --->\n" + "page two\n" + ) + + +def test_early_stop_detects_symbol_and_numeric_list_garbage(): + from glossapi.ocr.utils.cleaning import detect_early_stop_index + + symbol_garbage = "Κανονικό κείμενο\n" + (" " * 20) + numeric_list_garbage = "Πρόλογος\n" + " ".join(f"{idx}." for idx in range(1, 20)) + + symbol_cut = detect_early_stop_index(symbol_garbage) + numeric_cut = detect_early_stop_index(numeric_list_garbage) + + assert symbol_cut is not None + assert "Κανονικό κείμενο" in symbol_garbage[:symbol_cut] + assert numeric_cut is not None + assert "Πρόλογος" in numeric_list_garbage[:numeric_cut] + + +def test_runner_selects_vllm_script_when_requested(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["script"] = kwargs["script"] + calls["runtime_backend"] = kwargs["runtime_backend"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["runtime_backend"] == "vllm" + assert Path(calls["script"]).name == "run_pdf_ocr_vllm.py" + assert result["doc"]["page_count"] == 1 + + +def test_runner_prefers_repo_local_deepseek_runtime_when_env_missing(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner, runtime_paths + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + repo_root = tmp_path / "repo" + python_bin = repo_root / "dependency_setup" / ".venvs" / "deepseek31111" / "bin" / "python" + python_bin.parent.mkdir(parents=True, exist_ok=True) + python_bin.write_text("", encoding="utf-8") + monkeypatch.setattr(runtime_paths, "REPO_ROOT", repo_root) + + calls = {} + + def fake_run_cli(input_dir, output_dir, **kwargs): + calls["python_bin"] = kwargs["python_bin"] + md_dir = output_dir / "markdown" + metrics_dir = output_dir / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_cli", fake_run_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.delenv("GLOSSAPI_DEEPSEEK_PYTHON", raising=False) + monkeypatch.delenv("GLOSSAPI_DEEPSEEK_TEST_PYTHON", raising=False) + + result = runner.run_for_files(corpus, ["doc.pdf"], runtime_backend="vllm") + + assert calls["python_bin"] == python_bin + assert result["doc"]["page_count"] == 1 + + +def test_runner_forwards_scheduler_controls_to_multi_cli(tmp_path, monkeypatch): + from glossapi.ocr.deepseek import runner + + corpus = _mk_corpus(tmp_path) + (corpus.input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + calls = {} + + def fake_run_multi_cli(**kwargs): + calls.update(kwargs) + md_dir = kwargs["out_root"] / "markdown" + metrics_dir = kwargs["out_root"] / "json" / "metrics" + md_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (md_dir / "doc.md").write_text("ok\n", encoding="utf-8") + (metrics_dir / "doc.metrics.json").write_text('{"page_count": 1}', encoding="utf-8") + + monkeypatch.setattr(runner, "_run_multi_cli", fake_run_multi_cli) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files( + corpus, + ["doc.pdf"], + runtime_backend="vllm", + use_gpus="multi", + devices=[0, 1], + scheduler="exact_fill", + target_batch_pages=196, + shard_pages=64, + shard_threshold_pages=256, + ) + + assert calls["scheduler"] == "exact_fill" + assert calls["target_batch_pages"] == 196 + assert calls["shard_pages"] == 64 + assert calls["shard_threshold_pages"] == 256 + assert result["doc"]["page_count"] == 1 + + + +def test_runner_reassembles_exact_fill_shards_into_canonical_outputs(tmp_path, monkeypatch): + import json + + from glossapi.ocr.deepseek import runner + from glossapi.ocr.deepseek.run_pdf_ocr_transformers import _join_page_outputs, _write_outputs + + corpus = _mk_corpus(tmp_path) + downloads_dir = corpus.input_dir / "downloads" + downloads_dir.mkdir(parents=True, exist_ok=True) + (downloads_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n%real\n") + + def fake_run_multi_cli(*, out_root, **kwargs): + del kwargs + common_metrics = { + "source_file": "doc.pdf", + "source_stem": "doc", + "ocr_profile": "markdown_grounded", + "attn_backend": "vllm", + "runtime_backend": "vllm", + "batch_size": 96, + "repair_mode": "auto", + } + _write_outputs( + output_dir=out_root, + stem="doc__p00001-00002", + markdown=_join_page_outputs(["page one", "page two"]), + page_count=2, + extra_metrics={ + **common_metrics, + "source_start_page": 1, + "source_end_page": 2, + "render_sec": 1.5, + "infer_sec_total": 2.5, + "wall_time_sec": 3.5, + "repair_summary": {"repair_mode": "auto", "pages_flagged": 1, "pages_repaired": 1}, + "page_metrics": [ + {"page_number": 1, "infer_sec": 1.0, "repair_strategy": "none", "repair_applied": False}, + {"page_number": 2, "infer_sec": 1.5, "repair_strategy": "plain", "repair_applied": True}, + ], + }, + ) + _write_outputs( + output_dir=out_root, + stem="doc__p00003-00004", + markdown=_join_page_outputs(["page three", "page four"]), + page_count=2, + extra_metrics={ + **common_metrics, + "source_start_page": 3, + "source_end_page": 4, + "render_sec": 0.5, + "infer_sec_total": 1.5, + "wall_time_sec": 2.0, + "repair_summary": {"repair_mode": "auto", "pages_flagged": 0, "pages_repaired": 0}, + "page_metrics": [ + {"page_number": 1, "infer_sec": 0.7, "repair_strategy": "none", "repair_applied": False}, + {"page_number": 2, "infer_sec": 0.8, "repair_strategy": "none", "repair_applied": False}, + ], + }, + ) + + monkeypatch.setattr(runner, "_run_multi_cli", fake_run_multi_cli) + monkeypatch.setattr(runner, "_page_count", lambda path: 4) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(tmp_path)) + monkeypatch.setenv( + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + str(Path(runner.__file__).resolve().parent / "run_pdf_ocr_vllm.py"), + ) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", sys.executable) + + result = runner.run_for_files( + corpus, + ["doc.pdf"], + use_gpus="multi", + devices=[0, 1], + runtime_backend="vllm", + scheduler="exact_fill", + target_batch_pages=2, + ) + + canonical_md = corpus.output_dir / "markdown" / "doc.md" + canonical_metrics = corpus.output_dir / "json" / "metrics" / "doc.metrics.json" + assert canonical_md.exists() + assert canonical_metrics.exists() + assert canonical_md.read_text(encoding="utf-8") == _join_page_outputs( + ["page one", "page two", "page three", "page four"] + ) + "\n" + + metrics = json.loads(canonical_metrics.read_text(encoding="utf-8")) + assert metrics["reassembled_from_shards"] is True + assert metrics["reassembled_shard_count"] == 2 + assert [item["page_number"] for item in metrics["page_metrics"]] == [1, 2, 3, 4] + assert metrics["repair_summary"]["pages_flagged"] == 1 + assert metrics["repair_summary"]["pages_repaired"] == 1 + assert result["doc"]["page_count"] == 4 + + assert not (corpus.output_dir / "markdown" / "doc__p00001-00002.md").exists() + assert (corpus.output_dir / "sidecars" / "ocr_shards" / "markdown" / "doc__p00001-00002.md").exists() + assert (corpus.output_dir / "sidecars" / "ocr_shards" / "json" / "metrics" / "doc__p00003-00004.metrics.json").exists() + + +def test_vllm_batch_outputs_accept_in_memory_images_without_disk_roundtrip(): + from PIL import Image + + from glossapi.ocr.deepseek.run_pdf_ocr_vllm import _generate_batch_outputs + + class FakeOutput: + def __init__(self, text): + self.outputs = [type("TokenOutput", (), {"text": text})()] + + class FakeLLM: + def generate(self, prompt_batch, sampling_params=None): + del sampling_params + assert len(prompt_batch) == 2 + assert all(item["multi_modal_data"]["image"].mode == "RGB" for item in prompt_batch) + return [FakeOutput("alpha"), FakeOutput("beta")] + + jobs = [ + {"stem": "doc", "page_number": 1, "image": Image.new("RGB", (4, 4), color="white")}, + {"stem": "doc", "page_number": 2, "image": Image.new("RGB", (4, 4), color="black")}, + ] + outputs = _generate_batch_outputs( + FakeLLM(), + jobs=jobs, + prompt="prompt", + batch_size=2, + sampling_params=object(), + ) + + assert [item["raw_text"] for item in outputs] == ["alpha", "beta"] + assert jobs[0]["image"].size == (4, 4) + assert jobs[1]["image"].size == (4, 4) + for item in jobs: + item["image"].close() diff --git a/tests/test_deepseek_runner_stub.py b/tests/test_deepseek_runner_stub.py deleted file mode 100644 index aee5177..0000000 --- a/tests/test_deepseek_runner_stub.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -import pandas as pd - - -def _mk_corpus(tmp_path: Path): - from glossapi import Corpus - - root = tmp_path / "corpus" - root.mkdir() - return Corpus(input_dir=root, output_dir=root) - - -def test_deepseek_backend_stub_runs_and_updates_parquet(tmp_path, monkeypatch): - corpus = _mk_corpus(tmp_path) - - # Seed a minimal metadata parquet with one bad file - dl_dir = corpus.output_dir / "download_results" - dl_dir.mkdir(parents=True, exist_ok=True) - fname = "doc.pdf" - df = pd.DataFrame( - [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] - ) - parquet_path = dl_dir / "download_results.parquet" - df.to_parquet(parquet_path, index=False) - - # Create an empty placeholder file for the PDF - (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") - - # Monkeypatch the runner internal to avoid heavy imports - from glossapi.ocr.deepseek import runner - - def fake_run_one(pdf_path, md_out, metrics_out, cfg): - md_out.parent.mkdir(parents=True, exist_ok=True) - metrics_out.parent.mkdir(parents=True, exist_ok=True) - md_out.write_text("deepseek stub output\n", encoding="utf-8") - metrics_out.write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") - return {"page_count": 1} - - monkeypatch.setattr(runner, "_run_one_pdf", fake_run_one) - - # Run OCR via dispatcher - corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) - - # Artifacts exist - stem = "doc" - md = corpus.output_dir / "markdown" / f"{stem}.md" - metrics = corpus.output_dir / "json" / "metrics" / f"{stem}.metrics.json" - assert md.exists(), "Markdown output should be created by deepseek stub" - assert metrics.exists(), "Metrics JSON should be created by deepseek stub" - - # Parquet updated - updated = pd.read_parquet(parquet_path).set_index("filename") - row = updated.loc[fname] - assert bool(row["ocr_success"]) is True - assert bool(row["needs_ocr"]) is False - # extraction_mode is optional; if present assert value - if "extraction_mode" in updated.columns: - assert updated.loc[fname, "extraction_mode"] == "deepseek" diff --git a/tests/test_deepseek_scheduling.py b/tests/test_deepseek_scheduling.py new file mode 100644 index 0000000..25983a8 --- /dev/null +++ b/tests/test_deepseek_scheduling.py @@ -0,0 +1,238 @@ +from pathlib import Path + + +def _touch_files(root: Path, names: list[str]) -> None: + root.mkdir(parents=True, exist_ok=True) + for name in names: + (root / name).write_bytes(b"%PDF-1.4\n%stub\n") + + +def test_plan_lanes_balances_weighted_docs_greedily(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "huge.pdf": 500, + "mid_a.pdf": 300, + "mid_b.pdf": 300, + "small_a.pdf": 200, + "tiny_a.pdf": 100, + "tiny_b.pdf": 100, + } + _touch_files(tmp_path, list(weights)) + + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + lanes = runner._plan_lanes( + file_list=["tiny_b.pdf", "mid_a.pdf", "huge.pdf", "small_a.pdf", "tiny_a.pdf", "mid_b.pdf"], + input_root=tmp_path, + lane_devices=[0, 1, 2], + workers_per_gpu=1, + max_pages=None, + ) + + assert [int(lane["weight"]) for lane in lanes] == [500, 500, 500] + assigned = [name for lane in lanes for name in lane["files"]] + assert sorted(assigned) == sorted(weights) + assert len(assigned) == len(set(assigned)) + + +def test_auto_vllm_batch_size_caps_total_pages(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "a.pdf": 90, + "b.pdf": 120, + "c.pdf": 400, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + capped = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=None, + ) + reduced = runner._auto_vllm_batch_size( + runtime_backend="vllm", + file_list=list(weights), + input_root=tmp_path, + max_pages=20, + ) + + assert capped == 160 + assert reduced == 60 + + +def test_auto_scheduler_prefers_exact_fill_for_multi_gpu_vllm(): + from glossapi.ocr.deepseek import runner + + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="vllm", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "exact_fill" + assert runner._resolve_scheduler( + scheduler="auto", + runtime_backend="transformers", + lane_devices=[0, 1], + workers_per_gpu=1, + ) == "whole_doc" + + +def test_fixed_shard_builder_only_splits_large_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_fixed_shard_slices + + documents = [ + SourceDocument(name="huge.pdf", pages=310), + SourceDocument(name="mid.pdf", pages=120), + SourceDocument(name="small.pdf", pages=40), + ] + + slices = build_fixed_shard_slices(documents, shard_pages=128, shard_threshold_pages=200) + + assert [item.item_id for item in slices] == [ + "huge.pdf:1:128", + "huge.pdf:129:256", + "huge.pdf:257:310", + "mid.pdf", + "small.pdf", + ] + + +def test_exact_fill_batches_split_documents_to_fill_target(): + from glossapi.ocr.deepseek.scheduling import SourceDocument, build_exact_fill_batches + + documents = [ + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=60), + SourceDocument(name="c.pdf", pages=60), + SourceDocument(name="d.pdf", pages=20), + ] + + batches = build_exact_fill_batches(documents, target_batch_pages=160) + + assert [batch.pages for batch in batches] == [160, 160, 20] + assert [item.item_id for item in batches[0].items] == ["a.pdf:1:160"] + assert set(item.item_id for item in batches[1].items) == {"a.pdf:161:200", "b.pdf", "c.pdf"} + assert [item.item_id for item in batches[2].items] == ["d.pdf"] + + +def test_assign_batches_to_lanes_balances_full_batches(): + from glossapi.ocr.deepseek.scheduling import ( + BatchPlan, + WorkSlice, + assign_batches_to_lanes, + ) + + batches = [ + BatchPlan(batch_id=0, items=[WorkSlice("a.pdf", 160, 1, 160)]), + BatchPlan(batch_id=1, items=[WorkSlice("b.pdf", 160, 1, 160)]), + BatchPlan(batch_id=2, items=[WorkSlice("c.pdf", 160, 1, 160)]), + BatchPlan(batch_id=3, items=[WorkSlice("d.pdf", 20, 1, 20)]), + ] + + lanes = assign_batches_to_lanes(batches, devices=[0, 1], workers_per_gpu=1) + + assert sorted(lane.assigned_pages for lane in lanes) == [180, 320] + assert [len(lane.batches) for lane in lanes] == [2, 2] + + +def test_benchmark_planner_exact_fill_mixes_ranges_and_whole_docs(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=200), + SourceDocument(name="tiny.pdf", pages=20), + SourceDocument(name="mid.pdf", pages=60), + SourceDocument(name="mid2.pdf", pages=60), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] + + +def test_benchmark_planner_whole_doc_preserves_whole_files(): + from glossapi.ocr.deepseek.scheduling import SourceDocument + from glossapi.scripts.deepseek_pipeline_benchmark import _plan_lanes + + lanes = _plan_lanes( + documents=[ + SourceDocument(name="monster.pdf", pages=1085), + SourceDocument(name="a.pdf", pages=200), + SourceDocument(name="b.pdf", pages=200), + ], + devices=[0, 1], + workers_per_gpu=1, + scheduler="whole_doc", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + assigned = [name for lane in lanes for batch in lane["batches"] for name in batch["files"]] + assert sorted(assigned) == ["a.pdf", "b.pdf", "monster.pdf"] + + +def test_runner_lane_batches_exact_fill_split_large_docs(monkeypatch, tmp_path): + from glossapi.ocr.deepseek import runner + + weights = { + "monster.pdf": 200, + "mid.pdf": 60, + "mid2.pdf": 60, + "tiny.pdf": 20, + } + _touch_files(tmp_path, list(weights)) + monkeypatch.setattr(runner, "_page_count", lambda path: weights[path.name]) + + lanes = runner._plan_lane_batches( + file_list=list(weights), + input_root=tmp_path, + lane_devices=[0, 1], + workers_per_gpu=1, + max_pages=None, + runtime_backend="vllm", + scheduler="exact_fill", + target_batch_pages=160, + shard_pages=0, + shard_threshold_pages=0, + ) + + all_ranges = [ + spec + for lane in lanes + for batch in lane["batches"] + for spec in batch.get("page_ranges", []) + ] + all_files = [ + name + for lane in lanes + for batch in lane["batches"] + for name in batch.get("files", []) + ] + assert "monster.pdf:1:160" in all_ranges + assert "monster.pdf:161:200" in all_ranges + assert sorted(all_files) == ["mid.pdf", "mid2.pdf", "tiny.pdf"] diff --git a/tests/test_docling_pipeline_tuning.py b/tests/test_docling_pipeline_tuning.py new file mode 100644 index 0000000..d57aadb --- /dev/null +++ b/tests/test_docling_pipeline_tuning.py @@ -0,0 +1,35 @@ +from glossapi.ocr.docling import pipeline as docling_pipeline + + +def test_apply_common_pdf_options_prefers_threaded_pipeline_options_when_available(): + acc, _ = docling_pipeline._resolve_accelerator("cuda:0") + opts = docling_pipeline._apply_common_pdf_options( + acc=acc, + images_scale=1.25, + formula_enrichment=False, + code_enrichment=False, + ) + + expected_cls = docling_pipeline.ThreadedPdfPipelineOptions or docling_pipeline.PdfPipelineOptions + assert isinstance(opts, expected_cls) + + +def test_apply_runtime_overrides_updates_docling_page_batch_size(monkeypatch): + class Perf: + page_batch_size = 4 + + class Settings: + perf = Perf() + + monkeypatch.setenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", "8") + monkeypatch.setattr(docling_pipeline, "docling_settings", Settings(), raising=False) + + acc, _ = docling_pipeline._resolve_accelerator("cuda:0") + docling_pipeline._apply_common_pdf_options( + acc=acc, + images_scale=1.25, + formula_enrichment=False, + code_enrichment=False, + ) + + assert Settings.perf.page_batch_size == 8 diff --git a/tests/test_extract_checkpoint_benchmark.py b/tests/test_extract_checkpoint_benchmark.py new file mode 100644 index 0000000..aefa3e5 --- /dev/null +++ b/tests/test_extract_checkpoint_benchmark.py @@ -0,0 +1,85 @@ +import json +from pathlib import Path + +from glossapi.scripts import extract_checkpoint_benchmark as benchmark + + +def test_markdown_headers_counts_markdown_headings(): + text = "# Title\n\ntext\n## Subtitle\n\nnot a header\n### Third\n" + assert benchmark._markdown_headers(text) == 3 + + +def test_compare_inventory_detects_presence_size_header_and_sha_changes(): + baseline = { + "a": {"present": True, "byte_size": 10, "header_count": 1, "sha256": "old"}, + "b": {"present": True, "byte_size": 20, "header_count": 0, "sha256": "same"}, + } + current = { + "a": {"present": True, "byte_size": 12, "header_count": 2, "sha256": "new"}, + "c": {"present": True, "byte_size": 5, "header_count": 0, "sha256": "other"}, + } + diff = benchmark._compare_inventory(current, baseline) + assert diff["added_markdown"] == ["c"] + assert diff["missing_markdown"] == ["b"] + assert diff["byte_size_changed"] == ["a"] + assert diff["header_count_changed"] == ["a"] + assert diff["sha_changed"] == ["a"] + + +def test_load_baseline_inventory_reads_report_payload(tmp_path): + report_path = tmp_path / "baseline.json" + report_path.write_text( + json.dumps({"markdown_inventory": {"doc": {"present": True, "byte_size": 1, "header_count": 0}}}), + encoding="utf-8", + ) + assert benchmark._load_baseline_inventory(report_path)["doc"]["present"] is True + + +def test_inventory_markdown_marks_missing_files(tmp_path): + input_pdf = tmp_path / "sample.pdf" + input_pdf.write_bytes(b"%PDF-1.4\n") + markdown_dir = tmp_path / "markdown" + markdown_dir.mkdir() + inventory = benchmark._inventory_markdown(markdown_dir, pdf_paths=[input_pdf]) + assert inventory["sample"]["present"] is False + assert inventory["sample"]["byte_size"] == 0 + assert inventory["sample"]["header_count"] == 0 + + +def test_runtime_env_snapshot_captures_docling_batch_knobs(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + monkeypatch.setenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", "384") + monkeypatch.setenv("GLOSSAPI_DOCLING_PAGE_BATCH_SIZE", "8") + + snapshot = benchmark._runtime_env_snapshot() + + assert snapshot["GLOSSAPI_DOCLING_MAX_BATCH_FILES"] == "2" + assert snapshot["GLOSSAPI_DOCLING_BATCH_TARGET_PAGES"] == "384" + assert snapshot["GLOSSAPI_DOCLING_PAGE_BATCH_SIZE"] == "8" + + +def test_apply_cli_tuning_overrides_sets_docling_env(monkeypatch): + for env_name in benchmark.TUNING_ENV_VARS: + monkeypatch.delenv(env_name, raising=False) + + args = benchmark._parse_args( + [ + "--input-dir", + "/tmp/in", + "--output-dir", + "/tmp/out", + "--report-path", + "/tmp/report.json", + "--docling-max-batch-files", + "2", + "--docling-batch-target-pages", + "512", + "--docling-page-batch-size", + "8", + ] + ) + benchmark._apply_cli_tuning_overrides(args) + + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_MAX_BATCH_FILES"] == "2" + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_BATCH_TARGET_PAGES"] == "512" + assert benchmark._runtime_env_snapshot()["GLOSSAPI_DOCLING_PAGE_BATCH_SIZE"] == "8" diff --git a/tests/test_full_pipeline_checkpoint.py b/tests/test_full_pipeline_checkpoint.py new file mode 100644 index 0000000..5c36250 --- /dev/null +++ b/tests/test_full_pipeline_checkpoint.py @@ -0,0 +1,271 @@ +import json + +import pandas as pd + +from glossapi.scripts import full_pipeline_checkpoint as checkpoint + + +def test_read_metadata_counts_handles_missing_and_populated_parquet(tmp_path): + missing = checkpoint._read_metadata_counts(tmp_path / "missing.parquet") + assert missing["rows_total"] == 0 + + parquet_path = tmp_path / "download_results.parquet" + pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}, + {"filename": "b.pdf", "needs_ocr": False, "ocr_success": True, "text": "hello"}, + ] + ).to_parquet(parquet_path, index=False) + + counts = checkpoint._read_metadata_counts(parquet_path) + assert counts == { + "rows_total": 2, + "needs_ocr_true": 1, + "ocr_success_true": 1, + "text_nonempty": 1, + } + + +def test_full_pipeline_checkpoint_main_writes_summary(tmp_path, monkeypatch): + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + md = self.output_dir / "markdown" + md.mkdir(parents=True, exist_ok=True) + (md / "doc.md").write_text("raw text", encoding="utf-8") + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + (input_dir / "doc.pdf").write_bytes(b"%PDF-1.4\n") + + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + ] + ) + + assert rc == 0 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["post_clean_counts"]["needs_ocr_true"] == 1 + assert report["post_ocr_counts"]["ocr_success_true"] == 1 + assert report["export_records"] == 1 + + +def test_full_pipeline_checkpoint_can_resume_from_ocr_phase(tmp_path, monkeypatch): + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + raise AssertionError("extract should have been skipped") + + def clean(self, **kwargs): + raise AssertionError("clean should have been skipped") + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + metadata_path = output_dir / "download_results" / "download_results.parquet" + metadata_path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(metadata_path, index=False) + + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + "--skip-extract", + "--skip-clean", + ] + ) + + assert rc == 0 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["skipped_phases"] == ["extract", "clean"] + assert report["post_extract_counts"]["needs_ocr_true"] == 1 + assert report["post_ocr_counts"]["ocr_success_true"] == 1 + assert report["export_records"] == 1 + + +def test_full_pipeline_checkpoint_forwards_repair_exec_batch_controls(tmp_path, monkeypatch): + captured = {} + + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + return None + + def ocr(self, **kwargs): + captured.update(kwargs) + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + "--ocr-repair-exec-batch-target-pages", + "64", + "--ocr-repair-exec-batch-target-items", + "24", + ] + ) + + assert rc == 0 + assert captured["repair_exec_batch_target_pages"] == 64 + assert captured["repair_exec_batch_target_items"] == 24 + + +def test_full_pipeline_checkpoint_retries_empty_export_when_ocr_text_exists(tmp_path, monkeypatch): + calls = {"jsonl": 0} + + class DummyCorpus: + def __init__(self, input_dir, output_dir): + self.input_dir = input_dir + self.output_dir = output_dir + + def _metadata_path(self): + path = self.output_dir / "download_results" / "download_results.parquet" + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def extract(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": True, "ocr_success": False, "text": ""}] + ).to_parquet(self._metadata_path(), index=False) + + def clean(self, **kwargs): + return None + + def ocr(self, **kwargs): + pd.DataFrame( + [{"filename": "doc.pdf", "needs_ocr": False, "ocr_success": True, "text": "fixed text"}] + ).to_parquet(self._metadata_path(), index=False) + + def jsonl(self, output_path, **kwargs): + calls["jsonl"] += 1 + if calls["jsonl"] == 1: + output_path.write_text("", encoding="utf-8") + return + output_path.write_text(json.dumps({"text": "fixed text"}) + "\n", encoding="utf-8") + + monkeypatch.setattr(checkpoint, "Corpus", DummyCorpus) + + input_dir = tmp_path / "in" + input_dir.mkdir() + output_dir = tmp_path / "out" + export_path = tmp_path / "export.jsonl" + report_path = tmp_path / "report.json" + + rc = checkpoint.main( + [ + "--input-dir", + str(input_dir), + "--output-dir", + str(output_dir), + "--export-path", + str(export_path), + "--report-path", + str(report_path), + ] + ) + + assert rc == 0 + assert calls["jsonl"] == 2 + report = json.loads(report_path.read_text(encoding="utf-8")) + assert report["post_ocr_counts"]["text_nonempty"] == 1 + assert report["export_records"] == 1 diff --git a/tests/test_gloss_downloader_dynamic_html.py b/tests/test_gloss_downloader_dynamic_html.py new file mode 100644 index 0000000..a1bd678 --- /dev/null +++ b/tests/test_gloss_downloader_dynamic_html.py @@ -0,0 +1,53 @@ +from glossapi.gloss_downloader import GlossDownloader + + +def test_detects_waf_challenge_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360" + headers = { + "Content-Type": "text/html; charset=UTF-8", + "x-amzn-waf-action": "challenge", + } + body = b""" + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "challenge page" in error.lower() + + +def test_detects_js_document_viewer_html(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b""" + + + + + """ + + assert downloader.infer_file_extension(url, headers, body) == "html" + error = downloader._detect_html_interstitial(url, headers, body) + + assert error is not None + assert "document viewer" in error.lower() + + +def test_regular_html_document_is_still_allowed(tmp_path): + downloader = GlossDownloader(output_dir=str(tmp_path)) + url = "https://example.org/article" + headers = { + "Content-Type": "text/html; charset=UTF-8", + } + body = b"""Article +

Normal HTML document

Body text.

""" + + assert downloader.infer_file_extension(url, headers, body) == "html" + assert downloader._detect_html_interstitial(url, headers, body) is None diff --git a/tests/test_install_glossapi.py b/tests/test_install_glossapi.py new file mode 100644 index 0000000..5226429 --- /dev/null +++ b/tests/test_install_glossapi.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from glossapi.scripts.install_glossapi import ( + build_deepseek_command, + build_install_plan, + build_pip_command, +) + + +def test_build_install_plan_collects_phase_extras(): + plan = build_install_plan( + phases=["download", "browser_download", "extract", "ocr"], + editable=True, + include_cuda=False, + ) + + assert plan.phases == ("download", "browser_download", "extract", "ocr") + assert set(plan.extras) == {"browser", "docling"} + assert plan.editable is True + assert plan.needs_deepseek_runtime is True + + +def test_build_install_plan_adds_cuda_extra(): + plan = build_install_plan( + phases=["download"], + editable=False, + include_cuda=True, + ) + + assert set(plan.extras) == {"cuda"} + assert plan.editable is False + assert plan.needs_deepseek_runtime is False + + +def test_build_pip_command_uses_editable_install(): + plan = build_install_plan( + phases=["download", "browser_download"], + editable=True, + include_cuda=False, + ) + command = build_pip_command(plan, Path("/tmp/repo")) + + assert command[:4] == [command[0], "-m", "pip", "install"] + assert "-e" in command + assert command[-1] == ".[browser]" + + +def test_build_deepseek_command_points_to_setup_script(): + command = build_deepseek_command(Path("/tmp/repo")) + + assert command is None or command[0] diff --git a/tests/test_jsonl_export.py b/tests/test_jsonl_export.py index e05caa0..aecd7a3 100644 --- a/tests/test_jsonl_export.py +++ b/tests/test_jsonl_export.py @@ -458,6 +458,39 @@ def test_jsonl_export_sharded(tmp_path): assert len(seen_doc_ids) == len(texts) +def test_jsonl_prefers_base_markdown_when_chunks_exist(tmp_path): + corpus = Corpus(input_dir=tmp_path / "in_chunks", output_dir=tmp_path / "out_chunks") + + base_text = "## Base Title\n\nMerged body from extraction." + base_path = corpus.cleaned_markdown_dir / "chunked.md" + base_path.parent.mkdir(parents=True, exist_ok=True) + base_path.write_text(base_text, encoding="utf-8") + + chunk_dir = corpus.cleaned_markdown_dir / "chunks" / "chunked" + chunk_dir.mkdir(parents=True, exist_ok=True) + (chunk_dir / "chunked__p0001-0002.md").write_text("chunk-one", encoding="utf-8") + (chunk_dir / "chunked__p0003-0004.md").write_text("chunk-two", encoding="utf-8") + + _write_download_results( + corpus.output_dir / "download_results" / "download_results.parquet", + [ + { + "filename": "chunked.pdf", + "filter": "ok", + "needs_ocr": False, + "is_empty": False, + "char_count_no_comments": 10, + } + ], + ) + + out_path = corpus.output_dir / "chunked.jsonl" + corpus.jsonl(out_path) + + record = json.loads(out_path.read_text(encoding="utf-8").strip()) + assert record["document"] == base_text + + @pytest.mark.skipif(not _HAS_DATASETS, reason="datasets package is not installed") def test_hf_streaming_loader_example(tmp_path): corpus = Corpus(input_dir=tmp_path / "in7", output_dir=tmp_path / "out7") @@ -531,5 +564,6 @@ def test_pyarrow_filter_example(tmp_path): table = dataset.to_table(filter=(ds.field("lang") == "el") & (ds.field("year") >= 2019)) assert set(table.column("doc_id").to_pylist()) == {"a"} + def _expected_doc_id(filename: str) -> str: return hashlib.sha256(filename.encode("utf-8")).hexdigest() diff --git a/tests/test_metadata_fallback.py b/tests/test_metadata_fallback.py index f899f17..53524eb 100644 --- a/tests/test_metadata_fallback.py +++ b/tests/test_metadata_fallback.py @@ -210,6 +210,8 @@ def test_canonical_stem_variants(): "beta.metrics.json": "beta", "gamma.per_page.metrics.json": "gamma", "delta.with.dots.pdf": "delta.with.dots", + "needs__p0001-0002.pdf": "needs", + "needs__p00001-00096.md": "needs", } for source, expected in cases.items(): assert canonical_stem(source) == expected diff --git a/tests/test_ocr_backends_smoke.py b/tests/test_ocr_backends_smoke.py index 0419ba4..096bf73 100644 --- a/tests/test_ocr_backends_smoke.py +++ b/tests/test_ocr_backends_smoke.py @@ -1,3 +1,4 @@ +import hashlib from pathlib import Path import pandas as pd @@ -11,7 +12,7 @@ def _mk_corpus(tmp_path: Path): return Corpus(input_dir=root, output_dir=root) -def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): +def test_deepseek_ocr_then_math_only_smoke(tmp_path, monkeypatch): corpus = _mk_corpus(tmp_path) # Two PDFs: one needs OCR, one does not (for math-only later) @@ -28,7 +29,7 @@ def test_cross_backend_smoke_with_stubs(tmp_path, monkeypatch): parquet_path = dl_dir / "download_results.parquet" df.to_parquet(parquet_path, index=False) - # DeepSeek stub for OCR + # DeepSeek runner is stubbed here only to avoid the heavy model during unit tests. from glossapi.ocr.deepseek import runner def fake_run_for_files(self_ref, files, **kwargs): @@ -45,7 +46,7 @@ def fake_run_for_files(self_ref, files, **kwargs): # Run DeepSeek OCR for bad files corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=True, mode="ocr_bad_then_math") - # RapidOCR math-only pass: ensure JSON for clean.pdf and run math + # Math-only pass: ensure JSON for clean.pdf and run math json_dir = corpus.output_dir / "json" json_dir.mkdir(parents=True, exist_ok=True) (json_dir / "clean.docling.json").write_text("{}", encoding="utf-8") @@ -58,9 +59,58 @@ def fake_enrich(files=None, **kwargs): monkeypatch.setattr(corpus, "formula_enrich_from_json", fake_enrich) - corpus.ocr(backend="rapidocr", fix_bad=False, math_enhance=True, mode="math_only") + corpus.ocr(backend="deepseek", fix_bad=False, math_enhance=True, mode="math_only") # Verify updated = pd.read_parquet(parquet_path).set_index("filename") assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs.pdf", "needs_ocr"]) is False + assert updated.loc["needs.pdf", "text"] == "ds md\n" + assert updated.loc["needs.pdf", "ocr_markdown_relpath"] == "markdown/needs.md" + assert updated.loc["needs.pdf", "ocr_metrics_relpath"] == "json/metrics/needs.metrics.json" + assert ( + updated.loc["needs.pdf", "ocr_text_sha256"] + == hashlib.sha256(b"ds md\n").hexdigest() + ) assert captured.get("files") == ["clean"], "Math-only should run for non-OCR stem only" + + +def test_deepseek_ocr_normalizes_chunk_rows_to_real_source_pdf(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + (corpus.input_dir / "needs.pdf").write_bytes(b"%PDF-1.4\n%stub\n") + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + parquet_path = dl_dir / "download_results.parquet" + pd.DataFrame( + [ + {"filename": "needs.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + {"filename": "needs__p0001-0002.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False}, + ] + ).to_parquet(parquet_path, index=False) + + from glossapi.ocr.deepseek import runner + + captured = {} + + def fake_run_for_files(self_ref, files, **kwargs): + captured["files"] = list(files) + markdown_dir = corpus.output_dir / "markdown" + metrics_dir = corpus.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + (markdown_dir / "needs.md").write_text("normalized md\n", encoding="utf-8") + (metrics_dir / "needs.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {"needs": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False, mode="ocr_bad") + + assert captured["files"] == ["needs.pdf"] + updated = pd.read_parquet(parquet_path).set_index("filename") + assert bool(updated.loc["needs.pdf", "ocr_success"]) is True + assert bool(updated.loc["needs__p0001-0002.pdf", "ocr_success"]) is True + assert updated.loc["needs.pdf", "text"] == "normalized md\n" + assert updated.loc["needs__p0001-0002.pdf", "text"] == "normalized md\n" diff --git a/tests/test_ocr_dispatch_backends.py b/tests/test_ocr_dispatch_backends.py index 965692b..08eb326 100644 --- a/tests/test_ocr_dispatch_backends.py +++ b/tests/test_ocr_dispatch_backends.py @@ -51,29 +51,156 @@ def fail_math(*args, **kwargs): assert calls.get("files") == [fname] -def test_rapidocr_backend_routes_to_extract_with_docling(tmp_path, monkeypatch): +def test_deepseek_backend_forwards_repair_exec_batch_controls(tmp_path, monkeypatch): corpus = _mk_corpus(tmp_path) - # Seed minimal metadata parquet that flags a single file for OCR dl_dir = corpus.output_dir / "download_results" dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" df = pd.DataFrame([ - {"filename": "doc.pdf", corpus.url_column: "", "needs_ocr": True, "ocr_success": False} + {"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False} ]) df.to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + mode="ocr_bad", + repair_exec_batch_target_pages=64, + repair_exec_batch_target_items=24, + ) + + assert calls.get("files") == [fname] + assert calls["kwargs"]["repair_exec_batch_target_pages"] == 64 + assert calls["kwargs"]["repair_exec_batch_target_items"] == 24 + + +def test_invalid_backend_is_rejected(tmp_path): + corpus = _mk_corpus(tmp_path) + with pytest.raises(ValueError, match="backend must be 'deepseek'"): + corpus.ocr(backend="bogus", fix_bad=True, math_enhance=False) + + +def test_deepseek_backend_forwards_parallelism_controls(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + calls = {} + + def fake_run_for_files(self_ref, files, **kwargs): + calls["files"] = list(files) + calls["kwargs"] = dict(kwargs) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + corpus.ocr( + backend="deepseek", + fix_bad=True, + math_enhance=False, + use_gpus="multi", + devices=[1, 3], + workers_per_gpu=2, + ocr_profile="plain_ocr", + attn_backend="sdpa", + base_size=640, + image_size=448, + crop_mode=True, + render_dpi=120, + max_pages=7, + max_new_tokens=2048, + repetition_penalty=1.08, + no_repeat_ngram_size=12, + ) + + assert calls["files"] == [fname] + assert calls["kwargs"]["use_gpus"] == "multi" + assert calls["kwargs"]["devices"] == [1, 3] + assert calls["kwargs"]["workers_per_gpu"] == 2 + assert calls["kwargs"]["ocr_profile"] == "plain_ocr" + assert calls["kwargs"]["attn_backend"] == "sdpa" + assert calls["kwargs"]["base_size"] == 640 + assert calls["kwargs"]["image_size"] == 448 + assert calls["kwargs"]["crop_mode"] is True + assert calls["kwargs"]["render_dpi"] == 120 + assert calls["kwargs"]["max_pages"] == 7 + assert calls["kwargs"]["max_new_tokens"] == 2048 + assert calls["kwargs"]["repetition_penalty"] == 1.08 + assert calls["kwargs"]["no_repeat_ngram_size"] == 12 + + +def test_deepseek_rerun_refreshes_with_clean_ocr_then_score_only_clean(tmp_path, monkeypatch): + corpus = _mk_corpus(tmp_path) + + dl_dir = corpus.output_dir / "download_results" + dl_dir.mkdir(parents=True, exist_ok=True) + fname = "doc.pdf" + pd.DataFrame( + [{"filename": fname, corpus.url_column: "", "needs_ocr": True, "ocr_success": False}] + ).to_parquet(dl_dir / "download_results.parquet", index=False) + (corpus.input_dir / fname).write_bytes(b"%PDF-1.4\n%stub\n") + + from glossapi.ocr.deepseek import runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for current in files: + stem = Path(current).stem + (markdown_dir / f"{stem}.md").write_text("ocr text\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text( + "{\"page_count\": 1}\n", + encoding="utf-8", + ) + return {"doc": {"page_count": 1}} + + monkeypatch.setattr(runner, "run_for_files", fake_run_for_files) + + calls = [] + original_markdown_dir = corpus.markdown_dir - captured = {} + def fake_clean_ocr(*args, **kwargs): + calls.append( + ("clean_ocr", kwargs.get("input_dir"), kwargs.get("write_cleaned_files", True)) + ) + corpus.cleaned_markdown_dir.mkdir(parents=True, exist_ok=True) + corpus.markdown_dir = corpus.cleaned_markdown_dir - def fake_extract(**kwargs): - captured.update(kwargs) - return None + def fake_clean(*args, **kwargs): + calls.append(("clean", kwargs.get("input_dir"), kwargs.get("write_cleaned_files", True))) - monkeypatch.setattr(corpus, "extract", fake_extract) + monkeypatch.setattr(corpus, "clean_ocr", fake_clean_ocr) + monkeypatch.setattr(corpus, "clean", fake_clean) - corpus.ocr(backend="rapidocr", fix_bad=True, math_enhance=False, use_gpus="single", devices=[0]) + corpus.ocr(backend="deepseek", fix_bad=True, math_enhance=False) - assert captured, "Expected extract() to be called for rapidocr backend" - assert captured.get("force_ocr") is True - assert captured.get("phase1_backend") == "docling" - files = captured.get("filenames") or [] - assert files and files[0] == "doc.pdf" + assert calls[0][0] == "clean_ocr" + assert Path(str(calls[0][1])) == original_markdown_dir + assert calls[0][2] is True + assert calls[1][0] == "clean" + assert Path(str(calls[1][1])) == corpus.cleaned_markdown_dir + assert calls[1][2] is False diff --git a/tests/test_ocr_golden_pages.py b/tests/test_ocr_golden_pages.py new file mode 100644 index 0000000..6274d96 --- /dev/null +++ b/tests/test_ocr_golden_pages.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import difflib +import json +from pathlib import Path + +from glossapi import Corpus +from glossapi.corpus.phase_clean import _render_combined_ocr_debug_page + + +GOLDEN_DIR = Path( + "/home/foivos/data/openarchives_ocr_ingest_20260403/debug/ocr_golden_pages_first300_20260410" +) + + +def _load_manifest_rows() -> list[dict]: + manifest_path = GOLDEN_DIR / "manifest.jsonl" + assert manifest_path.exists(), f"Missing OCR golden manifest: {manifest_path}" + return [ + json.loads(line) + for line in manifest_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _format_diff(case_id: str, expected: str, actual: str) -> str: + diff = list( + difflib.unified_diff( + expected.splitlines(), + actual.splitlines(), + fromfile=f"{case_id}:expected", + tofile=f"{case_id}:actual", + lineterm="", + n=3, + ) + ) + return "\n".join(diff[:120]) + + +def test_combined_ocr_real_goldens_match_exact_output(tmp_path: Path) -> None: + rows = _load_manifest_rows() + assert len(rows) >= 300, f"Expected hundreds of real OCR golden cases, got {len(rows)}" + + corpus = Corpus(input_dir=tmp_path / "input", output_dir=tmp_path / "output") + corpus.input_dir.mkdir(parents=True, exist_ok=True) + corpus.output_dir.mkdir(parents=True, exist_ok=True) + noise_mod = corpus._load_rust_extension( + "glossapi_rs_noise", + "rust/glossapi_rs_noise/Cargo.toml", + required_attrs=("find_numeric_debug_page_spans", "evaluate_page_character_noise"), + ) + + mismatches: list[str] = [] + for row in rows: + case_id = str(row["case_id"]) + input_path = Path(str(row["input_path"])) + expected_path = Path(str(row["expected_path"])) + page_text = input_path.read_text(encoding="utf-8") + expected = expected_path.read_text(encoding="utf-8") + actual = _render_combined_ocr_debug_page( + page_text, + noise_mod=noise_mod, + min_progress_steps=10, + min_repeat_steps=8, + min_same_digit_steps=10, + word_rep_threshold=4, + word_min_period=3, + word_window=96, + )["annotated_page"] + if actual != expected: + mismatches.append(_format_diff(case_id, expected, actual)) + if len(mismatches) >= 5: + break + + assert not mismatches, "\n\n".join(mismatches) diff --git a/tests/test_ocr_imports.py b/tests/test_ocr_imports.py index 3487619..094e72b 100644 --- a/tests/test_ocr_imports.py +++ b/tests/test_ocr_imports.py @@ -8,32 +8,19 @@ def test_import_ocr_package_is_lightweight(): import glossapi.ocr as ocr assert hasattr(ocr, "deepseek") - assert hasattr(ocr, "rapidocr") # New subpackages remain importable lazily import glossapi.ocr.deepseek.runner as deepseek_runner - import glossapi.ocr.rapidocr.dispatch as rapid_dispatch assert ocr.deepseek.runner is deepseek_runner - assert ocr.rapidocr.dispatch is rapid_dispatch assert ocr.deepseek_runner is deepseek_runner - assert ocr.rapidocr_dispatch is rapid_dispatch assert hasattr(deepseek_runner, "run_for_files") - assert hasattr(rapid_dispatch, "run_via_extract") # Utilities module always available (pure Python) from glossapi.ocr.utils import json_io as utils_json assert hasattr(utils_json, "export_docling_json") - if importlib.util.find_spec("docling") is not None: - try: - from glossapi.ocr.rapidocr import pool as rapid_pool - except ModuleNotFoundError: - pytest.skip("Docling optional dependencies not available") - else: - assert hasattr(rapid_pool, "GLOBAL_RAPID_OCR_POOL") - if importlib.util.find_spec("docling_core") is not None: try: from glossapi.ocr.math import enrich_from_docling_json, RoiEntry diff --git a/tests/test_openarchives_download_freeze.py b/tests/test_openarchives_download_freeze.py new file mode 100644 index 0000000..e76b24e --- /dev/null +++ b/tests/test_openarchives_download_freeze.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +import glossapi.scripts.openarchives_download_freeze as freeze_mod +from glossapi.scripts.openarchives_download_freeze import main + + +def test_download_freeze_dry_run_materializes_manifest(tmp_path: Path) -> None: + src = tmp_path / "input.parquet" + pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ).to_parquet(src, index=False) + + work_root = tmp_path / "work" + rc = main(["--input-parquet", str(src), "--work-root", str(work_root), "--dry-run"]) + assert rc == 0 + assert (work_root / "manifests" / "download_input.parquet").exists() + assert (work_root / "download_results" / "download_results.parquet").exists() + + +def test_download_freeze_uses_pdf_only_auto_mode(tmp_path: Path, monkeypatch) -> None: + src = tmp_path / "input.parquet" + pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ).to_parquet(src, index=False) + + observed = {} + + class DummyCorpus: + def __init__(self, *args, **kwargs): + observed["init"] = kwargs + + def download(self, **kwargs): + observed["download"] = kwargs + return pd.DataFrame( + [ + { + "url": "https://example.com/a.pdf", + "filename": "ABC_001.pdf", + "download_success": True, + "download_error": "", + "file_ext": "pdf", + } + ] + ) + + monkeypatch.setattr(freeze_mod, "Corpus", DummyCorpus) + + work_root = tmp_path / "work" + rc = main(["--input-parquet", str(src), "--work-root", str(work_root)]) + + assert rc == 0 + assert observed["download"]["download_mode"] == "auto" + assert observed["download"]["supported_formats"] == ["pdf"] diff --git a/tests/test_openarchives_download_probe.py b/tests/test_openarchives_download_probe.py new file mode 100644 index 0000000..0213438 --- /dev/null +++ b/tests/test_openarchives_download_probe.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_download_probe import _prepare_probe_frame + + +def test_prepare_probe_frame_limits_per_host_and_adds_runtime_columns() -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/a.pdf"}, + {"filename": "b.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/b.pdf"}, + {"filename": "c.pdf", "pdf_url": "https://ikee.lib.auth.gr/file/c.pdf"}, + {"filename": "d.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/d.pdf"}, + {"filename": "e.pdf", "pdf_url": "https://dspace.lib.ntua.gr/file/e.pdf"}, + ] + ) + + out = _prepare_probe_frame( + df, + samples_per_host=2, + max_hosts=2, + seed=7, + ) + + counts = out.groupby("host").size().to_dict() + assert counts["ikee.lib.auth.gr"] == 2 + assert counts["dspace.lib.ntua.gr"] == 2 + assert set(out["url"]) <= set(df["pdf_url"]) + assert set(out["base_domain"]) == {"https://ikee.lib.auth.gr", "https://dspace.lib.ntua.gr"} diff --git a/tests/test_openarchives_hf_refresh.py b/tests/test_openarchives_hf_refresh.py new file mode 100644 index 0000000..81f015e --- /dev/null +++ b/tests/test_openarchives_hf_refresh.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_hf_refresh import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + cctx = zstd.ZstdCompressor(level=3) + with path.open("wb") as fh: + with cctx.stream_writer(fh) as writer: + for row in rows: + writer.write((json.dumps(row, ensure_ascii=False) + "\n").encode("utf-8")) + + +def _read_jsonl_zst(path: Path) -> list[dict]: + dctx = zstd.ZstdDecompressor() + with path.open("rb") as fh, dctx.stream_reader(fh) as reader: + text = io.TextIOWrapper(reader, encoding="utf-8").read() + return [json.loads(line) for line in text.splitlines() if line.strip()] + + +def test_openarchives_hf_refresh_updates_pipeline_metadata_and_readme(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {"filename": "AAA_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 1.0}, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "source_metadata": {"filename": "BBB_000.pdf"}, + "pipeline_metadata": {"needs_ocr": False, "greek_badness_score": 2.0}, + }, + ], + ) + (dataset_root / "README.md").write_text( + "---\npretty_name: OpenArchives.gr 191,000 docs\n---\n\n# OpenArchives.gr 191,000 docs\n\n" + "- Σύνολο markdown αρχείων: **191,301** from openarchives.gr\n" + "- Τα χαμηλής ποιότητας αρχεία που ενδέχεται να χρειάζονται OCR επεξεργασία επισημαίνονται με τη στήλη `needs_ocr`: **23,083 / 191,301 (12.07%)**\n" + "- Total markdown files: **191,301** from openarchives.gr\n" + "- Lower-quality files that may require OCR reprocessing are marked by the `needs_ocr` indicator: **23,083 / 191,301 (12.07%)**\n", + encoding="utf-8", + ) + + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + "ocr_success": False, + "greek_badness_score": 72.0, + "mojibake_badness_score": 0.2, + "latin_percentage": 33.3, + "polytonic_ratio": 0.0, + "char_count_no_comments": 1234.0, + "is_empty": False, + "filter": "ok", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + { + "source_doc_id": "doc-b", + "source_jsonl": str(shard_path), + "needs_ocr": False, + "ocr_success": False, + "greek_badness_score": 2.0, + "mojibake_badness_score": 0.0, + "latin_percentage": 22.0, + "polytonic_ratio": 0.0, + "char_count_no_comments": 456.0, + "is_empty": True, + "filter": "empty_text==0", + "quality_method": "refresh", + "reevaluated_at": "2026-03-31T12:00:00+00:00", + }, + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + ] + ) + assert rc == 0 + + rows = _read_jsonl_zst(out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst") + assert rows[0]["pipeline_metadata"]["needs_ocr"] is True + assert rows[0]["pipeline_metadata"]["greek_badness_score"] == 72.0 + assert rows[1]["pipeline_metadata"]["is_empty"] is True + assert rows[1]["pipeline_metadata"]["filter"] == "empty_text==0" + + readme = (out_root / "README.md").read_text(encoding="utf-8") + assert "OpenArchives.gr 2 docs" in readme + assert "**1 / 2 (50.00%)**" in readme + + +def test_openarchives_hf_refresh_dry_run_does_not_write_outputs(tmp_path: Path) -> None: + dataset_root = tmp_path / "openarchives.gr" + shard_path = dataset_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + shard_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "source_metadata": {}, + "pipeline_metadata": {"needs_ocr": False}, + } + ], + ) + (dataset_root / "README.md").write_text("# OpenArchives.gr 191,000 docs\n", encoding="utf-8") + metadata = tmp_path / "filled_document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "source_jsonl": str(shard_path), + "needs_ocr": True, + } + ] + ).to_parquet(metadata, index=False) + + out_root = tmp_path / "out" + rc = main( + [ + "--dataset-root", + str(dataset_root), + "--metadata-parquet", + str(metadata), + "--output-root", + str(out_root), + "--dry-run", + ] + ) + assert rc == 0 + assert not (out_root / "data" / "openarchives" / "shard_001" / "chunk-000.jsonl.zst").exists() diff --git a/tests/test_openarchives_ocr_enrich.py b/tests/test_openarchives_ocr_enrich.py new file mode 100644 index 0000000..16d683a --- /dev/null +++ b/tests/test_openarchives_ocr_enrich.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pandas as pd +import zstandard as zstd + +from glossapi.scripts.openarchives_ocr_enrich import main + + +def _write_jsonl_zst(path: Path, rows: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + payload = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows).encode("utf-8") + cctx = zstd.ZstdCompressor() + path.write_bytes(cctx.compress(payload)) + + +def test_openarchives_ocr_enrich_extracts_page_counts_and_pdf_url(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_01" / "chunk-000.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-a", + "filename": "AAA_000", + "text": "alpha", + "pipeline_metadata": {"page_count": 98, "pages_total": 98}, + "source_metadata": { + "pdf_links_json": "https://example.com/a.pdf", + "collection_slug": "Dione", + "language_code": "el", + }, + }, + { + "doc_id": "doc-b", + "filename": "BBB_000", + "text": "beta", + "pipeline_metadata": {"pages_total": 12}, + "source_metadata": { + "pdf_links_json": json.dumps( + [ + {"url": "https://example.com/b.pdf"}, + {"url": "https://example.com/b2.pdf"}, + ] + ), + "collection_slug": "Pandemos", + "language_code": "el", + }, + }, + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-a", + "filename": "AAA_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-b", + "filename": "BBB_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": True, + }, + { + "source_doc_id": "doc-c", + "filename": "CCC_000.pdf", + "source_jsonl": str(jsonl_path), + "needs_ocr": False, + }, + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output).sort_values("filename").reset_index(drop=True) + assert enriched["filename"].tolist() == ["AAA_000.pdf", "BBB_000.pdf"] + assert enriched["page_count_source"].tolist() == [98, 12] + assert enriched["pages_total_source"].tolist() == [98, 12] + assert enriched["pdf_url"].tolist() == ["https://example.com/a.pdf", "https://example.com/b.pdf"] + assert enriched["source_collection_slug"].tolist() == ["Dione", "Pandemos"] + + +def test_openarchives_ocr_enrich_resolves_rewritten_source_jsonl_path(tmp_path): + raw_root = tmp_path / "raw" / "openarchives.gr" + jsonl_path = raw_root / "data" / "openarchives" / "shard_02" / "chunk-001.jsonl.zst" + _write_jsonl_zst( + jsonl_path, + [ + { + "doc_id": "doc-x", + "filename": "XXX_000", + "text": "x", + "pipeline_metadata": {"page_count": 7}, + "source_metadata": {"external_link": "https://example.com/x"}, + } + ], + ) + + parquet = tmp_path / "document_level.parquet" + pd.DataFrame( + [ + { + "source_doc_id": "doc-x", + "filename": "XXX_000.pdf", + "source_jsonl": "/home/foivos/data/glossapi_raw/hf/openarchives.gr/data/openarchives/shard_02/chunk-001.jsonl.zst", + "needs_ocr": True, + } + ] + ).to_parquet(parquet, index=False) + + output = tmp_path / "enriched.parquet" + rc = main( + [ + "--parquet", + str(parquet), + "--raw-repo-root", + str(raw_root), + "--output-parquet", + str(output), + ] + ) + assert rc == 0 + + enriched = pd.read_parquet(output) + assert int(enriched.loc[0, "page_count_source"]) == 7 + assert enriched.loc[0, "pdf_url"] == "https://example.com/x" diff --git a/tests/test_openarchives_ocr_run_node.py b/tests/test_openarchives_ocr_run_node.py new file mode 100644 index 0000000..0b66d52 --- /dev/null +++ b/tests/test_openarchives_ocr_run_node.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +from glossapi.scripts.openarchives_ocr_run_node import ( + _normalize_download_results, + _prepare_download_input, +) + + +def test_prepare_download_input_adds_url_and_filename_base() -> None: + df = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "needs_ocr": True, + } + ] + ) + out = _prepare_download_input(df) + assert out.loc[0, "url"] == "https://example.com/a.pdf" + assert out.loc[0, "filename_base"] == "ABC_001" + + +def test_normalize_download_results_preserves_shard_filename_and_metadata() -> None: + shard = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "pdf_url": "https://example.com/a.pdf", + "filename_base": "ABC_001", + "needs_ocr": True, + "source_doc_id": "doc-1", + } + ] + ) + dl = pd.DataFrame( + [ + { + "filename": "ABC_001.pdf", + "filename_base": "ABC_001", + "download_success": True, + "download_error": "", + "url": "https://example.com/a.pdf", + } + ] + ) + out = _normalize_download_results(shard_df=shard, download_results_df=dl) + assert out.loc[0, "filename"] == "ABC_001.pdf" + assert out.loc[0, "source_doc_id"] == "doc-1" + assert bool(out.loc[0, "download_success"]) is True + assert bool(out.loc[0, "needs_ocr"]) is True diff --git a/tests/test_openarchives_ocr_shards.py b/tests/test_openarchives_ocr_shards.py new file mode 100644 index 0000000..d616225 --- /dev/null +++ b/tests/test_openarchives_ocr_shards.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +import hashlib +import json +from pathlib import Path + +import pandas as pd + +from glossapi.scripts import openarchives_ocr_cutoff_shards, openarchives_ocr_merge, openarchives_ocr_shards + + +def test_openarchives_ocr_shards_balances_pages(tmp_path: Path) -> None: + df = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "pages_total": 100}, + {"filename": "b.pdf", "needs_ocr": True, "pages_total": 90}, + {"filename": "c.pdf", "needs_ocr": True, "pages_total": 40}, + {"filename": "d.pdf", "needs_ocr": True, "pages_total": 30}, + {"filename": "skip.pdf", "needs_ocr": False, "pages_total": 999}, + ] + ) + source = tmp_path / "download_results.parquet" + out_dir = tmp_path / "shards" + df.to_parquet(source, index=False) + + rc = openarchives_ocr_shards.main( + [ + "--parquet", + str(source), + "--output-dir", + str(out_dir), + "--nodes", + "2", + ] + ) + assert rc == 0 + + summary = json.loads((out_dir / "openarchives_ocr_shard_summary.json").read_text()) + assert summary["docs_total"] == 4 + assert summary["pages_total"] == 260 + manifests = sorted(out_dir.glob("openarchives_ocr_shard_node_*.parquet")) + assert len(manifests) == 2 + page_totals = [int(pd.read_parquet(path)["pages_total"].sum()) for path in manifests] + assert max(page_totals) - min(page_totals) <= 20 + + +def test_openarchives_ocr_merge_updates_master(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": True, "ocr_success": False}, + {"filename": "b.pdf", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + {"filename": "a.pdf", "needs_ocr": False, "ocr_success": True, "ocr_node_id": 2}, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("filename") + assert bool(merged.loc["a.pdf", "ocr_success"]) is True + assert bool(merged.loc["a.pdf", "needs_ocr"]) is False + assert int(merged.loc["a.pdf", "ocr_node_id"]) == 2 + assert bool(merged.loc["b.pdf", "ocr_success"]) is False + + +def test_openarchives_ocr_cutoff_shards_uses_only_available_local_pdfs(tmp_path: Path) -> None: + df = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "filename_base": "A", "needs_ocr": True, "pages_total_source": 100}, + {"source_doc_id": "doc-2", "filename": "b.html", "filename_base": "B", "needs_ocr": True, "pages_total_source": 50}, + {"source_doc_id": "doc-3", "filename": "c.html", "filename_base": "C", "needs_ocr": False, "pages_total_source": 999}, + ] + ) + source = tmp_path / "master.parquet" + downloads = tmp_path / "downloads" + downloads.mkdir() + (downloads / "A.pdf").write_bytes(b"%PDF-1.4\n") + df.to_parquet(source, index=False) + + out_dir = tmp_path / "cutoff" + rc = openarchives_ocr_cutoff_shards.main( + [ + "--parquet", + str(source), + "--output-dir", + str(out_dir), + "--local-download-root", + str(downloads), + "--nodes", + "2", + "--cutoff-id", + "cutoff-x", + ] + ) + assert rc == 0 + summary = json.loads((out_dir / "openarchives_ocr_cutoff_summary.json").read_text()) + assert summary["available_docs_total"] == 1 + assert summary["missing_docs_total"] == 1 + shard = pd.read_parquet(out_dir / "openarchives_ocr_shard_node_00.parquet") + assert shard.loc[0, "source_filename"] == "a.html" + assert shard.loc[0, "filename"] == "A.pdf" + assert shard.loc[0, "md_filename"] == "A.md" + assert bool(shard.loc[0, "available_at_cutoff"]) is True + missing = pd.read_parquet(out_dir / "openarchives_ocr_missing_at_cutoff.parquet") + assert set(missing["source_doc_id"]) == {"doc-2"} + + +def test_openarchives_ocr_merge_copies_markdown_artifacts(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "md_filename": "a.md", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + (work_root / "markdown").mkdir(parents=True) + (work_root / "json" / "metrics").mkdir(parents=True) + (work_root / "markdown" / "A.md").write_text("ocr text", encoding="utf-8") + (work_root / "json" / "metrics" / "A.metrics.json").write_text("{}", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--preserve-master-columns", + "filename,md_filename", + "--artifact-work-roots", + str(work_root), + "--artifact-output-root", + str(tmp_path / "final"), + ] + ) + assert rc == 0 + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "filename"] == "a.html" + assert bool(merged.loc["doc-1", "ocr_success"]) is True + assert merged.loc["doc-1", "text"] == "ocr text" + assert merged.loc["doc-1", "ocr_markdown_relpath"] == "markdown/A.md" + assert merged.loc["doc-1", "ocr_metrics_relpath"] == "json/metrics/A.metrics.json" + assert merged.loc["doc-1", "ocr_text_sha256"] == hashlib.sha256(b"ocr text").hexdigest() + assert (tmp_path / "final" / "markdown" / "A.md").exists() + assert (tmp_path / "final" / "json" / "metrics" / "A.metrics.json").exists() + + +def test_openarchives_ocr_merge_embeds_text_without_copy_root(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + (work_root / "markdown").mkdir(parents=True) + (work_root / "json" / "metrics").mkdir(parents=True) + (work_root / "markdown" / "A.md").write_text("embedded text", encoding="utf-8") + (work_root / "json" / "metrics" / "A.metrics.json").write_text("{}", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--artifact-work-roots", + str(work_root), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "text"] == "embedded text" + assert pd.isna(merged.loc["doc-1", "ocr_markdown_relpath"]) + + +def test_openarchives_ocr_merge_unifies_markdown_shards(tmp_path: Path) -> None: + master = pd.DataFrame( + [ + {"source_doc_id": "doc-1", "filename": "a.html", "md_filename": "a.md", "needs_ocr": True, "ocr_success": False}, + ] + ) + shard = pd.DataFrame( + [ + { + "source_doc_id": "doc-1", + "filename": "A.pdf", + "filename_base": "A", + "md_filename": "A.md", + "needs_ocr": False, + "ocr_success": True, + }, + ] + ) + master_path = tmp_path / "master.parquet" + shard_path = tmp_path / "shard.parquet" + out_path = tmp_path / "merged.parquet" + work_root = tmp_path / "node00" + markdown_dir = work_root / "markdown" + markdown_dir.mkdir(parents=True) + (markdown_dir / "A__p00001-00096.md").write_text("part one", encoding="utf-8") + (markdown_dir / "A__p00097-00179.md").write_text("part two\n", encoding="utf-8") + master.to_parquet(master_path, index=False) + shard.to_parquet(shard_path, index=False) + + rc = openarchives_ocr_merge.main( + [ + "--master-parquet", + str(master_path), + "--shard-parquets", + str(shard_path), + "--output-parquet", + str(out_path), + "--key-column", + "source_doc_id", + "--artifact-work-roots", + str(work_root), + "--artifact-output-root", + str(tmp_path / "final"), + ] + ) + assert rc == 0 + + merged = pd.read_parquet(out_path).set_index("source_doc_id") + assert merged.loc["doc-1", "text"] == "part one\npart two\n" + assert merged.loc["doc-1", "ocr_markdown_relpath"] == "markdown/A.md" + assert (tmp_path / "final" / "markdown" / "A.md").read_text(encoding="utf-8") == "part one\npart two\n" + assert (tmp_path / "final" / "sidecars" / "ocr_shards" / "markdown" / "A__p00001-00096.md").exists() + assert (tmp_path / "final" / "sidecars" / "ocr_shards" / "markdown" / "A__p00097-00179.md").exists() diff --git a/tests/test_openarchives_pdf_stage_pull.py b/tests/test_openarchives_pdf_stage_pull.py new file mode 100644 index 0000000..fbdfbed --- /dev/null +++ b/tests/test_openarchives_pdf_stage_pull.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + +from glossapi.scripts.openarchives_pdf_stage_pull import ( + TransferItem, + TransferState, + canonicalize_pdf_name, + load_priority_filenames, + read_manifest, + run, +) + + +def _write_manifest(path: Path) -> None: + path.write_text( + "\t".join(["canonical_filename", "remote_path", "remote_size_bytes", "remote_name"]) + + "\n" + + "\t".join(["AAA_456.pdf", "/remote/AAA_456.pdf", "10", "AAA_456.pdf"]) + + "\n" + + "\t".join(["VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", "20", "VFK_368.pdf.Ac6Dc3BA"]) + + "\n", + encoding="utf-8", + ) + + +def test_read_manifest_parses_rows(tmp_path: Path) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + + items = read_manifest(manifest) + + assert items == [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("VFK_368.pdf", "/remote/VFK_368.pdf.Ac6Dc3BA", 20, "VFK_368.pdf.Ac6Dc3BA"), + ] + + +def test_transfer_state_resets_stale_and_marks_completed(tmp_path: Path) -> None: + db_path = tmp_path / "state.sqlite3" + downloads = tmp_path / "downloads" + partials = tmp_path / "partials" + downloads.mkdir() + partials.mkdir() + state = TransferState(db_path) + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + + state.mark_in_progress("AAA_456.pdf", 5) + (downloads / "BBB_001.pdf").write_bytes(b"x" * 12) + + state.reset_stale_in_progress() + state.mark_completed_if_present(downloads, partials) + + cur = state.conn.execute( + "SELECT canonical_filename, status, last_seen_size_bytes, last_error FROM transfer_items ORDER BY canonical_filename" + ) + rows = cur.fetchall() + assert rows[0][0] == "AAA_456.pdf" + assert rows[0][1] == "pending" + assert "Recovered from interrupted transfer" in rows[0][3] + assert rows[1][0] == "BBB_001.pdf" + assert rows[1][1] == "completed" + assert rows[1][2] == 12 + + counts = state.counts() + assert counts["pending"] == 1 + assert counts["completed"] == 1 + state.close() + + +def test_transfer_state_next_item_respects_attempt_limit(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=25 WHERE canonical_filename='AAA_456.pdf'" + ) + state.conn.execute( + "UPDATE transfer_items SET status='failed', attempts=2 WHERE canonical_filename='BBB_001.pdf'" + ) + state.conn.commit() + + row = state.next_item(max_attempts=20) + + assert row is not None + assert row["canonical_filename"] == "BBB_001.pdf" + state.close() + + +def test_load_priority_filenames_supports_lists_and_suffix_forms(tmp_path: Path) -> None: + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "manual.txt").write_text( + "AAA_456.pdf\n" + "/tmp/VFK_368.pdf.Ac6Dc3BA\n" + "ignore me\n", + encoding="utf-8", + ) + (priority_dir / "BBB_001.pdf").write_text("", encoding="utf-8") + + names = load_priority_filenames(priority_dir) + + assert names == {"AAA_456.pdf", "VFK_368.pdf", "BBB_001.pdf"} + assert canonicalize_pdf_name("VFK_368.pdf.Ac6Dc3BA") == "VFK_368.pdf" + + +def test_transfer_state_priorities_are_selected_first(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + TransferItem("CCC_002.pdf", "/remote/CCC_002.pdf", 14, "CCC_002.pdf"), + ] + ) + state.set_priorities({"CCC_002.pdf"}) + + row = state.next_item(max_attempts=20) + + assert row is not None + assert row["canonical_filename"] == "CCC_002.pdf" + counts = state.priority_counts() + assert counts["priority_total"] == 1 + assert counts["priority_pending"] == 1 + state.close() + + +def test_transfer_state_priority_only_skips_non_priority(tmp_path: Path) -> None: + state = TransferState(tmp_path / "state.sqlite3") + state.sync_manifest( + [ + TransferItem("AAA_456.pdf", "/remote/AAA_456.pdf", 10, "AAA_456.pdf"), + TransferItem("BBB_001.pdf", "/remote/BBB_001.pdf", 12, "BBB_001.pdf"), + ] + ) + state.set_priorities({"BBB_001.pdf"}) + + row = state.next_item(max_attempts=20, priority_only=True) + assert row is not None + assert row["canonical_filename"] == "BBB_001.pdf" + + state.mark_in_progress("BBB_001.pdf", 0) + state.mark_completed("BBB_001.pdf", 12) + + row2 = state.next_item(max_attempts=20, priority_only=True) + assert row2 is None + state.close() + + +def test_load_priority_filenames_ignores_parquet_and_reads_csv_columns(tmp_path: Path) -> None: + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "unreachable_from_source_20260331.csv").write_text( + "filename,source_unreachable_reason\n" + "ZFV_051.pdf,connect_timeout\n" + "ZGA_056.pdf,connect_timeout\n", + encoding="utf-8", + ) + (priority_dir / "unreachable_from_source_20260331.parquet").write_bytes(b"PAR1junkZXY_999.pdfjunk") + + names = load_priority_filenames(priority_dir) + + assert names == {"ZFV_051.pdf", "ZGA_056.pdf"} + + +def test_run_uses_rsync_transport_when_requested(tmp_path: Path, monkeypatch) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + work_root = tmp_path / "work" + seen: list[str] = [] + + def _fake_rsync_one(**kwargs): + seen.append("rsync") + Path(kwargs["temp_path"]).parent.mkdir(parents=True, exist_ok=True) + Path(kwargs["temp_path"]).write_bytes(b"x" * 10) + return subprocess.CompletedProcess(args=["rsync"], returncode=0, stdout="", stderr="") + + def _fake_sftp_one(**kwargs): + seen.append("sftp") + return subprocess.CompletedProcess(args=["sftp"], returncode=1, stdout="", stderr="unexpected") + + monkeypatch.setenv("GREECE_BOX_PASSWORD", "secret") + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.rsync_one", _fake_rsync_one) + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.sftp_one", _fake_sftp_one) + + rc = run( + [ + "--manifest", + str(manifest), + "--work-root", + str(work_root), + "--transport", + "rsync", + "--limit", + "1", + "--summary-interval-seconds", + "0", + ] + ) + + assert rc == 0 + assert seen == ["rsync"] + assert (work_root / "downloads" / "AAA_456.pdf").exists() + + +def test_run_priority_only_ignores_non_priority_items(tmp_path: Path, monkeypatch) -> None: + manifest = tmp_path / "manifest.tsv" + _write_manifest(manifest) + work_root = tmp_path / "work" + priority_dir = tmp_path / "priority" + priority_dir.mkdir() + (priority_dir / "priority.csv").write_text("filename\nVFK_368.pdf\n", encoding="utf-8") + seen: list[str] = [] + + def _fake_sftp_one(**kwargs): + seen.append(Path(kwargs["remote_path"]).name) + size = 20 if "VFK_368" in kwargs["remote_path"] else 10 + Path(kwargs["temp_path"]).parent.mkdir(parents=True, exist_ok=True) + Path(kwargs["temp_path"]).write_bytes(b"x" * size) + return subprocess.CompletedProcess(args=["sftp"], returncode=0, stdout="", stderr="") + + monkeypatch.setenv("GREECE_BOX_PASSWORD", "secret") + monkeypatch.setattr("glossapi.scripts.openarchives_pdf_stage_pull.sftp_one", _fake_sftp_one) + + rc = run( + [ + "--manifest", + str(manifest), + "--work-root", + str(work_root), + "--priority-dir", + str(priority_dir), + "--priority-only", + "--summary-interval-seconds", + "0", + ] + ) + + assert rc == 0 + assert seen == ["VFK_368.pdf.Ac6Dc3BA"] + assert (work_root / "downloads" / "VFK_368.pdf").exists() + assert not (work_root / "downloads" / "AAA_456.pdf").exists() diff --git a/tests/test_phase_extract_tuning.py b/tests/test_phase_extract_tuning.py new file mode 100644 index 0000000..3b32792 --- /dev/null +++ b/tests/test_phase_extract_tuning.py @@ -0,0 +1,87 @@ +from pathlib import Path + +from glossapi.corpus.phase_extract import ( + _build_extract_work_items, + _resolve_docling_batch_target_pages, + _resolve_docling_max_batch_files, + _resolve_docling_queue_policy, +) + + +def test_resolve_docling_max_batch_files_defaults_to_conservative_batch(monkeypatch): + monkeypatch.delenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", raising=False) + assert _resolve_docling_max_batch_files() == 1 + + +def test_resolve_docling_max_batch_files_accepts_explicit_override(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "4") + assert _resolve_docling_max_batch_files() == 4 + + +def test_resolve_docling_max_batch_files_ignores_invalid_values(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "not-an-int") + assert _resolve_docling_max_batch_files() == 1 + + +def test_resolve_docling_batch_target_pages_defaults(monkeypatch): + monkeypatch.delenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", raising=False) + assert _resolve_docling_batch_target_pages() == 256 + + +def test_resolve_docling_batch_target_pages_accepts_override(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_BATCH_TARGET_PAGES", "384") + assert _resolve_docling_batch_target_pages() == 384 + + +def test_resolve_docling_queue_policy_uses_env_when_extractor_is_unprimed(monkeypatch): + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + assert _resolve_docling_queue_policy(None) == (2, 600) + + +def test_resolve_docling_queue_policy_prefers_extractor_values(monkeypatch): + class Extractor: + max_batch_files = 3 + long_pdf_page_threshold = 900 + + monkeypatch.setenv("GLOSSAPI_DOCLING_MAX_BATCH_FILES", "2") + assert _resolve_docling_queue_policy(Extractor()) == (3, 900) + + +def test_build_extract_work_items_packs_smaller_files_by_page_budget(): + paths = [Path("a.pdf"), Path("b.pdf"), Path("c.pdf"), Path("d.pdf")] + pages = { + "a.pdf": 140, + "b.pdf": 120, + "c.pdf": 110, + "d.pdf": 90, + } + + items = _build_extract_work_items( + paths, + max_batch_files=2, + target_batch_pages=250, + long_pdf_page_threshold=600, + page_counter=lambda path: pages[path.name], + ) + + assert [[p.name for p in item] for item in items] == [["a.pdf", "c.pdf"], ["b.pdf", "d.pdf"]] + + +def test_build_extract_work_items_keeps_long_pdf_as_standalone_work_item(): + paths = [Path("huge.pdf"), Path("small-a.pdf"), Path("small-b.pdf")] + pages = { + "huge.pdf": 1200, + "small-a.pdf": 100, + "small-b.pdf": 80, + } + + items = _build_extract_work_items( + paths, + max_batch_files=3, + target_batch_pages=250, + long_pdf_page_threshold=600, + page_counter=lambda path: pages[path.name], + ) + + assert [p.name for p in items[0]] == ["huge.pdf"] + assert sorted(p.name for p in items[1]) == ["small-a.pdf", "small-b.pdf"] diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py index 4fe7464..f673a83 100644 --- a/tests/test_pipeline_smoke.py +++ b/tests/test_pipeline_smoke.py @@ -1,4 +1,5 @@ import os +import sys from pathlib import Path import pandas as pd @@ -7,10 +8,6 @@ pytest.importorskip("docling") pytest.importorskip("glossapi_rs_cleaner") -pytest.importorskip( - "onnxruntime", reason="RapidOCR/DeepSeek end-to-end tests require onnxruntime" -) -import onnxruntime as ort # noqa: E402 from glossapi import Corpus from glossapi.corpus import _resolve_skiplist_path @@ -106,11 +103,8 @@ def _assert_dir_contents( pytest.fail(f"Unexpected file {entry} in {root}") -@pytest.mark.rapidocr -def test_pipeline_smoke_and_artifacts(tmp_path): +def test_pipeline_smoke_and_artifacts(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for pipeline smoke test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -132,7 +126,6 @@ def test_pipeline_smoke_and_artifacts(tmp_path): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -146,6 +139,21 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert bool(needs.get("blank.pdf")), "Blank PDF should be flagged for OCR" assert not bool(needs.get("text.pdf")) + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + (markdown_dir / f"{stem}.md").write_text("[[Blank page]]\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( mode="ocr_bad", use_gpus="single", @@ -193,15 +201,8 @@ def test_pipeline_smoke_and_artifacts(tmp_path): assert sections_file.exists() -@pytest.mark.rapidocr def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - - assert torch.cuda.is_available(), "CUDA GPU expected for docling pipeline test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -242,7 +243,6 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -256,6 +256,25 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert bool(greek_row["needs_ocr"]), "Greek consonant doc should require OCR rerun" assert "non_greek_text" in str(greek_row.get("filter", "")), "Filter should record non-Greek text" + from glossapi.ocr.deepseek import runner as deepseek_runner + + def fake_run_for_files(self_ref, files, **kwargs): + markdown_dir = self_ref.output_dir / "markdown" + metrics_dir = self_ref.output_dir / "json" / "metrics" + markdown_dir.mkdir(parents=True, exist_ok=True) + metrics_dir.mkdir(parents=True, exist_ok=True) + for name in files: + stem = Path(name).stem + if stem == "greek_consonants": + text = documents["greek_consonants"] + else: + text = documents.get(stem) or "[[Blank page]]" + (markdown_dir / f"{stem}.md").write_text(f"{text}\n", encoding="utf-8") + (metrics_dir / f"{stem}.metrics.json").write_text("{\n \"page_count\": 1\n}\n", encoding="utf-8") + return {Path(name).stem: {"page_count": 1} for name in files} + + monkeypatch.setattr(deepseek_runner, "run_for_files", fake_run_for_files) + corpus.ocr( fix_bad=True, math_enhance=True, @@ -268,6 +287,15 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not bool(greek_after["needs_ocr"]), "Greek consonant doc should be resolved after OCR rerun" assert bool(greek_after.get("ocr_success", False)), "OCR rerun should mark greek consonant doc as success" + corpus.ocr( + backend="deepseek", + fix_bad=False, + math_enhance=True, + mode="math_only", + use_gpus="single", + devices=[device_idx], + ) + json_dir = corpus_dir / "json" assert json_dir.exists(), "Docling JSON directory should exist after extraction" for stem in documents: @@ -304,11 +332,8 @@ def test_docling_math_pipeline_with_mixed_pdfs(tmp_path, monkeypatch): assert not skiplist_path.read_text(encoding="utf-8").strip(), "Fatal skip-list should remain empty" -@pytest.mark.rapidocr def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): assert torch.cuda.is_available(), "CUDA GPU expected for OCR recovery test" - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" device_idx = 0 if torch.cuda.device_count() > 1: @@ -351,7 +376,6 @@ def test_clean_skips_files_with_successful_ocr(tmp_path, monkeypatch): accel_type="CUDA", num_threads=1, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) @@ -384,8 +408,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): script = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", - Path.cwd() / "deepseek-ocr" / "run_pdf_ocr_vllm.py", + "GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", + Path.cwd() / "src" / "glossapi" / "ocr" / "deepseek" / "run_pdf_ocr_transformers.py", ) ) if not script.exists(): @@ -393,8 +417,8 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): python_bin = Path( os.environ.get( - "GLOSSAPI_DEEPSEEK_TEST_PYTHON", - Path("/mnt/data/glossAPI/deepseek_venv/bin/python"), + "GLOSSAPI_DEEPSEEK_PYTHON", + os.environ.get("GLOSSAPI_DEEPSEEK_TEST_PYTHON", sys.executable), ) ) if not python_bin.exists(): @@ -409,29 +433,17 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): if not model_dir.exists(): pytest.skip(f"DeepSeek model directory missing: {model_dir}") - lib_path = os.environ.get("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH") - if not lib_path: - candidate = Path.cwd() / "deepseek-ocr" / "libjpeg-turbo" / "lib" - if candidate.exists(): - lib_path = str(candidate) - if not lib_path or not Path(lib_path).exists(): - pytest.skip("Set GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH to the libjpeg-turbo library directory") - - providers = ort.get_available_providers() - assert "CUDAExecutionProvider" in providers, f"CUDAExecutionProvider missing: {providers}" - device_idx = 0 if torch.cuda.device_count() > 1: device_idx = torch.cuda.current_device() - # Force the CLI path (no stub fallback) and point to the desired interpreter/script. + # Force the real runner path and point to the desired interpreter/script. monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_CLI", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_ALLOW_STUB", "0") monkeypatch.setenv("GLOSSAPI_DEEPSEEK_PYTHON", str(python_bin)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_VLLM_SCRIPT", str(script)) - monkeypatch.setenv("GLOSSAPI_DEEPSEEK_LD_LIBRARY_PATH", lib_path) - monkeypatch.setenv("VLLM_ALLOW_REMOTE_CODE", "1") + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_RUNNER_SCRIPT", str(script)) + monkeypatch.setenv("GLOSSAPI_DEEPSEEK_MODEL_DIR", str(model_dir)) existing_py_path = os.environ.get("PYTHONPATH", "") src_path = str(Path.cwd() / "src") if existing_py_path: @@ -439,13 +451,6 @@ def test_deepseek_cli_pipeline_with_synthetic_pdfs(tmp_path, monkeypatch): else: monkeypatch.setenv("PYTHONPATH", src_path) - import glossapi.ocr.deepseek.runner as deepseek_runner - - def _raise_if_stub(*_args, **_kwargs): - raise AssertionError("DeepSeek fallback stub should not run in CLI smoke test") - - monkeypatch.setattr(deepseek_runner, "_run_one_pdf", _raise_if_stub) - corpus_dir = tmp_path / "corpus" corpus_dir.mkdir() @@ -461,7 +466,6 @@ def _raise_if_stub(*_args, **_kwargs): num_threads=1, emit_formula_index=True, phase1_backend="docling", - force_ocr=True, use_gpus="single", devices=[device_idx], ) diff --git a/tests/test_rapidocr_patch.py b/tests/test_rapidocr_patch.py deleted file mode 100644 index 93a8ca5..0000000 --- a/tests/test_rapidocr_patch.py +++ /dev/null @@ -1,368 +0,0 @@ -import importlib -import sys -import types -from pathlib import Path -from types import SimpleNamespace - -import numpy as np -import pytest - - -def _clear_modules(prefix: str) -> None: - for name in list(sys.modules): - if name == prefix or name.startswith(f"{prefix}."): - sys.modules.pop(name, None) - - -def _install_docling_stub(*, supports_injection: bool) -> None: - _clear_modules("docling") - _clear_modules("docling_core") - _clear_modules("glossapi") - - def register(name: str) -> types.ModuleType: - module = types.ModuleType(name) - sys.modules[name] = module - return module - - docling = register("docling") - register("docling.backend") - register("docling.backend.docling_parse_backend").DoclingParseDocumentBackend = object - register("docling.backend.docling_parse_v2_backend").DoclingParseV2DocumentBackend = object - register("docling.backend.pypdfium2_backend").PyPdfiumDocumentBackend = object - - base_models = register("docling.datamodel.base_models") - - class InputFormat: - PDF = "pdf" - DOCX = "docx" - XML_JATS = "xml" - HTML = "html" - PPTX = "pptx" - CSV = "csv" - MD = "md" - - class ConversionStatus: - SUCCESS = "success" - PARTIAL_SUCCESS = "partial" - - class Page: - def __init__(self): - self._backend = types.SimpleNamespace( - is_valid=lambda: True, - get_page_image=lambda *args, **kwargs: types.SimpleNamespace() - ) - - base_models.InputFormat = InputFormat - base_models.ConversionStatus = ConversionStatus - base_models.Page = Page - - pipeline_opts = register("docling.datamodel.pipeline_options") - - class AcceleratorDevice: - AUTO = "auto" - CUDA = "cuda" - MPS = "mps" - CPU = "cpu" - - class AcceleratorOptions: - def __init__(self, num_threads=None, device=None): - self.num_threads = num_threads - self.device = device - - class PdfPipelineOptions: - def __init__(self, **_kwargs): - self.ocr_options = None - self.do_ocr = False - - class RapidOcrOptions: - def __init__(self, **kwargs): - for key, value in kwargs.items(): - setattr(self, key, value) - self.rec_keys_path = None - - class OcrOptions: - pass - - class LayoutOptions: - pass - - class TableStructureOptions: - def __init__(self, mode=None): - self.mode = mode - self.do_cell_matching = False - - class TableFormerMode: - ACCURATE = "accurate" - - class PictureDescriptionApiOptions: - pass - - pipeline_opts.AcceleratorDevice = AcceleratorDevice - pipeline_opts.AcceleratorOptions = AcceleratorOptions - pipeline_opts.PdfPipelineOptions = PdfPipelineOptions - pipeline_opts.RapidOcrOptions = RapidOcrOptions - pipeline_opts.OcrOptions = OcrOptions - pipeline_opts.LayoutOptions = LayoutOptions - pipeline_opts.TableStructureOptions = TableStructureOptions - pipeline_opts.TableFormerMode = TableFormerMode - pipeline_opts.PictureDescriptionApiOptions = PictureDescriptionApiOptions - - register("docling.datamodel.document").ConversionResult = object - - settings_mod = register("docling.datamodel.settings") - - class _Debug: - def __init__(self): - self.profile_pipeline_timings = False - self.visualize_ocr = False - - class _Settings: - def __init__(self): - self.debug = _Debug() - - settings_mod.settings = _Settings() - - converter_mod = register("docling.document_converter") - - class DocumentConverter: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - class PdfFormatOption: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - converter_mod.DocumentConverter = DocumentConverter - converter_mod.PdfFormatOption = PdfFormatOption - converter_mod.WordFormatOption = object - converter_mod.HTMLFormatOption = object - converter_mod.XMLJatsFormatOption = object - converter_mod.PowerpointFormatOption = object - converter_mod.MarkdownFormatOption = object - converter_mod.CsvFormatOption = object - - register("docling.pipeline.simple_pipeline").SimplePipeline = object - - pipelines_mod = register("docling.pipelines.standard_pdf_pipeline") - pipeline_mod = register("docling.pipeline.standard_pdf_pipeline") - - if supports_injection: - class StandardPdfPipeline: - def __init__(self, opts, ocr_model=None, **_): - self.opts = opts - self.ocr_model = ocr_model - else: - class StandardPdfPipeline: - def __init__(self, opts, **_): - self.opts = opts - - pipelines_mod.StandardPdfPipeline = StandardPdfPipeline - pipeline_mod.StandardPdfPipeline = StandardPdfPipeline - - rapid_module = register("docling.models.rapid_ocr_model") - - class DummyReader: - def __call__(self, *_args, **_kwargs): - return [] - - class RapidOcrModel: - def __init__(self, enabled, artifacts_path, options, accelerator_options): - self.enabled = enabled - self.reader = DummyReader() - self.options = options - - def get_ocr_rects(self, _page): - return [] - - def post_process_cells(self, _cells, _page): - pass - - class TextCell: - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - class _Log: - @staticmethod - def warning(_msg, *args, **kwargs): - return None - - rapid_module.RapidOcrModel = RapidOcrModel - rapid_module.TextCell = TextCell - rapid_module._log = _Log() - - utils_mod = register("docling.utils") - profiling_mod = register("docling.utils.profiling") - - class TimeRecorder: - def __init__(self, *_args, **_kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, *exc): - return False - - profiling_mod.TimeRecorder = TimeRecorder - utils_mod.profiling = profiling_mod - - register("docling.models") - - core_doc = register("docling_core.types.doc") - - class BoundingBox: - @staticmethod - def from_tuple(coord, origin=None): - return SimpleNamespace(coord=coord, origin=origin) - - class CoordOrigin: - TOPLEFT = "topleft" - - core_doc.BoundingBox = BoundingBox - core_doc.CoordOrigin = CoordOrigin - - core_page = register("docling_core.types.doc.page") - - class BoundingRectangle: - @staticmethod - def from_bounding_box(box): - return box - - core_page.BoundingRectangle = BoundingRectangle - - -def _install_onnxruntime_stub(): - sys.modules['onnxruntime'] = types.SimpleNamespace( - get_available_providers=lambda: ['CUDAExecutionProvider'] - ) - - -def _make_safe_ocr() -> SimpleNamespace: - """Return an instantiated SafeRapidOcrModel with stubbed dependencies.""" - rapid_opts = sys.modules['docling.datamodel.pipeline_options'].RapidOcrOptions() - accel_opts = sys.modules['docling.datamodel.pipeline_options'].AcceleratorOptions(device='cuda:0') - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel - - return SafeRapidOcrModel(enabled=True, artifacts_path=None, options=rapid_opts, accelerator_options=accel_opts) - - -@pytest.fixture(autouse=True) -def _cleanup_modules(): - yield - for name in [n for n in list(sys.modules) if n.startswith('glossapi') and '_rapidocr_paths' not in n]: - if name.startswith('glossapi_rs_'): - continue - sys.modules.pop(name, None) - _clear_modules('docling') - _clear_modules('docling_core') - sys.modules.pop('onnxruntime', None) - - -def test_patch_runs_on_import(): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - rapid_module = sys.modules['docling.models.rapid_ocr_model'] - from glossapi.ocr.rapidocr.safe import SafeRapidOcrModel, patch_docling_rapidocr - - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - patch_docling_rapidocr() - assert rapid_module.RapidOcrModel is SafeRapidOcrModel - - -def test_build_rapidocr_pipeline_injects_when_supported(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - glossapi_mod = importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - captured = {} - - def fake_pool_get(device, opts, factory, expected_type): - model = factory() - assert isinstance(model, pipeline.SafeRapidOcrModel) - assert expected_type is pipeline.SafeRapidOcrModel - captured['device'] = device - captured['opts'] = opts - return SimpleNamespace() - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fake_pool_get)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - assert hasattr(engine, 'ocr_model') - assert captured['device'] == 'cuda:0' - assert opts.do_ocr is True - - -def test_build_rapidocr_pipeline_falls_back_without_injection(monkeypatch): - _install_docling_stub(supports_injection=False) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - pipeline = importlib.reload(importlib.import_module('glossapi.ocr.rapidocr.pipeline')) - - monkeypatch.setattr( - pipeline, - 'resolve_packaged_onnx_and_keys', - lambda: SimpleNamespace(det='det', rec='rec', cls='cls', keys='keys'), - ) - - def fail_pool(*_args, **_kwargs): - raise AssertionError('Pool should not be used when injection unsupported') - - monkeypatch.setattr(pipeline, 'GLOBAL_RAPID_OCR_POOL', SimpleNamespace(get=fail_pool)) - - engine, opts = pipeline.build_rapidocr_pipeline(device='cuda:0') - converter_mod = importlib.import_module('docling.document_converter') - assert isinstance(engine, converter_mod.DocumentConverter) - assert opts.do_ocr is True - - -def test_safe_rapidocr_normalises_none(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - assert model._normalise_result(None) == [] - - -def test_safe_rapidocr_normalises_incomplete_and_valid_data(monkeypatch): - _install_docling_stub(supports_injection=True) - _install_onnxruntime_stub() - - importlib.import_module('glossapi') - model = _make_safe_ocr() - - class IncompleteResult: - boxes = None - txts = ['foo'] - scores = [0.9] - - assert model._normalise_result(IncompleteResult()) == [] - - box = np.array([ - [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0]], - ]) - - class FullResult: - boxes = box - txts = ['foo'] - scores = [0.9] - - output = model._normalise_result(FullResult()) - assert output == [ - (box[0].tolist(), 'foo', 0.9) - ] diff --git a/tests/test_streaming_garbage_detector.py b/tests/test_streaming_garbage_detector.py new file mode 100644 index 0000000..0d12fdd --- /dev/null +++ b/tests/test_streaming_garbage_detector.py @@ -0,0 +1,83 @@ +from pathlib import Path + +import pytest + +from glossapi.ocr.utils.cleaning import StreamingGarbageDetector + + +DOWNLOAD_EXPORT = ( + Path.home() + / "Downloads" + / "deepseek_ocr_43pdfs_allpages_20260331" +) + + +def _stream_detect(text: str, *, chunk_size: int) -> tuple[bool, str | None]: + detector = StreamingGarbageDetector() + for idx in range(0, len(text), max(1, int(chunk_size))): + if detector.feed(text[idx : idx + chunk_size]): + return True, detector.triggered_reason + return False, detector.triggered_reason + + +def _load_real_markdown_garbage() -> str: + root = DOWNLOAD_EXPORT / "corrections_markdown_garbage" + if not root.exists(): + pytest.skip(f"missing local export: {root}") + for path in sorted(root.glob("*__markdown_original.md")): + text = path.read_text(encoding="utf-8", errors="ignore") + if "\uf0b7" in text or "" in text or "" in text: + return text + pytest.skip("no local symbol-garbage sample found") + + +def _load_real_empty_page_numeric_garbage() -> str: + if not DOWNLOAD_EXPORT.exists(): + pytest.skip(f"missing local export: {DOWNLOAD_EXPORT}") + preferred = DOWNLOAD_EXPORT / ( + "000008__04afb897cb954a76fe378b2ca22f2f059097876fa60a57666de75e37319e5968__p0008__markdown_original.md" + ) + candidates = [preferred] if preferred.exists() else sorted(DOWNLOAD_EXPORT.glob("*__markdown_original.md")) + for path in candidates: + text = path.read_text(encoding="utf-8", errors="ignore") + if "1. 2. 3." in text: + return text + pytest.skip("no local numeric-list garbage sample found") + + +@pytest.mark.parametrize("chunk_size", [1, 2, 5, 17]) +def test_streaming_detector_catches_symbol_garbage_across_chunks(chunk_size): + text = "Κανονικό κείμενο\n" + (" " * 20) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 2, 4, 11]) +def test_streaming_detector_catches_numeric_list_garbage_across_chunks(chunk_size): + text = " ".join(f"{idx}." for idx in range(1, 25)) + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage" + + +def test_streaming_detector_ignores_non_ascii_digit_glyphs(): + triggered, reason = _stream_detect("x³ y² z¹", chunk_size=1) + assert triggered is False + assert reason is None + + +@pytest.mark.parametrize("chunk_size", [1, 3, 9, 23]) +def test_streaming_detector_real_faulty_page_from_downloads(chunk_size): + text = _load_real_markdown_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "symbol_garbage" + + +@pytest.mark.parametrize("chunk_size", [1, 3, 8, 21]) +def test_streaming_detector_real_empty_page_generation_from_downloads(chunk_size): + text = _load_real_empty_page_numeric_garbage() + triggered, reason = _stream_detect(text, chunk_size=chunk_size) + assert triggered is True + assert reason == "numeric_list_garbage"