diff --git a/.github/workflows/pii.yml b/.github/workflows/pii.yml new file mode 100644 index 00000000..37909611 --- /dev/null +++ b/.github/workflows/pii.yml @@ -0,0 +1,46 @@ +name: PII / Presidio + +# Opt-in job for the heavy Presidio PII path: it installs presidio + spaCy, which +# is minutes-long, so it is kept OUT of the main CI (which excludes the pii extra +# and exercises the regex engine + fallback). It runs on manual dispatch and +# automatically only when the redaction code or this workflow changes. +on: + workflow_dispatch: + push: + branches: [main] + paths: + - "src/pyfly/logging/redaction/**" + - "src/pyfly/config/properties/logging.py" + - "tests/logging/test_redaction_presidio.py" + - ".github/workflows/pii.yml" + pull_request: + branches: [main] + paths: + - "src/pyfly/logging/redaction/**" + - "src/pyfly/config/properties/logging.py" + - "tests/logging/test_redaction_presidio.py" + - ".github/workflows/pii.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + presidio: + name: Presidio redaction (end-to-end) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + - uses: actions/setup-python@v6 + with: + python-version: "3.13" + # All extras (incl. pii=presidio and cli=click, which spaCy's CLI needs) + dev. + # This is the dedicated heavy job, so the full install is acceptable here. + - run: uv sync --all-extras --group dev + # Tiny spaCy model so the run stays fast; PresidioProperties.model selects it. + - run: uv run python -m spacy download en_core_web_sm + # Exercise the real Presidio NER path end-to-end + the engine fallback logic. + - run: uv run pytest tests/logging/test_redaction_presidio.py tests/logging/test_redaction_engine.py tests/logging/test_redaction_patterns.py -q diff --git a/CHANGELOG.md b/CHANGELOG.md index 67a3a5f4..70c211b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). --- +## v26.06.03 (2026-06-05) + +### Presidio PII path — now functional + CI-covered + +- **`PresidioRedactor` now actually uses Presidio.** It previously passed pyfly's + regex-oriented entity names (`EMAIL`, `IBAN`, `PHONE`…) to Presidio, whose + recognizers use different names (`EMAIL_ADDRESS`, `IBAN_CODE`, `PHONE_NUMBER`…), + so detection found almost nothing and always fell back to regex. It now detects + with Presidio's **full recognizer set** (including NER for free-text **names**, + locations, etc.) and then runs the regex pass over the result, so token-types + Presidio has no recognizer for (JWT, bearer tokens, URL credentials) are still + masked. +- **Configurable spaCy model** — new `pyfly.logging.redaction.presidio.model` + (default `en_core_web_lg`). Set a lighter model (e.g. `en_core_web_sm`) where the + full model is too heavy. If the model isn't installed, redaction falls back to + regex rather than failing. +- **Opt-in CI job** (`.github/workflows/pii.yml`, `PII / Presidio`) installs + `pyfly[pii]` + a small spaCy model and exercises the Presidio NER path + end-to-end. It runs on manual dispatch and automatically only when the redaction + code changes — the main CI stays fast (it excludes the heavy `pii` extra). + +--- + ## v26.06.02 (2026-06-05) ### Unified logging, Spring-style configuration & PII redaction diff --git a/README.md b/README.md index 9e1a696e..787dd044 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Firefly Framework Python 3.12+ License: Apache 2.0 - Version: 26.06.02 + Version: 26.06.03 Type Checked: mypy strict Code Style: Ruff Async First diff --git a/docs/modules/logging.md b/docs/modules/logging.md index 3e5b5541..fb6befbf 100644 --- a/docs/modules/logging.md +++ b/docs/modules/logging.md @@ -199,6 +199,7 @@ message for sensitive entities before writing to any output. | `redaction.streams.enabled` | bool | `false` | Opt-in: wrap `sys.stdout`/`sys.stderr` with the redactor | | `redaction.presidio.languages` | list[string] | `["en"]` | Languages passed to Presidio's `AnalyzerEngine` | | `redaction.presidio.score-threshold` | float | `0.5` | Minimum Presidio confidence score to trigger redaction | +| `redaction.presidio.model` | string | `en_core_web_lg` | spaCy model Presidio's NLP engine loads (download it with `python -m spacy download `; use a smaller model like `en_core_web_sm` for lighter footprints) | Default entities detected by the regex engine: @@ -217,9 +218,18 @@ pip install "pyfly[pii]" uv add "pyfly[pii]" ``` -Then set `engine: auto` (default) or `engine: presidio`. Presidio uses -named-entity recognition models and catches PII that regex cannot, such as -free-text names and addresses. +You also need a spaCy model (Presidio defaults to `en_core_web_lg`): + +```bash +python -m spacy download en_core_web_lg # or en_core_web_sm for a lighter footprint +``` + +Then set `engine: auto` (default) or `engine: presidio`. Presidio detects with its +full recognizer set (named-entity recognition), catching PII that regex cannot — +free-text **names**, locations, etc. — and pyfly then runs the regex pass over the +result so token-types Presidio has no recognizer for (JWT, bearer tokens, URL +credentials) are still masked. If the model isn't installed, redaction falls back +to the regex engine rather than failing. ```yaml pyfly: diff --git a/pyproject.toml b/pyproject.toml index c14d1772..5c5fde86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "pyfly" # CalVer YY.MM.PATCH — package metadata uses PEP 440 normalized form (26.5.4); # git tag, GitHub release and human-readable display use leading-zero form # (v26.05.04) to match the Java/.NET/Go siblings. -version = "26.6.2" +version = "26.6.3" description = "The official Python implementation of the Firefly Framework — DI, CQRS, EDA, hexagonal architecture, and more." readme = "README.md" license = "Apache-2.0" diff --git a/src/pyfly/__init__.py b/src/pyfly/__init__.py index b3348236..e914f98e 100644 --- a/src/pyfly/__init__.py +++ b/src/pyfly/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. """PyFly — Enterprise Python Framework.""" -__version__ = "26.06.02" +__version__ = "26.06.03" diff --git a/src/pyfly/config/properties/logging.py b/src/pyfly/config/properties/logging.py index 699d214e..70154864 100644 --- a/src/pyfly/config/properties/logging.py +++ b/src/pyfly/config/properties/logging.py @@ -59,6 +59,7 @@ class PresidioProperties: languages: list[str] = field(default_factory=lambda: ["en"]) score_threshold: float = 0.5 + model: str = "en_core_web_lg" # spaCy model Presidio's NLP engine loads @dataclass diff --git a/src/pyfly/logging/redaction/engine.py b/src/pyfly/logging/redaction/engine.py index 3a88eccb..6e4614ba 100644 --- a/src/pyfly/logging/redaction/engine.py +++ b/src/pyfly/logging/redaction/engine.py @@ -88,32 +88,49 @@ class PresidioRedactor: def __init__(self, props: RedactionProperties) -> None: from presidio_analyzer import AnalyzerEngine # type: ignore[import-not-found, import-untyped, unused-ignore] + from presidio_analyzer.nlp_engine import ( # type: ignore[import-not-found, import-untyped, unused-ignore] + NlpEngineProvider, + ) from presidio_anonymizer import ( # type: ignore[import-not-found, import-untyped, unused-ignore] AnonymizerEngine, ) - # Typed as Any so the redact() body is checked independently of presidio's - # own type surface (its two packages even use different RecognizerResult - # classes); the no-untyped-call ignore covers presidio's untyped __init__, - # and unused-ignore covers the env where presidio isn't installed (Any). - self._analyzer: Any = AnalyzerEngine() # type: ignore[no-untyped-call, unused-ignore] + language = (props.presidio.languages or ["en"])[0] + # Build the NLP engine for the configured spaCy model (default + # en_core_web_lg) so the model is explicit and swappable (e.g. CI uses the + # tiny en_core_web_sm). Typed as Any so redact() type-checks independently + # of presidio's own type surface; the no-untyped-call ignores cover + # presidio's untyped constructors, unused-ignore covers the not-installed + # env (where the imported names are Any). + nlp_engine: Any = NlpEngineProvider( # type: ignore[no-untyped-call, unused-ignore] + nlp_configuration={ + "nlp_engine_name": "spacy", + "models": [{"lang_code": language, "model_name": props.presidio.model}], + } + ).create_engine() + self._analyzer: Any = AnalyzerEngine(nlp_engine=nlp_engine) # type: ignore[no-untyped-call, unused-ignore] self._anonymizer: Any = AnonymizerEngine() # type: ignore[no-untyped-call, unused-ignore] - self._language = (props.presidio.languages or ["en"])[0] + self._language = language self._threshold = props.presidio.score_threshold - self._entities = props.entities or None + # Regex pass runs after presidio to also mask token-types presidio has no + # recognizer for (JWT, bearer tokens, URL credentials). self._fallback = RegexRedactor(props.entities, props.mask, props.extra_patterns) def redact(self, text: Any) -> Any: if not isinstance(text, str) or not text: return text + result = text try: - results = self._analyzer.analyze(text=text, language=self._language, entities=self._entities) - results = [r for r in results if r.score >= self._threshold] - if not results: - return self._fallback.redact(text) - return self._anonymizer.anonymize(text=text, analyzer_results=results).text + # Detect with presidio's full recognizer set (its entity names differ + # from the regex engine's, so we do NOT restrict to props.entities). + findings = self._analyzer.analyze(text=result, language=self._language) + findings = [r for r in findings if r.score >= self._threshold] + if findings: + result = self._anonymizer.anonymize(text=result, analyzer_results=findings).text except Exception: # noqa: BLE001 — never let redaction crash logging - return self._fallback.redact(text) + pass + # Always run the regex pass too, for the token-types presidio misses. + return self._fallback.redact(result) def build_redactor(props: RedactionProperties) -> Redactor | None: diff --git a/tests/logging/test_redaction_presidio.py b/tests/logging/test_redaction_presidio.py new file mode 100644 index 00000000..8fd13be6 --- /dev/null +++ b/tests/logging/test_redaction_presidio.py @@ -0,0 +1,66 @@ +# Copyright 2026 Firefly Software Foundation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end tests for the Presidio PII redaction path. + +Skipped unless the optional ``pyfly[pii]`` extra (Presidio) AND a spaCy model are +installed — the dedicated ``PII / Presidio`` CI job installs both. Locally: +``uv pip install presidio-analyzer presidio-anonymizer && python -m spacy download +en_core_web_sm``. +""" + +from __future__ import annotations + +import pytest + +pytest.importorskip("presidio_analyzer") +pytest.importorskip("presidio_anonymizer") + +from pyfly.config.properties.logging import PresidioProperties, RedactionProperties # noqa: E402 +from pyfly.logging.redaction.engine import PresidioRedactor, build_redactor # noqa: E402 + +_MODEL = "en_core_web_sm" # tiny model — keeps CI fast + + +@pytest.fixture +def presidio_redactor() -> PresidioRedactor: + props = RedactionProperties(engine="presidio", presidio=PresidioProperties(model=_MODEL)) + try: + return PresidioRedactor(props) + except Exception as exc: # noqa: BLE001 — model not downloaded -> skip, don't fail + pytest.skip(f"presidio spaCy model '{_MODEL}' unavailable: {exc}") + + +def test_presidio_masks_person_and_email(presidio_redactor: PresidioRedactor) -> None: + # PERSON detection is the value-add over the regex engine (NER, no fixed pattern). + out = presidio_redactor.redact("My name is John Smith and my email is john.smith@acme.io") + assert "John Smith" not in out + assert "john.smith@acme.io" not in out + + +def test_presidio_plus_regex_masks_card_and_token(presidio_redactor: PresidioRedactor) -> None: + out = presidio_redactor.redact("card 4111111111111111 and jwt eyJabc123.dEf456.GhIjk789") + assert "4111111111111111" not in out + assert "eyJabc123.dEf456.GhIjk789" not in out + + +def test_build_redactor_selects_presidio_when_model_present(presidio_redactor: PresidioRedactor) -> None: + # Reaching here means the model loaded (the fixture didn't skip), so build_redactor + # must choose Presidio rather than falling back to regex. + props = RedactionProperties(engine="presidio", presidio=PresidioProperties(model=_MODEL)) + assert type(build_redactor(props)).__name__ == "PresidioRedactor" + + +def test_auto_engine_selects_presidio_when_available(presidio_redactor: PresidioRedactor) -> None: + props = RedactionProperties(engine="auto", presidio=PresidioProperties(model=_MODEL)) + assert type(build_redactor(props)).__name__ == "PresidioRedactor" diff --git a/uv.lock b/uv.lock index 145671f4..8e9423eb 100644 --- a/uv.lock +++ b/uv.lock @@ -1967,7 +1967,7 @@ wheels = [ [[package]] name = "pyfly" -version = "26.6.2" +version = "26.6.3" source = { editable = "." } dependencies = [ { name = "pydantic" },