diff --git a/.github/workflows/pii.yml b/.github/workflows/pii.yml
new file mode 100644
index 00000000..37909611
--- /dev/null
+++ b/.github/workflows/pii.yml
@@ -0,0 +1,46 @@
+name: PII / Presidio
+
+# Opt-in job for the heavy Presidio PII path: it installs presidio + spaCy, which
+# is minutes-long, so it is kept OUT of the main CI (which excludes the pii extra
+# and exercises the regex engine + fallback). It runs on manual dispatch and
+# automatically only when the redaction code or this workflow changes.
+on:
+ workflow_dispatch:
+ push:
+ branches: [main]
+ paths:
+ - "src/pyfly/logging/redaction/**"
+ - "src/pyfly/config/properties/logging.py"
+ - "tests/logging/test_redaction_presidio.py"
+ - ".github/workflows/pii.yml"
+ pull_request:
+ branches: [main]
+ paths:
+ - "src/pyfly/logging/redaction/**"
+ - "src/pyfly/config/properties/logging.py"
+ - "tests/logging/test_redaction_presidio.py"
+ - ".github/workflows/pii.yml"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ presidio:
+ name: Presidio redaction (end-to-end)
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v6
+ - uses: astral-sh/setup-uv@v7
+ with:
+ enable-cache: true
+ - uses: actions/setup-python@v6
+ with:
+ python-version: "3.13"
+ # All extras (incl. pii=presidio and cli=click, which spaCy's CLI needs) + dev.
+ # This is the dedicated heavy job, so the full install is acceptable here.
+ - run: uv sync --all-extras --group dev
+ # Tiny spaCy model so the run stays fast; PresidioProperties.model selects it.
+ - run: uv run python -m spacy download en_core_web_sm
+ # Exercise the real Presidio NER path end-to-end + the engine fallback logic.
+ - run: uv run pytest tests/logging/test_redaction_presidio.py tests/logging/test_redaction_engine.py tests/logging/test_redaction_patterns.py -q
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67a3a5f4..70c211b6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
---
+## v26.06.03 (2026-06-05)
+
+### Presidio PII path — now functional + CI-covered
+
+- **`PresidioRedactor` now actually uses Presidio.** It previously passed pyfly's
+ regex-oriented entity names (`EMAIL`, `IBAN`, `PHONE`…) to Presidio, whose
+ recognizers use different names (`EMAIL_ADDRESS`, `IBAN_CODE`, `PHONE_NUMBER`…),
+ so detection found almost nothing and always fell back to regex. It now detects
+ with Presidio's **full recognizer set** (including NER for free-text **names**,
+ locations, etc.) and then runs the regex pass over the result, so token-types
+ Presidio has no recognizer for (JWT, bearer tokens, URL credentials) are still
+ masked.
+- **Configurable spaCy model** — new `pyfly.logging.redaction.presidio.model`
+ (default `en_core_web_lg`). Set a lighter model (e.g. `en_core_web_sm`) where the
+ full model is too heavy. If the model isn't installed, redaction falls back to
+ regex rather than failing.
+- **Opt-in CI job** (`.github/workflows/pii.yml`, `PII / Presidio`) installs
+ `pyfly[pii]` + a small spaCy model and exercises the Presidio NER path
+ end-to-end. It runs on manual dispatch and automatically only when the redaction
+ code changes — the main CI stays fast (it excludes the heavy `pii` extra).
+
+---
+
## v26.06.02 (2026-06-05)
### Unified logging, Spring-style configuration & PII redaction
diff --git a/README.md b/README.md
index 9e1a696e..787dd044 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
-
+
diff --git a/docs/modules/logging.md b/docs/modules/logging.md
index 3e5b5541..fb6befbf 100644
--- a/docs/modules/logging.md
+++ b/docs/modules/logging.md
@@ -199,6 +199,7 @@ message for sensitive entities before writing to any output.
| `redaction.streams.enabled` | bool | `false` | Opt-in: wrap `sys.stdout`/`sys.stderr` with the redactor |
| `redaction.presidio.languages` | list[string] | `["en"]` | Languages passed to Presidio's `AnalyzerEngine` |
| `redaction.presidio.score-threshold` | float | `0.5` | Minimum Presidio confidence score to trigger redaction |
+| `redaction.presidio.model` | string | `en_core_web_lg` | spaCy model Presidio's NLP engine loads (download it with `python -m spacy download `; use a smaller model like `en_core_web_sm` for lighter footprints) |
Default entities detected by the regex engine:
@@ -217,9 +218,18 @@ pip install "pyfly[pii]"
uv add "pyfly[pii]"
```
-Then set `engine: auto` (default) or `engine: presidio`. Presidio uses
-named-entity recognition models and catches PII that regex cannot, such as
-free-text names and addresses.
+You also need a spaCy model (Presidio defaults to `en_core_web_lg`):
+
+```bash
+python -m spacy download en_core_web_lg # or en_core_web_sm for a lighter footprint
+```
+
+Then set `engine: auto` (default) or `engine: presidio`. Presidio detects with its
+full recognizer set (named-entity recognition), catching PII that regex cannot —
+free-text **names**, locations, etc. — and pyfly then runs the regex pass over the
+result so token-types Presidio has no recognizer for (JWT, bearer tokens, URL
+credentials) are still masked. If the model isn't installed, redaction falls back
+to the regex engine rather than failing.
```yaml
pyfly:
diff --git a/pyproject.toml b/pyproject.toml
index c14d1772..5c5fde86 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "pyfly"
# CalVer YY.MM.PATCH — package metadata uses PEP 440 normalized form (26.5.4);
# git tag, GitHub release and human-readable display use leading-zero form
# (v26.05.04) to match the Java/.NET/Go siblings.
-version = "26.6.2"
+version = "26.6.3"
description = "The official Python implementation of the Firefly Framework — DI, CQRS, EDA, hexagonal architecture, and more."
readme = "README.md"
license = "Apache-2.0"
diff --git a/src/pyfly/__init__.py b/src/pyfly/__init__.py
index b3348236..e914f98e 100644
--- a/src/pyfly/__init__.py
+++ b/src/pyfly/__init__.py
@@ -13,4 +13,4 @@
# limitations under the License.
"""PyFly — Enterprise Python Framework."""
-__version__ = "26.06.02"
+__version__ = "26.06.03"
diff --git a/src/pyfly/config/properties/logging.py b/src/pyfly/config/properties/logging.py
index 699d214e..70154864 100644
--- a/src/pyfly/config/properties/logging.py
+++ b/src/pyfly/config/properties/logging.py
@@ -59,6 +59,7 @@ class PresidioProperties:
languages: list[str] = field(default_factory=lambda: ["en"])
score_threshold: float = 0.5
+ model: str = "en_core_web_lg" # spaCy model Presidio's NLP engine loads
@dataclass
diff --git a/src/pyfly/logging/redaction/engine.py b/src/pyfly/logging/redaction/engine.py
index 3a88eccb..6e4614ba 100644
--- a/src/pyfly/logging/redaction/engine.py
+++ b/src/pyfly/logging/redaction/engine.py
@@ -88,32 +88,49 @@ class PresidioRedactor:
def __init__(self, props: RedactionProperties) -> None:
from presidio_analyzer import AnalyzerEngine # type: ignore[import-not-found, import-untyped, unused-ignore]
+ from presidio_analyzer.nlp_engine import ( # type: ignore[import-not-found, import-untyped, unused-ignore]
+ NlpEngineProvider,
+ )
from presidio_anonymizer import ( # type: ignore[import-not-found, import-untyped, unused-ignore]
AnonymizerEngine,
)
- # Typed as Any so the redact() body is checked independently of presidio's
- # own type surface (its two packages even use different RecognizerResult
- # classes); the no-untyped-call ignore covers presidio's untyped __init__,
- # and unused-ignore covers the env where presidio isn't installed (Any).
- self._analyzer: Any = AnalyzerEngine() # type: ignore[no-untyped-call, unused-ignore]
+ language = (props.presidio.languages or ["en"])[0]
+ # Build the NLP engine for the configured spaCy model (default
+ # en_core_web_lg) so the model is explicit and swappable (e.g. CI uses the
+ # tiny en_core_web_sm). Typed as Any so redact() type-checks independently
+ # of presidio's own type surface; the no-untyped-call ignores cover
+ # presidio's untyped constructors, unused-ignore covers the not-installed
+ # env (where the imported names are Any).
+ nlp_engine: Any = NlpEngineProvider( # type: ignore[no-untyped-call, unused-ignore]
+ nlp_configuration={
+ "nlp_engine_name": "spacy",
+ "models": [{"lang_code": language, "model_name": props.presidio.model}],
+ }
+ ).create_engine()
+ self._analyzer: Any = AnalyzerEngine(nlp_engine=nlp_engine) # type: ignore[no-untyped-call, unused-ignore]
self._anonymizer: Any = AnonymizerEngine() # type: ignore[no-untyped-call, unused-ignore]
- self._language = (props.presidio.languages or ["en"])[0]
+ self._language = language
self._threshold = props.presidio.score_threshold
- self._entities = props.entities or None
+ # Regex pass runs after presidio to also mask token-types presidio has no
+ # recognizer for (JWT, bearer tokens, URL credentials).
self._fallback = RegexRedactor(props.entities, props.mask, props.extra_patterns)
def redact(self, text: Any) -> Any:
if not isinstance(text, str) or not text:
return text
+ result = text
try:
- results = self._analyzer.analyze(text=text, language=self._language, entities=self._entities)
- results = [r for r in results if r.score >= self._threshold]
- if not results:
- return self._fallback.redact(text)
- return self._anonymizer.anonymize(text=text, analyzer_results=results).text
+ # Detect with presidio's full recognizer set (its entity names differ
+ # from the regex engine's, so we do NOT restrict to props.entities).
+ findings = self._analyzer.analyze(text=result, language=self._language)
+ findings = [r for r in findings if r.score >= self._threshold]
+ if findings:
+ result = self._anonymizer.anonymize(text=result, analyzer_results=findings).text
except Exception: # noqa: BLE001 — never let redaction crash logging
- return self._fallback.redact(text)
+ pass
+ # Always run the regex pass too, for the token-types presidio misses.
+ return self._fallback.redact(result)
def build_redactor(props: RedactionProperties) -> Redactor | None:
diff --git a/tests/logging/test_redaction_presidio.py b/tests/logging/test_redaction_presidio.py
new file mode 100644
index 00000000..8fd13be6
--- /dev/null
+++ b/tests/logging/test_redaction_presidio.py
@@ -0,0 +1,66 @@
+# Copyright 2026 Firefly Software Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""End-to-end tests for the Presidio PII redaction path.
+
+Skipped unless the optional ``pyfly[pii]`` extra (Presidio) AND a spaCy model are
+installed — the dedicated ``PII / Presidio`` CI job installs both. Locally:
+``uv pip install presidio-analyzer presidio-anonymizer && python -m spacy download
+en_core_web_sm``.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+pytest.importorskip("presidio_analyzer")
+pytest.importorskip("presidio_anonymizer")
+
+from pyfly.config.properties.logging import PresidioProperties, RedactionProperties # noqa: E402
+from pyfly.logging.redaction.engine import PresidioRedactor, build_redactor # noqa: E402
+
+_MODEL = "en_core_web_sm" # tiny model — keeps CI fast
+
+
+@pytest.fixture
+def presidio_redactor() -> PresidioRedactor:
+ props = RedactionProperties(engine="presidio", presidio=PresidioProperties(model=_MODEL))
+ try:
+ return PresidioRedactor(props)
+ except Exception as exc: # noqa: BLE001 — model not downloaded -> skip, don't fail
+ pytest.skip(f"presidio spaCy model '{_MODEL}' unavailable: {exc}")
+
+
+def test_presidio_masks_person_and_email(presidio_redactor: PresidioRedactor) -> None:
+ # PERSON detection is the value-add over the regex engine (NER, no fixed pattern).
+ out = presidio_redactor.redact("My name is John Smith and my email is john.smith@acme.io")
+ assert "John Smith" not in out
+ assert "john.smith@acme.io" not in out
+
+
+def test_presidio_plus_regex_masks_card_and_token(presidio_redactor: PresidioRedactor) -> None:
+ out = presidio_redactor.redact("card 4111111111111111 and jwt eyJabc123.dEf456.GhIjk789")
+ assert "4111111111111111" not in out
+ assert "eyJabc123.dEf456.GhIjk789" not in out
+
+
+def test_build_redactor_selects_presidio_when_model_present(presidio_redactor: PresidioRedactor) -> None:
+ # Reaching here means the model loaded (the fixture didn't skip), so build_redactor
+ # must choose Presidio rather than falling back to regex.
+ props = RedactionProperties(engine="presidio", presidio=PresidioProperties(model=_MODEL))
+ assert type(build_redactor(props)).__name__ == "PresidioRedactor"
+
+
+def test_auto_engine_selects_presidio_when_available(presidio_redactor: PresidioRedactor) -> None:
+ props = RedactionProperties(engine="auto", presidio=PresidioProperties(model=_MODEL))
+ assert type(build_redactor(props)).__name__ == "PresidioRedactor"
diff --git a/uv.lock b/uv.lock
index 145671f4..8e9423eb 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1967,7 +1967,7 @@ wheels = [
[[package]]
name = "pyfly"
-version = "26.6.2"
+version = "26.6.3"
source = { editable = "." }
dependencies = [
{ name = "pydantic" },