From 0a0f7d67ec458571f0af62060f58893957be9bf0 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 23:05:30 -0400 Subject: [PATCH 1/3] fix: correct pii_screening config-key namespace typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A documentation audit found that config.py read PII_SCREENING from ``ckanext.datastore_plus.pii_screening`` — a typo: ``datastore_plus`` instead of ``datapusher_plus``. Every other DP+ setting, the ``config_declaration.yaml`` declaration, and the README all use the ``datapusher_plus`` namespace. Practical effect: operators enabling PII screening via the documented key ``ckanext.datapusher_plus.pii_screening`` had no effect — ``PII_SCREENING`` silently stayed at its ``False`` fallback, so the PII-screening feature could not be turned on as documented. Only someone who happened to set the (undocumented, typo'd) key ``ckanext.datastore_plus.pii_screening`` would have enabled it. Fix: read the documented ``ckanext.datapusher_plus.pii_screening`` key. Adds tests/test_pii_screening_config_key.py with 3 regression tests: the documented key now drives PII_SCREENING, the old typo'd key no longer does, and an AST-based drift guard that pins the config.py key string to the config_declaration.yaml declaration so a reintroduced namespace typo fails CI. Co-Authored-By: Claude Opus 4.7 (1M context) --- ckanext/datapusher_plus/config.py | 4 +- tests/test_pii_screening_config_key.py | 146 +++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 tests/test_pii_screening_config_key.py diff --git a/ckanext/datapusher_plus/config.py b/ckanext/datapusher_plus/config.py index 4f89146..f06d6ae 100644 --- a/ckanext/datapusher_plus/config.py +++ b/ckanext/datapusher_plus/config.py @@ -46,7 +46,9 @@ FORMATS = FORMATS.split() # PII screening settings -PII_SCREENING = tk.asbool(tk.config.get("ckanext.datastore_plus.pii_screening", False)) +PII_SCREENING = tk.asbool( + tk.config.get("ckanext.datapusher_plus.pii_screening", False) +) PII_FOUND_ABORT = tk.asbool( tk.config.get("ckanext.datapusher_plus.pii_found_abort", False) ) diff --git a/tests/test_pii_screening_config_key.py b/tests/test_pii_screening_config_key.py new file mode 100644 index 0000000..6d2f1f1 --- /dev/null +++ b/tests/test_pii_screening_config_key.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +""" +Regression coverage for the ``pii_screening`` config-key typo fix. + +A documentation audit (2026-05-19) found that ``config.py`` read +``PII_SCREENING`` from ``ckanext.datastore_plus.pii_screening`` — a +typo: ``datastore_plus`` instead of ``datapusher_plus``. Every other +DP+ setting, the ``config_declaration.yaml`` declaration, and the +README all use the ``datapusher_plus`` namespace. The practical +effect of the typo: operators enabling PII screening via the +documented key ``ckanext.datapusher_plus.pii_screening`` had no +effect — ``PII_SCREENING`` silently stayed at its ``False`` fallback, +so the PII-screening feature could not be turned on as documented. + +These tests pin three properties: + +1. The documented key ``ckanext.datapusher_plus.pii_screening`` + actually drives ``PII_SCREENING`` — the functional fix. +2. The old typo'd key ``ckanext.datastore_plus.pii_screening`` does + NOT drive it — proves the read moved off the wrong namespace and + would catch a regression that reintroduced the typo. +3. The key string read in ``config.py`` matches the one declared in + ``config_declaration.yaml`` — a #179-style declaration-vs-code + drift guard (AST-parsed to avoid the CKAN bootstrap). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def _reload_config(monkeypatch, key, value): + """Set ``key`` on ``tk.config`` and reload ``config.py`` so its + import-time ``PII_SCREENING`` constant is recomputed. Mirrors the + reload pattern used by ``test_file_hash_algorithm`` / the #61 + config tests.""" + import importlib + + pytest.importorskip("ckan") + import ckan.plugins.toolkit as tk + + monkeypatch.setitem(tk.config, key, value) + import ckanext.datapusher_plus.config as conf + + return importlib.reload(conf) + + +def test_documented_key_enables_pii_screening(monkeypatch): + # The core fix: the key the README + config_declaration.yaml + # document must actually flip PII_SCREENING on. + conf = _reload_config( + monkeypatch, "ckanext.datapusher_plus.pii_screening", True + ) + assert conf.PII_SCREENING is True + + +def test_old_typo_key_has_no_effect(monkeypatch): + # The pre-fix typo'd key must NOT drive PII_SCREENING anymore. + # Set ONLY the typo'd key to True, with the correct key absent, + # and PII_SCREENING must stay at its False default. A regression + # that reinstated ``datastore_plus`` would flip this to True. + pytest.importorskip("ckan") + import ckan.plugins.toolkit as tk + + # Ensure the correct key is absent so the assertion is unambiguous + # (config_declaration.yaml declares it with default false, so the + # test env may have it pre-populated). + monkeypatch.delitem( + tk.config, "ckanext.datapusher_plus.pii_screening", raising=False + ) + conf = _reload_config( + monkeypatch, "ckanext.datastore_plus.pii_screening", True + ) + assert conf.PII_SCREENING is False + + +def test_config_py_key_matches_declaration(): + """The key string read in ``config.py`` must match the one + declared in ``config_declaration.yaml``. A namespace typo in + either source is exactly the bug this test exists to catch.""" + import ast + + # PyYAML guarded the same way the #112 / #142 / #61 drift-guards + # are — degrade to a skip in dep-light CI rather than ImportError. + try: + import yaml + except ImportError: + pytest.skip("PyYAML not available") + + # config_declaration.yaml side — iterate all groups/options so the + # guard survives a future declaration restructure. + yaml_path = ( + REPO_ROOT + / "ckanext" + / "datapusher_plus" + / "config_declaration.yaml" + ) + decl = yaml.safe_load(yaml_path.read_text(encoding="utf-8")) + declared_keys = { + opt.get("key") + for group in decl.get("groups", []) + for opt in group.get("options", []) + } + assert "ckanext.datapusher_plus.pii_screening" in declared_keys, ( + "config_declaration.yaml must declare " + "ckanext.datapusher_plus.pii_screening." + ) + + # config.py side — AST-find the PII_SCREENING assignment and the + # config key string it reads. + config_py = ( + REPO_ROOT / "ckanext" / "datapusher_plus" / "config.py" + ).read_text() + tree = ast.parse(config_py) + found_key = None + for node in ast.walk(tree): + if not ( + isinstance(node, ast.Assign) + and any( + getattr(t, "id", None) == "PII_SCREENING" + for t in node.targets + ) + ): + continue + for call in ast.walk(node): + if ( + isinstance(call, ast.Call) + and getattr(call.func, "attr", None) == "get" + and call.args + and isinstance(call.args[0], ast.Constant) + ): + found_key = call.args[0].value + break + break + assert found_key == "ckanext.datapusher_plus.pii_screening", ( + f"config.py reads PII_SCREENING from {found_key!r}; must be " + "'ckanext.datapusher_plus.pii_screening' to match the " + "config_declaration.yaml declaration and the README " + "(the 'datastore_plus' typo was the documented-but-dead " + "key bug)." + ) From fd51a748f29699991bf2e0447b027c6a34c5a02a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 23:08:03 -0400 Subject: [PATCH 2/3] docs: correct documentation-audit findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the documentation audit. Addresses the doc-side findings (the pii_screening code bug is fixed separately): * **config.py — preview_rows fallback**: aligned the inline ``tk.config.get`` fallback from ``"1000"`` to ``"0"`` so it agrees with ``config_declaration.yaml`` (default ``0``). Under CKAN 2.10+ declarative config the declared default already wins, so ``0`` was the effective runtime default all along — the ``"1000"`` fallback was stale, never-reached, and contradicted the declaration. No runtime behavior change. * **README.md — stale describeGPT_api_key**: removed the ``ckanext.datapusher_plus.describeGPT_api_key`` example line. No such config key exists; v3.0 moved AI-suggestion credentials out of ckan.ini into qsv's own config (``describegpt_config_path``) / the ``OPENAI_API_KEY`` env var. The README's own v3.0 config reference table already omitted it. * **README.md — formats example**: dropped ``xlsxb`` (a typo — the extension is ``xlsb``) and ``xlsm`` from both ``formats`` example lines so they match ``config.py``'s actual FORMATS default. Note: ``format_converter.py`` *can* convert ``.xlsm`` / ``.xlsb`` (they're in ``SPREADSHEET_EXTENSIONS``), but the ``FORMATS`` gate doesn't list them, so DP+ won't pick those files up today — adding them to FORMATS is a separate feature decision, not a doc fix. * **README.md — illustrative-values note**: both ckan.ini example config blocks silently contradicted the actual defaults for several keys (preview_rows, chunk_size, dedup, auto_index_threshold, ignore_file_hash, auto_alias_unique). Added a note above each block clarifying the values are illustrative examples, not defaults, and pointing to config_declaration.yaml as authoritative. The individual stale values were left as-is because some non-defaults may be deliberate install-guide recommendations — a value-by-value reconciliation is left to a maintainer pass. * **CONFIG.md — DRUF template overrides**: added the two ``resource_form.html`` override entries (plain + scheming) that were missing from the list — DP+ ships 5 DRUF override templates, the doc listed only 3. Co-Authored-By: Claude Opus 4.7 (1M context) --- CONFIG.md | 2 ++ README.md | 11 +++++++---- ckanext/datapusher_plus/config.py | 6 +++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CONFIG.md b/CONFIG.md index e0ebc1e..d6ffd3d 100644 --- a/CONFIG.md +++ b/CONFIG.md @@ -49,7 +49,9 @@ ckanext.datapusher_plus.enable_druf = true When DRUF is enabled, the following templates are overridden: - `snippets/add_dataset.html`: Changes "Add Dataset" to redirect to resource upload - `package/snippets/package_form.html`: Modifies form stages to show "Add data" first +- `package/snippets/resource_form.html`: Modifies resource form stages for the resource-first flow - `scheming/package/snippets/package_form.html`: Modifies scheming form stages +- `scheming/package/snippets/resource_form.html`: Modifies scheming resource form stages **Requirements:** - No special CKAN version requirements diff --git a/README.md b/README.md index d288339..2dfa1bf 100644 --- a/README.md +++ b/README.md @@ -303,6 +303,8 @@ ckan config-tool /etc/ckan/default/ckan.ini "ckanext.datapusher_plus.api_token=$ 7. Add the rest of the DP+ config to your CKAN config (e.g. `/etc/ckan/default/ckan.ini`): +> **Note:** The block below is an illustrative example, not a list of defaults. Several values (e.g. `preview_rows`, `chunk_size`, `dedup`, `auto_index_threshold`, `ignore_file_hash`) differ from DP+'s actual defaults. The authoritative defaults live in [`config_declaration.yaml`](ckanext/datapusher_plus/config_declaration.yaml). Only set the keys you actually want to override. + ```ini # datapusher-plus settings ckanext.datapusher_plus.use_proxy = false @@ -310,7 +312,7 @@ ckanext.datapusher_plus.download_proxy = ckanext.datapusher_plus.ssl_verify = false # supports INFO, DEBUG, TRACE - use DEBUG or TRACE when debugging scheming Formulas ckanext.datapusher_plus.upload_log_level = INFO -ckanext.datapusher_plus.formats = csv tsv tab ssv xls xlsx xlsxb xlsm ods geojson shp qgis zip +ckanext.datapusher_plus.formats = csv tsv tab ssv xls xlsx ods geojson shp qgis zip ckanext.datapusher_plus.pii_screening = false ckanext.datapusher_plus.pii_found_abort = false ckanext.datapusher_plus.pii_regex_resource_id_or_alias = @@ -408,7 +410,9 @@ Use a DP+ extended scheming schema: scheming.dataset_schemas = ckanext.datapusher_plus:dataset-druf.yaml ``` -Configure DP+ numerous settings. See [config.py](ckanext/datapusher_plus/config.py) for details. +Configure DP+ numerous settings. See [config.py](ckanext/datapusher_plus/config.py) and [`config_declaration.yaml`](ckanext/datapusher_plus/config_declaration.yaml) for details. + +> **Note:** The block below is an illustrative example, not a list of defaults. Several values differ from DP+'s actual defaults — see [`config_declaration.yaml`](ckanext/datapusher_plus/config_declaration.yaml) for the authoritative defaults. Only set the keys you actually want to override. >```ini > ckanext.datapusher_plus.use_proxy = false @@ -416,7 +420,7 @@ Configure DP+ numerous settings. See [config.py](ckanext/datapusher_plus/config. > ckanext.datapusher_plus.ssl_verify = false > # supports INFO, DEBUG, TRACE - use DEBUG or TRACE when debugging scheming Formulas > ckanext.datapusher_plus.upload_log_level = INFO -> ckanext.datapusher_plus.formats = csv tsv tab ssv xls xlsx xlsxb xlsm ods geojson shp qgis zip +> ckanext.datapusher_plus.formats = csv tsv tab ssv xls xlsx ods geojson shp qgis zip > ckanext.datapusher_plus.pii_screening = false > ckanext.datapusher_plus.pii_found_abort = false > ckanext.datapusher_plus.pii_regex_resource_id_or_alias = @@ -454,7 +458,6 @@ Configure DP+ numerous settings. See [config.py](ckanext/datapusher_plus/config. > ckanext.datapusher_plus.jinja2_bytecode_cache_dir = /tmp/jinja2_butecode_cache > ckanext.datapusher_plus.auto_unzip_one_file = true > ckanext.datapusher_plus.api_token = ->ckanext.datapusher_plus.describeGPT_api_key = >``` > >and add this entry to your CKAN's `resource_formats.json` file. diff --git a/ckanext/datapusher_plus/config.py b/ckanext/datapusher_plus/config.py index f06d6ae..93bab8a 100644 --- a/ckanext/datapusher_plus/config.py +++ b/ckanext/datapusher_plus/config.py @@ -74,7 +74,11 @@ ) # Data processing settings -PREVIEW_ROWS = tk.asint(tk.config.get("ckanext.datapusher_plus.preview_rows", "1000")) +# Fallback aligned to the config_declaration.yaml default (0 = load all +# rows). Under CKAN 2.10+ declarative config the declared default wins, +# so 0 is already the effective runtime default; this just removes a +# stale, never-reached "1000" fallback that contradicted the declaration. +PREVIEW_ROWS = tk.asint(tk.config.get("ckanext.datapusher_plus.preview_rows", "0")) TIMEOUT = tk.asint(tk.config.get("ckanext.datapusher_plus.download_timeout", "300")) QSV_COMMAND_TIMEOUT = tk.asint( tk.config.get("ckanext.datapusher_plus.qsv_command_timeout", "1800") From 7c1f00f5bb39ced2350bc94ee613253d895717f3 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 23:33:37 -0400 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20address=20Copilot=20review=20on=20PR?= =?UTF-8?q?=20#324=20=E2=80=94=20config-reload=20test=20isolation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot flagged that ``_reload_config`` uses ``importlib.reload`` on ``ckanext.datapusher_plus.config``, which mutates the module object in place. The recomputed constants (from a monkeypatched ``tk.config``) persist in ``sys.modules`` after the test — once ``monkeypatch`` restores ``tk.config`` it does NOT re-run ``config.py`` — so a non-default config snapshot could leak into later tests and make the suite order-dependent. Fix: added a ``_reset_config_module`` autouse fixture that reloads ``config`` on teardown. As an autouse fixture it is set up before the test's ``monkeypatch`` fixture, so its finalizer runs *after* ``monkeypatch`` has restored ``tk.config`` — the teardown reload therefore recomputes the module constants from the pristine test-environment config, leaving no order dependence. Also corrected the ``_reload_config`` docstring, which had cited ``test_file_hash_algorithm`` / the #61 tests as a reload precedent — both actually read ``tk.config`` live and do NOT reload. Verified: the config-adjacent suites (``test_file_hash_algorithm``, ``test_issue_61_download_always_whitelist``) run green alongside this file, and the full suite stays at 277 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_pii_screening_config_key.py | 43 ++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/tests/test_pii_screening_config_key.py b/tests/test_pii_screening_config_key.py index 6d2f1f1..b116970 100644 --- a/tests/test_pii_screening_config_key.py +++ b/tests/test_pii_screening_config_key.py @@ -34,11 +34,48 @@ REPO_ROOT = Path(__file__).resolve().parent.parent +@pytest.fixture(autouse=True) +def _reset_config_module(): + """Reload ``ckanext.datapusher_plus.config`` after each test so a + ``_reload_config`` call here can't leak a monkeypatched config + snapshot into later tests via ``sys.modules``. + + ``importlib.reload`` mutates the module in place; without this + teardown the last reload's constants would persist for the rest + of the session, making unrelated tests order-dependent. + + As an autouse fixture this is set up before the test's + ``monkeypatch`` fixture, so its finalizer runs *after* + ``monkeypatch`` has restored ``tk.config`` — the reload therefore + recomputes the module constants from the pristine + test-environment config. + """ + yield + try: + import importlib + + import ckanext.datapusher_plus.config as conf + + importlib.reload(conf) + except Exception: + # A reload failure during teardown must not mask the actual + # test result; the next test that needs a clean config does + # its own reload via ``_reload_config`` anyway. + pass + + def _reload_config(monkeypatch, key, value): """Set ``key`` on ``tk.config`` and reload ``config.py`` so its - import-time ``PII_SCREENING`` constant is recomputed. Mirrors the - reload pattern used by ``test_file_hash_algorithm`` / the #61 - config tests.""" + import-time ``PII_SCREENING`` constant is recomputed. + + ``importlib.reload`` mutates the module object in place, so the + recomputed constants would otherwise persist past this test — + ``monkeypatch`` restores ``tk.config`` but does not re-run + ``config.py``. The ``_reset_config_module`` autouse fixture below + reloads the module once more on teardown (after ``tk.config`` is + restored) so the pristine state is back for later tests regardless + of suite order. + """ import importlib pytest.importorskip("ckan")