Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions tests/test_artifact_registry_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""
Artifact-registry coverage CI guard.

Phase 4 of the artifact-freshness-monitor arc (plan doc:
``~/Development/alpha-engine-docs/private/artifact-freshness-monitor-260527.md``).
PR 4 of the 4-PR cascade across ae-data / ae-research / ae-predictor /
ae-backtester. Mirrors the
``alpha-engine-data/tests/test_schema_contract.py`` precedent for
forcing operator attention at every new producer addition.

**What this catches.** A new ``s3.put_object(...)`` or
``s3.upload_file(...)`` site in ae-data production code that hasn't
been registered in
``alpha-engine-config/private-docs/ARTIFACT_REGISTRY.yaml`` (or
explicitly grandfathered via the registry's ``grandfathered_paths``
block). The producer chokepoint at PR time means the silent
absence-of-artifact bug class (e.g., 2026-05-17→27 pit_parity.json)
can't slip through new code without the operator first deciding
whether the artifact is load-bearing enough to warrant an SLA.

**Why per-file count rather than per-key-template extraction.**
Statically extracting key templates from arbitrary f-string
``put_object(Key=...)`` calls is fragile — keys are often
constructed from surrounding context (loop variables, function args,
config values). Per-file count is **stable across refactors** and
sufficient to force operator review at every new addition. The
operator is the source of truth for "which key does this PUT emit";
the test's job is to ensure the operator can't sleepwalk past adding
one without thinking.

**Sister coverage in the freshness-monitor itself.** This test is
the PR-time chokepoint; runtime validation lives in
``infrastructure/lambdas/freshness-monitor/index.py::load_registry``
(which parses the live registry from S3 and fails the Lambda on a
malformed entry) and in
``alpha-engine-config/scripts/validate_artifact_registry.py`` (PR-time
schema validator on the YAML SoT).
"""

from __future__ import annotations

import re
import subprocess
from pathlib import Path

import pytest


REPO_ROOT = Path(__file__).resolve().parents[1]

# Per-file PUT-site counts. Pinning enforces operator attention on
# every new producer addition. When a file gains/loses a PUT site:
# 1. Decide whether the new artifact is load-bearing.
# 2. Register it in alpha-engine-config/private-docs/ARTIFACT_REGISTRY.yaml
# (or add the prefix to grandfathered_paths with a one-line reason).
# 3. Bump the count here.
# When a file is added/removed wholesale: add/remove the entry below
# AND mirror the registry change.
#
# Captured 2026-05-27. See ``_enumerate_put_sites`` for the scan
# semantics (skips tests/, infrastructure/lambdas/, .claude/,
# rag/pipelines/ ingest-side scripts are scope-exempt — they write
# to RAG-corpus S3 not the freshness-monitored production bucket).
EXPECTED_PER_FILE_PUT_COUNTS: dict[str, int] = {
"builders/_price_cache_writeboth.py": 1,
"builders/daily_append.py": 1,
"builders/migrate_universe_feature_order.py": 1,
"builders/migrate_universe_vwap.py": 1,
"builders/prune_delisted_tickers.py": 1,
"collectors/alternative.py": 3,
"collectors/constituents.py": 2,
"collectors/daily_closes.py": 1,
"collectors/daily_closes_fred_repair.py": 1,
"collectors/fred_history.py": 1,
"collectors/fundamentals.py": 1,
"collectors/macro.py": 1,
"collectors/prices.py": 1,
"collectors/short_interest.py": 1,
"collectors/signal_returns.py": 1,
"collectors/universe_returns.py": 1,
"data/cache.py": 1,
"data/derived/analyst_revisions.py": 2,
"data/derived/news_aggregates.py": 2,
"data/snapshotter/analyst_daily.py": 2,
"features/compute.py": 1,
"features/registry.py": 1,
"features/writer.py": 1,
"lambda/handler.py": 1,
"preflight.py": 1,
"rag/pipelines/emit_manifest.py": 2,
"rag/pipelines/filing_change_detection.py": 2,
"rag/pipelines/ingest_form4.py": 2,
"weekly_collector.py": 6,
}


# Path-prefix exemptions. These directories are not part of the
# freshness-monitored production-artifact surface:
# - tests/ ........................ test code, not production producers
# - infrastructure/lambdas/ ....... Lambda code; tested per-Lambda
# (sf-telegram-notifier and the
# freshness-monitor itself emit
# observational artifacts already
# covered in their own tests)
# - .claude/ ...................... worktrees + agent-managed scratch
# - .venv/, build/ ............... local-dev only
_SCAN_EXEMPT_PREFIXES: tuple[str, ...] = (
"tests/",
"infrastructure/lambdas/",
".claude/",
".venv/",
"build/",
"scripts/_", # operator scratch scripts
)


# ── PUT-site enumeration ────────────────────────────────────────────────────


def _enumerate_put_sites() -> dict[str, int]:
"""Return a ``{relative_path: count}`` mapping of every production
file containing ``put_object`` or ``upload_file`` literal references.

Uses ``git grep`` for tracked-file discipline (matches CI-time
behavior — untracked scratch files don't pollute the count).
"""
result = subprocess.run(
[
"git", "grep", "-l", "-E",
r"(put_object|upload_file)\(",
"--", "*.py",
],
cwd=REPO_ROOT,
capture_output=True,
text=True,
check=True,
)
files = [
line for line in result.stdout.splitlines()
if line and not any(line.startswith(p) for p in _SCAN_EXEMPT_PREFIXES)
]

counts: dict[str, int] = {}
for rel in files:
path = REPO_ROOT / rel
text = path.read_text(encoding="utf-8", errors="ignore")
# Count literal call-site occurrences. We use a regex for
# ``put_object(`` and ``upload_file(`` rather than counting
# the bare words so docstring / comment mentions don't inflate.
matches = re.findall(r"\b(?:put_object|upload_file)\(", text)
counts[rel] = len(matches)
return counts


# ── Tests ───────────────────────────────────────────────────────────────────


def test_every_producer_file_is_pinned():
"""Every file containing a PUT site is enumerated in
:data:`EXPECTED_PER_FILE_PUT_COUNTS`. A new file with a PUT site
forces the operator to (a) register the artifact in
``alpha-engine-config/private-docs/ARTIFACT_REGISTRY.yaml`` or
grandfather it, then (b) add the file to the pinned set."""
actual = _enumerate_put_sites()
unpinned = sorted(set(actual.keys()) - set(EXPECTED_PER_FILE_PUT_COUNTS.keys()))
assert not unpinned, (
"New producer file(s) with S3 PUT sites detected but not pinned:\n"
+ "\n".join(f" - {f} ({actual[f]} PUT call(s))" for f in unpinned)
+ "\n\nResolution:\n"
" 1. Register the new artifact(s) in alpha-engine-config/"
"private-docs/ARTIFACT_REGISTRY.yaml (or add the prefix to "
"grandfathered_paths with a one-line reason).\n"
" 2. Add the file(s) to EXPECTED_PER_FILE_PUT_COUNTS in "
"tests/test_artifact_registry_coverage.py with the per-file count.\n"
" 3. Re-run this test. Plan doc: "
"~/Development/alpha-engine-docs/private/artifact-freshness-monitor-260527.md"
)


def test_every_pinned_file_still_exists():
"""The pinned set must stay aligned with the actual repo state.
A file removed without updating this set is a stale pin."""
actual = _enumerate_put_sites()
stale = sorted(set(EXPECTED_PER_FILE_PUT_COUNTS.keys()) - set(actual.keys()))
assert not stale, (
"Pinned file(s) no longer have PUT sites (or no longer exist):\n"
+ "\n".join(f" - {f}" for f in stale)
+ "\n\nResolution: remove the file from EXPECTED_PER_FILE_PUT_COUNTS. "
"If the artifact was retired, also retire its row in "
"alpha-engine-config/private-docs/ARTIFACT_REGISTRY.yaml."
)


def test_pinned_counts_match_actual():
"""Per-file PUT-site counts must match the pinned values. A
delta forces operator review: new PUT site needs registry entry;
removed PUT site may need registry retirement."""
actual = _enumerate_put_sites()
deltas = []
for path, expected_count in sorted(EXPECTED_PER_FILE_PUT_COUNTS.items()):
actual_count = actual.get(path, 0)
if actual_count != expected_count:
deltas.append(f" - {path}: expected={expected_count}, actual={actual_count}")
assert not deltas, (
"PUT-site count drift detected:\n"
+ "\n".join(deltas)
+ "\n\nResolution: for each delta, either (a) the PUT count changed "
"legitimately — register the new artifact in alpha-engine-config/"
"private-docs/ARTIFACT_REGISTRY.yaml (or grandfather), then bump "
"the pinned count; or (b) the change was inadvertent — revert."
)
Loading