Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 46 additions & 9 deletions builders/daily_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ def daily_append(
date_str: str | None = None,
bucket: str = DEFAULT_BUCKET,
dry_run: bool = False,
skip_if_exists: bool = False,
) -> dict:
"""
Append today's features to ArcticDB universe.
Expand All @@ -329,6 +330,25 @@ def daily_append(
3. Compute features for the combined series
4. Extract the last row (today) and append to ArcticDB

Parameters
----------
skip_if_exists
When True, tickers whose ``date_str`` row is already in ArcticDB
skip the read/compute/write cycle entirely (counted as ``n_skip``).
Use for re-runs of EOD post-market (yfinance source) where today's
row is final and re-writing it is a wasteful full-series rewrite
via the backfill path. Always leave False for MorningEnrich
(polygon source) — that path must overwrite to apply polygon's
true volume-weighted VWAP over yfinance's NaN.

Background: a re-run with ``skip_if_exists=False`` enters
``_write_row_backfill_safe``'s backfill branch on every ticker
(target_ts == existing.index.max()), which calls
``lib.write(combined, prune_previous_versions=True)`` per ticker.
904 × ~1.5s = ~22 min — over the SSM 1200s timeout. The 2026-05-01
EOD SF rerun timed out exactly here after our manual recovery
run had already written today's rows.

Returns summary dict.
"""
s3 = boto3.client("s3")
Expand Down Expand Up @@ -683,16 +703,23 @@ def daily_append(
continue

# Re-running daily_append for the same date MUST overwrite the
# existing row, not skip it. universe_lib.update() is idempotent
# (same-date rows replace instead of accumulate), so there's
# nothing to guard against.
# existing row by default — universe_lib.update() is idempotent
# for same-date rows, but the 2026-04-17 polygon-label incident
# showed the path matters: when MorningEnrich's polygon refresh
# arrives, it must overwrite yfinance's NaN-VWAP row with
# polygon's true volume-weighted VWAP.
#
# Prior to 2026-04-18, a `today_ts in hist.index: skip` guard
# silently no-op'd every re-run. That masked the 2026-04-17
# label incident: after Polygon returned T-1 data stamped as T
# in the morning DailyData run, a re-run with fresh polygon
# data couldn't repair the poisoned rows. Removing the guard
# restores the idempotency guarantee that update() provides.
# ``skip_if_exists`` is the source-aware opt-out: EOD post-market
# passes True (yfinance, immutable once written), MorningEnrich
# leaves False (polygon, must overwrite). Without this, an EOD
# re-run on a day whose row already exists hits the backfill
# branch in ``_write_row_backfill_safe`` (target_ts ==
# existing.index.max()) and rewrites the full series per ticker
# — 904 × ~1.5s blew the 1200s SSM timeout on the 2026-05-01
# EOD recovery rerun.
if skip_if_exists and today_ts in hist.index:
n_skip += 1
continue

# Build today's OHLCV row
bar = closes[ticker]
Expand Down Expand Up @@ -958,6 +985,15 @@ def main():
parser.add_argument("--dry-run", action="store_true", help="Compute but skip ArcticDB writes")
parser.add_argument("--bucket", default=DEFAULT_BUCKET, help=f"S3 bucket (default: {DEFAULT_BUCKET})")
parser.add_argument("--verbose", "-v", action="store_true", help="Enable debug logging")
parser.add_argument(
"--skip-if-exists",
action="store_true",
help=(
"Skip tickers whose target-date row is already in ArcticDB. "
"Use for EOD post-market re-runs (yfinance, immutable). Leave "
"off for MorningEnrich runs (polygon must overwrite)."
),
)

args = parser.parse_args()

Expand All @@ -971,6 +1007,7 @@ def main():
date_str=args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d"),
bucket=args.bucket,
dry_run=args.dry_run,
skip_if_exists=args.skip_if_exists,
)

if result["status"] != "ok":
Expand Down
62 changes: 34 additions & 28 deletions tests/test_daily_append_semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,37 +244,43 @@ def test_short_history_matches_stored_dtype():
)


def test_no_skip_guard_on_existing_today_row():
"""daily_append must NOT skip tickers whose history already contains today_ts.

Regression for 2026-04-18: a `if today_ts in hist.index: skip` guard
defeated the idempotency guarantee that update() provides. Symptom was
discovered during the 2026-04-17 incident recovery — the poisoned
morning run had already written T-1 data under index=T, and a re-run
with correct polygon data couldn't overwrite because every ticker
tripped the skip guard.

update() is explicitly chosen (see the comment at the update call site)
BECAUSE it replaces same-date rows. The guard was redundant at best,
actively harmful at worst. This test locks the removal so a future
well-intentioned refactor doesn't re-introduce it.
def test_no_unconditional_skip_guard_on_existing_today_row():
"""daily_append must NOT *unconditionally* skip tickers whose history
already contains today_ts.

Regression for 2026-04-18: an unconditional ``if today_ts in
hist.index: skip`` guard defeated the idempotency guarantee that
update() provides. Symptom surfaced during the 2026-04-17 incident
recovery — the poisoned morning run had written T-1 data under
index=T, and a re-run with correct polygon data couldn't overwrite
because every ticker tripped the skip guard.

The 2026-05-01 follow-up introduced an opt-in gate
(``skip_if_exists`` parameter) so EOD post-market re-runs don't
redundantly rewrite all 904 tickers via the slow lib.write backfill
path (see test_daily_append_skip_if_exists.py for that contract).
The opt-in form ``if skip_if_exists and today_ts in hist.index:``
is allowed; an unconditional ``if today_ts in hist.index:`` is not.
"""
src = _source()
# Must not have the exact skip pattern. Allow comments that document
# why the guard was removed (they reference today_ts in hist.index).
# The test looks for the executable pattern: an `if today_ts in hist.index`
# immediately followed by `n_skip += 1` in the next 2 lines.
lines = src.splitlines()
for i, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith("#"):
continue # skip comments
if "today_ts in hist.index" in stripped and stripped.startswith("if "):
# Check if this is followed by `n_skip += 1 ... continue` (the
# skip pattern). If so, the guard was reintroduced.
following = "\n".join(lines[i:i+4])
assert "n_skip" not in following, (
f"Found skip-on-existing-today guard at line {i+1}. Remove it — "
"update() already handles same-date idempotency. See "
"2026-04-17 label-bug incident."
)
continue
if "today_ts in hist.index" not in stripped:
continue
if not stripped.startswith("if "):
continue
# Allow the explicit opt-in gate: caller has to pass skip_if_exists=True.
if "skip_if_exists" in stripped:
continue
# Bare ``if today_ts in hist.index:`` followed by skip is the
# forbidden pattern — the 2026-04-17 polygon-relabel bug recurs
# if a future PR reintroduces it without gating.
following = "\n".join(lines[i:i+4])
assert "n_skip" not in following, (
f"Found UNCONDITIONAL skip-on-existing-today guard at line "
f"{i+1}. Gate it behind ``skip_if_exists`` (see the 2026-05-01 "
f"design note in daily_append.py) or remove it."
)
Loading
Loading