# 04 Â· Diagnostics & Validation

Run automated checks to ensure data quality, determinism, and strategy consistency.

**Validation Tests**
- No NaNs or duplicates in price data
- UPRO/SPXU prices properly aligned
- Winner timestamps exist in data
- PDT counter behaves correctly
- Entry/exit timestamps are deterministic
- Backtest produces identical results on re-run (regression test)

In [None]:
# Add parent directory to path for module imports
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import time

from config import get_config
from momentum_lib import run_backtest, PDTTracker

sns.set_theme(style="ticks")

config = get_config()
data_dir = Path("../data")

print("Modules loaded.")

In [None]:
print("=" * 60)
print("TEST 1: DATA QUALITY CHECKS")
print("=" * 60)

# Check if data files exist
upro_file = data_dir / "upro_prices.csv"
spxu_file = data_dir / "spxu_prices.csv"

if not upro_file.exists() or not spxu_file.exists():
    print("[SKIP] Data files not found. Please run 01_feature_engineering.ipynb first.")
    print(f"       Expected files: {upro_file} and {spxu_file}")
    print("=" * 60)
    raise FileNotFoundError("Data files not found. Run 01_feature_engineering.ipynb to generate data.")

# Load price data
upro_df = pd.read_csv(upro_file, parse_dates=["timestamp"], index_col="timestamp")
spxu_df = pd.read_csv(spxu_file, parse_dates=["timestamp"], index_col="timestamp")

# Ensure timezone aware
if upro_df.index.tz is None:
    upro_df.index = upro_df.index.tz_localize("America/New_York")
if spxu_df.index.tz is None:
    spxu_df.index = spxu_df.index.tz_localize("America/New_York")

# Check for NaNs
assert not upro_df.isna().any().any(), "UPRO data contains NaNs"
assert not spxu_df.isna().any().any(), "SPXU data contains NaNs"
print("[OK] No NaNs in price data")

# Check for duplicates
assert not upro_df.index.duplicated().any(), "UPRO data contains duplicate timestamps"
assert not spxu_df.index.duplicated().any(), "SPXU data contains duplicate timestamps"
print("[OK] No duplicate timestamps")

# Check alignment
common_index = upro_df.index.intersection(spxu_df.index)
assert len(common_index) == len(upro_df) == len(spxu_df), "UPRO/SPXU data not aligned"
print("[OK] UPRO/SPXU data properly aligned")

print(f"\nData shape: {upro_df.shape}")
print(f"Date range: {upro_df.index.min().date()} to {upro_df.index.max().date()}")
print("=" * 60)

In [None]:
print("\n" + "=" * 60)
print("TEST 2: WINNER WINDOW TIMESTAMPS")
print("=" * 60)

# Check that 09:30-09:35 timestamps exist for each trading day
days_with_opens = upro_df[upro_df.index.time == time(9, 30)].index.date
print(f"Total trading days with 09:30 bar: {len(set(days_with_opens))}")

# Sample a few days and verify 5-minute window exists
sample_days = pd.Series(list(set(days_with_opens))).sample(min(5, len(set(days_with_opens))), random_state=42)

for day in sample_days:
    day_data = upro_df[upro_df.index.date == day]
    times = day_data.index.time
    
    # Check 09:30-09:35 exists
    expected_times = [time(9, 30+i) for i in range(6)]  # 09:30 to 09:35
    missing_times = [t for t in expected_times if t not in times]
    
    if missing_times:
        print(f"[WARNING] {day}: Missing timestamps {missing_times}")
    else:
        print(f"[OK] {day}: All winner window timestamps present")

print("=" * 60)

In [None]:
print("\n" + "=" * 60)
print("TEST 3: PDT TRACKER VALIDATION")
print("=" * 60)

# Test PDT tracker logic
pdt = PDTTracker(max_trades=3)

# Simulate some trades
test_date = pd.Timestamp("2024-01-15", tz="America/New_York")

# Add 3 day trades
pdt.add_day_trade(test_date, 1)
pdt.add_day_trade(test_date + pd.Timedelta(days=1), 1)
pdt.add_day_trade(test_date + pd.Timedelta(days=2), 1)

count = pdt.get_count(test_date + pd.Timedelta(days=2))
assert count == 3, f"Expected 3 trades, got {count}"
print(f"[OK] PDT counter shows {count} trades in rolling window")

# Test can_trade logic
can_trade_under = pdt.can_trade(test_date + pd.Timedelta(days=3), equity=20000, threshold=25000)
assert not can_trade_under, "Should not allow trading with 3 trades and equity < 25k"
print("[OK] PDT correctly blocks trading when limit reached and equity < threshold")

can_trade_over = pdt.can_trade(test_date + pd.Timedelta(days=3), equity=30000, threshold=25000)
assert can_trade_over, "Should allow trading with equity >= 25k"
print("[OK] PDT correctly allows trading when equity >= threshold")

print("=" * 60)

In [None]:
print("\n" + "=" * 60)
print("TEST 4: DETERMINISM & REGRESSION TEST")
print("=" * 60)

# Run backtest twice and verify identical results
print("Running backtest #1...")
state1, equity1, trades1 = run_backtest(upro_df, spxu_df, config)

print("Running backtest #2...")
state2, equity2, trades2 = run_backtest(upro_df, spxu_df, config)

# Compare results
assert state1.equity == state2.equity, "Final equity differs between runs"
print(f"[OK] Final equity identical: ${state1.equity:,.2f}")

assert len(trades1) == len(trades2), "Trade count differs between runs"
print(f"[OK] Trade count identical: {len(trades1)}")

assert equity1.equals(equity2), "Equity curves differ between runs"
print(f"[OK] Equity curves identical")

# Compare specific trade details
for i in range(min(10, len(trades1))):
    t1 = trades1.iloc[i]
    t2 = trades2.iloc[i]
    assert t1["timestamp"] == t2["timestamp"], f"Trade {i} timestamp mismatch"
    assert t1["ticker"] == t2["ticker"], f"Trade {i} ticker mismatch"
    assert t1["shares"] == t2["shares"], f"Trade {i} shares mismatch"
    assert abs(t1["price"] - t2["price"]) < 0.01, f"Trade {i} price mismatch"

print(f"[OK] First {min(10, len(trades1))} trades identical")
print("\n[SUCCESS] Backtest is fully deterministic")
print("=" * 60)