# 04 Â· Diagnostics & Regression Tests

Run automated checks to catch data issues, drift, and regression failures before going live.


**Checklist**
- Validate cached datasets (no NaNs/duplicates)
- Recompute headline metrics vs. stored baselines
- Run unit tests (`pytest`)
- Produce drift report for key features


In [None]:
# Add parent directory to path for module imports
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from momentum_lib import bootstrap_env, load_prices_from_data_dir

sns.set_theme(style="ticks")
bootstrap_path = PROJECT_ROOT / ".env"
bootstrap_env(bootstrap_path)
data_dir = PROJECT_ROOT / "data"

# Load prices from year-based files (prices_YYYY.csv)
prices = load_prices_from_data_dir(data_dir)

# Load features (still using single file for now)
features = pd.read_csv(data_dir / "features.csv", index_col=0)

print(features.shape, prices.shape)
assert features.notna().all().all()
assert not prices.index.duplicated().any()
assert not prices.isna().any().any()
print("Data health checks passed.")


In [None]:
plt.figure(figsize=(10, 4))
sns.lineplot(data=prices.tail(300))
plt.title("Recent price sanity check")
plt.show()



In [None]:
feature_stats = features.describe().T[['mean', 'std']]
feature_stats.head()


In [None]:
sample = features.sample(min(500, len(features)), random_state=42)
sns.pairplot(sample[[c for c in sample.columns if c.endswith("ret_1")][:4]])
plt.suptitle("Return-feature relationships", y=1.02)
plt.show()



In [None]:
!pytest -q
