In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys

repo_root = Path().resolve().parents[2]
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
load_data, get_feature_columns, split_data = safe_import_from(
    '03_ml_tabular_foundations.src.data',
    'load_data', 'get_feature_columns', 'split_data'
)

set_seed(42)
reports_dir = Path("../reports")
reports_dir.mkdir(exist_ok=True)

print("‚úÖ Setup complete")

## 1. The Importance of Proper Splitting

**Why splits matter**:
- **Train set**: Learn patterns (fit model parameters)
- **Validation set**: Tune hyperparameters, select models
- **Test set**: Final unbiased evaluation (touch ONCE!)

**Common mistakes**:
1. ‚ùå Training on test data
2. ‚ùå Using full data to compute statistics (mean, std) before splitting
3. ‚ùå Tuning on test set
4. ‚ùå Not stratifying with class imbalance
5. ‚ùå Data leakage through preprocessing

**Golden rule**: Test set is sacred. Pretend it doesn't exist until final evaluation.

In [None]:
# Load data
df = load_data()
feature_cols = get_feature_columns(df)
X = df[feature_cols].values
y = df['is_signal'].values

print(f"Dataset: {X.shape[0]:,} samples, {X.shape[1]} features")
print(f"Class balance: {y.mean():.1%} signal")

## 2. Split Strategy: Train/Val/Test

**Recommended split**: 60% train / 20% val / 20% test

**Why this ratio?**:
- Train: Need enough data to learn patterns
- Val: Need enough to estimate generalization reliably (not too small)
- Test: Final evaluation, keep same size as val for fair comparison

**Stratification**: Maintain class balance across all splits.

In [None]:
# Split using module's function (proper stratified split)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y, test_size=0.2, val_size=0.2, random_state=42)

print("Split Verification:")
print(f"Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/X.shape[0]:.1%}), signal rate: {y_train.mean():.1%}")
print(f"Val:   {X_val.shape[0]:,} samples ({X_val.shape[0]/X.shape[0]:.1%}), signal rate: {y_val.mean():.1%}")
print(f"Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/X.shape[0]:.1%}), signal rate: {y_test.mean():.1%}")

# Verify no overlap
train_val_overlap = len(set(range(len(X_train))) & set(range(len(X_train), len(X_train) + len(X_val))))
print(f"\n‚úÖ No data overlap: train ‚à© val = {train_val_overlap}")
print(f"‚úÖ Stratification preserved: all splits have ~{y.mean():.1%} signal rate")

## 3. Cross-Validation for Robust Evaluation

**Problem**: Single train/val split may be lucky or unlucky.

**Solution**: k-fold CV averages over multiple splits.

**Stratified k-Fold CV**:
1. Split data into k folds
2. For each fold i:
   - Train on k-1 folds
   - Validate on fold i
3. Average validation metrics across all folds

**When to use CV**:
- ‚úÖ Model selection (compare architectures)
- ‚úÖ Hyperparameter tuning
- ‚ùå Final test evaluation (use held-out test set)

In [None]:
# Demonstrate stratified k-fold CV
from sklearn.model_selection import cross_val_score

# Use train+val data for CV (hold out test set for final eval)
X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

# Simple logistic regression (we'll fix preprocessing leakage later)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# For now, we'll standardize INSIDE cv (proper way) vs OUTSIDE cv (leakage)
# We'll demonstrate both approaches

print("5-Fold Stratified Cross-Validation:")
print("=" * 60)

fold_scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val[train_idx], X_train_val[val_idx]
    y_fold_train, y_fold_val = y_train_val[train_idx], y_train_val[val_idx]
    
    # ‚úÖ CORRECT: Fit scaler on train fold only
    scaler = StandardScaler()
    X_fold_train_scaled = scaler.fit_transform(X_fold_train)
    X_fold_val_scaled = scaler.transform(X_fold_val)
    
    # Train model
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_fold_train_scaled, y_fold_train)
    
    # Evaluate
    y_pred = model.predict_proba(X_fold_val_scaled)[:, 1]
    auc = roc_auc_score(y_fold_val, y_pred)
    fold_scores.append(auc)
    
    print(f"Fold {fold+1}: AUC = {auc:.4f} (train: {len(y_fold_train):,}, val: {len(y_fold_val):,})")

mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\n{'='*60}")
print(f"Mean AUC: {mean_auc:.4f} ¬± {std_auc:.4f}")
print(f"95% CI: [{mean_auc - 1.96*std_auc:.4f}, {mean_auc + 1.96*std_auc:.4f}]")

## 4. Data Leakage: The Silent Killer

**Definition**: Using information from validation/test sets during training.

**Why it's dangerous**:
- Model sees "future" information
- Overestimates performance
- Fails in production

**Common leakage sources**:
1. **Preprocessing on full data before split** ‚Üê Most common!
2. Target encoding using global statistics
3. Feature selection using full data
4. Normalization using test set statistics

## 5. Demonstration: Leaky Pipeline (‚ùå WRONG)

Let's deliberately create a leaky pipeline to see the effect.

**Mistake**: Standardize using statistics from ALL data (train + val + test).

In [None]:
# ‚ùå WRONG: Fit scaler on ALL data (leakage!)
print("üö® LEAKY PIPELINE DEMONSTRATION")
print("=" * 60)

# Step 1: Fit scaler on ENTIRE dataset (WRONG!)
scaler_leaky = StandardScaler()
X_all_scaled = scaler_leaky.fit_transform(X)  # Using test data to compute mean/std!

# Step 2: Split AFTER scaling
X_train_leaky, X_temp, y_train_leaky, y_temp = train_test_split(
    X_all_scaled, y, test_size=0.4, stratify=y, random_state=42
)
X_val_leaky, X_test_leaky, y_val_leaky, y_test_leaky = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Step 3: Train model
model_leaky = LogisticRegression(max_iter=1000, random_state=42)
model_leaky.fit(X_train_leaky, y_train_leaky)

# Step 4: Evaluate
y_pred_train_leaky = model_leaky.predict_proba(X_train_leaky)[:, 1]
y_pred_val_leaky = model_leaky.predict_proba(X_val_leaky)[:, 1]
y_pred_test_leaky = model_leaky.predict_proba(X_test_leaky)[:, 1]

auc_train_leaky = roc_auc_score(y_train_leaky, y_pred_train_leaky)
auc_val_leaky = roc_auc_score(y_val_leaky, y_pred_val_leaky)
auc_test_leaky = roc_auc_score(y_test_leaky, y_pred_test_leaky)

print("‚ùå LEAKY RESULTS:")
print(f"  Train AUC: {auc_train_leaky:.4f}")
print(f"  Val AUC:   {auc_val_leaky:.4f}")
print(f"  Test AUC:  {auc_test_leaky:.4f}")
print("\n‚ö†Ô∏è Problem: Val/test were used to compute scaling statistics!")
print("   ‚Üí Model has seen test data indirectly through mean/std")

## 6. Fixed Pipeline: sklearn Pipelines (‚úÖ CORRECT)

**Solution**: Use `sklearn.pipeline.Pipeline` to encapsulate preprocessing.

**Why Pipelines prevent leakage**:
- `.fit()` on train data only
- `.transform()` applied consistently to val/test
- No manual bookkeeping of scalers

In [None]:
# ‚úÖ CORRECT: Use Pipeline
print("‚úÖ CORRECT PIPELINE (NO LEAKAGE)")
print("=" * 60)

# Create pipeline
pipeline_correct = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Fit on train data ONLY
pipeline_correct.fit(X_train, y_train)

# Evaluate
y_pred_train_correct = pipeline_correct.predict_proba(X_train)[:, 1]
y_pred_val_correct = pipeline_correct.predict_proba(X_val)[:, 1]
y_pred_test_correct = pipeline_correct.predict_proba(X_test)[:, 1]

auc_train_correct = roc_auc_score(y_train, y_pred_train_correct)
auc_val_correct = roc_auc_score(y_val, y_pred_val_correct)
auc_test_correct = roc_auc_score(y_test, y_pred_test_correct)

print("‚úÖ CORRECT RESULTS:")
print(f"  Train AUC: {auc_train_correct:.4f}")
print(f"  Val AUC:   {auc_val_correct:.4f}")
print(f"  Test AUC:  {auc_test_correct:.4f}")
print("\n‚úÖ Scaler fit on train data only")
print("‚úÖ Val/test transformed using train statistics")

## 7. Comparison: Leaky vs. Correct

**Key observation**: Leaky pipeline shows **optimistically biased** performance.

In [None]:
# Compare results
comparison = pd.DataFrame({
    'Pipeline': ['Leaky ‚ùå', 'Correct ‚úÖ'],
    'Train AUC': [auc_train_leaky, auc_train_correct],
    'Val AUC': [auc_val_leaky, auc_val_correct],
    'Test AUC': [auc_test_leaky, auc_test_correct],
    'Val Overestimate': [auc_val_leaky - auc_val_correct, 0],
    'Test Overestimate': [auc_test_leaky - auc_test_correct, 0]
})

print("\nüìä Leaky vs. Correct Pipeline Comparison:")
print(comparison.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(2)
width = 0.25

ax.bar(x - width, [auc_train_leaky, auc_train_correct], width, 
       label='Train', alpha=0.8, edgecolor='black', linewidth=1.5)
ax.bar(x, [auc_val_leaky, auc_val_correct], width, 
       label='Val', alpha=0.8, edgecolor='black', linewidth=1.5)
ax.bar(x + width, [auc_test_leaky, auc_test_correct], width, 
       label='Test', alpha=0.8, edgecolor='black', linewidth=1.5)

ax.set_ylabel('AUC-ROC', fontsize=12)
ax.set_title('Leaky vs. Correct Pipeline: Performance Comparison', 
             fontsize=13, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['Leaky ‚ùå', 'Correct ‚úÖ'])
ax.legend()
ax.grid(alpha=0.3, axis='y')
ax.set_ylim([0.90, 0.96])

plt.tight_layout()
plt.savefig(reports_dir / '02_leaky_vs_correct.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n‚ö†Ô∏è Leaky pipeline overestimates:")
print(f"  Val AUC by {(auc_val_leaky - auc_val_correct)*100:.2f} percentage points")
print(f"  Test AUC by {(auc_test_leaky - auc_test_correct)*100:.2f} percentage points")
print("\nüí° Lesson: Always use Pipelines to prevent accidental leakage!")

## 8. Leakage Detection: Sanity Checks

How to catch leakage in your own pipelines:

**Red flags**:
1. Val/test AUC ‚âà Train AUC (suspiciously good generalization)
2. Performance drops dramatically in production
3. Preprocessor fit on anything other than train data

In [None]:
# Sanity check implementation
def check_for_leakage(model, X_train, X_val, y_train, y_val):
    """Detect potential leakage via sanity checks."""
    
    checks = []
    
    # Check 1: Pipeline structure
    if hasattr(model, 'named_steps'):
        scaler_in_pipeline = 'scaler' in model.named_steps
        checks.append(('Pipeline contains scaler', scaler_in_pipeline, '‚úÖ' if scaler_in_pipeline else '‚ùå'))
    else:
        checks.append(('Pipeline structure', False, '‚ùå Not using Pipeline'))
    
    # Check 2: Performance gap
    y_pred_train = model.predict_proba(X_train)[:, 1]
    y_pred_val = model.predict_proba(X_val)[:, 1]
    auc_train = roc_auc_score(y_train, y_pred_train)
    auc_val = roc_auc_score(y_val, y_pred_val)
    gap = auc_train - auc_val
    
    reasonable_gap = 0.01 < gap < 0.10  # Typical gap for good model
    checks.append(('Train-val gap reasonable', reasonable_gap, 
                   f"‚úÖ Gap={gap:.4f}" if reasonable_gap else f"‚ö†Ô∏è Gap={gap:.4f}"))
    
    # Check 3: Val performance not suspiciously high
    suspiciously_high = auc_val > 0.99
    checks.append(('Val AUC realistic', not suspiciously_high,
                   '‚úÖ AUC < 0.99' if not suspiciously_high else f'‚ö†Ô∏è AUC={auc_val:.4f} (too good?)'))
    
    print("üîç Leakage Detection Checks:")
    print("=" * 60)
    for check_name, passed, message in checks:
        print(f"  {message} {check_name}")
    
    all_passed = all(c[1] for c in checks)
    return all_passed

# Run checks
print("Checking CORRECT pipeline:")
check_for_leakage(pipeline_correct, X_train, X_val, y_train, y_val)

## 9. Leakage Checklist

Save this checklist to `reports/` for reference in all future projects.

In [None]:
# Generate leakage prevention checklist
leakage_checklist = """
# Data Leakage Prevention Checklist

## Before Training
- [ ] Split data BEFORE any preprocessing
- [ ] Use stratified splits for imbalanced data
- [ ] Set random_state for reproducibility
- [ ] Verify no data overlap between train/val/test

## During Preprocessing
- [ ] Use sklearn Pipelines for all preprocessing
- [ ] Fit preprocessors (scaler, imputer, encoder) on train data ONLY
- [ ] Transform val/test using train-fitted preprocessors
- [ ] Never use global statistics (mean, std) computed on full data

## Feature Engineering
- [ ] Create features using train data only
- [ ] No target-derived features (e.g., target mean encoding without CV)
- [ ] No future information (temporal leakage)
- [ ] No identifiers that proxy for target

## Model Training
- [ ] Train on train set only
- [ ] Tune hyperparameters using validation set
- [ ] Use cross-validation for robust tuning
- [ ] Never touch test set until final evaluation

## Evaluation
- [ ] Report train/val/test metrics separately
- [ ] Check for reasonable train-val gap (0.01-0.10 typical)
- [ ] Investigate if val AUC > 0.99 (suspiciously high)
- [ ] Verify performance is consistent across CV folds

## Code Review
- [ ] All preprocessing inside Pipeline
- [ ] No .fit() calls on val/test data
- [ ] No global scaling before split
- [ ] Random states fixed for reproducibility

## Production Deployment
- [ ] Save entire pipeline (not just model)
- [ ] Use pipeline.transform() for new data
- [ ] Monitor for distribution shift
- [ ] Retrain periodically with proper splits
"""

# Save checklist
with open(reports_dir / '02_leakage_checklist.md', 'w') as f:
    f.write(leakage_checklist)

print("üìã Leakage Prevention Checklist")
print("=" * 60)
print(leakage_checklist)
print(f"\n‚úÖ Checklist saved to: {reports_dir / '02_leakage_checklist.md'}")

## 10. Time-Based Splits (Bonus)

**When temporal ordering matters** (time series, transactions), use time-based splits.

**Our dataset**: Particle collisions are i.i.d. (no temporal ordering), so random stratified split is appropriate.

**For time series**:
```python
# Example (not run):
# from sklearn.model_selection import TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)
# for train_idx, val_idx in tscv.split(X):
#     # Train on past, validate on future
#     X_train, X_val = X[train_idx], X[val_idx]
```

**Key difference**: Don't shuffle! Preserve temporal order.

## 11. Exercises

**Exercise 1**: Identify the leakage

Which of these pipelines has data leakage? Explain why.

```python
# Pipeline A
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Full data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
model = LogisticRegression()
model.fit(X_train, y_train)

# Pipeline B
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])
pipeline.fit(X_train, y_train)
```

In [None]:
# Your answer here:
# 


**Exercise 2**: Implement a safe preprocessing pipeline

Create a sklearn Pipeline that:
1. Imputes missing values with median (train-fitted)
2. Standardizes features (train-fitted)
3. Trains a logistic regression model

Verify with assertions that no leakage occurs.

In [None]:
# Your implementation here:
# 


**Exercise 3**: Design a split strategy for a time-series problem

You have transaction fraud data with timestamps:
- 1M transactions over 2 years
- Goal: Predict fraud in next month
- Transactions have temporal autocorrelation (fraud waves)

Design a proper train/val/test split strategy. Consider:
1. Should you shuffle?
2. How to split temporally?
3. How to handle data drift?

In [None]:
# Your answer here:
# 


---
## Solutions

**Solution 1**: Identify the leakage

**Answer**: Pipeline A has leakage ‚ùå

**Explanation**:
```python
# Pipeline A (LEAKY ‚ùå)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # ‚Üê LEAKAGE HERE!
# Problem: Scaler computes mean/std using ENTIRE dataset (train + test)
# Test data influences scaling parameters ‚Üí indirect information leak

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)
# Even though we split after, scaling already "saw" test data
```

**Pipeline B is correct** ‚úÖ:
```python
# Pipeline B (CORRECT ‚úÖ)
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Split BEFORE any preprocessing

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Fits on train only
    ('model', LogisticRegression())
])
pipeline.fit(X_train, y_train)
# pipeline.fit() calls scaler.fit_transform(X_train), not X_test
# pipeline.predict() calls scaler.transform(X_test) using train statistics
```

**The fix**: Always split BEFORE preprocessing, or use Pipeline.

In [None]:
# Solution 1 (demonstration)
from sklearn.impute import SimpleImputer

# Demonstrate correct order
print("‚úÖ CORRECT ORDER:")
print("1. Split data")
print("2. Create Pipeline")
print("3. Fit Pipeline on train")
print("4. Evaluate on val/test")
print("\n‚ùå WRONG ORDER:")
print("1. Preprocess full data")
print("2. Split preprocessed data")
print("3. Train model")
print("   ‚Üí Leakage: preprocessing used test data!")

**Solution 2**: Implement safe preprocessing pipeline

In [None]:
# Solution 2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Create safe pipeline
safe_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Fit on train
    ('scaler', StandardScaler()),                    # Fit on train
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Split data
X_train_ex, X_test_ex, y_train_ex, y_test_ex = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit pipeline
safe_pipeline.fit(X_train_ex, y_train_ex)

# Verify no leakage with assertions
print("üîç Leakage Verification:")

# Check 1: Imputer statistics computed from train only
imputer_stats = safe_pipeline.named_steps['imputer'].statistics_
print(f"‚úÖ Imputer median computed from {X_train_ex.shape[0]} train samples")

# Check 2: Scaler statistics computed from train only
scaler_mean = safe_pipeline.named_steps['scaler'].mean_
print(f"‚úÖ Scaler mean computed from {X_train_ex.shape[0]} train samples")

# Check 3: Predictions work on test
y_pred_test = safe_pipeline.predict_proba(X_test_ex)[:, 1]
auc_test = roc_auc_score(y_test_ex, y_pred_test)
print(f"‚úÖ Test AUC: {auc_test:.4f}")

# Check 4: Verify imputer/scaler were not refit on test
# (No way to directly check, but we can verify pipeline behavior)
assert hasattr(safe_pipeline, 'named_steps'), "Pipeline structure exists"
assert 'imputer' in safe_pipeline.named_steps, "Imputer in pipeline"
assert 'scaler' in safe_pipeline.named_steps, "Scaler in pipeline"

print("\n‚úÖ All assertions passed: No leakage detected!")

**Solution 3**: Time-series split strategy

**Strategy for fraud detection with temporal data**:

```
Timeline: 2021-01 to 2022-12 (24 months)

Train:    2021-01 to 2022-06 (18 months) ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îì
Val:      2022-07 to 2022-09 (3 months)                    ‚îÅ‚îÅ‚îÅ‚îì
Test:     2022-10 to 2022-12 (3 months)                        ‚îÅ‚îÅ‚îÅ
                                                        (predict)
```

**Key decisions**:

1. **No shuffling** ‚ùå
   - Preserve temporal order
   - Train on past, predict future
   
2. **Temporal split**:
   ```python
   # Sort by timestamp
   df_sorted = df.sort_values('timestamp')
   
   # Split by date ranges
   train_end = '2022-06-30'
   val_end = '2022-09-30'
   
   train = df_sorted[df_sorted['timestamp'] <= train_end]
   val = df_sorted[(df_sorted['timestamp'] > train_end) & 
                   (df_sorted['timestamp'] <= val_end)]
   test = df_sorted[df_sorted['timestamp'] > val_end]
   ```

3. **Handling data drift**:
   - Monitor train vs. val distribution shift
   - Use expanding window: retrain monthly with all past data
   - Track feature importance changes over time
   
4. **Validation strategy**:
   - Use TimeSeriesSplit for hyperparameter tuning
   - Multiple temporal folds to catch seasonality
   
5. **Production considerations**:
   - Retrain model monthly with expanding window
   - Monitor for distribution shift (PSI, KS test)
   - Have fallback to rules if model degrades

In [None]:
# Solution 3 (code template)
from sklearn.model_selection import TimeSeriesSplit

# Simulated time-series split (conceptual)
print("Time-Series Split Strategy:")
print("=" * 60)

# Create temporal indices (simulate 24 months)
n_samples = len(X)
time_indices = np.arange(n_samples)

# TimeSeriesSplit for CV
tscv = TimeSeriesSplit(n_splits=5)

print("TimeSeriesSplit Cross-Validation Folds:")
for i, (train_idx, val_idx) in enumerate(tscv.split(time_indices)):
    train_months = len(train_idx) / (n_samples / 24)
    val_months = len(val_idx) / (n_samples / 24)
    print(f"Fold {i+1}:")
    print(f"  Train: {len(train_idx):,} samples (~{train_months:.1f} months)")
    print(f"  Val:   {len(val_idx):,} samples (~{val_months:.1f} months)")

print("\nüîë Key Principles:")
print("  1. ‚úÖ No shuffling (preserve temporal order)")
print("  2. ‚úÖ Train on past, validate on future")
print("  3. ‚úÖ Expanding window (not sliding)")
print("  4. ‚úÖ Monitor distribution shift")
print("  5. ‚úÖ Retrain periodically")

---

## ‚úÖ Notebook Complete!

**What you learned**:
1. ‚úÖ Proper train/val/test splits with stratification
2. ‚úÖ K-fold cross-validation for robust evaluation
3. ‚úÖ Data leakage: how it happens and how to prevent it
4. ‚úÖ sklearn Pipelines as leakage prevention tool
5. ‚úÖ Sanity checks to detect leakage
6. ‚úÖ Time-based splits for temporal data

**Outputs saved**:
- `reports/02_leaky_vs_correct.png`
- `reports/02_leakage_checklist.md`

**Key takeaway**: Always use `sklearn.pipeline.Pipeline` to encapsulate preprocessing. Split BEFORE preprocessing.

**Next notebook**: `03_baselines_and_metrics_that_matter.ipynb` ‚Äî Learn to establish baselines and choose metrics that align with business goals.