In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import json

repo_root = Path().resolve().parents[2]
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
load_data, get_feature_columns = safe_import_from(
    '03_ml_tabular_foundations.src.data',
    'load_data', 'get_feature_columns'
)

set_seed(42)
reports_dir = Path("../reports")
reports_dir.mkdir(exist_ok=True)

print("‚úÖ Setup complete")

## 1. Problem Framing

**Dataset**: Synthetic particle collision events (100K samples)

**Task**: Binary classification to identify signal events (rare particle decays) from background noise.

**Physics Context**:
- Signal rate: ~10% (class imbalance typical in HEP)
- Features: 16 kinematic variables (momentum, energy, angles, mass)
- No time-ordering (each event is independent)

**What Makes This Hard**:
1. Class imbalance (1:9 signal:background)
2. Correlated features (physics constraints)
3. Rare events ‚Üí need high precision to avoid false positives

In [None]:
# Load data
df = load_data()

print("Dataset Statistics:")
print(f"  Total samples: {len(df):,}")
print(f"  Features: {len(get_feature_columns(df))}")
print(f"  Target: 'is_signal' (binary)")
print(f"\nClass distribution:")
print(df['is_signal'].value_counts())
print(f"\nSignal rate: {df['is_signal'].mean():.1%}")

## 2. Data Schema Validation

**Professional ML Mindset**: Always validate your data schema before analysis.

**Checks**:
- Column names match expectation
- Data types are correct
- No unexpected nulls
- Value ranges are plausible (physics constraints)

In [None]:
# Schema validation
expected_features = {
    'p_T', 'eta', 'phi', 'E_total', 'm_inv', 'missing_E_T',
    'n_jets', 'b_tag_score', 'lepton_iso', 'delta_R', 'm_T',
    'E_ratio', 'sphericity', 'aplanarity', 'centrality', 'H_T'
}
expected_target = 'is_signal'

actual_cols = set(df.columns)
feature_cols = get_feature_columns(df)

print("Schema Validation:")
print(f"‚úÖ All expected features present: {expected_features == set(feature_cols)}")
print(f"‚úÖ Target column present: {expected_target in df.columns}")

# Data type checks
print(f"\n‚úÖ All features numeric: {df[feature_cols].select_dtypes(include=[np.number]).shape[1] == len(feature_cols)}")
print(f"‚úÖ Target is binary: {set(df[expected_target].unique()) == {0, 1}}")

# Missingness check
missing = df.isnull().sum()
print(f"\n‚úÖ No missing values: {missing.sum() == 0}")

# Physics sanity checks
print("\nPhysics Constraints:")
print(f"  p_T ‚â• 0: {(df['p_T'] >= 0).all()} ‚úÖ")
print(f"  E_total ‚â• 0: {(df['E_total'] >= 0).all()} ‚úÖ")
print(f"  0 ‚â§ b_tag_score ‚â§ 1: {((df['b_tag_score'] >= 0) & (df['b_tag_score'] <= 1)).all()} ‚úÖ")
print(f"  -œÄ ‚â§ phi ‚â§ œÄ: {((df['phi'] >= -np.pi) & (df['phi'] <= np.pi)).all()} ‚úÖ")

## 3. Target Distribution Analysis

**Critical First Step**: Understand your target distribution.

For classification:
- Class balance affects metric choice
- Imbalance affects model training (need stratified sampling)
- Baseline accuracy = majority class rate

In [None]:
# Target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
counts = df['is_signal'].value_counts().sort_index()
ax1.bar(['Background', 'Signal'], counts.values, alpha=0.7, 
        color=['steelblue', 'coral'], edgecolor='black', linewidth=1.5)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Class Distribution', fontsize=13, fontweight='bold')
ax1.grid(alpha=0.3, axis='y')
for i, v in enumerate(counts.values):
    ax1.text(i, v + 1000, f'{v:,}\n({v/len(df):.1%})', 
             ha='center', fontsize=11, fontweight='bold')

# Pie chart
ax2.pie(counts.values, labels=['Background', 'Signal'], autopct='%1.1f%%',
        colors=['steelblue', 'coral'], startangle=90, 
        wedgeprops={'edgecolor': 'black', 'linewidth': 1.5})
ax2.set_title('Class Proportions', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.savefig(reports_dir / '01_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüìä Key Observations:")
print(f"  ‚Ä¢ Imbalanced dataset: {df['is_signal'].mean():.1%} signal")
print(f"  ‚Ä¢ Baseline accuracy (predict majority): {df['is_signal'].value_counts().max() / len(df):.1%}")
print(f"  ‚Ä¢ ‚ö†Ô∏è Implications:")
print(f"    - Need stratified splits to maintain class balance")
print(f"    - Accuracy is misleading (use AUC-ROC, PR-AUC)")
print(f"    - May need class weights or resampling")

## 4. Feature Type Analysis

**Categorize features** by type for appropriate preprocessing:
- Continuous: normalize/standardize
- Categorical: one-hot encode
- Ordinal: encode preserving order
- Counts: may need log-transform or binning

In [None]:
# Feature type analysis
feature_types = {
    'continuous': ['p_T', 'eta', 'phi', 'E_total', 'm_inv', 'missing_E_T', 
                   'delta_R', 'm_T', 'H_T'],
    'bounded_continuous': ['b_tag_score', 'lepton_iso', 'E_ratio', 
                           'sphericity', 'aplanarity', 'centrality'],
    'discrete_count': ['n_jets']
}

print("Feature Types:")
for ftype, features in feature_types.items():
    print(f"\n{ftype.upper()} ({len(features)}):")
    for f in features:
        print(f"  ‚Ä¢ {f}")

# Distribution statistics
print("\n\nDistribution Summary:")
print(df[feature_cols].describe().T[['mean', 'std', 'min', 'max']])

## 5. Missing Value Analysis

**Best Practice**: Analyze missing patterns, don't just impute blindly.

Questions to ask:
- Are missings random or systematic?
- Do missing indicators correlate with target?
- Should we impute or use as a feature?

In [None]:
# Missing value analysis
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

print("Missing Value Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\n‚úÖ No missing values detected")
    print("   ‚Üí This is unusual for real-world data but expected for synthetic datasets")
    print("   ‚Üí In production, monitor for sudden changes in missingness patterns")
else:
    print(f"\n‚ö†Ô∏è Total missing: {missing_df['Missing Count'].sum():,}")

## 6. Outlier Detection

**Purpose**: Identify extreme values that may indicate:
- Data quality issues
- Rare but valid events
- Need for robust scaling/capping

**Method**: Use IQR (Interquartile Range) method for each feature.

In [None]:
# Outlier detection using IQR method
def detect_outliers_iqr(data, multiplier=1.5):
    """Detect outliers using IQR method."""
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    outliers = (data < lower_bound) | (data > upper_bound)
    return outliers.sum(), lower_bound, upper_bound

outlier_summary = []
for col in feature_cols:
    n_outliers, lower, upper = detect_outliers_iqr(df[col])
    outlier_summary.append({
        'Feature': col,
        'Outliers': n_outliers,
        'Outlier %': f"{n_outliers/len(df)*100:.2f}%",
        'Lower Bound': f"{lower:.2f}",
        'Upper Bound': f"{upper:.2f}"
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outliers', ascending=False)
print("Outlier Summary (Top 10):")
print(outlier_df.head(10).to_string(index=False))

# Visualize outliers for top feature
top_feature = outlier_df.iloc[0]['Feature']
fig, ax = plt.subplots(figsize=(10, 4))
ax.boxplot(df[top_feature], vert=False, widths=0.5)
ax.set_xlabel(top_feature, fontsize=12)
ax.set_title(f'Outlier Detection: {top_feature}', fontsize=13, fontweight='bold')
ax.grid(alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig(reports_dir / '01_outliers_boxplot.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Interpretation:")
print(f"  ‚Ä¢ Feature with most outliers: {top_feature}")
print(f"  ‚Ä¢ In physics, 'outliers' may be real high-energy events")
print(f"  ‚Ä¢ ‚ö†Ô∏è Decision: Keep outliers (physics-motivated), but use robust scaling")

## 7. Feature Distributions by Class

**Goal**: Understand which features discriminate between signal and background.

**Good discriminators**:
- Show clear separation between classes
- Have different means/medians
- Exhibit different shapes

**Poor discriminators**:
- Overlap heavily between classes
- Similar distributions

In [None]:
# Feature distributions by class (top 6 features)
key_features = ['m_inv', 'missing_E_T', 'b_tag_score', 'lepton_iso', 'p_T', 'E_total']

fig, axes = plt.subplots(2, 3, figsize=(15, 9))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    ax = axes[idx]
    
    signal_data = df[df['is_signal'] == 1][feature]
    background_data = df[df['is_signal'] == 0][feature]
    
    # Overlapping histograms
    ax.hist(background_data, bins=50, alpha=0.5, label='Background', 
            color='steelblue', density=True, edgecolor='black', linewidth=0.5)
    ax.hist(signal_data, bins=50, alpha=0.5, label='Signal', 
            color='coral', density=True, edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel(feature, fontsize=11)
    ax.set_ylabel('Density', fontsize=11)
    ax.set_title(feature, fontsize=12, fontweight='bold')
    ax.legend(fontsize=9, loc='upper right')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '01_feature_distributions_by_class.png', dpi=150, bbox_inches='tight')
plt.show()

print("üìä Visual Assessment:")
print("  ‚Ä¢ m_inv: Signal has clear peak at 125 GeV (Higgs mass) ‚úÖ Strong discriminator")
print("  ‚Ä¢ missing_E_T: Signal has higher missing energy ‚úÖ Good discriminator")
print("  ‚Ä¢ b_tag_score: Signal enriched in high b-tag ‚úÖ Good discriminator")
print("  ‚Ä¢ lepton_iso: Signal has better isolated leptons ‚úÖ Good discriminator")

## 8. Feature Correlations

**Purpose**: Identify multicollinearity and feature relationships.

**Why it matters**:
- High correlation ‚Üí redundant features (consider dropping)
- Physics correlations (e.g., E ‚àù p) are expected
- Unexpected correlations may indicate leakage

In [None]:
# Correlation matrix
corr_matrix = df[feature_cols].corr()

# Plot heatmap
fig, ax = plt.subplots(figsize=(12, 10))
im = ax.imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')

ax.set_xticks(np.arange(len(feature_cols)))
ax.set_yticks(np.arange(len(feature_cols)))
ax.set_xticklabels(feature_cols, rotation=45, ha='right', fontsize=9)
ax.set_yticklabels(feature_cols, fontsize=9)
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Correlation', fontsize=11)

plt.tight_layout()
plt.savefig(reports_dir / '01_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Find highly correlated pairs
high_corr_threshold = 0.7
high_corr_pairs = []
for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > high_corr_threshold:
            high_corr_pairs.append((feature_cols[i], feature_cols[j], corr_val))

print(f"\nHighly Correlated Pairs (|r| > {high_corr_threshold}):")
if high_corr_pairs:
    for f1, f2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True):
        print(f"  ‚Ä¢ {f1} <-> {f2}: r = {corr:.3f}")
else:
    print("  ‚úÖ No highly correlated pairs (good for linear models)")
    
print("\n‚ö†Ô∏è Note: Moderate correlations (0.3-0.7) are expected in physics data")
print("   E.g., E_total and p_T are physically related via E¬≤ = (pc)¬≤ + (mc¬≤)¬≤")

## 9. Potential Data Leakage Detection

**Critical Check**: Identify features that may leak target information.

**Red flags**:
- Perfect or near-perfect correlation with target
- Features computed using target
- Future information in past predictions
- Identifiers that proxy for target

**Our dataset**: Synthetic, so leakage unlikely, but let's check systematically.

In [None]:
# Leakage detection: correlation with target
from sklearn.metrics import mutual_info_classif

# Compute correlations and mutual information with target
target_correlations = df[feature_cols].corrwith(df['is_signal']).abs().sort_values(ascending=False)

# Mutual information (handles non-linear relationships)
mi_scores = mutual_info_classif(df[feature_cols], df['is_signal'], random_state=42)
mi_df = pd.DataFrame({
    'Feature': feature_cols,
    'Mutual Info': mi_scores
}).sort_values('Mutual Info', ascending=False)

print("Feature-Target Relationship Strength:")
print("\nTop 5 by Absolute Correlation:")
print(target_correlations.head())

print("\nTop 5 by Mutual Information:")
print(mi_df.head().to_string(index=False))

# Leakage red flags
leakage_threshold = 0.9
suspicious_features = target_correlations[target_correlations > leakage_threshold]

if len(suspicious_features) > 0:
    print(f"\nüö® LEAKAGE ALERT: Features with |corr| > {leakage_threshold}:")
    for feat, corr in suspicious_features.items():
        print(f"  ‚Ä¢ {feat}: {corr:.4f}")
    print("  ‚Üí These features may be computed using the target!")
else:
    print(f"\n‚úÖ No obvious leakage detected (no |corr| > {leakage_threshold})")
    print("  ‚Üí But always review feature engineering pipeline manually")

print("\nüìã Leakage Checklist:")
print("  1. ‚úÖ No features with perfect correlation to target")
print("  2. ‚úÖ All features are kinematic (physics-based, not target-derived)")
print("  3. ‚úÖ No temporal leakage (events are independent)")
print("  4. ‚úÖ No identifiers that might proxy for target")

## 10. Summary Report

Generate a compact EDA report for stakeholders.

In [None]:
# Generate summary report
eda_summary = {
    'dataset': {
        'name': 'Particle Collision Classification',
        'total_samples': len(df),
        'n_features': len(feature_cols),
        'target': 'is_signal (binary)',
        'signal_rate': f"{df['is_signal'].mean():.1%}",
        'class_balance': '10% signal / 90% background (imbalanced)'
    },
    'data_quality': {
        'missing_values': missing_df['Missing Count'].sum(),
        'duplicate_rows': df.duplicated().sum(),
        'schema_valid': True,
        'outliers_detected': outlier_df['Outliers'].sum()
    },
    'feature_insights': {
        'top_discriminators': ['m_inv', 'missing_E_T', 'b_tag_score', 'lepton_iso'],
        'highly_correlated_pairs': len(high_corr_pairs),
        'leakage_suspects': len(suspicious_features)
    },
    'recommendations': {
        'split_strategy': 'Stratified train/val/test (maintain 10% signal rate)',
        'preprocessing': 'StandardScaler (robust to outliers)',
        'metrics': 'AUC-ROC, PR-AUC (not accuracy due to imbalance)',
        'modeling': 'Gradient boosting (handles feature interactions)',
        'calibration': 'Required (check reliability diagram)'
    }
}

# Save report
with open(reports_dir / '01_eda_summary.json', 'w') as f:
    json.dump(eda_summary, f, indent=2)

print("üìä EDA Summary Report")
print("=" * 60)
print(f"\nDataset: {eda_summary['dataset']['name']}")
print(f"  Samples: {eda_summary['dataset']['total_samples']:,}")
print(f"  Features: {eda_summary['dataset']['n_features']}")
print(f"  Signal rate: {eda_summary['dataset']['signal_rate']}")

print(f"\nData Quality:")
print(f"  ‚úÖ Missing values: {eda_summary['data_quality']['missing_values']}")
print(f"  ‚úÖ Duplicate rows: {eda_summary['data_quality']['duplicate_rows']}")
print(f"  ‚úÖ Schema valid: {eda_summary['data_quality']['schema_valid']}")
print(f"  ‚ö†Ô∏è Outliers detected: {eda_summary['data_quality']['outliers_detected']:,} (expected in physics)")

print(f"\nTop Discriminators:")
for feat in eda_summary['feature_insights']['top_discriminators']:
    print(f"  ‚Ä¢ {feat}")

print(f"\nüîë Key Recommendations:")
for key, value in eda_summary['recommendations'].items():
    print(f"  ‚Ä¢ {key.replace('_', ' ').title()}: {value}")

print(f"\n‚úÖ Report saved to: {reports_dir / '01_eda_summary.json'}")

## 11. Exercises

Complete these exercises to reinforce your learning:

**Exercise 1**: Identify potential leakage columns

Given a hypothetical dataset with these columns:
- `transaction_id` (unique identifier)
- `user_age`, `user_income` (demographics)
- `transaction_amount` (purchase amount)
- `fraud_investigation_opened` (binary: was fraud investigation started?)
- **Target**: `is_fraud` (binary: was transaction fraudulent?)

Which column(s) would cause data leakage? Why?

In [None]:
# Your answer here:
# 


**Exercise 2**: Propose a split strategy

For our particle collision dataset:
1. What split ratio would you use (train/val/test)?
2. Should you use stratification? Why or why not?
3. Should you shuffle before splitting? Why or why not?
4. Write pseudocode for the splitting logic.

In [None]:
# Your answer here:
# 


**Exercise 3**: Detect outliers vs. valid rare events

In particle physics, extremely high `p_T` (transverse momentum) events are rare but physically valid.

How would you distinguish between:
1. **Data quality outliers** (measurement errors, detector glitches)
2. **Valid rare events** (high-energy physics processes)

What checks would you implement?

In [None]:
# Your answer here:
# 


---
## Solutions

**Solution 1**: Leakage columns

**Leakage column**: `fraud_investigation_opened`

**Why**: This column is **computed after** the target is known. Banks only open fraud investigations after detecting fraudulent activity. Using this feature gives the model perfect hindsight.

**Other potential issues**:
- `transaction_id`: Could leak if IDs are assigned sequentially and fraud rate changes over time
- `user_age`, `user_income`: ‚úÖ Safe (demographics known before transaction)
- `transaction_amount`: ‚úÖ Safe (observed at transaction time)

**General rule**: Ask "Was this feature available at prediction time?" If no ‚Üí leakage.

In [None]:
# Solution 1 (code demonstration)
print("Leakage Check:")
print("  fraud_investigation_opened ‚Üí TARGET: ‚ùå LEAKAGE")
print("  Reason: Investigation happens AFTER fraud detection")
print("")
print("Safe Features:")
print("  user_age ‚Üí Target: ‚úÖ Known before transaction")
print("  user_income ‚Üí Target: ‚úÖ Known before transaction")
print("  transaction_amount ‚Üí Target: ‚úÖ Observed at transaction time")

**Solution 2**: Split strategy

```python
# Recommended split strategy for particle collision dataset

# 1. Split ratio: 60/20/20 (train/val/test)
train_size = 0.60  # 60K samples for training
val_size = 0.20    # 20K samples for validation (hyperparameter tuning, calibration)
test_size = 0.20   # 20K samples for final evaluation (touch once!)

# 2. Stratification: YES
# Reason: Class imbalance (10% signal). Without stratification,
# random split might give 9% signal in train and 11% in test ‚Üí biased evaluation.
stratify = True

# 3. Shuffling: YES
# Reason: No temporal ordering in particle collisions (events are i.i.d.).
# Shuffling ensures random split, not based on data collection order.
shuffle = True

# Pseudocode:
# from sklearn.model_selection import train_test_split
# 
# # First split: separate test set
# train_val, test = train_test_split(data, test_size=0.2, 
#                                     stratify=data['is_signal'], 
#                                     shuffle=True, random_state=42)
# 
# # Second split: separate validation from train
# train, val = train_test_split(train_val, test_size=0.25,  # 0.25 * 0.8 = 0.2
#                               stratify=train_val['is_signal'],
#                               shuffle=True, random_state=42)
```

**Critical**: Fix `random_state=42` for reproducibility!

In [None]:
# Solution 2 (code verification)
from sklearn.model_selection import train_test_split

# Demonstrate stratified split
train_val, test = train_test_split(df, test_size=0.2, 
                                    stratify=df['is_signal'], 
                                    random_state=42)
train, val = train_test_split(train_val, test_size=0.25, 
                              stratify=train_val['is_signal'], 
                              random_state=42)

print("Split Verification:")
print(f"Train: {len(train):,} samples, {train['is_signal'].mean():.1%} signal")
print(f"Val:   {len(val):,} samples, {val['is_signal'].mean():.1%} signal")
print(f"Test:  {len(test):,} samples, {test['is_signal'].mean():.1%} signal")
print("\n‚úÖ Stratification preserved class balance across all splits")

**Solution 3**: Outliers vs. valid rare events

**Strategy**: Use domain knowledge + statistical checks.

**Approach**:

1. **Physics Constraints**:
   ```python
   # Hard constraints (if violated ‚Üí data error)
   assert (df['E_total'] >= 0).all(), "Energy cannot be negative"
   assert (df['p_T'] >= 0).all(), "Momentum cannot be negative"
   assert (df['phi'].between(-np.pi, np.pi)).all(), "Angle out of range"
   ```

2. **Extreme Value Analysis**:
   ```python
   # Flag events beyond 5-sigma (extreme but rare)
   z_scores = (df['p_T'] - df['p_T'].mean()) / df['p_T'].std()
   extreme_events = df[np.abs(z_scores) > 5]
   
   # Manual review: Are these:
   # - High-energy physics processes (valid)
   # - Detector saturation (error)
   # - Pile-up contamination (error)
   ```

3. **Consistency Checks**:
   ```python
   # Check energy-momentum relationship: E¬≤ ‚âà (pc)¬≤ + (mc¬≤)¬≤
   # If violated significantly ‚Üí measurement error
   energy_momentum_consistent = check_energy_momentum_relation(df)
   ```

4. **Decision Rule**:
   - **Keep** if passes physics constraints + consistency checks
   - **Flag** if extreme + inconsistent (review manually)
   - **Cap** if using in production (e.g., `p_T_capped = min(p_T, 99th_percentile)`)

**In this dataset**: All events pass physics checks ‚Üí keep all data points.

In [None]:
# Solution 3 (implementation)
def check_outliers_vs_rare_events(df):
    """Distinguish outliers from valid rare events."""
    
    # Physics constraint checks
    constraints_passed = {
        'E_total ‚â• 0': (df['E_total'] >= 0).all(),
        'p_T ‚â• 0': (df['p_T'] >= 0).all(),
        'missing_E_T ‚â• 0': (df['missing_E_T'] >= 0).all(),
        '0 ‚â§ b_tag_score ‚â§ 1': ((df['b_tag_score'] >= 0) & (df['b_tag_score'] <= 1)).all(),
        '-œÄ ‚â§ phi ‚â§ œÄ': ((df['phi'] >= -np.pi) & (df['phi'] <= np.pi)).all()
    }
    
    print("Physics Constraint Checks:")
    for constraint, passed in constraints_passed.items():
        status = "‚úÖ" if passed else "‚ùå"
        print(f"  {status} {constraint}")
    
    # Extreme value detection
    z_threshold = 5
    extreme_features = []
    for col in ['p_T', 'E_total', 'missing_E_T']:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        n_extreme = (z_scores > z_threshold).sum()
        if n_extreme > 0:
            extreme_features.append((col, n_extreme))
    
    print(f"\nExtreme Value Analysis (>{z_threshold}œÉ):")
    if extreme_features:
        for feat, count in extreme_features:
            print(f"  ‚ö†Ô∏è {feat}: {count} extreme events ({count/len(df)*100:.2f}%)")
        print("  ‚Üí Decision: Keep (physically valid high-energy events)")
    else:
        print("  ‚úÖ No extreme outliers detected")
    
    return all(constraints_passed.values())

# Run check
valid = check_outliers_vs_rare_events(df)
print(f"\n{'‚úÖ' if valid else '‚ùå'} Dataset passes all physics constraints")

---

## ‚úÖ Notebook Complete!

**What you learned**:
1. ‚úÖ Systematic EDA workflow (schema ‚Üí target ‚Üí features ‚Üí quality)
2. ‚úÖ Data quality checks (missing values, outliers, constraints)
3. ‚úÖ Leakage detection (correlation with target, temporal logic)
4. ‚úÖ Feature analysis (distributions, correlations, discriminators)
5. ‚úÖ Split strategy design (stratification, shuffling, reproducibility)

**Outputs saved**:
- `reports/01_target_distribution.png`
- `reports/01_outliers_boxplot.png`
- `reports/01_feature_distributions_by_class.png`
- `reports/01_correlation_matrix.png`
- `reports/01_eda_summary.json`

**Next notebook**: `02_splitting_cv_and_leakage.ipynb` ‚Äî Learn proper train/val/test splits and demonstrate leakage with wrong pipelines.