In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys

repo_root = Path().resolve().parents[2]
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
load_data, get_feature_columns = safe_import_from(
    '03_ml_tabular_foundations.src.data',
    'load_data', 'get_feature_columns'
)

set_seed(42)
df = load_data()

print(f"✅ Loaded {len(df):,} samples")
print(f"   Signal: {df['is_signal'].sum():,} ({df['is_signal'].mean():.1%})")
print(f"   Background: {(~df['is_signal'].astype(bool)).sum():,}")

## 1. Class Distribution

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
df['is_signal'].value_counts().plot(kind='bar', ax=ax, alpha=0.7, color=['steelblue', 'coral'])
ax.set_xticklabels(['Background', 'Signal'], rotation=0)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Class Distribution (Imbalanced: 10% signal)', fontsize=14)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Class imbalance ratio: {df['is_signal'].mean():.1%} signal")

## 2. Feature Distributions by Class

In [None]:
# Plot key physics features
key_features = ['m_inv', 'missing_E_T', 'p_T', 'b_tag_score', 'lepton_iso']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    ax = axes[idx]
    
    # Separate by class
    signal_data = df[df['is_signal'] == 1][feature]
    background_data = df[df['is_signal'] == 0][feature]
    
    # Plot histograms
    ax.hist(background_data, bins=50, alpha=0.5, label='Background', 
            color='steelblue', density=True, edgecolor='black', linewidth=0.5)
    ax.hist(signal_data, bins=50, alpha=0.5, label='Signal', 
            color='coral', density=True, edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel(feature, fontsize=11)
    ax.set_ylabel('Density', fontsize=11)
    ax.set_title(f'{feature} Distribution', fontsize=12)
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)

# Hide extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.savefig('../reports/eda_feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print("✅ Key observations:")
print("   - m_inv (invariant mass): Signal peaked at 125 GeV (Higgs-like)")
print("   - missing_E_T: Signal has higher missing energy (neutrinos)")
print("   - b_tag_score: Signal enriched in b-jets")
print("   - lepton_iso: Signal has well-isolated leptons")

## 3. Feature Correlations

In [None]:
# Correlation matrix
feature_cols = get_feature_columns(df)
corr_matrix = df[feature_cols].corr()

fig, ax = plt.subplots(figsize=(12, 10))
im = ax.imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')

ax.set_xticks(np.arange(len(feature_cols)))
ax.set_yticks(np.arange(len(feature_cols)))
ax.set_xticklabels(feature_cols, rotation=45, ha='right', fontsize=9)
ax.set_yticklabels(feature_cols, fontsize=9)
ax.set_title('Feature Correlation Matrix', fontsize=14)

plt.colorbar(im, ax=ax, label='Correlation')
plt.tight_layout()
plt.savefig('../reports/eda_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(feature_cols)):
    for j in range(i+1, len(feature_cols)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append((feature_cols[i], feature_cols[j], corr_matrix.iloc[i, j]))

print("\nHighly correlated feature pairs (|r| > 0.7):")
for f1, f2, corr in high_corr_pairs:
    print(f"   {f1} <-> {f2}: {corr:.3f}")

## 4. Class Separability Analysis

In [None]:
# Compute ROC AUC for each individual feature
from sklearn.metrics import roc_auc_score

feature_auc = {}
for col in feature_cols:
    try:
        auc = roc_auc_score(df['is_signal'], df[col])
        # Convert to discrimination (max(auc, 1-auc))
        feature_auc[col] = max(auc, 1 - auc)
    except:
        feature_auc[col] = 0.5

# Sort by discrimination power
feature_auc_sorted = sorted(feature_auc.items(), key=lambda x: x[1], reverse=True)

# Plot
fig, ax = plt.subplots(figsize=(10, 8))
features = [x[0] for x in feature_auc_sorted]
aucs = [x[1] for x in feature_auc_sorted]

ax.barh(features, aucs, alpha=0.7, color='teal')
ax.set_xlabel('AUC-ROC (discrimination power)', fontsize=12)
ax.set_title('Single-Feature Discrimination Power', fontsize=14)
ax.axvline(0.5, color='red', linestyle='--', linewidth=1, label='Random (0.5)')
ax.legend(fontsize=10)
ax.grid(alpha=0.3, axis='x')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('../reports/eda_feature_discrimination.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTop 5 discriminative features:")
for feat, auc in feature_auc_sorted[:5]:
    print(f"   {feat}: AUC = {auc:.3f}")

## 5. Summary Statistics

In [None]:
print("\nDataset Summary:")
print(df[feature_cols].describe())

print("\n✅ EDA Complete!")
print("\nKey Insights:")
print("  1. Class imbalance: 10% signal, 90% background")
print("  2. Invariant mass (m_inv) is strongest single discriminator")
print("  3. Missing E_T and b-tagging important for signal identification")
print("  4. Some feature correlations exist (e.g., E_total with p_T)")
print("  5. No missing values - complete dataset")
print("\nNext: Train baseline and production models")