# Feature Analysis for Steel Casting Defect Prediction

**Date:** 2025-01-21  
**Author:** Steel Defect Demo Team  
**Purpose:** Comprehensive analysis of features for defect prediction

## Overview
This notebook provides a comprehensive analysis of the steel casting dataset, focusing on:
- Feature distributions comparing normal vs defect casts
- Correlation analysis with visualization
- Feature importance ranking using multiple methods
- Univariate and bivariate feature analysis
- Feature selection recommendations
- Publication-quality visualizations

## Objectives
- Understand feature distributions and their relationship to defect prediction
- Identify the most important features for model training
- Detect multicollinearity and provide feature selection guidance
- Generate actionable insights for stakeholders

In [None]:
# Feature Analysis for Steel Casting Defect Prediction
# Date: 2025-01-21
# Author: Steel Defect Demo Team
# Purpose: Comprehensive analysis of features for defect prediction

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency, pearsonr, spearmanr
from sklearn.feature_selection import (
    mutual_info_classif, SelectKBest, f_classif,
    chi2, RFE
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score

# Custom modules
import sys
sys.path.append('../src')
from data.data_loader import DataLoader
from features.feature_engineer import CastingFeatureEngineer

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)

# Set up plotting
plt.style.use('seaborn')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

print("Feature Analysis Notebook - Steel Casting Defect Prediction")
print("="*60)

In [None]:
# Load the cleaned data
data_loader = DataLoader(data_dir='../data')
df = data_loader.load_cleaned_data('../data/processed/cleaned_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['defect'].value_counts()}")
print(f"Target proportions:\n{df['defect'].value_counts(normalize=True)}")

# Basic dataset information
print("\nDataset Info:")
print(df.info())

# Check for missing values
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print(f"\nMissing values:\n{missing_data[missing_data > 0]}")
else:
    print("\nNo missing values found")

# Separate features and target
X = df.drop('defect', axis=1)
y = df['defect']

# Feature types analysis
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

In [None]:
# === FEATURE DISTRIBUTION ANALYSIS ===
print("1. FEATURE DISTRIBUTION ANALYSIS")
print("="*50)

def analyze_feature_distributions(df, target_col='defect', max_features=20):
    """
    Analyze feature distributions comparing normal vs defect casts
    """
    results = {}
    
    # Separate normal and defect samples
    normal_data = df[df[target_col] == 0]
    defect_data = df[df[target_col] == 1]
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)
    
    # Limit number of features for display
    if len(numeric_cols) > max_features:
        print(f"Analyzing top {max_features} features (out of {len(numeric_cols)})")
        numeric_cols = numeric_cols[:max_features]
    
    for feature in numeric_cols:
        # Statistical tests
        stat_test = stats.mannwhitneyu(
            normal_data[feature].dropna(),
            defect_data[feature].dropna(),
            alternative='two-sided'
        )
        
        # Effect size (Cohen's d)
        normal_mean = normal_data[feature].mean()
        defect_mean = defect_data[feature].mean()
        pooled_std = np.sqrt(
            ((len(normal_data[feature]) - 1) * normal_data[feature].var() +
             (len(defect_data[feature]) - 1) * defect_data[feature].var()) /
            (len(normal_data[feature]) + len(defect_data[feature]) - 2)
        )
        cohens_d = (defect_mean - normal_mean) / pooled_std if pooled_std > 0 else 0
        
        results[feature] = {
            'normal_mean': normal_mean,
            'defect_mean': defect_mean,
            'normal_std': normal_data[feature].std(),
            'defect_std': defect_data[feature].std(),
            'mannwhitney_statistic': stat_test.statistic,
            'mannwhitney_pvalue': stat_test.pvalue,
            'cohens_d': cohens_d,
            'effect_size': 'small' if abs(cohens_d) < 0.5 else 'medium' if abs(cohens_d) < 0.8 else 'large'
        }
    
    return pd.DataFrame(results).T

# Perform distribution analysis
distribution_results = analyze_feature_distributions(df)

# Sort by effect size
distribution_results['abs_cohens_d'] = abs(distribution_results['cohens_d'])
distribution_results = distribution_results.sort_values('abs_cohens_d', ascending=False)

print("Top 10 features with largest effect sizes:")
display(distribution_results.head(10))

# Visualize top discriminative features
top_features = distribution_results.head(8).index.tolist()

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, feature in enumerate(top_features):
    ax = axes[i]
    
    # Plot distributions
    normal_data = df[df['defect'] == 0][feature]
    defect_data = df[df['defect'] == 1][feature]
    
    ax.hist(normal_data, alpha=0.7, label='Normal', bins=30, density=True)
    ax.hist(defect_data, alpha=0.7, label='Defect', bins=30, density=True)
    
    ax.set_title(f'{feature}\nCohen\'s d = {distribution_results.loc[feature, "cohens_d"]:.3f}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Feature Distributions: Normal vs Defect Casts', fontsize=16, y=1.02)
plt.show()

In [None]:
# === CORRELATION ANALYSIS ===
print("2. CORRELATION ANALYSIS")
print("="*30)

def create_correlation_analysis(df, target_col='defect'):
    """
    Comprehensive correlation analysis
    """
    # Prepare numeric data
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Calculate correlation matrices
    pearson_corr = numeric_df.corr(method='pearson')
    spearman_corr = numeric_df.corr(method='spearman')
    
    # Target correlations
    target_correlations = pd.DataFrame({
        'pearson': pearson_corr[target_col].drop(target_col),
        'spearman': spearman_corr[target_col].drop(target_col)
    })
    target_correlations['abs_pearson'] = abs(target_correlations['pearson'])
    target_correlations = target_correlations.sort_values('abs_pearson', ascending=False)
    
    return pearson_corr, spearman_corr, target_correlations

pearson_corr, spearman_corr, target_correlations = create_correlation_analysis(df)

print("Top 15 features correlated with defect:")
display(target_correlations.head(15))

# Correlation heatmap
plt.figure(figsize=(16, 12))

# Select top correlated features for visualization
top_corr_features = target_correlations.head(20).index.tolist() + ['defect']
corr_subset = pearson_corr.loc[top_corr_features, top_corr_features]

# Create heatmap
mask = np.triu(np.ones_like(corr_subset, dtype=bool))
sns.heatmap(
    corr_subset,
    mask=mask,
    annot=True,
    cmap='RdBu_r',
    center=0,
    square=True,
    fmt='.2f',
    cbar_kws={"shrink": .8}
)

plt.title('Feature Correlation Matrix (Top 20 + Target)', fontsize=14)
plt.tight_layout()
plt.show()

# Interactive correlation heatmap with Plotly
fig = px.imshow(
    corr_subset,
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu_r',
    color_continuous_midpoint=0,
    title="Interactive Correlation Heatmap"
)
fig.update_layout(height=600, width=800)
fig.show()

# High correlation pairs (potential multicollinearity)
def find_high_correlations(corr_matrix, threshold=0.8):
    """Find pairs of features with high correlation"""
    high_corr_pairs = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr_pairs.append({
                    'feature1': corr_matrix.columns[i],
                    'feature2': corr_matrix.columns[j],
                    'correlation': corr_matrix.iloc[i, j]
                })
    
    return pd.DataFrame(high_corr_pairs)

high_corr_pairs = find_high_correlations(pearson_corr, threshold=0.8)
if not high_corr_pairs.empty:
    print(f"\nHigh correlation pairs (|r| > 0.8): {len(high_corr_pairs)}")
    display(high_corr_pairs.head(10))
else:
    print("\nNo high correlation pairs found (|r| > 0.8)")

In [None]:
# === FEATURE IMPORTANCE ANALYSIS ===
print("3. FEATURE IMPORTANCE ANALYSIS")
print("="*35)

def calculate_feature_importance(X, y, random_state=42):
    """
    Calculate feature importance using multiple methods
    """
    importance_results = pd.DataFrame(index=X.columns)
    
    # 1. Random Forest Importance
    rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
    rf.fit(X, y)
    importance_results['random_forest'] = rf.feature_importances_
    
    # 2. Mutual Information
    mi_scores = mutual_info_classif(X, y, random_state=random_state)
    importance_results['mutual_information'] = mi_scores
    
    # 3. F-statistics
    f_scores, f_pvalues = f_classif(X, y)
    importance_results['f_statistic'] = f_scores
    importance_results['f_pvalue'] = f_pvalues
    
    # 4. Univariate feature selection
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    importance_results['univariate_score'] = selector.scores_
    
    # 5. Logistic Regression coefficients (with standardized features)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    lr = LogisticRegression(random_state=random_state, max_iter=1000)
    lr.fit(X_scaled, y)
    importance_results['logistic_coef'] = abs(lr.coef_[0])
    
    # Normalize importance scores to 0-1 range for comparison
    for col in importance_results.columns:
        if col not in ['f_pvalue']:
            importance_results[f'{col}_normalized'] = (
                importance_results[col] / importance_results[col].max()
            )
    
    # Calculate composite importance score
    importance_cols = [col for col in importance_results.columns if col.endswith('_normalized')]
    importance_results['composite_importance'] = importance_results[importance_cols].mean(axis=1)
    
    return importance_results

# Calculate feature importance
importance_results = calculate_feature_importance(X, y)

# Sort by composite importance
importance_results = importance_results.sort_values('composite_importance', ascending=False)

print("Top 15 most important features:")
display(importance_results[['random_forest', 'mutual_information', 'f_statistic', 'logistic_coef', 'composite_importance']].head(15))

# Visualize feature importance comparison
top_features = importance_results.head(15).index.tolist()
importance_subset = importance_results.loc[top_features]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Random Forest Importance
axes[0, 0].barh(range(len(top_features)), importance_subset['random_forest'])
axes[0, 0].set_yticks(range(len(top_features)))
axes[0, 0].set_yticklabels(top_features)
axes[0, 0].set_title('Random Forest Feature Importance')
axes[0, 0].set_xlabel('Importance')

# Mutual Information
axes[0, 1].barh(range(len(top_features)), importance_subset['mutual_information'])
axes[0, 1].set_yticks(range(len(top_features)))
axes[0, 1].set_yticklabels(top_features)
axes[0, 1].set_title('Mutual Information Scores')
axes[0, 1].set_xlabel('MI Score')

# F-statistics
axes[1, 0].barh(range(len(top_features)), importance_subset['f_statistic'])
axes[1, 0].set_yticks(range(len(top_features)))
axes[1, 0].set_yticklabels(top_features)
axes[1, 0].set_title('F-statistic Scores')
axes[1, 0].set_xlabel('F-score')

# Composite Importance
axes[1, 1].barh(range(len(top_features)), importance_subset['composite_importance'])
axes[1, 1].set_yticks(range(len(top_features)))
axes[1, 1].set_yticklabels(top_features)
axes[1, 1].set_title('Composite Importance Score')
axes[1, 1].set_xlabel('Composite Score')

plt.tight_layout()
plt.suptitle('Feature Importance Comparison (Top 15 Features)', fontsize=16, y=1.02)
plt.show()

# Interactive feature importance plot
fig = go.Figure()

methods = ['random_forest', 'mutual_information', 'f_statistic', 'logistic_coef']
colors = ['blue', 'red', 'green', 'orange']

for i, method in enumerate(methods):
    fig.add_trace(go.Bar(
        name=method.replace('_', ' ').title(),
        x=top_features,
        y=importance_subset[f'{method}_normalized'],
        marker_color=colors[i],
        opacity=0.7
    ))

fig.update_layout(
    title='Feature Importance Comparison (Normalized)',
    xaxis_title='Features',
    yaxis_title='Normalized Importance',
    barmode='group',
    height=500,
    xaxis_tickangle=-45
)
fig.show()

In [None]:
# === UNIVARIATE ANALYSIS ===
print("4. UNIVARIATE ANALYSIS")
print("="*25)

def perform_univariate_analysis(df, target_col='defect', top_n=10):
    """
    Detailed univariate analysis for top features
    """
    # Get top features from importance analysis
    top_features = importance_results.head(top_n).index.tolist()
    
    univariate_stats = {}
    
    for feature in top_features:
        stats_dict = {}
        
        # Overall statistics
        stats_dict['mean'] = df[feature].mean()
        stats_dict['std'] = df[feature].std()
        stats_dict['median'] = df[feature].median()
        stats_dict['min'] = df[feature].min()
        stats_dict['max'] = df[feature].max()
        stats_dict['skewness'] = df[feature].skew()
        stats_dict['kurtosis'] = df[feature].kurtosis()
        
        # Group statistics
        normal_data = df[df[target_col] == 0][feature]
        defect_data = df[df[target_col] == 1][feature]
        
        stats_dict['normal_mean'] = normal_data.mean()
        stats_dict['defect_mean'] = defect_data.mean()
        stats_dict['normal_std'] = normal_data.std()
        stats_dict['defect_std'] = defect_data.std()
        
        # Statistical tests
        # Normality tests
        _, normal_shapiro_p = stats.shapiro(normal_data.sample(min(5000, len(normal_data))))
        _, defect_shapiro_p = stats.shapiro(defect_data.sample(min(5000, len(defect_data))))
        
        stats_dict['normal_shapiro_p'] = normal_shapiro_p
        stats_dict['defect_shapiro_p'] = defect_shapiro_p
        
        # Variance test
        _, levene_p = stats.levene(normal_data, defect_data)
        stats_dict['levene_p'] = levene_p
        
        # Mean difference test
        _, ttest_p = stats.ttest_ind(normal_data, defect_data, equal_var=(levene_p > 0.05))
        stats_dict['ttest_p'] = ttest_p
        
        # Mann-Whitney U test (non-parametric)
        _, mannwhitney_p = stats.mannwhitneyu(normal_data, defect_data, alternative='two-sided')
        stats_dict['mannwhitney_p'] = mannwhitney_p
        
        univariate_stats[feature] = stats_dict
    
    return pd.DataFrame(univariate_stats).T

# Perform univariate analysis
univariate_stats = perform_univariate_analysis(df, top_n=15)

print("Univariate Statistics for Top 15 Features:")
display(univariate_stats[['mean', 'std', 'skewness', 'kurtosis', 'normal_mean', 'defect_mean', 'ttest_p', 'mannwhitney_p']].round(4))

# Visualize distributions of top features
top_univariate_features = univariate_stats.index[:12].tolist()

fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

for i, feature in enumerate(top_univariate_features):
    ax = axes[i]
    
    # Create box plots
    box_data = [
        df[df['defect'] == 0][feature].dropna(),
        df[df['defect'] == 1][feature].dropna()
    ]
    
    bp = ax.boxplot(box_data, labels=['Normal', 'Defect'], patch_artist=True)
    bp['boxes'][0].set_facecolor('lightblue')
    bp['boxes'][1].set_facecolor('lightcoral')
    
    ax.set_title(f'{feature}\np-value: {univariate_stats.loc[feature, "mannwhitney_p"]:.4f}')
    ax.set_ylabel(feature)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Univariate Analysis: Box Plots for Top Features', fontsize=16, y=1.02)
plt.show()

# Statistical significance summary
significant_features = univariate_stats[univariate_stats['mannwhitney_p'] < 0.05]
print(f"\nStatistically significant features (p < 0.05): {len(significant_features)}")
print(f"Percentage of significant features: {len(significant_features)/len(univariate_stats)*100:.1f}%")

In [None]:
# === BIVARIATE ANALYSIS ===
print("5. BIVARIATE ANALYSIS")
print("="*25)

def perform_bivariate_analysis(df, target_col='defect', top_n=8):
    """
    Bivariate analysis between top features and target
    """
    top_features = importance_results.head(top_n).index.tolist()
    
    # Create pair plots for top features
    plot_df = df[top_features + [target_col]].copy()
    plot_df[target_col] = plot_df[target_col].map({0: 'Normal', 1: 'Defect'})
    
    # Seaborn pairplot
    plt.figure(figsize=(16, 16))
    g = sns.pairplot(
        plot_df, 
        hue=target_col,
        diag_kind='hist',
        plot_kws={'alpha': 0.6},
        diag_kws={'alpha': 0.7}
    )
    g.fig.suptitle('Bivariate Analysis: Feature Pairs Colored by Target', fontsize=16, y=1.02)
    plt.show()
    
    # Feature interaction analysis
    interaction_results = {}
    
    for i, feature1 in enumerate(top_features):
        for j, feature2 in enumerate(top_features[i+1:], i+1):
            # Calculate interaction effect
            normal_data = df[df[target_col] == 'Normal' if target_col in ['Normal', 'Defect'] else df[target_col] == 0]
            defect_data = df[df[target_col] == 'Defect' if target_col in ['Normal', 'Defect'] else df[target_col] == 1]
            
            # Correlation within groups
            normal_corr = normal_data[[feature1, feature2]].corr().iloc[0, 1]
            defect_corr = defect_data[[feature1, feature2]].corr().iloc[0, 1]
            
            interaction_results[f"{feature1}_x_{feature2}"] = {
                'normal_correlation': normal_corr,
                'defect_correlation': defect_corr,
                'correlation_difference': abs(defect_corr - normal_corr)
            }
    
    return pd.DataFrame(interaction_results).T

# Reset target column if changed
df['defect'] = y  # Ensure numeric target

bivariate_results = perform_bivariate_analysis(df, top_n=6)

print("Feature Interaction Analysis (Top 6 Features):")
bivariate_sorted = bivariate_results.sort_values('correlation_difference', ascending=False)
display(bivariate_sorted.head(10))

# Scatter plots for most interesting pairs
top_interactions = bivariate_sorted.head(4).index.tolist()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for i, interaction in enumerate(top_interactions):
    feature1, feature2 = interaction.split('_x_')
    ax = axes[i]
    
    # Scatter plot colored by target
    normal_idx = df['defect'] == 0
    defect_idx = df['defect'] == 1
    
    ax.scatter(df.loc[normal_idx, feature1], df.loc[normal_idx, feature2], 
              alpha=0.6, label='Normal', color='blue')
    ax.scatter(df.loc[defect_idx, feature1], df.loc[defect_idx, feature2], 
              alpha=0.6, label='Defect', color='red')
    
    ax.set_xlabel(feature1)
    ax.set_ylabel(feature2)
    ax.set_title(f'{feature1} vs {feature2}\nCorr Diff: {bivariate_results.loc[interaction, "correlation_difference"]:.3f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('Bivariate Analysis: Feature Interactions', fontsize=16, y=1.02)
plt.show()

In [None]:
# === FEATURE SELECTION RECOMMENDATIONS ===
print("6. FEATURE SELECTION RECOMMENDATIONS")
print("="*40)

def generate_feature_recommendations(importance_results, correlation_results, univariate_stats, threshold_importance=0.1, threshold_correlation=0.8):
    """
    Generate comprehensive feature selection recommendations
    """
    recommendations = {}
    
    # 1. High importance features
    high_importance = importance_results[importance_results['composite_importance'] > threshold_importance]
    recommendations['high_importance'] = high_importance.index.tolist()
    
    # 2. Statistically significant features
    significant_features = univariate_stats[univariate_stats['mannwhitney_p'] < 0.05].index.tolist()
    recommendations['statistically_significant'] = significant_features
    
    # 3. Low correlation features (avoid multicollinearity)
    high_corr_pairs = find_high_correlations(correlation_results, threshold=threshold_correlation)
    
    # For high correlation pairs, keep the one with higher importance
    features_to_remove = set()
    for _, row in high_corr_pairs.iterrows():
        feature1, feature2 = row['feature1'], row['feature2']
        if feature1 in importance_results.index and feature2 in importance_results.index:
            if importance_results.loc[feature1, 'composite_importance'] > importance_results.loc[feature2, 'composite_importance']:
                features_to_remove.add(feature2)
            else:
                features_to_remove.add(feature1)
    
    low_corr_features = [f for f in importance_results.index if f not in features_to_remove]
    recommendations['low_multicollinearity'] = low_corr_features
    
    # 4. Combined recommendations
    final_features = list(set(recommendations['high_importance']) & 
                         set(recommendations['statistically_significant']) & 
                         set(recommendations['low_multicollinearity']))
    
    recommendations['final_recommended'] = sorted(final_features, 
                                                 key=lambda x: importance_results.loc[x, 'composite_importance'], 
                                                 reverse=True)
    
    return recommendations

# Generate recommendations
feature_recommendations = generate_feature_recommendations(
    importance_results, 
    pearson_corr, 
    univariate_stats,
    threshold_importance=0.05,
    threshold_correlation=0.8
)

print("FEATURE SELECTION RECOMMENDATIONS")
print("="*40)

for category, features in feature_recommendations.items():
    print(f"\n{category.replace('_', ' ').title()}: {len(features)} features")
    if len(features) <= 20:  # Show all if reasonable number
        for i, feature in enumerate(features[:20], 1):
            importance_score = importance_results.loc[feature, 'composite_importance'] if feature in importance_results.index else 'N/A'
            print(f"  {i:2d}. {feature} (importance: {importance_score:.3f})")
    else:  # Show top 20
        for i, feature in enumerate(features[:20], 1):
            importance_score = importance_results.loc[feature, 'composite_importance'] if feature in importance_results.index else 'N/A'
            print(f"  {i:2d}. {feature} (importance: {importance_score:.3f})")
        print(f"      ... and {len(features) - 20} more")

# Feature selection performance comparison
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def evaluate_feature_sets(X, y, feature_sets, cv=5):
    """Evaluate different feature sets using cross-validation"""
    results = {}
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    for set_name, features in feature_sets.items():
        if len(features) > 0:
            X_subset = X[features]
            scores = cross_val_score(rf, X_subset, y, cv=cv, scoring='roc_auc')
            results[set_name] = {
                'n_features': len(features),
                'mean_auc': scores.mean(),
                'std_auc': scores.std(),
                'scores': scores
            }
    
    return results

# Evaluate feature sets
evaluation_sets = {
    'all_features': X.columns.tolist(),
    'top_10_importance': importance_results.head(10).index.tolist(),
    'top_20_importance': importance_results.head(20).index.tolist(),
    'final_recommended': feature_recommendations['final_recommended']
}

evaluation_results = evaluate_feature_sets(X, y, evaluation_sets)

print("\n\nFEATURE SET PERFORMANCE COMPARISON")
print("="*40)

performance_df = pd.DataFrame({
    name: {
        'n_features': results['n_features'],
        'mean_auc': results['mean_auc'],
        'std_auc': results['std_auc']
    }
    for name, results in evaluation_results.items()
}).T

performance_df = performance_df.sort_values('mean_auc', ascending=False)
display(performance_df)

# Visualize performance comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# AUC comparison
ax1.bar(range(len(performance_df)), performance_df['mean_auc'], 
        yerr=performance_df['std_auc'], capsize=5)
ax1.set_xticks(range(len(performance_df)))
ax1.set_xticklabels(performance_df.index, rotation=45)
ax1.set_ylabel('Cross-Validation AUC')
ax1.set_title('Feature Set Performance Comparison')
ax1.grid(True, alpha=0.3)

# Number of features vs AUC
ax2.scatter(performance_df['n_features'], performance_df['mean_auc'], s=100)
for idx, row in performance_df.iterrows():
    ax2.annotate(idx, (row['n_features'], row['mean_auc']), 
                xytext=(5, 5), textcoords='offset points')
ax2.set_xlabel('Number of Features')
ax2.set_ylabel('Cross-Validation AUC')
ax2.set_title('Features vs Performance Trade-off')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# === SUMMARY AND CONCLUSIONS ===
print("7. SUMMARY AND CONCLUSIONS")
print("="*30)

print("FEATURE ANALYSIS SUMMARY")
print("="*25)

# Key findings
print(f"📊 Dataset Overview:")
print(f"   • Total features analyzed: {len(X.columns)}")
print(f"   • Numeric features: {len(numeric_features)}")
print(f"   • Categorical features: {len(categorical_features)}")
print(f"   • Class distribution: {(y==0).sum()} normal, {(y==1).sum()} defect")

print(f"\n🔍 Distribution Analysis:")
print(f"   • Features with large effect sizes (|Cohen's d| > 0.5): {len(distribution_results[distribution_results['abs_cohens_d'] > 0.5])}")
print(f"   • Most discriminative feature: {distribution_results.index[0]} (Cohen's d = {distribution_results.iloc[0]['cohens_d']:.3f})")

print(f"\n📈 Correlation Analysis:")
print(f"   • Features highly correlated with target (|r| > 0.1): {len(target_correlations[target_correlations['abs_pearson'] > 0.1])}")
print(f"   • Highest target correlation: {target_correlations.index[0]} (r = {target_correlations.iloc[0]['pearson']:.3f})")
print(f"   • High correlation pairs (potential multicollinearity): {len(high_corr_pairs) if not high_corr_pairs.empty else 0}")

print(f"\n🎯 Feature Importance:")
print(f"   • Top feature by composite importance: {importance_results.index[0]} (score: {importance_results.iloc[0]['composite_importance']:.3f})")
print(f"   • Features with high importance (score > 0.1): {len(importance_results[importance_results['composite_importance'] > 0.1])}")

print(f"\n📊 Statistical Significance:")
print(f"   • Statistically significant features (p < 0.05): {len(univariate_stats[univariate_stats['mannwhitney_p'] < 0.05])}")
print(f"   • Features with very high significance (p < 0.001): {len(univariate_stats[univariate_stats['mannwhitney_p'] < 0.001])}")

print(f"\n✅ Final Recommendations:")
print(f"   • Recommended feature set size: {len(feature_recommendations['final_recommended'])}")
print(f"   • Expected performance with recommended features: {evaluation_results['final_recommended']['mean_auc']:.3f} ± {evaluation_results['final_recommended']['std_auc']:.3f} AUC")
print(f"   • Performance vs all features: {evaluation_results['final_recommended']['mean_auc'] - evaluation_results['all_features']['mean_auc']:+.3f}")

print(f"\n📋 Top 10 Recommended Features:")
for i, feature in enumerate(feature_recommendations['final_recommended'][:10], 1):
    importance_score = importance_results.loc[feature, 'composite_importance']
    target_corr = target_correlations.loc[feature, 'pearson'] if feature in target_correlations.index else 'N/A'
    print(f"   {i:2d}. {feature}")
    print(f"       • Importance: {importance_score:.3f}")
    print(f"       • Target correlation: {target_corr:.3f}" if target_corr != 'N/A' else "       • Target correlation: N/A")

print(f"\n🎯 Next Steps:")
print(f"   1. Use recommended feature set for model training")
print(f"   2. Consider feature engineering for top features")
print(f"   3. Monitor for concept drift in important features")
print(f"   4. Investigate domain meaning of top features")
print(f"   5. Consider interaction terms for bivariate analysis")
print(f"   6. Validate findings on holdout test set")
print(f"   7. Document feature importance for model interpretability")

print(f"\n📊 Analysis Complete: {len(X.columns)} features analyzed, {len(feature_recommendations['final_recommended'])} recommended for modeling")