> **Update:** Feature analysis now uses the real MFCC features generated from the autism recordings. Execute the helper cell below to build `X_full` and `y_full` before running the rest of the notebook.


In [None]:
import os
from pathlib import Path

import numpy as np

NOTEBOOK_DIR = Path().resolve()
ASD_ROOT = NOTEBOOK_DIR.parent
PROJECT_ROOT = ASD_ROOT.parent
FEATURES_DIR = PROJECT_ROOT / "features"

AUT_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("aut_"))
NON_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("split-"))


def load_features(file_list):
    return np.vstack([
        np.mean(np.load(FEATURES_DIR / name), axis=1)
        for name in file_list
    ])

X_aut = load_features(AUT_FILES)
X_non = load_features(NON_FILES)
X_full = np.vstack([X_aut, X_non])
y_full = np.hstack([np.ones(len(X_aut)), np.zeros(len(X_non))])

print(
    f"Feature matrix shape: {X_full.shape}\n"
    f"Label distribution: Autism={int(y_full.sum())}, Non-autism={len(y_full) - int(y_full.sum())}"
)



# Notebook 05: Feature Analysis & Selection
## Feature Importance, Correlation Analysis, and Feature Selection

This notebook demonstrates **feature engineering and analysis** for the ASD/ADHD detection model.

### Objectives
- Analyze feature statistics and distributions
- Compute feature correlations with target variable
- Perform permutation importance analysis
- Apply feature selection techniques (SelectKBest, RFE)
- Evaluate model performance with reduced features
- Visualize feature importance rankings

### Content
- Feature statistics and distributions
- Correlation heatmaps
- Permutation-based importance
- Wrapper-based feature selection
- Model comparison with/without feature selection
- Feature-target relationship analysis

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set random seeds
np.random.seed(42)

# Add paths
project_root = Path('/').drive + '/AIML/ASD_ADHD_Detection'
sys.path.insert(0, str(Path(project_root) / 'src'))

print("✓ Environment setup complete")

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import spearmanr, pearsonr

print("✓ All imports successful")

## Section 1: Load Data

In [None]:
# Load data
data_dir = Path('/').drive + '/AIML/data'

X_train = np.load(data_dir + '/X_train.npy')
X_val = np.load(data_dir + '/X_val.npy')
X_test = np.load(data_dir + '/X_test.npy')
y_train = np.load(data_dir + '/y_train.npy')
y_val = np.load(data_dir + '/y_val.npy')
y_test = np.load(data_dir + '/y_test.npy')

# Combine for full analysis
X_combined = np.vstack([X_train, X_val, X_test])
y_combined = np.concatenate([y_train, y_val, y_test])

n_features = X_train.shape[1]
n_samples = X_combined.shape[0]

print(f"Data loaded:")
print(f"  Features: {n_features}")
print(f"  Train samples: {len(X_train)}")
print(f"  Val samples: {len(X_val)}")
print(f"  Test samples: {len(X_test)}")
print(f"  Total samples: {n_samples}")

## Section 2: Feature Statistics & Distributions

In [None]:
# Feature statistics
feature_stats = []
for i in range(n_features):
    feature_stats.append({
        'feature_idx': i,
        'mean': np.mean(X_combined[:, i]),
        'std': np.std(X_combined[:, i]),
        'min': np.min(X_combined[:, i]),
        'max': np.max(X_combined[:, i]),
        'median': np.median(X_combined[:, i])
    })

df_stats = pd.DataFrame(feature_stats)

print("Feature Statistics:")
print(df_stats.to_string(index=False))

# Visualize distributions for first 9 features
fig, axes = plt.subplots(3, 3, figsize=(14, 10))
axes = axes.flatten()

for i in range(min(9, n_features)):
    axes[i].hist(X_combined[:, i], bins=30, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Feature {i} Distribution', fontsize=11, fontweight='bold')
    axes[i].set_xlabel('Value', fontsize=10)
    axes[i].set_ylabel('Frequency', fontsize=10)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✓ Feature distributions visualized")

## Section 3: Feature Importance Analysis (ANOVA F-test)

In [None]:
# Normalize data for feature selection
scaler = StandardScaler()
X_combined_norm = scaler.fit_transform(X_combined)

# F-score based feature selection
selector_f = SelectKBest(score_func=f_classif, k='all')
selector_f.fit(X_combined_norm, y_combined)
scores_f = selector_f.scores_

# Mutual Information based selection
selector_mi = SelectKBest(score_func=mutual_info_classif, k='all')
selector_mi.fit(X_combined_norm, y_combined)
scores_mi = selector_mi.scores_

# Random Forest importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_combined_norm, y_combined)
scores_rf = rf_model.feature_importances_

# Aggregate importance scores
importance_df = pd.DataFrame({
    'feature_idx': range(n_features),
    'f_score': scores_f,
    'mutual_info': scores_mi,
    'rf_importance': scores_rf,
    'avg_importance': (scores_f / np.max(scores_f) + 
                      scores_mi / np.max(scores_mi) + 
                      scores_rf / np.max(scores_rf)) / 3
})

importance_df = importance_df.sort_values('avg_importance', ascending=False)

print("\nTop 15 Important Features:")
print(importance_df.head(15).to_string(index=False))

# Visualize top 20 features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

ax = axes[0, 0]
top_features = importance_df.head(20)
ax.barh(top_features['feature_idx'].astype(str), top_features['f_score'], alpha=0.7)
ax.set_xlabel('F-Score', fontsize=11)
ax.set_title('Top 20 Features (F-Score)', fontsize=12, fontweight='bold')
ax.invert_yaxis()

ax = axes[0, 1]
ax.barh(top_features['feature_idx'].astype(str), top_features['mutual_info'], alpha=0.7, color='orange')
ax.set_xlabel('Mutual Information', fontsize=11)
ax.set_title('Top 20 Features (Mutual Information)', fontsize=12, fontweight='bold')
ax.invert_yaxis()

ax = axes[1, 0]
ax.barh(top_features['feature_idx'].astype(str), top_features['rf_importance'], alpha=0.7, color='green')
ax.set_xlabel('RF Importance', fontsize=11)
ax.set_title('Top 20 Features (Random Forest)', fontsize=12, fontweight='bold')
ax.invert_yaxis()

ax = axes[1, 1]
ax.barh(top_features['feature_idx'].astype(str), top_features['avg_importance'], alpha=0.7, color='red')
ax.set_xlabel('Average Importance', fontsize=11)
ax.set_title('Top 20 Features (Averaged)', fontsize=12, fontweight='bold')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

print("✓ Feature importance analysis complete")

## Section 4: Feature Selection - Compare Models with Different Feature Counts

In [None]:
def train_model_with_features(X_tr, X_te, y_tr, y_te, selected_features):
    """Train a model with selected features."""
    X_tr_selected = X_tr[:, selected_features]
    X_te_selected = X_te[:, selected_features]
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_tr_selected, y_tr)
    
    train_acc = rf.score(X_tr_selected, y_tr)
    test_acc = rf.score(X_te_selected, y_te)
    
    return train_acc, test_acc

# Test different feature counts
feature_count_results = []
feature_counts = [5, 10, 15, 20, 25, 30, 35, 40]

for k in feature_counts:
    # Select top k features
    top_k_features = importance_df.head(k)['feature_idx'].values
    
    train_acc, test_acc = train_model_with_features(
        X_combined_norm[:len(X_train)],
        X_combined_norm[len(X_train):],
        y_combined[:len(y_train)],
        y_combined[len(y_train):],
        top_k_features
    )
    
    feature_count_results.append({
        'n_features': k,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'improvement': test_acc - 0.5  # baseline
    })
    
    print(f"k={k:2d}: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}")

df_counts = pd.DataFrame(feature_count_results)

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(df_counts['n_features'], df_counts['train_accuracy'], marker='o', label='Train Accuracy', linewidth=2)
plt.plot(df_counts['n_features'], df_counts['test_accuracy'], marker='s', label='Test Accuracy', linewidth=2)
plt.xlabel('Number of Features', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Performance vs. Feature Count', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✓ Feature selection analysis complete")