> **Update:** The cross-validation workflow below now draws from the real autism MFCC features stored in `../features`. Execute the helper cell that follows to build `X_full` and `y_full` from the recordings before running the rest of the notebook.


In [None]:
import os
from pathlib import Path

import numpy as np

NOTEBOOK_DIR = Path().resolve()
ASD_ROOT = NOTEBOOK_DIR.parent
PROJECT_ROOT = ASD_ROOT.parent
FEATURES_DIR = PROJECT_ROOT / "features"

AUT_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("aut_"))
NON_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("split-"))


def load_features(file_list):
    return np.vstack([
        np.mean(np.load(FEATURES_DIR / name), axis=1)
        for name in file_list
    ])

X_aut = load_features(AUT_FILES)
X_non = load_features(NON_FILES)
X_full = np.vstack([X_aut, X_non])
y_full = np.hstack([np.ones(len(X_aut)), np.zeros(len(X_non))])

print(
    f"Loaded {len(X_full)} samples from {FEATURES_DIR}\n"
    f"  Autism: {len(X_aut)} | Non-autism: {len(X_non)}"
)


# Notebook 03: K-Fold Cross-Validation
## Robust Model Evaluation with Multiple Folds

This notebook demonstrates **K-Fold cross-validation** for the ASD/ADHD detection model.

### Objectives
- Split data into K folds for robust evaluation
- Train independent models on each fold
- Compute fold-specific and aggregated metrics
- Calculate confidence intervals
- Visualize per-fold performance
- Generate comparative analysis across folds

### What You'll Learn
- How K-Fold improves model reliability
- Per-fold performance interpretation
- Aggregated metrics and confidence bounds
- Statistical significance of results

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import json
import pickle
import warnings

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)

# Add paths
project_root = r'f:\AIML\ASD_ADHD_Detection'
sys.path.insert(0, os.path.join(project_root, 'src'))

print("Environment setup complete!")
print(f"Project root: {project_root}")

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support, f1_score
)

print("✓ All imports successful!")

## Section 1: Load Data
Load the precomputed data splits from disk.

In [None]:
data_dir = r'f:\AIML\data'

X_train = np.load(os.path.join(data_dir, 'X_train.npy'))
X_val = np.load(os.path.join(data_dir, 'X_val.npy'))
X_test = np.load(os.path.join(data_dir, 'X_test.npy'))
y_train = np.load(os.path.join(data_dir, 'y_train.npy'))
y_val = np.load(os.path.join(data_dir, 'y_val.npy'))
y_test = np.load(os.path.join(data_dir, 'y_test.npy'))

# Combine training and validation for K-Fold
X_combined = np.vstack([X_train, X_val])
y_combined = np.concatenate([y_train, y_val])

print(f"Combined training data shape: {X_combined.shape}")
print(f"Combined labels shape: {y_combined.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of classes: {len(np.unique(y_combined))}")
print(f"\nClass distribution (combined):")
for cls in np.unique(y_combined):
    count = np.sum(y_combined == cls)
    pct = 100 * count / len(y_combined)
    print(f"  Class {cls}: {count} samples ({pct:.1f}%)")

## Section 2: Setup K-Fold Cross-Validation
Configure and initialize Stratified K-Fold for robust evaluation.

In [None]:
# K-Fold configuration
n_splits = 5
random_state = 42

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

print(f"K-Fold Configuration:")
print(f"  Number of splits: {n_splits}")
print(f"  Stratified: Yes (maintains class distribution)")
print(f"  Random state: {random_state}")
print(f"\nFold details:")

fold_info = []
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_combined, y_combined)):
    train_size = len(train_idx)
    val_size = len(val_idx)
    print(f"  Fold {fold_idx + 1}: train={train_size}, val={val_size} (split={train_size/(train_size+val_size):.1%})")
    fold_info.append({'fold': fold_idx + 1, 'train_size': train_size, 'val_size': val_size})

## Section 3: Model Builder Function
Define a function to create and train models for each fold.

In [None]:
def build_model(input_dim, n_classes):
    """Build a neural network model."""
    model = models.Sequential([
        layers.Dense(256, input_dim=input_dim, kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.3),
        
        layers.Dense(128, kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.3),
        
        layers.Dense(64, kernel_regularizer=keras.regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.2),
        
        layers.Dense(n_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

print("✓ Model builder function defined")

## Section 4: Train Models Across K Folds
Train an independent model on each fold and evaluate on the held-out validation set.

In [None]:
%%time

# Storage for results
fold_results = []
fold_models = []
fold_histories = []

input_dim = X_combined.shape[1]
n_classes = len(np.unique(y_combined))

print(f"Training {n_splits} models across K folds...\n")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_combined, y_combined)):
    print(f"\n{'='*70}")
    print(f"FOLD {fold_idx + 1}/{n_splits}")
    print(f"{'='*70}")
    
    # Split data
    X_train_fold = X_combined[train_idx]
    y_train_fold = y_combined[train_idx]
    X_val_fold = X_combined[val_idx]
    y_val_fold = y_combined[val_idx]
    
    # Normalize
    scaler = StandardScaler()
    X_train_fold_norm = scaler.fit_transform(X_train_fold)
    X_val_fold_norm = scaler.transform(X_val_fold)
    
    # Build model
    model = build_model(input_dim, n_classes)
    
    # Train
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0)
    lr_schedule = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=0)
    
    history = model.fit(
        X_train_fold_norm, y_train_fold,
        validation_data=(X_val_fold_norm, y_val_fold),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop, lr_schedule],
        verbose=0
    )
    
    # Evaluate
    val_loss, val_acc = model.evaluate(X_val_fold_norm, y_val_fold, verbose=0)
    y_pred = model.predict(X_val_fold_norm, verbose=0).argmax(axis=1)
    
    # Metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_val_fold, y_pred, average='weighted')
    
    result = {
        'fold': fold_idx + 1,
        'train_size': len(train_idx),
        'val_size': len(val_idx),
        'epochs_trained': len(history.history['loss']),
        'val_accuracy': float(val_acc),
        'val_loss': float(val_loss),
        'val_precision': float(precision),
        'val_recall': float(recall),
        'val_f1': float(f1),
    }
    
    fold_results.append(result)
    fold_models.append(model)
    fold_histories.append(history)
    
    print(f"Epochs: {result['epochs_trained']} | Accuracy: {val_acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

print(f"\n{'='*70}")
print(f"K-Fold Training Complete!")
print(f"{'='*70}")

## Section 5: Fold Results Summary
Display aggregated metrics across all folds.

In [None]:
# Convert to DataFrame for easy viewing
df_results = pd.DataFrame(fold_results)

print("\nPer-Fold Results:")
print(df_results.to_string(index=False))

# Aggregate metrics
print(f"\n{'='*70}")
print("AGGREGATED METRICS ACROSS K FOLDS")
print(f"{'='*70}")

metrics_to_agg = ['val_accuracy', 'val_precision', 'val_recall', 'val_f1']
for metric in metrics_to_agg:
    values = df_results[metric].values
    mean = np.mean(values)
    std = np.std(values)
    ci_lower = mean - 1.96 * std / np.sqrt(len(values))
    ci_upper = mean + 1.96 * std / np.sqrt(len(values))
    print(f"\n{metric}:")
    print(f"  Mean:              {mean:.4f}")
    print(f"  Std Dev:           {std:.4f}")
    print(f"  95% CI:            [{ci_lower:.4f}, {ci_upper:.4f}]")
    print(f"  Per-fold values:   {[f'{v:.4f}' for v in values]}")

## Section 6: Visualizations
Create visualizations for per-fold performance and cross-fold comparisons.

In [None]:
sns.set_style("whitegrid")

# Plot 1: Per-Fold Metrics Comparison
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(fold_results))
width = 0.2

ax.bar(x - 1.5*width, df_results['val_accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x - 0.5*width, df_results['val_precision'], width, label='Precision', alpha=0.8)
ax.bar(x + 0.5*width, df_results['val_recall'], width, label='Recall', alpha=0.8)
ax.bar(x + 1.5*width, df_results['val_f1'], width, label='F1-Score', alpha=0.8)

ax.set_xlabel('Fold', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Per-Fold Performance Metrics', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels([f'Fold {i+1}' for i in range(len(fold_results))])
ax.legend(fontsize=11)
ax.set_ylim([0, 1.0])
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("✓ Per-fold metrics visualization complete")

In [None]:
# Plot 2: Distribution with Error Bars
fig, ax = plt.subplots(figsize=(10, 6))
metrics = ['val_accuracy', 'val_precision', 'val_recall', 'val_f1']
means = [df_results[m].mean() for m in metrics]
stds = [df_results[m].std() for m in metrics]
labels = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

x_pos = np.arange(len(labels))
ax.bar(x_pos, means, yerr=stds, capsize=10, alpha=0.7, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

ax.set_ylabel('Score', fontsize=12)
ax.set_title('Mean Metrics Across K-Folds (with Std Dev)', fontsize=14, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(labels, fontsize=11)
ax.set_ylim([0, 1.0])
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (mean, std) in enumerate(zip(means, stds)):
    ax.text(i, mean + std + 0.02, f'{mean:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

print("✓ Aggregated metrics visualization complete")

In [None]:
# Plot 3: Training history for all folds
fig, axes = plt.subplots(2, 3, figsize=(16, 8))
axes = axes.flatten()

for fold_idx, history in enumerate(fold_histories):
    ax = axes[fold_idx]
    ax.plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
    ax.plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
    ax.set_xlabel('Epoch', fontsize=11)
    ax.set_ylabel('Accuracy', fontsize=11)
    ax.set_title(f'Fold {fold_idx + 1} Training History', fontsize=12, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

# Hide the extra subplot
axes[5].axis('off')

plt.tight_layout()
plt.show()

print("✓ Training history visualizations complete")

## Section 7: Best Model Evaluation on Test Set
Use the best-performing fold model to evaluate on the held-out test set.

In [None]:
# Select best fold
best_fold_idx = df_results['val_accuracy'].idxmax()
best_fold_num = best_fold_idx + 1
best_accuracy = df_results.loc[best_fold_idx, 'val_accuracy']

print(f"Best fold: Fold {best_fold_num} (accuracy: {best_accuracy:.4f})")

# Re-train best model on all combined training data for final test evaluation
best_model_final = build_model(input_dim, n_classes)

# Normalize combined data
scaler_final = StandardScaler()
X_combined_norm = scaler_final.fit_transform(X_combined)
X_test_norm = scaler_final.transform(X_test)

early_stop = callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True, verbose=0)
history_final = best_model_final.fit(
    X_combined_norm, y_combined,
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

# Evaluate on test set
test_loss, test_acc = best_model_final.evaluate(X_test_norm, y_test, verbose=0)
y_pred_test = best_model_final.predict(X_test_norm, verbose=0).argmax(axis=1)

print(f"\n{'='*70}")
print("TEST SET EVALUATION (Best Model Trained on All K-Fold Data)")
print(f"{'='*70}")
print(f"Test Loss:     {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

print("\nTest Classification Report:")
print(classification_report(y_test, y_pred_test))

cm_test = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix (Test):")
print(cm_test)

In [None]:
# Test confusion matrix visualization
plt.figure(figsize=(10, 8))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=range(n_classes), yticklabels=range(n_classes))
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)
plt.title('Confusion Matrix (Test Set - Best K-Fold Model)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()