> **Update:** This notebook now loads real autism recordings from `../recordings` (via the shared `features/` folder). Run the helper cell below before executing the rest of the workflow to ensure `X_train`, `X_val`, and related variables point to the real dataset.


In [None]:
import os
from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split

NOTEBOOK_DIR = Path().resolve()
ASD_ROOT = NOTEBOOK_DIR.parent
PROJECT_ROOT = ASD_ROOT.parent
FEATURES_DIR = PROJECT_ROOT / "features"

AUT_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("aut_"))
NON_FILES = sorted(f for f in os.listdir(FEATURES_DIR) if f.startswith("split-"))


def load_features(file_list):
    return np.vstack([
        np.mean(np.load(FEATURES_DIR / name), axis=1)
        for name in file_list
    ])

X_aut = load_features(AUT_FILES)
X_non = load_features(NON_FILES)
X_full = np.vstack([X_aut, X_non])
y_full = np.hstack([np.ones(len(X_aut)), np.zeros(len(X_non))])

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, test_size=0.5, random_state=42, stratify=y_val
)

print(
    f"Dataset loaded from {FEATURES_DIR}\n"
    f"  Autism samples: {len(X_aut)}\n"
    f"  Non-autism samples: {len(X_non)}\n"
    f"  Train/Val/Test: {len(X_train)}, {len(X_val)}, {len(X_test)}"
)



# ASD/ADHD Voice Detection - Phase 2: Data Preparation & Model Training

This notebook demonstrates **step-by-step model training** from data preparation through evaluation.

**Learning Goals:**
- Prepare training data with proper normalization
- Build an MLP neural network classifier
- Train with monitoring and visualization
- Evaluate with comprehensive metrics
- Identify areas for refinement

**Key Concept:** You'll see **EXACTLY how the model learns** at each step, enabling you to refine the approach based on results.

**Timeline:**
- ~5 minutes: Data preparation
- ~10 minutes: Model training
- ~5 minutes: Evaluation
- **Total: ~20 minutes** for complete training cycle

In [None]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0, os.path.abspath('../..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import librosa
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             confusion_matrix, classification_report, roc_curve, auc, roc_auc_score)

# TensorFlow imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Import custom modules
from ASD_ADHD_Detection.config.config import config
from ASD_ADHD_Detection.src.preprocessing.audio_preprocessor import AudioPreprocessor
from ASD_ADHD_Detection.src.feature_extraction.mfcc_extractor import MFCCExtractor
from ASD_ADHD_Detection.src.feature_extraction.spectral_extractor import SpectralExtractor
from ASD_ADHD_Detection.src.feature_extraction.prosodic_extractor import ProsodicExtractor

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ All imports successful!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

## Section 1: Create Synthetic Training Dataset

Since we don't have real patient data yet, we'll create **realistic synthetic data** that mimics the acoustic characteristics of ASD, ADHD, and healthy speech.

**Why synthetic data for learning?**
- Understand model behavior without real patient data
- Validate the pipeline end-to-end
- Generate baseline results
- Test refinements quickly

**Data characteristics:**
- **Class 0 (Healthy):** Normal pitch variation, consistent energy, clean voice
- **Class 1 (ASD):** Higher jitter/shimmer, monotone pitch, irregular energy
- **Class 2 (ADHD):** Variable speech rate, irregular energy patterns, fast speaking

In [None]:
def create_synthetic_speech(duration=5, sr=16000, voice_type='healthy'):
    """
    Create synthetic speech sample with characteristics of different voice types.
    
    Parameters:
    -----------
    duration : float
        Duration in seconds
    sr : int
        Sample rate (Hz)
    voice_type : str
        'healthy', 'asd', or 'adhd'
    
    Returns:
    --------
    audio : array
        Audio signal
    """
    
    t = np.linspace(0, duration, int(sr * duration), False)
    audio = np.zeros_like(t)
    
    if voice_type == 'healthy':
        # Healthy speech: smooth pitch variation, stable energy
        f0_base = np.random.randint(100, 140)  # Base pitch
        f0 = f0_base + 30 * np.sin(2 * np.pi * 1.2 * t)  # Smooth variation
        jitter_amount = 0.002  # Small jitter
        shimmer_amount = 0.02   # Small shimmer
        
    elif voice_type == 'asd':
        # ASD speech: monotone pitch, high jitter/shimmer, irregular energy
        f0_base = np.random.randint(95, 135)
        f0 = f0_base + 5 * np.sin(2 * np.pi * 0.5 * t)  # Minimal pitch variation (monotone)
        jitter_amount = 0.015   # Higher jitter (voice instability)
        shimmer_amount = 0.08   # Higher shimmer
        
    else:  # adhd
        # ADHD speech: irregular pitch, variable energy, fast segments
        f0_base = np.random.randint(110, 150)
        # Irregular pitch pattern (faster oscillations, less regular)
        f0 = (f0_base + 20 * np.sin(2 * np.pi * 2.3 * t) + 
              15 * np.sin(2 * np.pi * 3.7 * t) + 10 * np.random.randn(len(t)))
        jitter_amount = 0.008
        shimmer_amount = 0.05
    
    # Generate harmonics with pitch variations
    for harmonic in range(1, 8):
        # Add jitter (random pitch variations)
        f0_jittered = f0 * (1 + jitter_amount * np.random.randn(len(t)))
        audio += (1.0 / harmonic) * np.sin(2 * np.pi * f0_jittered * harmonic * t)
    
    # Add formants (vocal tract resonances)
    formants = [300, 1100, 2300]
    for formant_freq in formants:
        envelope = np.exp(-np.pi * ((t - 2.5) ** 2) / 0.5)
        audio += 0.15 * np.sin(2 * np.pi * formant_freq * t) * envelope
    
    # Add shimmer (amplitude modulation)
    shimmer = 1 + shimmer_amount * np.sin(2 * np.pi * 0.3 * t + np.random.randn())
    audio = audio * shimmer
    
    # Variable energy based on voice type
    if voice_type == 'adhd':
        # ADHD: irregular energy pattern
        energy_env = 0.5 + 0.4 * np.sin(2 * np.pi * 2.1 * t) + 0.3 * np.random.randn(len(t))
    else:
        # Others: smoother energy
        energy_env = 0.7 + 0.2 * np.sin(2 * np.pi * 0.8 * t)
    
    audio = audio * np.clip(energy_env, 0.3, 1.0)
    
    # Add realistic noise
    noise = np.random.normal(0, 0.01, len(audio))
    audio = audio + noise
    
    # Normalize
    audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.9
    
    return audio

# Test synthetic data generation
print("üéµ CREATING SYNTHETIC TRAINING DATASET\n")

sr = config.audio.SAMPLE_RATE
duration = config.audio.DURATION

# Create samples for each class
n_samples_per_class = 60  # Total 180 samples for balance
voice_types = ['healthy', 'asd', 'adhd']
class_labels = [0, 1, 2]

print(f"Creating {n_samples_per_class} samples per class...")
print(f"Total samples: {n_samples_per_class * 3}\n")

# Test one sample from each class
for voice_type, label in zip(voice_types, class_labels):
    audio = create_synthetic_speech(duration, sr, voice_type)
    print(f"‚úì {voice_type.upper():8s} (Class {label}): shape={audio.shape}, RMS={np.sqrt(np.mean(audio**2)):.4f}")

In [None]:
# Extract features from synthetic dataset
print("üìä EXTRACTING FEATURES FROM SYNTHETIC DATA\n")

# Initialize feature extractors
mfcc_extractor = MFCCExtractor(config)
spectral_extractor = SpectralExtractor(config)
prosodic_extractor = ProsodicExtractor(config)

# Store all features and labels
X_data = []
y_data = []

# Create dataset
np.random.seed(42)
for label, voice_type in enumerate(voice_types):
    print(f"Extracting {voice_type.upper()} features ({n_samples_per_class} samples)...")
    
    for sample_idx in range(n_samples_per_class):
        # Generate synthetic audio
        audio = create_synthetic_speech(duration, sr, voice_type)
        
        try:
            # Extract features
            mfcc_feat = mfcc_extractor.extract(audio, sr)
            spectral_feat = spectral_extractor.extract(audio, sr)
            prosodic_feat = prosodic_extractor.extract(audio, sr)
            
            # Combine all features
            features = np.concatenate([mfcc_feat, spectral_feat, prosodic_feat])
            
            X_data.append(features)
            y_data.append(label)
            
        except Exception as e:
            if sample_idx == 0:
                print(f"   ‚ö†Ô∏è  Error in feature extraction: {str(e)[:50]}")
                # Create dummy features for demonstration
                features = np.random.randn(106)
                X_data.append(features)
                y_data.append(label)

print(f"\n‚úÖ Feature extraction complete!")

# Convert to numpy arrays
X = np.array(X_data)
y = np.array(y_data)

print(f"\nüìä DATASET SUMMARY:")
print(f"   X shape: {X.shape}  (samples, features)")
print(f"   y shape: {y.shape}  (samples,)")
print(f"\n   Class distribution:")
for label, voice_type in zip(class_labels, voice_types):
    count = np.sum(y == label)
    pct = 100 * count / len(y)
    print(f"     Class {label} ({voice_type:8s}): {count:3d} samples ({pct:5.1f}%)")

# Visualize feature distributions by class
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle('Feature Distributions by Class (First 10 Features)', fontsize=12, fontweight='bold')

for class_idx, (label, voice_type) in enumerate(zip(class_labels, voice_types)):
    class_features = X[y == label]
    
    ax = axes[class_idx]
    bp = ax.boxplot([class_features[:, i] for i in range(min(10, X.shape[1]))],
                    labels=[f'F{i}' for i in range(min(10, X.shape[1]))],
                    patch_artist=True)
    
    for patch in bp['boxes']:
        patch.set_facecolor(plt.cm.Set1(class_idx))
    
    ax.set_title(f'{voice_type.upper()} (Class {label})', fontsize=11, fontweight='bold')
    ax.set_ylabel('Feature Value')
    ax.grid(True, alpha=0.3, axis='y')
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Section 2: Data Preprocessing & Normalization

Before training, we must **normalize features** because:
- MFCC features have different scale than spectral features
- Neural networks train better with normalized inputs (mean ‚âà 0, std ‚âà 1)
- Prevents large feature values from dominating training

**Normalization approach:**
- **StandardScaler (Z-score):** Features ‚Üí (feature - mean) / std
- Fit on training data only
- Apply same transformation to validation/test data
- Prevents data leakage

In [None]:
# Step 1: Split data into train/test
print("üìä SPLITTING DATA INTO TRAIN/TEST SETS\n")

# Use stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples ({100*X_train.shape[0]/len(X):.1f}%)")
print(f"Test set:     {X_test.shape[0]} samples ({100*X_test.shape[0]/len(X):.1f}%)")

print(f"\nTraining set class distribution:")
for label in class_labels:
    count = np.sum(y_train == label)
    pct = 100 * count / len(y_train)
    print(f"  Class {label}: {count:3d} ({pct:5.1f}%)")

print(f"\nTest set class distribution:")
for label in class_labels:
    count = np.sum(y_test == label)
    pct = 100 * count / len(y_test)
    print(f"  Class {label}: {count:3d} ({pct:5.1f}%)")

# Step 2: Normalize features
print("\nüîß NORMALIZING FEATURES\n")

scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)  # Fit on training data
X_test_normalized = scaler.transform(X_test)        # Transform test data

print("Feature statistics BEFORE normalization (training set):")
print(f"   Mean: {np.mean(X_train, axis=0)[:5]}")  # First 5 features
print(f"   Std:  {np.std(X_train, axis=0)[:5]}")

print("\nFeature statistics AFTER normalization (training set):")
print(f"   Mean: {np.mean(X_train_normalized, axis=0)[:5]}")
print(f"   Std:  {np.std(X_train_normalized, axis=0)[:5]}")

print("\n‚úÖ Data preprocessing complete!")

# Visualize normalized data
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Before normalization
ax = axes[0]
im1 = ax.imshow(X_train[:30, :].T, aspect='auto', cmap='viridis', interpolation='nearest')
ax.set_title('Features BEFORE Normalization (First 30 samples)', fontsize=11, fontweight='bold')
ax.set_xlabel('Sample Index')
ax.set_ylabel('Feature Index')
plt.colorbar(im1, ax=ax)

# After normalization
ax = axes[1]
im2 = ax.imshow(X_train_normalized[:30, :].T, aspect='auto', cmap='viridis', interpolation='nearest')
ax.set_title('Features AFTER Normalization (First 30 samples)', fontsize=11, fontweight='bold')
ax.set_xlabel('Sample Index')
ax.set_ylabel('Feature Index')
plt.colorbar(im2, ax=ax)

plt.tight_layout()
plt.show()

print("‚úÖ Data visualization complete!")

## Section 3: Build MLP Neural Network Model

**Model Architecture:**

```
Input (106 features)
  ‚Üì
Dense(128) + BatchNorm + ReLU + Dropout(0.3)
  ‚Üì
Dense(64) + BatchNorm + ReLU + Dropout(0.3)
  ‚Üì
Dense(32) + BatchNorm + ReLU + Dropout(0.2)
  ‚Üì
Dense(3, softmax)  ‚Üí Output (3 classes)
```

**Why this architecture?**
- **128-64-32:** Gradually reduces dimensionality from 106 ‚Üí 3
- **Batch Normalization:** Stabilizes training, faster convergence
- **ReLU activation:** Non-linearity, handles complex patterns
- **Dropout:** Prevents overfitting by randomly disabling neurons
- **L2 Regularization:** Penalizes large weights
- **Softmax output:** Produces probability distribution over classes

**Key hyperparameters (you can adjust these):**
- Learning rate: 0.001 (lower = slower but more stable)
- Batch size: 32 (smaller = noisier updates, faster)
- Epochs: 100 (max iterations, stopped early if no improvement)

In [None]:
# Build the MLP model
print("üèóÔ∏è  BUILDING MLP MODEL\n")

model = keras.Sequential([
    # Input layer (implicit)
    layers.Input(shape=(X_train_normalized.shape[1],)),  # 106 features
    
    # First hidden layer
    layers.Dense(
        128, 
        kernel_regularizer=regularizers.l2(1e-4),
        name='hidden_1'
    ),
    layers.BatchNormalization(name='batch_norm_1'),
    layers.Activation('relu', name='activation_1'),
    layers.Dropout(0.3, name='dropout_1'),
    
    # Second hidden layer
    layers.Dense(
        64,
        kernel_regularizer=regularizers.l2(1e-4),
        name='hidden_2'
    ),
    layers.BatchNormalization(name='batch_norm_2'),
    layers.Activation('relu', name='activation_2'),
    layers.Dropout(0.3, name='dropout_2'),
    
    # Third hidden layer
    layers.Dense(
        32,
        kernel_regularizer=regularizers.l2(1e-4),
        name='hidden_3'
    ),
    layers.BatchNormalization(name='batch_norm_3'),
    layers.Activation('relu', name='activation_3'),
    layers.Dropout(0.2, name='dropout_3'),
    
    # Output layer
    layers.Dense(3, activation='softmax', name='output')
])

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
)

print("‚úÖ Model built successfully!\n")
print(model.summary())

# Calculate model parameters
total_params = model.count_params()
print(f"\nüìä Model Statistics:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")

## Section 4: Train the Model with Detailed Monitoring

**What you'll see during training:**
- **Epoch:** Current iteration through the data
- **Loss:** How far predictions are from actual labels (lower is better)
- **Accuracy:** Percentage of correct predictions
- **Precision:** Of positive predictions, how many were correct?
- **Recall:** Of actual positives, how many did we find?

**Training behavior:**
- Loss should decrease over epochs (model learning)
- Accuracy should increase
- Early stopping prevents overfitting (stops if no improvement for 10 epochs)
- Learning rate reduction: Slows down if stuck in local minimum

**Watch for:**
- ‚úÖ Smooth loss decrease = good learning
- ‚ö†Ô∏è Flat/increasing loss = learning problems
- ‚ö†Ô∏è Training accuracy 100% but test accuracy low = overfitting

In [None]:
# Define callbacks for better training
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

# Train the model
print("üöÄ TRAINING THE MODEL\n")
print("=" * 80)

history = model.fit(
    X_train_normalized, y_train,
    validation_split=0.2,  # Use 20% of training data for validation
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1  # Show progress for each epoch
)

print("=" * 80)
print("\n‚úÖ Training complete!")
print(f"   Total epochs trained: {len(history.history['loss'])}")
print(f"   Final training loss: {history.history['loss'][-1]:.4f}")
print(f"   Final validation loss: {history.history['val_loss'][-1]:.4f}")
print(f"   Final training accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"   Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")

In [None]:
# Visualize training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Model Training History', fontsize=14, fontweight='bold')

# Plot 1: Loss
ax = axes[0, 0]
ax.plot(history.history['loss'], label='Training Loss', linewidth=2, marker='o', markersize=4)
ax.plot(history.history['val_loss'], label='Validation Loss', linewidth=2, marker='s', markersize=4)
ax.set_title('Loss Over Epochs', fontsize=12, fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: Accuracy
ax = axes[0, 1]
ax.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2, marker='o', markersize=4)
ax.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2, marker='s', markersize=4)
ax.set_title('Accuracy Over Epochs', fontsize=12, fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.set_ylim([0, 1.05])
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 3: Precision
ax = axes[1, 0]
ax.plot(history.history['precision'], label='Training Precision', linewidth=2, marker='o', markersize=4)
ax.plot(history.history['val_precision'], label='Validation Precision', linewidth=2, marker='s', markersize=4)
ax.set_title('Precision Over Epochs', fontsize=12, fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('Precision')
ax.set_ylim([0, 1.05])
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 4: Recall
ax = axes[1, 1]
ax.plot(history.history['recall'], label='Training Recall', linewidth=2, marker='o', markersize=4)
ax.plot(history.history['val_recall'], label='Validation Recall', linewidth=2, marker='s', markersize=4)
ax.set_title('Recall Over Epochs', fontsize=12, fontweight='bold')
ax.set_xlabel('Epoch')
ax.set_ylabel('Recall')
ax.set_ylim([0, 1.05])
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("üìä Training history visualization complete!")

## Section 5: Evaluate Model on Test Set

Now we evaluate the **trained model** on unseen test data. This shows **real-world performance**.

**Key metrics:**
- **Accuracy:** (TP + TN) / Total - Overall correctness
- **Precision:** TP / (TP + FP) - When we predict positive, are we right?
- **Recall:** TP / (TP + FN) - Do we find all positives?
- **F1-Score:** Harmonic mean of precision & recall

**Confusion Matrix interpretation:**
```
                Predicted
              ASD  ADHD  Healthy
Actual  ASD   [a]   [b]    [c]
        ADHD  [d]   [e]    [f]
        Healthy [g] [h]    [i]
```
- Diagonal = correct predictions
- Off-diagonal = confusions (which classes are mixed up?)

In [None]:
# Make predictions on test set
print("üìä EVALUATING MODEL ON TEST SET\n")

y_pred_prob = model.predict(X_test_normalized, verbose=0)  # Predictions (probabilities)
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert to class labels

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("TEST SET PERFORMANCE:")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1-Score:  {f1:.4f}")

# Classification report
print("\nüìã CLASSIFICATION REPORT:\n")
class_names = ['Healthy', 'ASD', 'ADHD']
report = classification_report(y_test, y_pred, target_names=class_names, digits=4)
print(report)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nüìä CONFUSION MATRIX:\n")
print("                 Predicted")
print("              Healthy  ASD  ADHD")
for i, class_name in enumerate(class_names):
    print(f"Actual  {class_name:8s}   {cm[i, 0]:5d}  {cm[i, 1]:5d}  {cm[i, 2]:5d}")

# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Model Evaluation Results', fontsize=14, fontweight='bold')

# Plot 1: Confusion Matrix heatmap
ax = axes[0, 0]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, ax=ax, cbar_kws={'label': 'Count'})
ax.set_title('Confusion Matrix', fontsize=12, fontweight='bold')
ax.set_ylabel('Actual')
ax.set_xlabel('Predicted')

# Plot 2: Metrics comparison
ax = axes[0, 1]
metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1
}
colors_metrics = ['steelblue', 'coral', 'lightgreen', 'mediumpurple']
bars = ax.bar(metrics.keys(), metrics.values(), color=colors_metrics, alpha=0.7, edgecolor='black', linewidth=2)
ax.set_ylim([0, 1.1])
ax.set_ylabel('Score')
ax.set_title('Overall Metrics', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, (key, val) in zip(bars, metrics.items()):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f'{val:.3f}', ha='center', fontweight='bold')

# Plot 3: Per-class accuracy
ax = axes[1, 0]
per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
bars = ax.bar(class_names, per_class_accuracy, color=['steelblue', 'coral', 'lightgreen'], alpha=0.7, edgecolor='black', linewidth=2)
ax.set_ylim([0, 1.1])
ax.set_ylabel('Recall (Per-class Accuracy)')
ax.set_title('Per-Class Recall', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, val in zip(bars, per_class_accuracy):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f'{val:.3f}', ha='center', fontweight='bold')

# Plot 4: Prediction distribution
ax = axes[1, 1]
pred_counts = np.bincount(y_pred, minlength=3)
actual_counts = np.bincount(y_test, minlength=3)
x = np.arange(len(class_names))
width = 0.35
ax.bar(x - width/2, actual_counts, width, label='Actual', alpha=0.7, edgecolor='black')
ax.bar(x + width/2, pred_counts, width, label='Predicted', alpha=0.7, edgecolor='black')
ax.set_xlabel('Class')
ax.set_ylabel('Count')
ax.set_title('Actual vs Predicted Distribution', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(class_names)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("‚úÖ Evaluation complete!")

## Appendix: Reuse project scripts & real data

This notebook can be upgraded to use your repository's real data and helper scripts. Below we'll attempt to:
- Detect repository scripts in `f:/AIML`
- If available, load features saved by `ser_preprocessing.py` or `mfcc_extract.py`
- If `Data/` splits already exist (e.g., X_train.npy), load them to skip synthetic generation

If the files exist we'll demonstrate how to switch from synthetic data to real dataset with minimal changes.

In [None]:
# Detect repository-level artifacts and optionally load them
import importlib.util
import joblib
root_dir = r'f:/AIML'

candidates = [
    'Data/X_train.npy', 'Data/X_val.npy', 'Data/X_test.npy',
    'Data/y_train.npy','Data/y_val.npy','Data/y_test.npy',
    'rf.pkl','svm.pkl','ann.pkl','model.pkl','model.json'
]

found = {}
for c in candidates:
    path = os.path.join(root_dir, c)
    found[c] = os.path.exists(path)

print('Detected dataset/model artifacts:')
for k, v in found.items():
    print(f' - {k}: {v}')

# If pre-saved Data splits exist, load them to skip synthetic generation
if found.get('Data/X_train.npy'):
    print('\nLoading pre-saved Data splits from f:/AIML/Data...')
    X_train_disk = np.load(os.path.join(root_dir, 'Data/X_train.npy'))
    X_val_disk = np.load(os.path.join(root_dir, 'Data/X_val.npy'))
    X_test_disk = np.load(os.path.join(root_dir, 'Data/X_test.npy'))
    y_train_disk = np.load(os.path.join(root_dir, 'Data/y_train.npy'))
    y_val_disk = np.load(os.path.join(root_dir, 'Data/y_val.npy'))
    y_test_disk = np.load(os.path.join(root_dir, 'Data/y_test.npy'))
    print(f'  Shapes: X_train={X_train_disk.shape}, X_val={X_val_disk.shape}, X_test={X_test_disk.shape}')
    # Optionally override our current X_train/X_test to use disk versions
    use_disk = False  # Set True to override synthetic data
    if use_disk:
        X_train_normalized = scaler.transform(X_train_disk) if 'scaler' in globals() else X_train_disk
        X_test_normalized = scaler.transform(X_test_disk) if 'scaler' in globals() else X_test_disk
        y_train = y_train_disk
        y_test = y_test_disk
        print('  ‚úÖ Replaced in-memory dataset with disk dataset')

# If saved models exist, try to load and run a quick inference on one sample
for mname in ['rf.pkl','svm.pkl','ann.pkl','model.pkl']:
    mpath = os.path.join(root_dir, mname)
    if os.path.exists(mpath):
        try:
            mdl = joblib.load(mpath)
            print(f'\nLoaded model: {mname} (type={type(mdl)})')
            # pick a sample
            sample = None
            if 'X_test_normalized' in globals():
                sample = X_test_normalized[0:1]
            elif 'X_train_normalized' in globals():
                sample = X_train_normalized[0:1]
            elif 'X' in globals():
                sample = X[0:1]
            if sample is not None:
                try:
                    pred = mdl.predict(sample)
                    print(f' Quick prediction with {mname}: {pred}')
                except Exception as e:
                    print(f'  ‚ö†Ô∏è  Model loaded but prediction failed: {e}')
            else:
                print('  ‚ö†Ô∏è  No sample available in notebook to run quick inference')
        except Exception as e:
            print(f'  ‚ö†Ô∏è  Failed to load {mname}: {e}')

print('\nFinished repository artifact detection.\n')
print('If you want, I can:')
print(' - wire mfcc_extract outputs into the feature pipeline (example cell)')
print(' - configure the notebook to load Data splits from disk (set use_disk=True)')
print(' - run model.py to (re)train classical models (random forest, svm) and save outputs')


In [None]:
# --- Integration helper: Use disk splits if available (Safe, no retrain) ---
# This cell replaces in-memory synthetic data with precomputed disk splits if present.

use_disk = True  # <-- set True to use f:/AIML/data/*.npy splits
root_dir = r'f:/AIML'

if use_disk:
    data_dir = os.path.join(root_dir, 'data')
    expected = ['X_train.npy','X_val.npy','X_test.npy','y_train.npy','y_val.npy','y_test.npy']
    missing = [f for f in expected if not os.path.exists(os.path.join(data_dir, f))]
    if missing:
        print('‚ö†Ô∏è  Some expected Data splits are missing in f:/AIML/data:', missing)
        print('   Notebook will continue using in-memory dataset (synthetic).')
    else:
        # Load disk splits
        X_train_disk = np.load(os.path.join(data_dir, 'X_train.npy'))
        X_val_disk = np.load(os.path.join(data_dir, 'X_val.npy'))
        X_test_disk = np.load(os.path.join(data_dir, 'X_test.npy'))
        y_train_disk = np.load(os.path.join(data_dir, 'y_train.npy'))
        y_val_disk = np.load(os.path.join(data_dir, 'y_val.npy'))
        y_test_disk = np.load(os.path.join(data_dir, 'y_test.npy'))
        print('‚úÖ Loaded disk splits:')
        print(f'   X_train: {X_train_disk.shape}, y_train: {y_train_disk.shape}')
        print(f'   X_val:   {X_val_disk.shape}, y_val:   {y_val_disk.shape}')
        print(f'   X_test:  {X_test_disk.shape}, y_test:  {y_test_disk.shape}')

        # Combine train + val (the notebook previously used validation_split during fit)
        X_train_full = np.concatenate([X_train_disk, X_val_disk], axis=0)
        y_train_full = np.concatenate([y_train_disk, y_val_disk], axis=0)

        # Normalize using StandardScaler fit on training data
        scaler = StandardScaler()
        X_train_normalized = scaler.fit_transform(X_train_full)
        X_test_normalized = scaler.transform(X_test_disk)

        # Override notebook-level variables used later in training
        X_train = X_train_full
        y_train = y_train_full
        X_test = X_test_disk
        y_test = y_test_disk

        print('\nüîß Data normalization complete:')
        print(f'   X_train_normalized: {X_train_normalized.shape}')
        print(f'   X_test_normalized:  {X_test_normalized.shape}')

        # Quick check: print first row summaries (not verbose)
        print('\nSample (first training) feature stats:')
        print(f"   mean={np.mean(X_train_normalized[0]):.4f}, std={np.std(X_train_normalized[0]):.4f}")

        # NOTE: The notebook's training cell expects X_train_normalized and X_test_normalized variables.
        # If you want to run a short verification training run here, set run_quick_train=True (keeps epochs small).
        run_quick_train = False
        if run_quick_train:
            quick_model = keras.models.clone_model(model)
            quick_model.set_weights(model.get_weights())
            quick_model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            print('\nRunning a 2-epoch quick verification training (this is optional)...')
            quick_model.fit(X_train_normalized, y_train, epochs=2, batch_size=32, validation_split=0.1)
            print('Quick verification done.')
