# Notebook 03: Baseline Models for Instrument Family Classification

## Overview
This notebook implements two baseline models to establish performance benchmarks for instrument family classification. Both models use traditional machine learning approaches with different feature representations and are designed to be computationally efficient.

## Workflow
1. **Data Loading and Setup** — Load train/validation/test splits and configure label mappings
2. **MFCC + SVM Baseline** — Extract Mel-Frequency Cepstral Coefficients and train Support Vector Machine classifier
3. **Mel-Spectrogram + CNN Baseline** — Train lightweight Convolutional Neural Network on mel-spectrogram features
4. **Model Training** — Train both models with appropriate hyperparameters
5. **Evaluation and Comparison** — Generate confusion matrices and performance metrics for test set
6. **Results Export** — Save model predictions and visualizations to Results directory

---

In [1]:
# --- Imports and configuration ---

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import soundfile as sf
import librosa

# Bokeh imports
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import row
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, ColorBar, BasicTicker
from bokeh.transform import transform
from bokeh.palettes import Blues9
from bokeh.io import push_notebook

# Initialize Bokeh for notebook
output_notebook()

# Device configuration
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {DEVICE}")

# Project configuration
PROJECT_ROOT = Path("/Users/dghifari/02-University/SEM-2-2025/elec5305-project-520140154")
manifests_dir = PROJECT_ROOT / "Manifests"
train_csv = manifests_dir / "train.csv"
val_csv   = manifests_dir / "val.csv"
test_csv  = manifests_dir / "test.csv"

# Audio processing parameters
FAMILY_COLNAME = "family_label"
SAMPLE_RATE = 16000
DURATION_SECONDS = 3
TARGET_NUM_SAMPLES = SAMPLE_RATE * DURATION_SECONDS
BATCH_SIZE = 32
EPOCHS = 25

# Feature extraction parameters
N_MFCC = 13
N_MELS = 64
HOP_LENGTH = 512
N_FFT = 2048

# Data loading and label mappings
print("\nLoading data and creating label mappings...")

df_train = pd.read_csv(train_csv)
families = sorted(df_train[FAMILY_COLNAME].unique())
family_to_idx = {f:i for i,f in enumerate(families)}
idx_to_family = {i:f for f,i in family_to_idx.items()}
num_classes = len(family_to_idx)

print(f"Classes: {families}")
print(f"Train: {len(pd.read_csv(train_csv))} | Val: {len(pd.read_csv(val_csv))} | Test: {len(pd.read_csv(test_csv))}")

Device: mps

Loading data and creating label mappings...
Classes: ['keyboards', 'percussion', 'strings', 'voice', 'winds']
Train: 685 | Val: 79 | Test: 110


In [2]:
# --- Audio processing utilities ---

class Normalize:
    """Audio normalization utility"""
    def __call__(self, x: torch.Tensor):
        x = x / (x.abs().max() + 1e-9)
        rms = x.pow(2).mean().sqrt()
        if rms > 0:
            x = x / (rms + 1e-9) * 0.1
        return x

class AudioDatasetForFeatures(Dataset):
    """Dataset for loading raw audio files and extracting features"""
    def __init__(self, csv_path, label_map, feature_type='mfcc'):
        self.df = pd.read_csv(csv_path)
        self.label_map = label_map
        self.norm = Normalize()
        self.target_length = TARGET_NUM_SAMPLES
        self.feature_type = feature_type

    def _fix_length(self, wav: torch.Tensor, target_len: int):
        T = wav.shape[-1]
        if T == target_len:
            return wav
        if T > target_len:
            start = (T - target_len) // 2
            return wav[..., start:start + target_len]
        pad_len = target_len - T
        return torch.nn.functional.pad(wav, (0, pad_len))

    def _extract_mfcc(self, wav):
        """Extract MFCC features with temporal statistics"""
        wav_np = wav.numpy()
        mfcc = librosa.feature.mfcc(y=wav_np, sr=SAMPLE_RATE, n_mfcc=N_MFCC, 
                                   hop_length=HOP_LENGTH, n_fft=N_FFT)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        return np.concatenate([mfcc_mean, mfcc_std])
    
    def _extract_melspectrogram(self, wav):
        """Extract mel-spectrogram features"""
        wav_np = wav.numpy()
        mel_spec = librosa.feature.melspectrogram(y=wav_np, sr=SAMPLE_RATE, 
                                                 n_mels=N_MELS, hop_length=HOP_LENGTH, 
                                                 n_fft=N_FFT)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec_db

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav_np, sr = sf.read(row['filepath'], dtype='float32')
        
        wav = torch.from_numpy(wav_np)
        if wav.dim() == 1:
            wav = wav.unsqueeze(0)
        else:
            wav = wav.T
        
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        
        if sr != SAMPLE_RATE:
            import torchaudio.functional as F
            wav = F.resample(wav, sr, SAMPLE_RATE)
        
        wav = self._fix_length(wav, self.target_length)
        wav = self.norm(wav)
        wav = wav.squeeze(0)
        
        if self.feature_type == 'mfcc':
            features = self._extract_mfcc(wav)
        elif self.feature_type == 'mel':
            features = self._extract_melspectrogram(wav)
        else:
            raise ValueError(f"Unknown feature type: {self.feature_type}")
        
        label = self.label_map[row[FAMILY_COLNAME]]
        return features, label, row['filepath']

    def __len__(self):
        return len(self.df)

def extract_features_dataset(csv_path, label_map, feature_type='mfcc'):
    """Extract features for an entire dataset"""
    dataset = AudioDatasetForFeatures(csv_path, label_map, feature_type)
    loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
    
    all_features = []
    all_labels = []
    all_filepaths = []
    
    for features, labels, filepaths in tqdm(loader, desc=f"Extracting {feature_type}"):
        all_features.append(features.numpy().squeeze())
        all_labels.extend(labels.numpy())
        all_filepaths.extend(filepaths)
    
    all_features = np.array(all_features)
    all_labels = np.array(all_labels)
    
    return all_features, all_labels, all_filepaths

In [3]:
# --- Baseline 1: MFCC + SVM ---

print("\nBaseline 1: MFCC + SVM")
print("-" * 50)

# Create Results directory if it doesn't exist
results_dir = PROJECT_ROOT / "Results"
results_dir.mkdir(exist_ok=True)

# Extract MFCC features
print("Extracting MFCC features...")
train_mfcc, train_labels, train_paths = extract_features_dataset(train_csv, family_to_idx, 'mfcc')
val_mfcc, val_labels, val_paths = extract_features_dataset(val_csv, family_to_idx, 'mfcc')
test_mfcc, test_labels, test_paths = extract_features_dataset(test_csv, family_to_idx, 'mfcc')

print(f"MFCC features shape: {train_mfcc.shape}")
print(f"Feature dimension: {train_mfcc.shape[1]}")

# Standardize features and train SVM
print("\nTraining SVM classifier...")

# Standardize features
scaler = StandardScaler()
train_mfcc_scaled = scaler.fit_transform(train_mfcc)
val_mfcc_scaled = scaler.transform(val_mfcc)
test_mfcc_scaled = scaler.transform(test_mfcc)

# Train SVM with RBF kernel
svm_classifier = SVC(kernel='rbf', C=10.0, gamma='scale', random_state=42)
svm_classifier.fit(train_mfcc_scaled, train_labels)

# Evaluate on validation set
val_pred = svm_classifier.predict(val_mfcc_scaled)
val_acc = accuracy_score(val_labels, val_pred)
val_f1 = f1_score(val_labels, val_pred, average='macro')

print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation Macro F1: {val_f1:.4f}")
print("\nValidation Classification Report:")
print(classification_report(val_labels, val_pred, target_names=families, digits=4))

# Final test evaluation
print("\nFinal test evaluation...")
test_pred = svm_classifier.predict(test_mfcc_scaled)
test_acc = accuracy_score(test_labels, test_pred)
test_f1 = f1_score(test_labels, test_pred, average='macro')

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Macro F1: {test_f1:.4f}")
print("\nTest Classification Report:")
print(classification_report(test_labels, test_pred, target_names=families, digits=4))

# Confusion matrix with Bokeh
cm_normalized = confusion_matrix(test_labels, test_pred, normalize='true')
cm_counts = confusion_matrix(test_labels, test_pred)
n_classes = len(families)

# Prepare data
true_labels, pred_labels, values, counts, pct_text, count_text, text_colors = [], [], [], [], [], [], []

for i in range(n_classes):
    for j in range(n_classes):
        true_labels.append(families[i])
        pred_labels.append(families[j])
        val = cm_normalized[i, j]
        cnt = int(cm_counts[i, j])
        values.append(val)
        counts.append(cnt)
        pct_text.append(f'{val:.0%}')
        count_text.append(f'({cnt})')
        
        # Text color for readability
        if val < 0.3:
            text_colors.append('#2b2b2b')
        elif val > 0.7:
            text_colors.append('white')
        else:
            text_colors.append('#2b2b2b')

source = ColumnDataSource(data=dict(
    true_label=true_labels,
    pred_label=pred_labels,
    value=values,
    count=counts,
    pct_text=pct_text,
    count_text=count_text,
    text_color=text_colors
))

# Color mapper - Blues gradient (light to dark)
palette = list(reversed(Blues9))
mapper = LinearColorMapper(palette=palette, low=0, high=1)

# Create figure
p = figure(
    title="SVM Confusion Matrix (Test Set)",
    x_range=families, 
    y_range=list(reversed(families)),
    width=750, 
    height=600,
    tools="hover,save,reset"
)

p.rect(x="pred_label", y="true_label", width=1, height=1, source=source,
       fill_color=transform('value', mapper), line_color='white', line_width=2)

p.text(x='pred_label', y='true_label', text='pct_text', source=source,
       text_align='center', text_baseline='middle', text_font_size='14pt',
       text_font_style='bold', text_color='text_color', y_offset=6)

p.text(x='pred_label', y='true_label', text='count_text', source=source,
       text_align='center', text_baseline='middle', text_font_size='10pt',
       text_color='text_color', y_offset=-8)

# Color bar
color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(desired_num_ticks=10),
                     label_standoff=12, border_line_color=None, location=(0, 0),
                     title='Accuracy', title_text_font_style='bold')
p.add_layout(color_bar, 'right')

# Hover
hover = p.select_one(HoverTool)
hover.tooltips = [("True", "@true_label"), ("Predicted", "@pred_label"), 
                  ("Accuracy", "@value{0.1%}"), ("Count", "@count")]

# Styling
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.xaxis.axis_label = 'Predicted Label'
p.yaxis.axis_label = 'True Label'
p.xaxis.axis_label_text_font_style = "bold"
p.yaxis.axis_label_text_font_style = "bold"

show(p)

# Store SVM results
svm_results = {
    'test_accuracy': test_acc,
    'test_f1_macro': test_f1,
    'confusion_matrix': cm_counts,
    'confusion_matrix_normalized': cm_normalized
}


Baseline 1: MFCC + SVM
--------------------------------------------------
Extracting MFCC features...


Extracting mfcc: 100%|██████████| 685/685 [00:05<00:00, 114.48it/s]
Extracting mfcc: 100%|██████████| 79/79 [00:00<00:00, 133.27it/s]
Extracting mfcc: 100%|██████████| 110/110 [00:00<00:00, 133.12it/s]


MFCC features shape: (685, 26)
Feature dimension: 26

Training SVM classifier...
Validation Accuracy: 0.7975
Validation Macro F1: 0.6694

Validation Classification Report:
              precision    recall  f1-score   support

   keyboards     0.9167    0.5500    0.6875        20
  percussion     0.8235    0.9333    0.8750        15
     strings     0.0000    0.0000    0.0000         5
       voice     0.8947    1.0000    0.9444        17
       winds     0.7500    0.9545    0.8400        22

    accuracy                         0.7975        79
   macro avg     0.6770    0.6876    0.6694        79
weighted avg     0.7898    0.7975    0.7773        79


Final test evaluation...
Test Accuracy: 0.6636
Test Macro F1: 0.6386

Test Classification Report:
              precision    recall  f1-score   support

   keyboards     0.6429    0.8571    0.7347        21
  percussion     0.8333    0.5882    0.6897        17
     strings     0.7143    0.2381    0.3571        21
       voice     0.8182

In [4]:
# --- Baseline 2: Mel-Spectrogram + CNN ---

print("\nBaseline 2: Mel-Spectrogram + CNN")
print("-" * 50)

# Extract mel-spectrogram features
print("Extracting mel-spectrogram features...")
train_mel, train_labels, train_paths = extract_features_dataset(train_csv, family_to_idx, 'mel')
val_mel, val_labels, val_paths = extract_features_dataset(val_csv, family_to_idx, 'mel')
test_mel, test_labels, test_paths = extract_features_dataset(test_csv, family_to_idx, 'mel')

print(f"Mel-spectrogram features shape: {train_mel.shape}")
print(f"Spectrogram shape: {train_mel[0].shape}")

# Dataset class for mel-spectrograms
class MelSpectrogramDataset(Dataset):
    """Dataset for mel-spectrogram features"""
    def __init__(self, mel_specs, labels):
        self.mel_specs = torch.FloatTensor(mel_specs)
        self.labels = torch.LongTensor(labels)
        # Add channel dimension for CNN
        self.mel_specs = self.mel_specs.unsqueeze(1)
    
    def __getitem__(self, idx):
        return self.mel_specs[idx], self.labels[idx]
    
    def __len__(self):
        return len(self.labels)


Baseline 2: Mel-Spectrogram + CNN
--------------------------------------------------
Extracting mel-spectrogram features...


Extracting mel: 100%|██████████| 685/685 [00:02<00:00, 240.48it/s]
Extracting mel: 100%|██████████| 79/79 [00:00<00:00, 205.07it/s]
Extracting mel: 100%|██████████| 110/110 [00:00<00:00, 229.57it/s]

Mel-spectrogram features shape: (685, 64, 94)
Spectrogram shape: (64, 94)





In [5]:
# --- CNN model architecture ---

class MelCNN(nn.Module):
    """
    CNN for mel-spectrogram classification - MPS Compatible
    
    Key fix: Uses AvgPool2d instead of AdaptiveAvgPool2d to avoid MPS limitation
    where input sizes must be divisible by output sizes.
    
    Input: (batch, 1, 64, 94) mel-spectrogram
    After 3 MaxPool2d(2,2): (batch, 128, 8, 11)
    After AvgPool2d(2,3): (batch, 128, 4, 3) -> Flatten to (batch, 1536)
    """
    def __init__(self, num_classes=5, input_channels=1):
        super().__init__()
        
        self.conv_layers = nn.Sequential(
            # Conv block 1
            nn.Conv2d(input_channels, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Conv block 2
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Conv block 3
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
        )
        
        # MPS COMPATIBLE: Use AvgPool2d instead of AdaptiveAvgPool2d
        # Input after conv layers: (batch, 128, 8, 11)
        # Output after avg pool: (batch, 128, 4, 3)
        self.avg_pool = nn.AvgPool2d(kernel_size=(2, 3))
        
        # Classifier - Updated for new output size
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(128 * 4 * 3, 256),  # 1536 input features
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.classifier(x)
        return x

In [6]:
# --- CNN training setup ---

# Create datasets and loaders
train_dataset = MelSpectrogramDataset(train_mel, train_labels)
val_dataset = MelSpectrogramDataset(val_mel, val_labels)
test_dataset = MelSpectrogramDataset(test_mel, test_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model
cnn_model = MelCNN(num_classes=num_classes).to(DEVICE)
print(f"CNN Model | Parameters: {sum(p.numel() for p in cnn_model.parameters()):,}")

CNN Model | Parameters: 487,877


In [7]:
# --- CNN training functions ---

def train_cnn_one_epoch(model, loader, criterion, optimizer):
    """Train CNN for one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for mel_specs, labels in tqdm(loader, leave=False, desc="Training"):
        mel_specs, labels = mel_specs.to(DEVICE), labels.to(DEVICE)
        outputs = model(mel_specs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    return total_loss / len(loader), 100. * correct / total

def evaluate_cnn(model, loader):
    """Evaluate CNN model"""
    model.eval()
    y_true, y_pred = [], []
    
    with torch.no_grad():
        for mel_specs, labels in tqdm(loader, leave=False, desc="Evaluating"):
            mel_specs, labels = mel_specs.to(DEVICE), labels.to(DEVICE)
            outputs = model(mel_specs)
            preds = outputs.argmax(1).cpu().numpy()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds)
    
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = 100. * sum([1 for t, p in zip(y_true, y_pred) if t == p]) / len(y_true)
    return f1, acc, classification_report(y_true, y_pred, target_names=families, digits=4)

In [8]:
# --- CNN training loop ---

print("\nTraining CNN...")

# Create Results directory if it doesn't exist
results_dir = PROJECT_ROOT / "Results"
results_dir.mkdir(exist_ok=True)

optimizer = torch.optim.AdamW(cnn_model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)

best_cnn_f1 = 0
train_losses, train_accs, val_f1s, val_accs = [], [], [], []

for epoch in range(EPOCHS):
    train_loss, train_acc = train_cnn_one_epoch(cnn_model, train_loader, criterion, optimizer)
    val_f1, val_acc, report = evaluate_cnn(cnn_model, val_loader)
    
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_f1s.append(val_f1)
    val_accs.append(val_acc)
    
    scheduler.step(val_f1)
    
    # Save best model
    if val_f1 > best_cnn_f1:
        best_cnn_f1 = val_f1
        torch.save(cnn_model.state_dict(), results_dir / "model_baseline_cnn.pt")
        checkpoint = '✓'
    else:
        checkpoint = ''
    
    if epoch % 5 == 0 or epoch == EPOCHS - 1:
        print(f"Epoch {epoch+1:2d}/{EPOCHS} | Loss: {train_loss:.4f} | Train: {train_acc:5.1f}% | Val: {val_acc:5.1f}% | F1: {val_f1:.4f} {checkpoint}")
    
    if epoch % 10 == 9:
        print(report)

print(f"Best CNN F1: {best_cnn_f1:.4f}")
print(f"Model saved to: {results_dir / 'model_baseline_cnn.pt'}")

# Load best model for evaluation
cnn_model.load_state_dict(torch.load(results_dir / "model_baseline_cnn.pt"))


Training CNN...


                                                         

Epoch  1/25 | Loss: 1.5183 | Train:  38.4% | Val:  50.6% | F1: 0.4365 ✓


                                                         

Epoch  6/25 | Loss: 0.4073 | Train:  86.1% | Val:  75.9% | F1: 0.7376 


                                                         

              precision    recall  f1-score   support

   keyboards     1.0000    0.4500    0.6207        20
  percussion     0.7895    1.0000    0.8824        15
     strings     0.4444    0.8000    0.5714         5
       voice     0.8667    0.7647    0.8125        17
       winds     0.6667    0.8182    0.7347        22

    accuracy                         0.7468        79
   macro avg     0.7535    0.7666    0.7243        79
weighted avg     0.8033    0.7468    0.7403        79



                                                         

Epoch 11/25 | Loss: 0.2897 | Train:  89.8% | Val:  73.4% | F1: 0.7067 


                                                         

Epoch 16/25 | Loss: 0.2371 | Train:  91.1% | Val:  75.9% | F1: 0.7194 


                                                         

              precision    recall  f1-score   support

   keyboards     1.0000    0.4500    0.6207        20
  percussion     0.8333    1.0000    0.9091        15
     strings     0.4000    0.8000    0.5333         5
       voice     1.0000    0.7647    0.8667        17
       winds     0.6552    0.8636    0.7451        22

    accuracy                         0.7595        79
   macro avg     0.7777    0.7757    0.7350        79
weighted avg     0.8344    0.7595    0.7575        79



                                                         

Epoch 21/25 | Loss: 0.1662 | Train:  94.0% | Val:  73.4% | F1: 0.6678 


                                                         

Epoch 25/25 | Loss: 0.1222 | Train:  95.5% | Val:  75.9% | F1: 0.7105 
Best CNN F1: 0.7787
Model saved to: /Users/dghifari/02-University/SEM-2-2025/elec5305-project-520140154/Results/model_baseline_cnn.pt




<All keys matched successfully>

In [9]:
# --- CNN final evaluation ---

print("\nFinal test evaluation...")
test_f1, test_acc, test_report = evaluate_cnn(cnn_model, test_loader)

print(f"Test Accuracy: {test_acc/100:.4f}")
print(f"Test Macro F1: {test_f1:.4f}")
print("\nTest Classification Report:")
print(test_report)

# Get predictions for confusion matrix
cnn_model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for mel_specs, labels in test_loader:
        mel_specs, labels = mel_specs.to(DEVICE), labels.to(DEVICE)
        outputs = cnn_model(mel_specs)
        preds = outputs.argmax(1).cpu().numpy()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds)

# Confusion matrix with Bokeh
cm_normalized = confusion_matrix(y_true, y_pred, normalize='true')
cm_counts = confusion_matrix(y_true, y_pred)
n_classes = len(families)

# Prepare data
true_labels, pred_labels, values, counts, pct_text, count_text, text_colors = [], [], [], [], [], [], []

for i in range(n_classes):
    for j in range(n_classes):
        true_labels.append(families[i])
        pred_labels.append(families[j])
        val = cm_normalized[i, j]
        cnt = int(cm_counts[i, j])
        values.append(val)
        counts.append(cnt)
        pct_text.append(f'{val:.0%}')
        count_text.append(f'({cnt})')
        
        # Text color for readability
        if val < 0.3:
            text_colors.append('#2b2b2b')
        elif val > 0.7:
            text_colors.append('white')
        else:
            text_colors.append('#2b2b2b')

source = ColumnDataSource(data=dict(
    true_label=true_labels,
    pred_label=pred_labels,
    value=values,
    count=counts,
    pct_text=pct_text,
    count_text=count_text,
    text_color=text_colors
))

# Color mapper - Blues gradient (light to dark)
palette = list(reversed(Blues9))
mapper = LinearColorMapper(palette=palette, low=0, high=1)

# Create figure
p = figure(
    title="CNN Confusion Matrix (Test Set)",
    x_range=families, 
    y_range=list(reversed(families)),
    width=750, 
    height=600,
    tools="hover,save,reset"
)

p.rect(x="pred_label", y="true_label", width=1, height=1, source=source,
       fill_color=transform('value', mapper), line_color='white', line_width=2)

p.text(x='pred_label', y='true_label', text='pct_text', source=source,
       text_align='center', text_baseline='middle', text_font_size='14pt',
       text_font_style='bold', text_color='text_color', y_offset=6)

p.text(x='pred_label', y='true_label', text='count_text', source=source,
       text_align='center', text_baseline='middle', text_font_size='10pt',
       text_color='text_color', y_offset=-8)

# Color bar
color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(desired_num_ticks=10),
                     label_standoff=12, border_line_color=None, location=(0, 0),
                     title='Accuracy', title_text_font_style='bold')
p.add_layout(color_bar, 'right')

# Hover
hover = p.select_one(HoverTool)
hover.tooltips = [("True", "@true_label"), ("Predicted", "@pred_label"), 
                  ("Accuracy", "@value{0.1%}"), ("Count", "@count")]

# Styling
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.xaxis.axis_label = 'Predicted Label'
p.yaxis.axis_label = 'True Label'
p.xaxis.axis_label_text_font_style = "bold"
p.yaxis.axis_label_text_font_style = "bold"

show(p)

# Store CNN results
cnn_results = {
    'test_accuracy': test_acc/100,
    'test_f1_macro': test_f1,
    'confusion_matrix': cm_counts,
    'confusion_matrix_normalized': cm_normalized
}


Final test evaluation...


                                                 

Test Accuracy: 0.6182
Test Macro F1: 0.5750

Test Classification Report:
              precision    recall  f1-score   support

   keyboards     1.0000    0.9524    0.9756        21
  percussion     0.3696    1.0000    0.5397        17
     strings     0.6250    0.2381    0.3448        21
       voice     0.7500    0.1111    0.1935        27
       winds     0.7188    0.9583    0.8214        24

    accuracy                         0.6182       110
   macro avg     0.6927    0.6520    0.5750       110
weighted avg     0.7083    0.6182    0.5622       110



In [10]:
# --- Results summary ---

print("\nSummary of Baseline Models")
print("-" * 50)
print(f"MFCC + SVM - Test Accuracy: {svm_results['test_accuracy']:.4f}, F1: {svm_results['test_f1_macro']:.4f}")
print(f"Mel-Spec + CNN - Test Accuracy: {cnn_results['test_accuracy']:.4f}, F1: {cnn_results['test_f1_macro']:.4f}")


Summary of Baseline Models
--------------------------------------------------
MFCC + SVM - Test Accuracy: 0.6636, F1: 0.6386
Mel-Spec + CNN - Test Accuracy: 0.6182, F1: 0.5750
