In [None]:
import numpy as np
import os
import pandas as pd
import random
import torch
import torchaudio

from time import time
from torch import nn
from torch.utils.data import DataLoader

from msc_dataset import MSCDataset

# For data augmentation
import torch.nn.functional as F

In [2]:
# Device setup for Mac M4 Pro (MPS), CUDA (NVIDIA), or CPU fallback
if torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
elif torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')

print(f"Using device: {DEVICE}")

Using device: mps


In [3]:
# ==================== CONFIGURATION ====================
CFG = {
    'sampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
    'n_mels': 40,
    'f_min': 20,  # Ottimizzato per voce umana
    'f_max': 4000,
    'seed': 42,
    'train_steps': 5000,  # Pi√π step per epoch
    'train_batch_size': 64,  # Batch pi√π piccolo per better convergence
    'learning_rate': 0.001,
    'epochs': 50,  # Pi√π epochs con early stopping
    # Data Augmentation
    'time_shift_ms': 100,  # shift audio di ¬±100ms
    'noise_level': 0.005,   # background noise
    'time_stretch_factor': 0.1,  # speed variation
}

# Define the set of target classes
CLASSES = ['stop', 'up']

# Set Deterministic Behaviour
torch.manual_seed(CFG['seed'])
np.random.seed(CFG['seed'])
random.seed(CFG['seed'])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
# ==================== MEL-SPECTROGRAM FEATURE EXTRACTOR ====================
class MelSpectrogramExtractor(nn.Module):
    """ONNX-compatible Mel-Spectrogram feature extractor"""
    def __init__(self):
        super().__init__()
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=CFG['sampling_rate'],
            n_fft=int(CFG['frame_length_in_s'] * CFG['sampling_rate']),
            hop_length=int(CFG['frame_step_in_s'] * CFG['sampling_rate']),
            n_mels=CFG['n_mels'],
            f_min=CFG['f_min'],
            f_max=CFG['f_max'],
            window_fn=torch.hann_window,
            power=2.0,
            normalized=False,
            center=True,
            pad_mode="reflect"
        )
        
    def forward(self, waveform):
        # waveform: (batch, samples)
        mel_spec = self.mel_transform(waveform)  # (batch, n_mels, time)
        
        # Log scale
        log_mel = torch.log(mel_spec + 1e-9)
        
        # Normalize to [-1, 1] range (per-sample)
        log_mel = (log_mel - log_mel.mean(dim=[1, 2], keepdim=True)) / (log_mel.std(dim=[1, 2], keepdim=True) + 1e-9)
        
        return log_mel.unsqueeze(1)  # (batch, 1, n_mels, time)

In [5]:
# ==================== DATA AUGMENTATION ====================
class AudioAugmentation:
    """Data augmentation per audio waveforms"""
    def __init__(self, config, training=True):
        self.config = config
        self.training = training
        
    def time_shift(self, waveform):
        """Shift audio randomly in time"""
        if not self.training or random.random() > 0.5:
            return waveform
            
        shift_samples = int(random.uniform(-self.config['time_shift_ms'], 
                                          self.config['time_shift_ms']) 
                          * self.config['sampling_rate'] / 1000)
        return torch.roll(waveform, shifts=shift_samples, dims=-1)
    
    def add_noise(self, waveform):
        """Add background white noise"""
        if not self.training or random.random() > 0.5:
            return waveform
            
        noise = torch.randn_like(waveform) * self.config['noise_level']
        return waveform + noise
    
    def time_stretch(self, waveform):
        """Slightly speed up or slow down audio"""
        if not self.training or random.random() > 0.5:
            return waveform
            
        rate = 1.0 + random.uniform(-self.config['time_stretch_factor'], 
                                    self.config['time_stretch_factor'])
        
        # Simple resampling-based time stretch
        stretched = F.interpolate(
            waveform.unsqueeze(0), 
            size=int(waveform.shape[-1] * rate),
            mode='linear',
            align_corners=False
        ).squeeze(0)
        
        # Pad or crop to original length
        target_len = waveform.shape[-1]
        if stretched.shape[-1] < target_len:
            stretched = F.pad(stretched, (0, target_len - stretched.shape[-1]))
        else:
            stretched = stretched[..., :target_len]
            
        return stretched
    
    def __call__(self, waveform):
        """Apply all augmentations in sequence"""
        waveform = self.time_shift(waveform)
        waveform = self.add_noise(waveform)
        waveform = self.time_stretch(waveform)
        return waveform

In [6]:
# ==================== CNN MODEL (IMPROVED) ====================
class KeywordSpotter(nn.Module):
    """Enhanced CNN for Up/Stop classification with Dropout"""
    def __init__(self, num_classes=2, dropout=0.3):
        super().__init__()
        
        # Block 1: 1 ‚Üí 64 channels
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout2d(dropout * 0.5)  # Light dropout early
        
        # Block 2: 64 ‚Üí 64 channels, downsample
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout2d(dropout * 0.5)
        
        # Block 3: 64 ‚Üí 128 channels, downsample
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(128)
        self.relu3 = nn.ReLU(inplace=True)
        self.dropout3 = nn.Dropout2d(dropout)
        
        # Block 4: 128 ‚Üí 128 channels
        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn4 = nn.BatchNorm2d(128)
        self.relu4 = nn.ReLU(inplace=True)
        
        # Block 5: 128 ‚Üí 256 channels (NEW)
        self.conv5 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn5 = nn.BatchNorm2d(256)
        self.relu5 = nn.ReLU(inplace=True)
        self.dropout5 = nn.Dropout2d(dropout)
        
        # Global Average Pooling
        self.gap = nn.AdaptiveAvgPool2d(1)
        
        # Classifier with dropout
        self.dropout_fc = nn.Dropout(dropout)
        self.fc = nn.Linear(256, num_classes, bias=True)
        
    def forward(self, x):
        # x: (batch, 1, 40, 49)
        x = self.dropout1(self.relu1(self.bn1(self.conv1(x))))  # (batch, 64, 40, 49)
        x = self.dropout2(self.relu2(self.bn2(self.conv2(x))))  # (batch, 64, 20, 25)
        x = self.dropout3(self.relu3(self.bn3(self.conv3(x))))  # (batch, 128, 10, 13)
        x = self.relu4(self.bn4(self.conv4(x)))                 # (batch, 128, 10, 13)
        x = self.dropout5(self.relu5(self.bn5(self.conv5(x))))  # (batch, 256, 10, 13)
        
        x = self.gap(x)  # (batch, 256, 1, 1)
        x = x.view(x.size(0), -1)  # (batch, 256)
        x = self.dropout_fc(x)
        x = self.fc(x)  # (batch, 2)
        
        return x

In [7]:
# ==================== MAIN TRAINING PIPELINE ====================
print("=" * 60)
print("UP/STOP KEYWORD SPOTTER - TRAINING PIPELINE (ENHANCED)")
print("=" * 60)
print(f"Device: {DEVICE}")
print(f"Mel-Spectrogram config: n_mels={CFG['n_mels']}, n_fft={int(CFG['frame_length_in_s'] * CFG['sampling_rate'])}, hop={int(CFG['frame_step_in_s'] * CFG['sampling_rate'])}")
print(f"Training config: epochs={CFG['epochs']}, batch_size={CFG['train_batch_size']}, lr={CFG['learning_rate']}")
print(f"Data Augmentation: time_shift=¬±{CFG['time_shift_ms']}ms, noise={CFG['noise_level']}, stretch=¬±{CFG['time_stretch_factor']*100}%")
print("=" * 60)

# Create Mel-Spectrogram transform
transform = MelSpectrogramExtractor()

# Create augmentation
train_augmentation = AudioAugmentation(CFG, training=True)
val_augmentation = AudioAugmentation(CFG, training=False)

# Create datasets
print("\nüìÅ Loading datasets...")
train_dataset = MSCDataset(
    root='.',
    classes=CLASSES,
    split='training',
    preprocess=None,
)

val_dataset = MSCDataset(
    root='.',
    classes=CLASSES,
    split='validation',
    preprocess=None,
)

test_dataset = MSCDataset(
    root='.',
    classes=CLASSES,
    split='testing',
    preprocess=None,
)

# Create dataloaders with RandomSampler for training
sampler = torch.utils.data.RandomSampler(
    train_dataset,
    replacement=True,
    num_samples=CFG['train_steps'] * CFG['train_batch_size'],
)
train_loader = DataLoader(
    train_dataset,
    batch_size=CFG['train_batch_size'],
    sampler=sampler,
    num_workers=0,  # Set to 0 for macOS compatibility
)

val_loader = DataLoader(val_dataset, batch_size=100, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=100, num_workers=0)

# Initialize models
print("\nüèóÔ∏è  Initializing models...")
feature_extractor = MelSpectrogramExtractor().to(DEVICE)
model = KeywordSpotter(num_classes=len(CLASSES), dropout=0.3).to(DEVICE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params:,}")
print(f"Estimated size (float32): {total_params * 4 / 1024:.2f} KB")
print(f"Estimated size (int8): {total_params / 1024:.2f} KB")

# Loss and optimizer
loss_module = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CFG['learning_rate'], weight_decay=1e-4)

# Scheduler - More gradual decay
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=5, min_lr=1e-6
)

# Early stopping setup
best_val_acc = 0
best_val_loss = float('inf')
patience = 15  # More patience
patience_counter = 0
best_model_state = None

UP/STOP KEYWORD SPOTTER - TRAINING PIPELINE (ENHANCED)
Device: mps
Mel-Spectrogram config: n_mels=40, n_fft=640, hop=320
Training config: epochs=50, batch_size=64, lr=0.001
Data Augmentation: time_shift=¬±100ms, noise=0.005, stretch=¬±10.0%

üìÅ Loading datasets...
Using data folder: ./msc-training
Loaded 1600 samples from ./msc-training for classes ['stop', 'up']
Using data folder: ./msc-validation
Loaded 200 samples from ./msc-validation for classes ['stop', 'up']
Using data folder: ./msc-testing
Loaded 200 samples from ./msc-testing for classes ['stop', 'up']

üèóÔ∏è  Initializing models...
Model parameters: 555,330
Estimated size (float32): 2169.26 KB
Estimated size (int8): 542.31 KB


In [8]:
# ==================== TRAINING & EVALUATION FUNCTIONS ====================

def evaluate(model, feature_extractor, loader, device):
    """Evaluate model on validation/test set"""
    model.eval()
    correct = total = 0
    total_loss = 0
    loss_module = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch in loader:
            x = batch['x'].squeeze(1).to(device)
            y = batch['y'].to(device)
            features = feature_extractor(x)
            output = model(features)
            predictions = output.argmax(dim=1)
            correct += (predictions == y).sum().item()
            total += y.size(0)
            total_loss += loss_module(output, y).item()
    
    accuracy = (correct / total) * 100
    avg_loss = total_loss / len(loader)
    return accuracy, avg_loss


def train_epoch(model, feature_extractor, train_loader, optimizer, loss_module, device, 
                steps_per_epoch, current_epoch, augmentation):
    """Train model for one epoch with data augmentation"""
    model.train()
    
    start_step = current_epoch * steps_per_epoch
    end_step = start_step + steps_per_epoch
    
    epoch_loss = 0
    step_count = 0
    
    for step, batch in enumerate(train_loader):
        if step < start_step:
            continue
        if step >= end_step:
            break
            
        x = batch['x'].squeeze(1)  # Keep on CPU for augmentation
        y = batch['y'].to(device)
        
        # Apply data augmentation on CPU
        x_augmented = []
        for i in range(x.shape[0]):
            aug_sample = augmentation(x[i:i+1])
            x_augmented.append(aug_sample)
        x = torch.cat(x_augmented, dim=0).to(device)
        
        # Extract features
        with torch.no_grad():
            features = feature_extractor(x)
        
        # Forward pass
        output = model(features)
        loss = loss_module(output, y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        step_count += 1
        
        if ((step + 1) % 100) == 0 or step == 0:
            print(f'Step={step}; Training Loss={loss.item():.4f}')
    
    avg_epoch_loss = epoch_loss / step_count if step_count > 0 else 0
    return avg_epoch_loss


def test_model(model, feature_extractor, test_loader, device):
    """Test model on test dataset"""
    model.eval()
    correct = total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            x = batch['x'].squeeze(1).to(device)
            y = batch['y'].to(device)
            
            # Extract features
            features = feature_extractor(x)
            
            output = model(features)
            predictions = output.argmax(dim=1)
            
            correct += (predictions == y).sum().item()
            total += y.size(0)
    
    accuracy = (correct / total) * 100
    return accuracy

In [9]:
# ==================== TRAINING LOOP ====================
print("\nüöÄ Starting training...")

steps_per_epoch = len(train_loader) // CFG['epochs']
current_epoch = 0

train_history = {'epoch': [], 'train_loss': [], 'val_acc': [], 'val_loss': [], 'lr': []}

for epoch in range(CFG['epochs']):
    print(f"\n{'='*60}")
    print(f"EPOCH {epoch+1}/{CFG['epochs']}")
    print(f"{'='*60}")
    
    # Train for one epoch
    train_loss = train_epoch(
        model, feature_extractor, train_loader, optimizer, loss_module, 
        DEVICE, steps_per_epoch, epoch, train_augmentation
    )
    
    current_epoch += 1
    
    # Evaluate on validation set
    val_acc, val_loss = evaluate(model, feature_extractor, val_loader, DEVICE)
    
    # Get current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f'\nüìä Epoch {current_epoch} Summary:')
    print(f'   Train Loss: {train_loss:.4f}')
    print(f'   Val Acc: {val_acc:.2f}%')
    print(f'   Val Loss: {val_loss:.4f}')
    print(f'   Learning Rate: {current_lr:.6f}')
    
    # Save history
    train_history['epoch'].append(current_epoch)
    train_history['train_loss'].append(train_loss)
    train_history['val_acc'].append(val_acc)
    train_history['val_loss'].append(val_loss)
    train_history['lr'].append(current_lr)
    
    # Early stopping check
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_val_loss = val_loss
        best_model_state = model.state_dict().copy()
        patience_counter = 0
        print(f'‚úÖ New best model! Val Acc={val_acc:.2f}%')
    else:
        patience_counter += 1
        print(f'‚è≥ No improvement. Patience: {patience_counter}/{patience}')
    
    if patience_counter >= patience:
        print(f'\nüõë Early stopping at epoch {current_epoch}')
        break
    
    # Learning rate scheduler step (ReduceLROnPlateau needs metric)
    scheduler.step(val_acc)

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f'\n‚úÖ Loaded best model with Val Acc={best_val_acc:.2f}%')

print(f"\n{'='*60}")
print(f"Training completed after {current_epoch} epochs")
print(f"Best validation accuracy: {best_val_acc:.2f}%")
print(f"{'='*60}")


üöÄ Starting training...

EPOCH 1/50
Step=0; Training Loss=0.7418
Step=0; Training Loss=0.7418
Step=99; Training Loss=0.4684

üìä Epoch 1 Summary:
   Train Loss: 0.5945
   Val Acc: 78.00%
   Val Loss: 0.4290
   Learning Rate: 0.001000
‚úÖ New best model! Val Acc=78.00%

EPOCH 2/50
Step=99; Training Loss=0.4684

üìä Epoch 1 Summary:
   Train Loss: 0.5945
   Val Acc: 78.00%
   Val Loss: 0.4290
   Learning Rate: 0.001000
‚úÖ New best model! Val Acc=78.00%

EPOCH 2/50
Step=199; Training Loss=0.4439

üìä Epoch 2 Summary:
   Train Loss: 0.4131
   Val Acc: 88.00%
   Val Loss: 0.2403
   Learning Rate: 0.001000
‚úÖ New best model! Val Acc=88.00%

EPOCH 3/50
Step=199; Training Loss=0.4439

üìä Epoch 2 Summary:
   Train Loss: 0.4131
   Val Acc: 88.00%
   Val Loss: 0.2403
   Learning Rate: 0.001000
‚úÖ New best model! Val Acc=88.00%

EPOCH 3/50
Step=299; Training Loss=0.3500

üìä Epoch 3 Summary:
   Train Loss: 0.3266
   Val Acc: 92.50%
   Val Loss: 0.2239
   Learning Rate: 0.001000
‚úÖ New

In [10]:
# ==================== TEST EVALUATION ====================
print("\nüìä Evaluating model on test set...")

test_accuracy = test_model(model, feature_extractor, test_loader, DEVICE)
print(f'\nüéØ Test Accuracy: {test_accuracy:.2f}%')

if test_accuracy > 99.4:
    print("‚úÖ PASSED: Accuracy > 99.4%")
else:
    print("‚ùå FAILED: Accuracy <= 99.4%")



üìä Evaluating model on test set...

üéØ Test Accuracy: 99.00%
‚ùå FAILED: Accuracy <= 99.4%


In [11]:
# ==================== SAVE MODEL ====================
print("\n" + "="*60)
print("SAVING MODEL")
print("="*60)

timestamp = int(time())
saved_model_dir = './saved_models/'
if not os.path.exists(saved_model_dir):
    os.makedirs(saved_model_dir)

print(f'Model Timestamp: {timestamp}')

model.eval()
feature_extractor.eval()

# Move models to CPU for ONNX export (MPS not supported for export)
print("\nüîÑ Moving models to CPU for ONNX export...")
model_cpu = model.cpu()
feature_extractor_cpu = feature_extractor.cpu()

# Export Feature Extractor to ONNX
print("\nüì¶ Exporting Feature Extractor to ONNX...")
torch.onnx.export(
    feature_extractor_cpu,  # model to export
    torch.randn(1, 16000),  # inputs of the model (waveform)
    f'{saved_model_dir}/{timestamp}_frontend.onnx',  # filename of the ONNX model
    input_names=['input'],  # input name in the ONNX model
    dynamo=True,
    optimize=True,
    report=False,
    external_data=False,
)
print(f"‚úÖ Feature extractor saved: {saved_model_dir}/{timestamp}_frontend.onnx")

# Export Model to ONNX
print("\nüì¶ Exporting Model to ONNX...")
# Get a sample waveform from training dataset and extract features
sample_waveform = train_dataset[0]['x'].squeeze(0).unsqueeze(0).cpu()  # (1, 16000)
sample_features = feature_extractor_cpu(sample_waveform)  # (1, 1, n_mels, time)
torch.onnx.export(
    model_cpu,  # model to export
    sample_features,  # inputs of the model (mel-spectrogram features)
    f'{saved_model_dir}/{timestamp}_model.onnx',  # filename of the ONNX model
    input_names=['input'],  # input name in the ONNX model
    dynamo=True,
    optimize=True,
    report=False,
    external_data=False,
)
print(f"‚úÖ Model saved: {saved_model_dir}/{timestamp}_model.onnx")

# Check sizes
fe_size = os.path.getsize(f'{saved_model_dir}/{timestamp}_frontend.onnx') / 1024
model_size = os.path.getsize(f'{saved_model_dir}/{timestamp}_model.onnx') / 1024
total_size = fe_size + model_size

print("\n" + "="*60)
print("SIZE REPORT (ONNX - Float32)")
print("="*60)
print(f"Feature Extractor: {fe_size:.2f} KB")
print(f"Model: {model_size:.2f} KB")
print(f"Total: {total_size:.2f} KB")

if total_size < 300:
    print("‚úÖ PASSED: Total size < 300 KB (before quantization)")
else:
    print("‚ö†Ô∏è  WARNING: Size > 300 KB - quantization required!")

# Save Hyperparameters & Results
print("\nüìù Saving hyperparameters and results...")
output_dict = {
    'timestamp': timestamp,
    **CFG,
    'test_accuracy': test_accuracy
}

df = pd.DataFrame([output_dict])
output_path = './keyword_spotter_results.csv'
df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)
print(f"‚úÖ Results saved to {output_path}")

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)



SAVING MODEL
Model Timestamp: 1764345312

üîÑ Moving models to CPU for ONNX export...

üì¶ Exporting Feature Extractor to ONNX...


W1128 16:55:12.471000 36687 torch/onnx/_internal/exporter/_registration.py:107] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Obtain model graph for `MelSpectrogramExtractor([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `MelSpectrogramExtractor([...]` with `torch.export.export(..., strict=False)`... ‚úÖ
[torch.onnx] Run decomposition...


W1128 16:55:12.911000 36687 torch/onnx/_internal/exporter/_registration.py:107] torchvision is not installed. Skipping torchvision::nms


[torch.onnx] Run decomposition... ‚úÖ
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ‚úÖ
Applied 4 of general pattern rewrite rules.
‚úÖ Feature extractor saved: ./saved_models//1764345312_frontend.onnx

üì¶ Exporting Model to ONNX...
[torch.onnx] Obtain model graph for `KeywordSpotter([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `KeywordSpotter([...]` with `torch.export.export(..., strict=False)`... ‚úÖ
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ‚úÖ
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ‚úÖ
Applied 10 of general pattern rewrite rules.
‚úÖ Model saved: ./saved_models//1764345312_model.onnx

SIZE REPORT (ONNX - Float32)
Feature Extractor: 332.49 KB
Model: 2178.31 KB
Total: 2510.80 KB

üìù Saving hyperparameters and results...
‚úÖ Results saved to ./keyword_spotter_results.csv

TRAINING COMPLETE
[torch.onnx] Obtain mo