Multi-Modal Mood Matcher - Audio Feature Extraction Training Notebook
======================================================================
This notebook trains audio emotion recognition models using:
1. Wav2Vec2/HuBERT for self-supervised speech features
2. CNN on mel-spectrograms with temporal modeling
3. ECAPA-TDNN for speaker-aware emotion features

Target: Extract emotion-aware temporal embeddings from speech segments

In [1]:
# ============================================================================
# 1. IMPORTS AND SETUP
# ============================================================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
import json
import random
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Audio Processing
import librosa
import librosa.display
import soundfile as sf
import torchaudio
import torchaudio.transforms as T

# Transformers for Wav2Vec2/HuBERT
from transformers import (
    Wav2Vec2Model, 
    Wav2Vec2Processor,
    Wav2Vec2Config,
    HubertModel,
    AutoProcessor,
    AutoModel
)

# SpeechBrain for ECAPA-TDNN (optional)
try:
    from speechbrain.pretrained import EncoderClassifier
    SPEECHBRAIN_AVAILABLE = True
    print("✓ SpeechBrain available")
except (ImportError, AttributeError) as e:
    SPEECHBRAIN_AVAILABLE = False
    print(f"⚠️ SpeechBrain not available: {type(e).__name__}")
    print("   This is optional - Wav2Vec2 and CNN models will still work fine.")

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split


⚠️ SpeechBrain not available: AttributeError
   This is optional - Wav2Vec2 and CNN models will still work fine.


In [2]:
# Set random seeds
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


In [4]:
# ============================================================================
# 2. CONFIGURATION
# ============================================================================

class Config:
    # Paths
    DATA_ROOT = "./data/ravdess"  # RAVDESS dataset directory
    OUTPUT_DIR = "./outputs/audio_emotion_model"
    CHECKPOINT_DIR = "./checkpoints/audio"
    
    # Model Architecture
    MODEL_TYPE = "wav2vec2"  # Options: wav2vec2, hubert, cnn_spectrogram
    PRETRAINED_MODEL = "facebook/wav2vec2-base"  # or "facebook/hubert-base-ls960"
    
    # Audio Parameters
    SAMPLE_RATE = 16000  # Standard for speech models
    MAX_DURATION = 10.0  # seconds
    MIN_DURATION = 1.0
    
    # Feature Extraction (for CNN-based models)
    N_MELS = 128
    N_MFCC = 40
    HOP_LENGTH = 512
    N_FFT = 2048
    
    # Model Hyperparameters
    EMBEDDING_DIM = 768  # Wav2Vec2: 768, HuBERT: 768
    HIDDEN_DIM = 512
    NUM_CLASSES = 4  # RAVDESS 4-class: neutral, happy, sad, angry
    USE_LSTM = True  # Add LSTM for temporal modeling
    LSTM_LAYERS = 2
    LSTM_BIDIRECTIONAL = True
    
    # Training Hyperparameters
    BATCH_SIZE = 16  # Audio models are memory intensive
    NUM_EPOCHS = 30
    LEARNING_RATE = 5e-5  # Lower LR for fine-tuning pretrained models
    WEIGHT_DECAY = 1e-4
    WARMUP_STEPS = 500
    SCHEDULER = "linear_warmup"
    
    # Training Strategy
    FREEZE_ENCODER_EPOCHS = 3  # Freeze Wav2Vec2 encoder initially
    GRAD_CLIP = 1.0
    MIXED_PRECISION = True
    ACCUMULATION_STEPS = 2  # Gradient accumulation for larger effective batch size
    
    # Data Augmentation
    USE_AUGMENTATION = True
    AUGMENT_PROB = 0.5
    
    # Early Stopping
    PATIENCE = 8
    MIN_DELTA = 0.001
    
    # Logging
    USE_WANDB = False
    WANDB_PROJECT = "audio-emotion-embedding"
    
    # RAVDESS Emotion Labels (4-class mapping)
    # RAVDESS codes: 01=neutral, 03=happy, 04=sad, 05=angry
    EMOTION_LABELS = {
        0: 'neutral',
        1: 'happy',
        2: 'sad',
        3: 'angry'
    }
    
    # RAVDESS to our label mapping
    RAVDESS_EMOTION_MAP = {
        '01': 0,  # neutral
        '03': 1,  # happy
        '04': 2,  # sad
        '05': 3,  # angry
        # Excluded: '02': calm, '06': fearful, '07': disgust, '08': surprised
    }

config = Config()
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(config.CHECKPOINT_DIR, exist_ok=True)

In [5]:
# ============================================================================
# 3. AUDIO PREPROCESSING AND AUGMENTATION
# ============================================================================

class AudioPreprocessor:
    """Audio preprocessing utilities"""
    
    def __init__(self, sample_rate=16000, max_duration=10.0):
        self.sample_rate = sample_rate
        self.max_duration = max_duration
        self.max_samples = int(sample_rate * max_duration)
    
    def load_audio(self, path):
        """Load audio file and resample if needed"""
        # Use soundfile directly to avoid torchaudio backend issues
        waveform_np, sr = sf.read(path)
        
        # Convert to tensor
        waveform = torch.from_numpy(waveform_np).float()
        
        # Add channel dimension if mono
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        elif waveform.dim() == 2:
            # If stereo, transpose to (channels, samples)
            waveform = waveform.T
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample if needed
        if sr != self.sample_rate:
            resampler = T.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
        
        return waveform.squeeze(0)
    
    def pad_or_truncate(self, waveform):
        """Pad or truncate waveform to fixed length"""
        if len(waveform) > self.max_samples:
            # Truncate
            waveform = waveform[:self.max_samples]
        elif len(waveform) < self.max_samples:
            # Pad
            pad_length = self.max_samples - len(waveform)
            waveform = F.pad(waveform, (0, pad_length))
        
        return waveform
    
    def normalize(self, waveform):
        """Normalize waveform"""
        return waveform / (torch.max(torch.abs(waveform)) + 1e-8)
    
    def extract_mel_spectrogram(self, waveform):
        """Extract mel-spectrogram features"""
        mel_transform = T.MelSpectrogram(
            sample_rate=self.sample_rate,
            n_fft=config.N_FFT,
            hop_length=config.HOP_LENGTH,
            n_mels=config.N_MELS
        )
        
        mel_spec = mel_transform(waveform)
        mel_spec_db = T.AmplitudeToDB()(mel_spec)
        
        return mel_spec_db
    
    def extract_mfcc(self, waveform):
        """Extract MFCC features"""
        mfcc_transform = T.MFCC(
            sample_rate=self.sample_rate,
            n_mfcc=config.N_MFCC,
            melkwargs={
                'n_fft': config.N_FFT,
                'hop_length': config.HOP_LENGTH,
                'n_mels': config.N_MELS
            }
        )
        
        mfcc = mfcc_transform(waveform)
        return mfcc

class AudioAugmentation:
    """Audio augmentation techniques"""
    
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
    
    def add_noise(self, waveform, noise_factor=0.005):
        """Add Gaussian noise"""
        noise = torch.randn_like(waveform) * noise_factor
        return waveform + noise
    
    def time_shift(self, waveform, shift_max=0.2):
        """Shift audio in time"""
        shift = int(np.random.uniform(-shift_max, shift_max) * len(waveform))
        return torch.roll(waveform, shift)
    
    def pitch_shift(self, waveform, n_steps=2):
        """Shift pitch (using librosa)"""
        waveform_np = waveform.numpy()
        shifted = librosa.effects.pitch_shift(
            waveform_np, 
            sr=self.sample_rate, 
            n_steps=np.random.uniform(-n_steps, n_steps)
        )
        return torch.from_numpy(shifted).float()
    
    def time_stretch(self, waveform, rate_range=(0.8, 1.2)):
        """Time stretching"""
        rate = np.random.uniform(*rate_range)
        waveform_np = waveform.numpy()
        stretched = librosa.effects.time_stretch(waveform_np, rate=rate)
        
        # Pad or truncate to original length
        if len(stretched) > len(waveform):
            stretched = stretched[:len(waveform)]
        else:
            stretched = np.pad(stretched, (0, len(waveform) - len(stretched)))
        
        return torch.from_numpy(stretched).float()
    
    def apply_augmentation(self, waveform, prob=0.5):
        """Apply random augmentation"""
        if np.random.random() < prob:
            aug_type = np.random.choice(['noise', 'time_shift', 'pitch_shift', 'time_stretch'])
            
            if aug_type == 'noise':
                return self.add_noise(waveform)
            elif aug_type == 'time_shift':
                return self.time_shift(waveform)
            elif aug_type == 'pitch_shift':
                return self.pitch_shift(waveform)
            elif aug_type == 'time_stretch':
                return self.time_stretch(waveform)
        
        return waveform

preprocessor = AudioPreprocessor(config.SAMPLE_RATE, config.MAX_DURATION)
augmenter = AudioAugmentation(config.SAMPLE_RATE)


In [6]:
# ============================================================================
# 4. DATASET CLASS
# ============================================================================

class AudioEmotionDataset(Dataset):
    """
    Dataset for audio emotion recognition
    
    Expected data structure:
    data_root/
        train/
            emotion_0/
                audio1.wav
                audio2.wav
            emotion_1/
                ...
        val/
            ...
    
    OR CSV format with columns: audio_path, emotion_label
    """
    
    def __init__(self, data_root, split='train', config=None, 
                 csv_path=None, use_augmentation=True):
        self.data_root = Path(data_root)
        self.split = split
        self.config = config or Config()
        self.use_augmentation = use_augmentation and (split == 'train')
        
        # Load data
        if csv_path:
            self.samples = self._load_from_csv(csv_path)
        else:
            self.samples = self._load_from_directory()
        
        print(f"{split} dataset: {len(self.samples)} samples")
        self._print_distribution()
    
    def _load_from_csv(self, csv_path):
        """Load dataset from CSV"""
        df = pd.read_csv(csv_path)
        samples = []
        for _, row in df.iterrows():
            samples.append({
                'path': row['audio_path'],
                'label': int(row['emotion_label'])
            })
        return samples
    
    def _load_from_directory(self):
        """Load dataset from directory structure"""
        samples = []
        split_dir = self.data_root / self.split
        
        if not split_dir.exists():
            raise ValueError(f"Directory {split_dir} does not exist")
        
        for emotion_dir in split_dir.iterdir():
            if not emotion_dir.is_dir():
                continue
            
            label = int(emotion_dir.name.split('_')[-1])
            
            for audio_path in emotion_dir.glob('*.wav'):
                samples.append({'path': str(audio_path), 'label': label})
        
        return samples
    
    def _print_distribution(self):
        """Print label distribution"""
        labels = [s['label'] for s in self.samples]
        unique, counts = np.unique(labels, return_counts=True)
        print(f"\nLabel distribution in {self.split}:")
        for label, count in zip(unique, counts):
            emotion_name = self.config.EMOTION_LABELS.get(label, f"class_{label}")
            print(f"  {emotion_name}: {count} ({100*count/len(labels):.1f}%)")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Load audio
        waveform = preprocessor.load_audio(sample['path'])
        
        # Normalize
        waveform = preprocessor.normalize(waveform)
        
        # Augmentation
        if self.use_augmentation:
            waveform = augmenter.apply_augmentation(waveform, config.AUGMENT_PROB)
        
        # Pad or truncate
        waveform = preprocessor.pad_or_truncate(waveform)
        
        label = sample['label']
        
        return waveform, label


In [7]:
# ============================================================================
# 5. MODEL ARCHITECTURES
# ============================================================================

class Wav2Vec2EmotionModel(nn.Module):
    """
    Wav2Vec2-based emotion recognition model
    
    Architecture:
    - Wav2Vec2 Encoder (frozen initially) -> Temporal Features
    - Optional LSTM for temporal modeling
    - Classification Head -> Emotion Classes
    - Extract hidden states for embeddings
    """
    
    def __init__(self, model_name="facebook/wav2vec2-base", 
                 num_classes=4, use_lstm=True, hidden_dim=512):
        super().__init__()
        
        self.num_classes = num_classes
        self.use_lstm = use_lstm
        
        # Load pretrained Wav2Vec2
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(model_name)
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        
        # Get embedding dimension
        self.embedding_dim = self.wav2vec2.config.hidden_size  # 768 for base
        
        # Temporal modeling with LSTM
        if use_lstm:
            self.lstm = nn.LSTM(
                input_size=self.embedding_dim,
                hidden_size=hidden_dim,
                num_layers=config.LSTM_LAYERS,
                batch_first=True,
                bidirectional=config.LSTM_BIDIRECTIONAL,
                dropout=0.3 if config.LSTM_LAYERS > 1 else 0
            )
            
            lstm_output_dim = hidden_dim * (2 if config.LSTM_BIDIRECTIONAL else 1)
        else:
            self.lstm = None
            lstm_output_dim = self.embedding_dim
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(lstm_output_dim),
            nn.Dropout(0.3),
            nn.Linear(lstm_output_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
        # Attention pooling (alternative to mean pooling)
        self.attention = nn.Sequential(
            nn.Linear(lstm_output_dim, 128),
            nn.Tanh(),
            nn.Linear(128, 1)
        )
    
    def forward(self, waveform, return_embedding=False, return_temporal=False):
        """
        Forward pass
        
        Args:
            waveform: Input tensor (B, seq_len)
            return_embedding: Return utterance-level embedding
            return_temporal: Return temporal sequence of embeddings
        
        Returns:
            logits or embeddings based on flags
        """
        # Extract features from Wav2Vec2
        outputs = self.wav2vec2(waveform)
        hidden_states = outputs.last_hidden_state  # (B, T, embedding_dim)
        
        # Temporal modeling with LSTM
        if self.lstm:
            lstm_out, _ = self.lstm(hidden_states)  # (B, T, lstm_output_dim)
            features = lstm_out
        else:
            features = hidden_states
        
        # Return temporal embeddings if requested
        if return_temporal:
            return features
        
        # Attention-based pooling
        attention_weights = self.attention(features)  # (B, T, 1)
        attention_weights = F.softmax(attention_weights, dim=1)
        
        # Weighted sum
        utterance_embedding = torch.sum(features * attention_weights, dim=1)  # (B, lstm_output_dim)
        
        # Return embedding if requested
        if return_embedding:
            return utterance_embedding
        
        # Classification
        logits = self.classifier(utterance_embedding)
        
        return logits
    
    def freeze_encoder(self):
        """Freeze Wav2Vec2 encoder"""
        for param in self.wav2vec2.parameters():
            param.requires_grad = False
        print("Wav2Vec2 encoder frozen")
    
    def unfreeze_encoder(self):
        """Unfreeze Wav2Vec2 encoder"""
        for param in self.wav2vec2.parameters():
            param.requires_grad = True
        print("Wav2Vec2 encoder unfrozen")
    
    def get_embedding(self, waveform):
        """Extract utterance-level embedding"""
        with torch.no_grad():
            return self.forward(waveform, return_embedding=True)
    
    def get_temporal_embeddings(self, waveform):
        """Extract temporal sequence of embeddings"""
        with torch.no_grad():
            return self.forward(waveform, return_temporal=True)

class CNNSpectrogramModel(nn.Module):
    """
    CNN-based emotion recognition from mel-spectrograms
    
    Architecture:
    - Conv layers on mel-spectrogram
    - Temporal pooling
    - LSTM for temporal modeling
    - Classification head
    """
    
    def __init__(self, num_classes=4, n_mels=128, use_lstm=True):
        super().__init__()
        
        self.num_classes = num_classes
        self.use_lstm = use_lstm
        
        # CNN layers
        self.conv_layers = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.2),
            
            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.2),
            
            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.3),
            
            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, None))  # Pool frequency dimension
        )
        
        # LSTM for temporal modeling
        if use_lstm:
            self.lstm = nn.LSTM(
                input_size=512,
                hidden_size=256,
                num_layers=2,
                batch_first=True,
                bidirectional=True,
                dropout=0.3
            )
            lstm_output_dim = 512  # 256 * 2 (bidirectional)
        else:
            self.lstm = None
            lstm_output_dim = 512
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(lstm_output_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, mel_spec, return_embedding=False):
        """
        Forward pass
        
        Args:
            mel_spec: Mel-spectrogram (B, 1, n_mels, time)
            return_embedding: Return embedding instead of logits
        """
        # CNN feature extraction
        features = self.conv_layers(mel_spec)  # (B, 512, 1, T)
        features = features.squeeze(2).permute(0, 2, 1)  # (B, T, 512)
        
        # LSTM temporal modeling
        if self.lstm:
            lstm_out, _ = self.lstm(features)  # (B, T, 512)
            # Mean pooling over time
            embedding = torch.mean(lstm_out, dim=1)  # (B, 512)
        else:
            embedding = torch.mean(features, dim=1)
        
        if return_embedding:
            return embedding
        
        # Classification
        logits = self.classifier(embedding)
        
        return logits


In [8]:
# ============================================================================
# 6. TRAINING UTILITIES
# ============================================================================

class FocalLoss(nn.Module):
    """Focal Loss for handling class imbalance"""
    
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

class AverageMeter:
    """Computes and stores average and current value"""
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

class EarlyStopping:
    """Early stopping"""
    def __init__(self, patience=10, min_delta=0.001, mode='min'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        
    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif self._is_improvement(score):
            self.best_score = score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        return self.early_stop
    
    def _is_improvement(self, score):
        if self.mode == 'min':
            return score < (self.best_score - self.min_delta)
        else:
            return score > (self.best_score + self.min_delta)

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """Linear warmup scheduler"""
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [9]:
# ============================================================================
# 7. TRAINING LOOP
# ============================================================================

class AudioEmotionTrainer:
    """Training and validation logic for audio emotion recognition"""
    
    def __init__(self, model, train_loader, val_loader, config):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        
        # Loss function
        self.criterion = FocalLoss(alpha=None, gamma=2.0)
        
        # Optimizer
        self.optimizer = optim.AdamW(
            self.model.parameters(),
            lr=config.LEARNING_RATE,
            weight_decay=config.WEIGHT_DECAY
        )
        
        # Scheduler
        num_training_steps = len(train_loader) * config.NUM_EPOCHS
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=config.WARMUP_STEPS,
            num_training_steps=num_training_steps
        )
        
        # Mixed precision
        self.scaler = torch.cuda.amp.GradScaler() if config.MIXED_PRECISION else None
        
        # Early stopping
        self.early_stopping = EarlyStopping(
            patience=config.PATIENCE,
            min_delta=config.MIN_DELTA,
            mode='max'  # Maximize accuracy
        )
        
        # Training history
        self.history = {
            'train_loss': [], 'train_acc': [], 'train_f1': [],
            'val_loss': [], 'val_acc': [], 'val_f1': [],
            'lr': []
        }
        
        self.best_val_acc = 0.0
        self.global_step = 0
    
    def train_epoch(self, epoch):
        """Train for one epoch"""
        self.model.train()
        
        losses = AverageMeter()
        accs = AverageMeter()
        
        all_preds = []
        all_labels = []
        
        pbar = tqdm(self.train_loader, desc=f'Epoch {epoch+1}/{self.config.NUM_EPOCHS}')
        
        for batch_idx, (waveforms, labels) in enumerate(pbar):
            waveforms = waveforms.to(device)
            labels = labels.to(device)
            
            # Forward pass
            if self.scaler:
                with torch.cuda.amp.autocast():
                    outputs = self.model(waveforms)
                    loss = self.criterion(outputs, labels) / self.config.ACCUMULATION_STEPS
                
                self.scaler.scale(loss).backward()
                
                if (batch_idx + 1) % self.config.ACCUMULATION_STEPS == 0:
                    if self.config.GRAD_CLIP:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.GRAD_CLIP)
                    
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.optimizer.zero_grad()
                    self.scheduler.step()
                    self.global_step += 1
            else:
                # Non-mixed precision training
                outputs = self.model(waveforms)
                loss = self.criterion(outputs, labels) / self.config.ACCUMULATION_STEPS
                
                loss.backward()
                
                if (batch_idx + 1) % self.config.ACCUMULATION_STEPS == 0:
                    if self.config.GRAD_CLIP:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.GRAD_CLIP)
                    
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    self.scheduler.step()
                    self.global_step += 1
            
            # Calculate metrics
            _, preds = torch.max(outputs, 1)
            acc = (preds == labels).float().mean()
            
            losses.update(loss.item() * self.config.ACCUMULATION_STEPS, waveforms.size(0))
            accs.update(acc.item(), waveforms.size(0))
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            # Update progress bar
            pbar.set_postfix({'loss': losses.avg, 'acc': accs.avg})
        
        # Calculate F1 score
        f1 = f1_score(all_labels, all_preds, average='weighted')
        
        return losses.avg, accs.avg, f1
    
    def validate(self):
        """Validation loop"""
        self.model.eval()
        
        losses = AverageMeter()
        accs = AverageMeter()
        
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for waveforms, labels in tqdm(self.val_loader, desc='Validation'):
                waveforms = waveforms.to(device)
                labels = labels.to(device)
                
                outputs = self.model(waveforms)
                loss = self.criterion(outputs, labels)
                
                _, preds = torch.max(outputs, 1)
                acc = (preds == labels).float().mean()
                
                losses.update(loss.item(), waveforms.size(0))
                accs.update(acc.item(), waveforms.size(0))
                
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        f1 = f1_score(all_labels, all_preds, average='weighted')
        
        return losses.avg, accs.avg, f1, all_preds, all_labels
    
    def train(self):
        """Full training loop"""
        print("\n" + "="*80)
        print("STARTING AUDIO EMOTION TRAINING")
        print("="*80 + "\n")
        
        # Freeze encoder initially
        if self.config.FREEZE_ENCODER_EPOCHS > 0 and hasattr(self.model, 'freeze_encoder'):
            self.model.freeze_encoder()
        
        for epoch in range(self.config.NUM_EPOCHS):
            # Unfreeze encoder after specified epochs
            if epoch == self.config.FREEZE_ENCODER_EPOCHS and hasattr(self.model, 'unfreeze_encoder'):
                self.model.unfreeze_encoder()
            
            # Train
            train_loss, train_acc, train_f1 = self.train_epoch(epoch)
            
            # Validate
            val_loss, val_acc, val_f1, val_preds, val_labels = self.validate()
            
            # Record history
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            self.history['train_f1'].append(train_f1)
            self.history['val_loss'].append(val_loss)
            self.history['val_acc'].append(val_acc)
            self.history['val_f1'].append(val_f1)
            self.history['lr'].append(self.scheduler.get_last_lr()[0])
            
            # Print summary
            print(f"\nEpoch {epoch+1}/{self.config.NUM_EPOCHS}")
            print(f"Train - Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
            print(f"Val   - Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")
            
            # Save best model
            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                self.save_checkpoint(epoch, is_best=True)
                print(f"✓ New best model! Val Acc: {val_acc:.4f}")
            
            # Regular checkpoint
            if (epoch + 1) % 5 == 0:
                self.save_checkpoint(epoch, is_best=False)
            
            # Early stopping
            if self.early_stopping(val_acc):
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
            
            print("-" * 80)
        
        print("\n" + "="*80)
        print("TRAINING COMPLETED")
        print(f"Best Validation Accuracy: {self.best_val_acc:.4f}")
        print("="*80 + "\n")
        
        return self.history
    
    def save_checkpoint(self, epoch, is_best=False):
        """Save checkpoint"""
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'best_val_acc': self.best_val_acc,
            'history': self.history,
            'config': vars(self.config)
        }
        
        if is_best:
            path = os.path.join(self.config.CHECKPOINT_DIR, 'best_audio_model.pth')
        else:
            path = os.path.join(self.config.CHECKPOINT_DIR, f'audio_checkpoint_epoch_{epoch+1}.pth')
        
        torch.save(checkpoint, path)


In [10]:
# ============================================================================
# 8. EMBEDDING EXTRACTION
# ============================================================================

class AudioEmbeddingExtractor:
    """Extract embeddings from trained audio model"""
    
    def __init__(self, model_path, config, model_type='wav2vec2'):
        self.config = config
        self.device = device
        self.model_type = model_type
        
        # Load model
        if model_type == 'wav2vec2':
            self.model = Wav2Vec2EmotionModel(
                model_name=config.PRETRAINED_MODEL,
                num_classes=config.NUM_CLASSES,
                use_lstm=config.USE_LSTM,
                hidden_dim=config.HIDDEN_DIM
            ).to(self.device)
        elif model_type == 'cnn_spectrogram':
            self.model = CNNSpectrogramModel(
                num_classes=config.NUM_CLASSES,
                n_mels=config.N_MELS,
                use_lstm=config.USE_LSTM
            ).to(self.device)
        
        # Load checkpoint
        checkpoint = torch.load(model_path, map_location=self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()
        
        print(f"Audio model loaded from {model_path}")
    
    def extract_utterance_embedding(self, audio_path):
        """
        Extract utterance-level embedding from audio file
        
        Args:
            audio_path: Path to audio file
        
        Returns:
            embedding: numpy array (embedding_dim,)
        """
        # Load and preprocess audio
        waveform = preprocessor.load_audio(audio_path)
        waveform = preprocessor.normalize(waveform)
        waveform = preprocessor.pad_or_truncate(waveform)
        waveform = waveform.unsqueeze(0).to(self.device)  # Add batch dim
        
        # Extract embedding
        with torch.no_grad():
            if self.model_type == 'cnn_spectrogram':
                mel_spec = preprocessor.extract_mel_spectrogram(waveform.squeeze(0))
                mel_spec = mel_spec.unsqueeze(0).unsqueeze(0).to(self.device)
                embedding = self.model(mel_spec, return_embedding=True)
            else:
                embedding = self.model.get_embedding(waveform)
        
        return embedding.cpu().numpy().squeeze()
    
    def extract_temporal_embeddings(self, audio_path):
        """
        Extract temporal sequence of embeddings (for multi-modal alignment)
        
        Args:
            audio_path: Path to audio file
        
        Returns:
            embeddings: numpy array (num_timesteps, embedding_dim)
        """
        # Load and preprocess
        waveform = preprocessor.load_audio(audio_path)
        waveform = preprocessor.normalize(waveform)
        waveform = preprocessor.pad_or_truncate(waveform)
        waveform = waveform.unsqueeze(0).to(self.device)
        
        # Extract temporal embeddings
        with torch.no_grad():
            if hasattr(self.model, 'get_temporal_embeddings'):
                embeddings = self.model.get_temporal_embeddings(waveform)
            else:
                # Fallback: get utterance-level embedding
                embeddings = self.model.get_embedding(waveform).unsqueeze(1)
        
        return embeddings.cpu().numpy().squeeze()
    
    def extract_batch_embeddings(self, audio_paths):
        """Extract embeddings from batch of audio files"""
        embeddings = []
        for path in tqdm(audio_paths, desc='Extracting embeddings'):
            emb = self.extract_utterance_embedding(path)
            embeddings.append(emb)
        return np.array(embeddings)


In [11]:
# ============================================================================
# 9. EVALUATION AND VISUALIZATION
# ============================================================================

def plot_training_history(history, save_path=None):
    """Plot training metrics"""
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Loss
    axes[0].plot(history['train_loss'], label='Train', marker='o')
    axes[0].plot(history['val_loss'], label='Val', marker='s')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Accuracy
    axes[1].plot(history['train_acc'], label='Train', marker='o')
    axes[1].plot(history['val_acc'], label='Val', marker='s')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].set_title('Accuracy')
    axes[1].legend()
    axes[1].grid(True)
    
    # F1 Score
    axes[2].plot(history['train_f1'], label='Train F1', marker='o')
    axes[2].plot(history['val_f1'], label='Val F1', marker='s')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('F1 Score')
    axes[2].set_title('F1 Score')
    axes[2].legend()
    axes[2].grid(True)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

def plot_confusion_matrix(y_true, y_pred, labels, save_path=None):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()

def evaluate_model(model, test_loader, config):
    """Comprehensive evaluation"""
    model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for waveforms, labels in tqdm(test_loader, desc='Evaluating'):
            waveforms = waveforms.to(device)
            outputs = model(waveforms)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    # Metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"\nTest Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds,
                                target_names=list(config.EMOTION_LABELS.values())))
    
    # Confusion matrix
    plot_confusion_matrix(all_labels, all_preds,
                         list(config.EMOTION_LABELS.values()),
                         save_path=os.path.join(config.OUTPUT_DIR, 'confusion_matrix_audio.png'))
    
    return all_preds, all_labels

# RAVDESS Dataset Preparation
Process the downloaded RAVDESS dataset and create train/val/test splits.

In [12]:
# ============================================================================
# RAVDESS DATASET PREPARATION
# ============================================================================

def prepare_ravdess_dataset(ravdess_dir="./data/ravdess"):
    """
    Prepare RAVDESS dataset for training
    
    RAVDESS filename format: 03-01-06-01-02-01-12.wav
    - Modality (01 = full-AV, 02 = video-only, 03 = audio-only)
    - Vocal channel (01 = speech, 02 = song)
    - Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)
    - Emotional intensity (01 = normal, 02 = strong)
    - Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door")
    - Repetition (01 = 1st repetition, 02 = 2nd repetition)
    - Actor (01 to 24, odd numbered actors are male, even numbered actors are female)
    """
    
    ravdess_path = Path(ravdess_dir)
    
    if not ravdess_path.exists():
        print(f"❌ RAVDESS directory not found: {ravdess_path.absolute()}")
        print("\nPlease ensure RAVDESS dataset is extracted to:")
        print(f"  {ravdess_path.absolute()}")
        print("\nExpected structure:")
        print("  data/ravdess/")
        print("    Audio_Speech_Actors_01-24/")
        print("      Actor_01/")
        print("        03-01-01-01-01-01-01.wav")
        print("        ...")
        print("      Actor_02/")
        print("        ...")
        return None
    
    print("RAVDESS Dataset Preparation")
    print("=" * 60)
    
    # Emotion mapping (RAVDESS code -> our label)
    # We'll use 4-class: neutral, happy, sad, angry
    emotion_map = config.RAVDESS_EMOTION_MAP
    
    print("\nEmotion Mapping (4-class):")
    for ravdess_code, label in emotion_map.items():
        emotion_name = config.EMOTION_LABELS[label]
        print(f"  RAVDESS {ravdess_code} -> {label}: {emotion_name}")
    
    # Check for Audio_Speech_Actors_01-24 subdirectory
    audio_speech_dir = ravdess_path / "Audio_Speech_Actors_01-24"
    if audio_speech_dir.exists():
        print(f"\n✓ Found Audio_Speech_Actors_01-24 subdirectory")
        search_path = audio_speech_dir
    else:
        print(f"\nSearching in: {ravdess_path}")
        search_path = ravdess_path
    
    # Collect all audio files
    samples = []
    
    actor_dirs = list(search_path.glob("Actor_*"))
    if len(actor_dirs) == 0:
        print(f"❌ No Actor_* directories found in {search_path}")
        print(f"\nAvailable directories:")
        for item in search_path.iterdir():
            if item.is_dir():
                print(f"  - {item.name}")
        return None
    
    print(f"\n✓ Found {len(actor_dirs)} Actor directories")
    
    for actor_dir in sorted(actor_dirs):
        if not actor_dir.is_dir():
            continue
        
        audio_files = list(actor_dir.glob("*.wav"))
        
        for audio_file in audio_files:
            # Parse filename
            filename = audio_file.stem
            parts = filename.split('-')
            
            if len(parts) != 7:
                continue
            
            modality = parts[0]
            vocal_channel = parts[1]
            emotion_code = parts[2]
            intensity = parts[3]
            statement = parts[4]
            repetition = parts[5]
            actor = parts[6]
            
            # Filter: only audio-only (03) and speech (01)
            if modality != '03' or vocal_channel != '01':
                continue
            
            # Filter: only 4-class emotions
            if emotion_code not in emotion_map:
                continue
            
            samples.append({
                'path': str(audio_file.absolute()),
                'emotion_code': emotion_code,
                'emotion_label': emotion_map[emotion_code],
                'intensity': intensity,
                'actor': actor,
                'gender': 'M' if int(actor) % 2 == 1 else 'F'
            })
    
    if len(samples) == 0:
        print(f"❌ No valid audio files found in {search_path}")
        print("\nPlease check:")
        print("  1. Audio files exist in Actor_* directories")
        print("  2. Files follow RAVDESS naming convention (03-01-XX-XX-XX-XX-XX.wav)")
        print("  3. Files are audio-only (03) and speech (01)")
        return None
    
    print(f"\n✓ Found {len(samples)} audio files")
    
    # Create DataFrame
    df = pd.DataFrame(samples)
    
    # Print emotion distribution
    print("\nEmotion Distribution:")
    for label in sorted(df['emotion_label'].unique()):
        count = len(df[df['emotion_label'] == label])
        emotion_name = config.EMOTION_LABELS[label]
        print(f"  {emotion_name}: {count} samples ({100*count/len(df):.1f}%)")
    
    # Split into train/val/test (70/15/15)
    # Stratify by both emotion and actor to ensure speaker independence
    
    # First, split by actor for speaker independence
    actors = df['actor'].unique()
    np.random.seed(42)
    np.random.shuffle(actors)
    
    n_actors = len(actors)
    n_train = int(0.7 * n_actors)
    n_val = int(0.15 * n_actors)
    
    train_actors = actors[:n_train]
    val_actors = actors[n_train:n_train+n_val]
    test_actors = actors[n_train+n_val:]
    
    train_df = df[df['actor'].isin(train_actors)].copy()
    val_df = df[df['actor'].isin(val_actors)].copy()
    test_df = df[df['actor'].isin(test_actors)].copy()
    
    print(f"\nDataset Splits (Speaker-Independent):")
    print(f"  Train: {len(train_df)} samples ({len(train_actors)} actors)")
    print(f"  Val:   {len(val_df)} samples ({len(val_actors)} actors)")
    print(f"  Test:  {len(test_df)} samples ({len(test_actors)} actors)")
    
    # Save split CSVs
    output_dir = Path("./data/processed")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save with required columns for dataset loader
    for split_name, split_df in [('train', train_df), ('val', val_df), ('test', test_df)]:
        split_df_save = split_df[['path', 'emotion_label']].copy()
        split_df_save.columns = ['audio_path', 'emotion_label']
        split_df_save.to_csv(output_dir / f'{split_name}.csv', index=False)
    
    print(f"\n✓ Saved processed CSVs to: {output_dir.absolute()}")
    print("\nFiles created:")
    print(f"  - {output_dir / 'train.csv'}")
    print(f"  - {output_dir / 'val.csv'}")
    print(f"  - {output_dir / 'test.csv'}")
    
    return {
        'train': train_df,
        'val': val_df,
        'test': test_df
    }

# Prepare RAVDESS dataset
print("Preparing RAVDESS dataset...")
print("=" * 80)
dataset_splits = prepare_ravdess_dataset()

if dataset_splits:
    print("\n" + "=" * 80)
    print("✓ RAVDESS DATASET READY FOR TRAINING!")
    print("=" * 80)
    print("\nYou can now run the main training cell below.")
else:
    print("\n" + "=" * 80)
    print("⚠️ DATASET PREPARATION FAILED")
    print("=" * 80)
    print("\nPlease ensure RAVDESS dataset is properly extracted to:")
    print("  ./data/ravdess/Audio_Speech_Actors_01-24/Actor_01/")
    print("  ./data/ravdess/Audio_Speech_Actors_01-24/Actor_02/")
    print("  ...")
    print("  ./data/ravdess/Audio_Speech_Actors_01-24/Actor_24/")


Preparing RAVDESS dataset...
RAVDESS Dataset Preparation

Emotion Mapping (4-class):
  RAVDESS 01 -> 0: neutral
  RAVDESS 03 -> 1: happy
  RAVDESS 04 -> 2: sad
  RAVDESS 05 -> 3: angry

✓ Found Audio_Speech_Actors_01-24 subdirectory

✓ Found 24 Actor directories

✓ Found 672 audio files

Emotion Distribution:
  neutral: 96 samples (14.3%)
  happy: 192 samples (28.6%)
  sad: 192 samples (28.6%)
  angry: 192 samples (28.6%)

Dataset Splits (Speaker-Independent):
  Train: 448 samples (16 actors)
  Val:   84 samples (3 actors)
  Test:  140 samples (5 actors)

✓ Saved processed CSVs to: /Users/guptatilak/Desktop/multi-modal-mood-matcher/audio/data/processed

Files created:
  - data/processed/train.csv
  - data/processed/val.csv
  - data/processed/test.csv

✓ RAVDESS DATASET READY FOR TRAINING!

You can now run the main training cell below.


In [13]:
# ============================================================================
# 10. MAIN EXECUTION
# ============================================================================

def main():
    """Main training pipeline for RAVDESS dataset"""
    
    print("="*80)
    print("AUDIO EMOTION EMBEDDING MODEL TRAINING - RAVDESS")
    print("="*80)
    
    # Load processed CSV files
    processed_dir = Path("./data/processed")
    train_csv = str(processed_dir / 'train.csv')
    val_csv = str(processed_dir / 'val.csv')
    
    print("\n✓ Loading RAVDESS datasets from processed CSV files...")
    
    # Create datasets from CSV
    train_dataset = AudioEmotionDataset(
        data_root="./data",
        split='train',
        config=config,
        csv_path=train_csv,
        use_augmentation=config.USE_AUGMENTATION
    )
    
    val_dataset = AudioEmotionDataset(
        data_root="./data",
        split='val',
        config=config,
        csv_path=val_csv,
        use_augmentation=False
    )
    
    # Data loaders - USE num_workers=0 to avoid multiprocessing issues
    # (librosa operations in augmentation don't work well with multiprocessing)
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=0,  # Changed from 4 to 0 to avoid worker process issues
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=0,  # Changed from 4 to 0 to avoid worker process issues
        pin_memory=True
    )
    
    # Initialize model
    print(f"\nInitializing {config.MODEL_TYPE} model...")
    if config.MODEL_TYPE == 'wav2vec2':
        model = Wav2Vec2EmotionModel(
            model_name=config.PRETRAINED_MODEL,
            num_classes=config.NUM_CLASSES,
            use_lstm=config.USE_LSTM,
            hidden_dim=config.HIDDEN_DIM
        )
    elif config.MODEL_TYPE == 'cnn_spectrogram':
        model = CNNSpectrogramModel(
            num_classes=config.NUM_CLASSES,
            n_mels=config.N_MELS,
            use_lstm=config.USE_LSTM
        )
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    
    # Train
    trainer = AudioEmotionTrainer(model, train_loader, val_loader, config)
    history = trainer.train()
    
    # Plot history
    plot_training_history(
        history,
        save_path=os.path.join(config.OUTPUT_DIR, 'audio_training_history.png')
    )
    
    # Final evaluation
    print("\nFinal evaluation...")
    model_path = os.path.join(config.CHECKPOINT_DIR, 'best_audio_model.pth')
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    val_preds, val_labels = evaluate_model(model, val_loader, config)
    
    print("\n" + "="*80)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print(f"Best model: {model_path}")
    print("="*80)
    
    # Demonstration
    print("\n" + "="*80)
    print("DEMONSTRATION: EMBEDDING EXTRACTION")
    print("="*80)
    
    extractor = AudioEmbeddingExtractor(model_path, config, config.MODEL_TYPE)
    
    # Example
    sample_waveform, sample_label = val_dataset[0]
    sample_audio_path = val_dataset.samples[0]['path']
    
    # Utterance-level embedding
    utterance_emb = extractor.extract_utterance_embedding(sample_audio_path)
    print(f"\nUtterance embedding shape: {utterance_emb.shape}")
    print(f"True emotion: {config.EMOTION_LABELS[sample_label]}")
    
    # Temporal embeddings
    if config.MODEL_TYPE == 'wav2vec2':
        temporal_emb = extractor.extract_temporal_embeddings(sample_audio_path)
        print(f"Temporal embeddings shape: {temporal_emb.shape}")
        print(f"Approximate frames per second: {temporal_emb.shape[0] / config.MAX_DURATION:.1f}")

if __name__ == '__main__':
    main()

AUDIO EMOTION EMBEDDING MODEL TRAINING - RAVDESS

✓ Loading RAVDESS datasets from processed CSV files...
train dataset: 448 samples

Label distribution in train:
  neutral: 64 (14.3%)
  happy: 128 (28.6%)
  sad: 128 (28.6%)
  angry: 128 (28.6%)
val dataset: 84 samples

Label distribution in val:
  neutral: 12 (14.3%)
  happy: 24 (28.6%)
  sad: 24 (28.6%)
  angry: 24 (28.6%)

Initializing wav2vec2 model...
Total parameters: 106,319,237
Trainable parameters: 106,319,237

STARTING AUDIO EMOTION TRAINING

Wav2Vec2 encoder frozen


Epoch 1/30: 100%|██████████| 28/28 [19:11<00:00, 41.12s/it, loss=0.873, acc=0.192]
Validation: 100%|██████████| 6/6 [00:49<00:00,  8.25s/it]



Epoch 1/30
Train - Loss: 0.8732 | Acc: 0.1920 | F1: 0.1622
Val   - Loss: 0.7875 | Acc: 0.3452 | F1: 0.2710
✓ New best model! Val Acc: 0.3452
--------------------------------------------------------------------------------


Epoch 2/30: 100%|██████████| 28/28 [19:05<00:00, 40.90s/it, loss=0.838, acc=0.241]
Validation: 100%|██████████| 6/6 [00:49<00:00,  8.25s/it]



Epoch 2/30
Train - Loss: 0.8380 | Acc: 0.2411 | F1: 0.2151
Val   - Loss: 0.7627 | Acc: 0.2976 | F1: 0.2118
--------------------------------------------------------------------------------


Epoch 3/30: 100%|██████████| 28/28 [20:37<00:00, 44.18s/it, loss=0.82, acc=0.248] 
Validation: 100%|██████████| 6/6 [01:05<00:00, 10.86s/it]



Epoch 3/30
Train - Loss: 0.8198 | Acc: 0.2478 | F1: 0.2381
Val   - Loss: 0.7465 | Acc: 0.2976 | F1: 0.2222
--------------------------------------------------------------------------------
Wav2Vec2 encoder unfrozen


Epoch 4/30: 100%|██████████| 28/28 [29:22<00:00, 62.93s/it, loss=0.764, acc=0.286]
Validation: 100%|██████████| 6/6 [00:44<00:00,  7.44s/it]



Epoch 4/30
Train - Loss: 0.7637 | Acc: 0.2857 | F1: 0.2636
Val   - Loss: 0.7279 | Acc: 0.3571 | F1: 0.2901
✓ New best model! Val Acc: 0.3571
--------------------------------------------------------------------------------


Epoch 5/30: 100%|██████████| 28/28 [29:53<00:00, 64.04s/it, loss=0.73, acc=0.364] 
Validation: 100%|██████████| 6/6 [00:44<00:00,  7.48s/it]



Epoch 5/30
Train - Loss: 0.7297 | Acc: 0.3638 | F1: 0.3453
Val   - Loss: 0.6931 | Acc: 0.3214 | F1: 0.2284
--------------------------------------------------------------------------------


Epoch 6/30: 100%|██████████| 28/28 [25:25<00:00, 54.46s/it, loss=0.692, acc=0.411]
Validation: 100%|██████████| 6/6 [00:47<00:00,  7.97s/it]



Epoch 6/30
Train - Loss: 0.6915 | Acc: 0.4107 | F1: 0.3919
Val   - Loss: 0.6388 | Acc: 0.4524 | F1: 0.4044
✓ New best model! Val Acc: 0.4524
--------------------------------------------------------------------------------


Epoch 7/30: 100%|██████████| 28/28 [24:21<00:00, 52.18s/it, loss=0.606, acc=0.429]
Validation: 100%|██████████| 6/6 [00:45<00:00,  7.53s/it]



Epoch 7/30
Train - Loss: 0.6060 | Acc: 0.4286 | F1: 0.4117
Val   - Loss: 0.6273 | Acc: 0.4643 | F1: 0.4127
✓ New best model! Val Acc: 0.4643
--------------------------------------------------------------------------------


Epoch 8/30: 100%|██████████| 28/28 [24:45<00:00, 53.05s/it, loss=0.548, acc=0.48] 
Validation: 100%|██████████| 6/6 [00:48<00:00,  8.05s/it]



Epoch 8/30
Train - Loss: 0.5483 | Acc: 0.4799 | F1: 0.4549
Val   - Loss: 0.5257 | Acc: 0.5000 | F1: 0.4450
✓ New best model! Val Acc: 0.5000
--------------------------------------------------------------------------------


Epoch 9/30: 100%|██████████| 28/28 [24:09<00:00, 51.77s/it, loss=0.503, acc=0.52] 
Validation: 100%|██████████| 6/6 [00:45<00:00,  7.55s/it]



Epoch 9/30
Train - Loss: 0.5029 | Acc: 0.5201 | F1: 0.5036
Val   - Loss: 0.4479 | Acc: 0.5238 | F1: 0.4756
✓ New best model! Val Acc: 0.5238
--------------------------------------------------------------------------------


Epoch 10/30: 100%|██████████| 28/28 [24:31<00:00, 52.55s/it, loss=0.482, acc=0.542]
Validation: 100%|██████████| 6/6 [00:48<00:00,  8.01s/it]



Epoch 10/30
Train - Loss: 0.4818 | Acc: 0.5424 | F1: 0.5242
Val   - Loss: 0.6036 | Acc: 0.4286 | F1: 0.3611
--------------------------------------------------------------------------------


Epoch 11/30: 100%|██████████| 28/28 [24:37<00:00, 52.77s/it, loss=0.416, acc=0.616]
Validation: 100%|██████████| 6/6 [00:45<00:00,  7.56s/it]



Epoch 11/30
Train - Loss: 0.4158 | Acc: 0.6161 | F1: 0.6027
Val   - Loss: 0.4999 | Acc: 0.4881 | F1: 0.4411
--------------------------------------------------------------------------------


Epoch 12/30: 100%|██████████| 28/28 [24:47<00:00, 53.11s/it, loss=0.391, acc=0.562]
Validation: 100%|██████████| 6/6 [00:59<00:00,  9.96s/it]



Epoch 12/30
Train - Loss: 0.3913 | Acc: 0.5625 | F1: 0.5572
Val   - Loss: 0.4241 | Acc: 0.5952 | F1: 0.5334
✓ New best model! Val Acc: 0.5952
--------------------------------------------------------------------------------


Epoch 13/30: 100%|██████████| 28/28 [24:59<00:00, 53.56s/it, loss=0.321, acc=0.674]
Validation: 100%|██████████| 6/6 [00:49<00:00,  8.22s/it]



Epoch 13/30
Train - Loss: 0.3208 | Acc: 0.6741 | F1: 0.6724
Val   - Loss: 0.3794 | Acc: 0.6071 | F1: 0.5260
✓ New best model! Val Acc: 0.6071
--------------------------------------------------------------------------------


Epoch 14/30:  61%|██████    | 17/28 [15:56<10:18, 56.26s/it, loss=0.308, acc=0.68] 


KeyboardInterrupt: 