In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoProcessor, AutoModelForAudioClassification

from datasets import load_dataset, Audio # <-- Key new imports
import numpy as np

import os
import time
import copy
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score # For better metrics

print("All libraries imported.")


In [None]:
# --- 1. Setup Device ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 2. Define Model Name ---
model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"

# --- 3. Define Data Paths ---
DATA_DIR = '../data/MELD_processed/'
TRAIN_FILE = os.path.join(DATA_DIR, 'train_text.csv')
VAL_FILE = os.path.join(DATA_DIR, 'dev_text.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test_text.csv') # We'll load this later for final testing

print(f"Train file: {TRAIN_FILE}")
print(f"Val file: {VAL_FILE}")

In [None]:
# --- 4. Load CSV metadata ---
data_files = {
    "train": TRAIN_FILE,
    "val": VAL_FILE
}
dataset = load_dataset("csv", data_files=data_files)

# --- 5. Cast audio column ---
# WE ARE REMOVING THE .cast_column() LINE TO AVOID THE torchcodec ERROR
# dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))

# --- 6. Rename columns and get labels ---
dataset = dataset.rename_column("emotion", "label")

# --- Convert the 'label' column (strings) into a ClassLabel object ---
print("Encoding labels...")
dataset = dataset.class_encode_column("label")

# --- Get the list of labels ---
labels = dataset["train"].features["label"].names
num_labels = len(labels)

# --- Create the dictionaries manually ---
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

print(f"Loaded dataset with {num_labels} labels:")
print(labels)
print(f"\nExample of label2id: {label2id}")

In [None]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

# --- 7. Load Processor & Model ---
print(f"Loading processor and model for: {model_name}...")

# --- THIS IS THE FIX ---
# Use AutoFeatureExtractor, as this model doesn't have a text tokenizer
processor = AutoFeatureExtractor.from_pretrained(model_name)
# ---------------------

model = AutoModelForAudioClassification.from_pretrained(
    model_name,
    num_labels=num_labels,      # Pass our 7 Labels
    label2id=label2id,          # Pass our new mappings
    id2label=id2label,
    use_safetensors=True,
    ignore_mismatched_sizes=True  # <-- ADD THIS LINE TO FIX THE ERROR
).to(device) # Move model to GPU

print("‚úÖ Processor and model loaded successfully.")

In [None]:
import librosa # <-- Make sure to import librosa

# --- 8. Preprocessing Function ---
# This function will be applied to every audio file
def preprocess_function(batch):
    
    # --- THIS IS THE FIX ---
    # Manually load audio files using librosa
    audio_arrays = []
    for path in batch["audio_path"]:
        y, sr = librosa.load(path, sr=16000) # Load and resample
        audio_arrays.append(y)
    # -----------------------

    # The processor converts the audio arrays into numerical inputs
    processed_batch = processor(
        audio_arrays, # <-- Pass the loaded audio arrays
        sampling_rate=16000,
        truncation=True, # Truncate long audio files
        padding="longest", # Pad shorter files to be the same length
        max_length=80000 # Max length = 5 seconds (16000 * 5)
    )
    
    # The 'label' is already an integer, so we just pass it along
    processed_batch["label"] = batch["label"]
    return processed_batch

# --- 9. Apply Preprocessing ---
print("Applying preprocessing to the dataset (using librosa)...")
print("This may take a few minutes...")

# .map() applies the function to all examples. batched=True makes it fast.
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=100 # Process in chunks of 100
)

print("‚úÖ Preprocessing complete.")
print(processed_dataset)

In [None]:
import torch
from dataclasses import dataclass
from transformers.feature_extraction_utils import BatchFeature
import torch.optim as optim

# --- 10. Define Custom Data Collator ---
# This class fixes the "KeyError: 'label'"
@dataclass
class CustomAudioDataCollator:
    processor: any
    padding: bool = True
    
    def __call__(self, features):
        # 'features' is a list of dicts: [{'input_values': ..., 'label': ...}, ...]
        
        # 1. Separate the labels from the features
        labels = [feature.pop("label") for feature in features]
        
        # 2. Use the processor's built-in padding for the rest
        # This will correctly pad 'input_values' and 'attention_mask'
        batch = processor.pad(
            features,
            padding=self.padding,
            return_tensors="pt",
        )
        
        # 3. Add the labels back into the batch, but as a tensor
        batch["labels"] = torch.tensor(labels) # Note: 'labels' (plural)
        
        return batch

# --- 11. Initialize the Collator, Optimizer, and Loss ---
data_collator = CustomAudioDataCollator(processor=processor, padding=True)
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

print("‚úÖ Custom Collator, Optimizer, and Loss defined.")

In [None]:
import time
import copy
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.amp import autocast, GradScaler
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# --- 10. Define Optimizer and Loss ---
optimizer = optim.AdamW(model.parameters(), lr=3e-5)

# --- NEW: CALCULATE CLASS WEIGHTS TO FIX IMBALANCE ---
# Get all training labels as a NumPy array (this is more stable)
train_labels = np.array(processed_dataset["train"]["label"])
# Get unique class names (in order)
class_names = processed_dataset["train"].features["label"].names

# --- THIS IS THE FIX ---
# Instead of np.unique(train_labels), we will manually create the list
# of classes, which we know is [0, 1, 2, 3, 4, 5, 6]
classes_list = np.arange(len(class_names))
# -----------------------

# Calculate weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes_list,  # <-- PASS THE CORRECT, FULL LIST
    y=train_labels
)
# Convert weights to a PyTorch tensor and move to GPU
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print(f"Class weights: {class_weights}")
print(f"Applying weights to: {class_names}")

# Pass the weights to the loss function
criterion = nn.CrossEntropyLoss(weight=class_weights)
# ----------------------------------------------------

print("‚úÖ Optimizer and Weighted Loss defined.")

# --- 11. Create DataLoaders (using batch_size=4) ---
processed_dataset.set_format(type="torch", columns=["input_values", "attention_mask", "label"])

train_dataloader = DataLoader(
    processed_dataset["train"],
    shuffle=True,
    batch_size=4 # <-- Use batch_size 4
)
val_dataloader = DataLoader(
    processed_dataset["val"],
    batch_size=4 # <-- Use batch_size 4
)

dataloaders = {"train": train_dataloader, "val": val_dataloader}
dataset_sizes = {"train": len(processed_dataset["train"]), "val": len(processed_dataset["val"])}
print("‚úÖ DataLoaders created successfully (batch_size=4).")

# --- 12. Define train_model function ---
def train_model(model, criterion, optimizer, num_epochs=3):
    since = time.time()
    scaler = GradScaler('cuda')
    accumulation_steps = 4 # Effective batch size = 4 * 4 = 16
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_f1 = 0.0 

    for epoch in range(num_epochs):
        print(f'\n--- Epoch {epoch+1}/{num_epochs} ---')
        print('-' * 20)

        for phase in ['train', 'val']:
            model.train(phase == 'train')
            running_loss, all_preds, all_labels = 0.0, [], []

            for i, batch in enumerate(tqdm(dataloaders[phase], desc=f"{phase.title()} Batches", leave=False)):
                input_values = batch['input_values'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device) 

                with autocast('cuda', dtype=torch.float16):
                    outputs = model(input_values=input_values, attention_mask=attention_mask)
                    logits = outputs.logits
                    loss = criterion(logits, labels) # Criterion now uses weights
                    if phase == 'train':
                        loss = loss / accumulation_steps

                if phase == 'train':
                    scaler.scale(loss).backward()
                    if (i + 1) % accumulation_steps == 0 or (i + 1) == len(dataloaders[phase]):
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad(set_to_none=True)
                else:
                    pass 

                running_loss += loss.item() * input_values.size(0)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.detach().cpu().numpy())
                all_labels.extend(labels.detach().cpu().numpy())

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = accuracy_score(all_labels, all_preds)
            epoch_f1 = f1_score(all_labels, all_preds, average="weighted")

            print(f"{phase.capitalize()} ‚Üí Loss: {epoch_loss:.4f} | Acc: {epoch_acc:.4f} | F1: {epoch_f1:.4f}")

            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                best_model_wts = copy.deepcopy(model.state_dict())
                print(f"‚ú® New best val F1: {best_f1:.4f}")

    total_time = time.time() - since
    print(f"\nüèÅ Training complete in {total_time/60:.1f} min | Best F1: {best_f1:.4f}")
    model.load_state_dict(best_model_wts)
    return model

# --- 13. START TRAINING ---
print("‚öôÔ∏è Starting fast fine-tuning loop (with weighted loss)...")
model_ft = train_model(model, criterion, optimizer, num_epochs=3)

# --- 14. Save Model ---
SAVE_PATH = "../models/ser_model_finetuned_weighted.pth" # New save name
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)
torch.save(model_ft.state_dict(), SAVE_PATH)
print(f"\n‚úÖ Training Finished! Model saved to: {SAVE_PATH}")