<a href="https://colab.research.google.com/github/dineshRaja29/Learning-From-Others/blob/main/Rec0_2025_Spring_paper2code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recitation 0: Paper2Code

Prepared by: Massa Baali (mbaali@andrew.cmu.edu)



### Dataset

In [None]:
# dataloader.py
import os
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

class VoxCelebDataset(Dataset):
    def __init__(self, train_csv_path, transform=None):
        """
        Args:
            train_csv_path (str): Path to the dataset csv file.
            transform (callable, optional): Optional transforms to apply to the audio data.
        """
        df = pd.read_csv(train_csv_path)
        self.labels = df["utt_spk_int_labels"].values
        self.paths = df["utt_paths"].values

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        waveform = torchaudio.load(self.paths[index])

        waveform_length = waveform.shape[-1]

        sample = {
        'waveform':  waveform,
        'path': self.paths[index],
        'mapped_id': self.labels[index],
        'lens': waveform_length
        }
        return sample

# Path to your VoxCeleb dataset directory
data_dir = "/path/to/voxceleb.csv"

# Create the dataset
voxceleb_dataset = VoxCelebDataset(data_dir)

# Split the dataset into train, validation, and test sets
train_size = int(0.8 * len(voxceleb_dataset))
val_size = int(0.1 * len(voxceleb_dataset))
test_size = len(voxceleb_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    voxceleb_dataset, [train_size, val_size, test_size]
)

# Create DataLoaders for each split
batch_size = 16
voxceleb_dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
voxceleb_dataloader_val = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
voxceleb_dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


### Model


In [None]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiFrameAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiFrameAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # Input shape: (batch_size, sequence_length, embed_dim)
        attn_output, _ = self.attention(x, x, x)
        return self.linear(attn_output)

class ConformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(ConformerBlock, self).__init__()
        self.mfa = MultiFrameAttention(embed_dim, num_heads)
        self.ff1 = nn.Linear(embed_dim, ff_dim)
        self.ff2 = nn.Linear(ff_dim, embed_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=31, padding=15, groups=embed_dim)  # Depthwise conv

    def forward(self, x):
        # Multi-Frame Attention
        residual = x
        x = self.mfa(x)
        x = self.norm1(residual + self.dropout(x))

        # Convolutional Module
        residual = x
        x = x.transpose(1, 2)  # (batch_size, embed_dim, sequence_length)
        x = F.gelu(self.conv(x))
        x = x.transpose(1, 2)  # Back to (batch_size, sequence_length, embed_dim)
        x = self.norm2(residual + self.dropout(x))

        # Feedforward Network
        residual = x
        x = F.gelu(self.ff1(x))
        x = self.ff2(x)
        return residual + self.dropout(x)

class ConformerMFA(nn.Module):
    def __init__(self, num_blocks, embed_dim, num_heads, ff_dim, num_classes, dropout=0.1):
        super(ConformerMFA, self).__init__()
        self.blocks = nn.ModuleList([
            ConformerBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_blocks)
        ])
        self.fc_out = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Input shape: (batch_size, sequence_length, embed_dim)
        for block in self.blocks:
            x = block(x)
        # Average pooling for sequence aggregation
        x = x.mean(dim=1)  # (batch_size, embed_dim)
        return self.fc_out(x)

# Example Usage
if __name__ == "__main__":
    # Define model
    model = ConformerMFA(
        num_blocks=4,
        embed_dim=256,
        num_heads=8,
        ff_dim=1024,
        num_classes=10,
        dropout=0.1
    )

    # Dummy input (batch_size=16, sequence_length=100, embed_dim=256)
    dummy_input = torch.randn(16, 100, 256)
    output = model(dummy_input)

    print("Output shape:", output.shape)  # Should be (16, num_classes)


Output shape: torch.Size([16, 10])


### Train


In [None]:
# train.py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss, correct, total = 0, 0, 0

    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metrics
        epoch_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = epoch_loss / total
    accuracy = correct / total
    return avg_loss, accuracy

def validate_one_epoch(model, dataloader, criterion, device):
    model.eval()
    epoch_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validation", leave=False):
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Metrics
            epoch_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = epoch_loss / total
    accuracy = correct / total
    return avg_loss, accuracy

def save_checkpoint(model, optimizer, epoch, file_path):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
    }, file_path)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, checkpoint_path):
    for epoch in range(1, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs}")

        # Training phase
        train_loss, train_accuracy = train_one_epoch(model, train_loader, criterion, optimizer, device)
        print(f"Training Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")

        # Validation phase
        val_loss, val_accuracy = validate_one_epoch(model, val_loader, criterion, device)
        print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

        # U can add here a criteria to check the accuracy and the loss of the validation set..

        # Save checkpoint
        save_checkpoint(model, optimizer, epoch, f"{checkpoint_path}_epoch_{epoch}.pth")

# Example Usage
if __name__ == "__main__":
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_model(model, voxceleb_dataloader_train, voxceleb_dataloader_val, criterion, optimizer, num_epochs=5, device=device, checkpoint_path="model_checkpoint")


### Eval


In [None]:
#eval.py
import torch
from sklearn.metrics import accuracy_score

def evaluate_model(model, test_loader, device):
    """
    Evaluate the model on the test dataset.

    Args:
        model: Trained MFA Conformer model.
        test_loader: Voxceleb DataLoader for the test dataset.
        device: Device to run the evaluation (CPU or GPU).

    Returns:
        float: Accuracy score on the test set.
    """
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)  # Get class predictions

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute the accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

# Example Usage
if __name__ == "__main__":

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = torch.load("model_checkpoint_best.pth")  # Load the best model checkpoint
    model = model['model_state_dict']  # Adjust if checkpoint includes state_dict
    model.to(device)

    # Evaluate the model
    evaluate_model(model, voxceleb_dataloader_test, device)


# References and further reading

1.   https://arxiv.org/abs/2203.15249
2.   https://youtu.be/fdJxIqVBImU?list=PLp-0K3kfddPzbe1JqsQ7nmzZ38joYelj6
