# UrbanSound8K Audio Classifier (Sentinel)

This notebook implements a multi-class audio classifier using the UrbanSound8K dataset. 
It uses a ResNet18 CNN model and evaluates performance using 10-fold cross-validation.

**Requirements**:
- Google Drive must contain the `UrbanSound8K` folder at `My Drive/UrbanSound8K`.
- GPU Runtime is recommended.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Imports
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as T
from torchvision.models import resnet18
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import librosa
import librosa.display
from sklearn.utils.class_weight import compute_class_weight

print("PyTorch Version:", torch.__version__)
print("TorchAudio Version:", torchaudio.__version__)

In [None]:
# Configuration & Constants
DATA_ROOT = '/content/drive/My Drive/UrbanSound8K'
AUDIO_DIR = os.path.join(DATA_ROOT, 'audio')
METADATA_PATH = os.path.join(DATA_ROOT, 'metadata', 'UrbanSound8K.csv')

SAMPLE_RATE = 22050
DURATION = 4 # seconds
TARGET_SAMPLE_COUNT = SAMPLE_RATE * DURATION

N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

BATCH_SIZE = 32
EPOCHS = 10 
LEARNING_RATE = 0.001
NUM_CLASSES = 5
FOLDS = 10

# Class Mapping
# 0: Siren (8)
# 1: Jackhammer (7)
# 2: Gunshot (6)
# 3: Street Music (9)
# 4: Background (0, 1, 2, 3, 4, 5)
CLASS_MAP = {
    8: 0, 7: 1, 6: 2, 9: 3,
    0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4
}
CLASS_NAMES = ['Siren', 'Jackhammer', 'Gunshot', 'Street Music', 'Background']

In [None]:
# Helper to apply mapping
def map_classes(df, mapping):
    df['new_label'] = df['classID'].map(mapping)
    return df

In [None]:
# Data Exploration: Load Metadata & Visualize Distribution
if os.path.exists(METADATA_PATH):
    metadata = pd.read_csv(METADATA_PATH)
    print("Original Metadata Info:")
    print(metadata.head())
    
    # Apply Mapping
    metadata = map_classes(metadata, CLASS_MAP)
    print("\nMetadata after mapping:")
    print(metadata[['slice_file_name', 'fold', 'classID', 'class', 'new_label']].head())
    
    # Visualize Class Distribution
    plt.figure(figsize=(10, 5))
    metadata['new_label'].value_counts().sort_index().plot(kind='bar')
    plt.title('Class Distribution (Mapped)')
    plt.xticks(ticks=range(5), labels=CLASS_NAMES, rotation=45)
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.show()
else:
    print("WARNING: Metadata file not found. Ensure Drive is mounted.")

In [None]:
# Dataset Class
class UrbanSoundDataset(Dataset):
    def __init__(self, df, audio_dir, transformation, target_sample_rate, num_samples):
        self.df = df
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        # Construct path: audio/foldX/filename
        audio_path = os.path.join(self.audio_dir, f"fold{row['fold']}", row['slice_file_name'])
        # Use the mapped label
        label = row['new_label']

        try:
            # Load audio
            signal, sr = torchaudio.load(audio_path)
            
            # Resample if needed
            if sr != self.target_sample_rate:
                resampler = T.Resample(sr, self.target_sample_rate)
                signal = resampler(signal)
            
            # Mix down to mono if stereo
            if signal.shape[0] > 1:
                signal = torch.mean(signal, dim=0, keepdim=True)
            
            # Pad or Truncate
            length_signal = signal.shape[1]
            if length_signal > self.num_samples:
                signal = signal[:, :self.num_samples]
            elif length_signal < self.num_samples:
                num_missing = self.num_samples - length_signal
                signal = torch.nn.functional.pad(signal, (0, num_missing))
            
            # Apply MelSpectrogram
            # Note: We return both signal and transformed signal for visualization if needed, 
            # but for training we usually just need the transformed one.
            # Here we apply transform immediately.
            spec = self.transformation(signal)
            
            return spec, label
            
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # Return a dummy tensor
            dummy_signal = torch.zeros((1, N_MELS, (self.num_samples // HOP_LENGTH) + 1))
            return dummy_signal, label

In [None]:
# Audio Transforms
mel_spectrogram = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
)

class LogMelSpectrogram(nn.Module):
    def __init__(self, mel_spectrogram):
        super().__init__()
        self.mel_spectrogram = mel_spectrogram
        self.amplitude_to_db = T.AmplitudeToDB()

    def forward(self, x):
        x = self.mel_spectrogram(x)
        x = self.amplitude_to_db(x)
        return x

audio_transform = LogMelSpectrogram(mel_spectrogram)

In [None]:
# Visualization: Check Spectrograms
if os.path.exists(METADATA_PATH):
    # Create a dummy dataset for visualization
    # We pick one sample from each class
    print("Loading samples for visualization...")
    vis_df = metadata.groupby('new_label').apply(lambda x: x.iloc[0]).reset_index(drop=True)
    vis_ds = UrbanSoundDataset(vis_df, AUDIO_DIR, audio_transform, SAMPLE_RATE, TARGET_SAMPLE_COUNT)
    
    plt.figure(figsize=(15, 10))

    for i in range(len(vis_ds)):
        spec, label = vis_ds[i]
        
        plt.subplot(2, 3, i+1)
        plt.imshow(spec.squeeze().numpy(), aspect='auto', origin='lower')
        plt.title(f"Class: {CLASS_NAMES[label]}")
        plt.axis('off')
    plt.show()
    print("Spectrogram shape:", spec.shape)

In [None]:
# Model
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = resnet18(pretrained=True)
        self.model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.model(x)

In [None]:
# Training Functions
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in tqdm(dataloader, desc='Training', leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
    return running_loss / len(dataloader), correct / total

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return running_loss / len(dataloader), accuracy, f1

In [None]:
# Main Loop
if not os.path.exists(METADATA_PATH):
    print(f"ERROR: Metadata file not found at {METADATA_PATH}")
else:
    # Note: Metadata loaded above in exploration step, but strictly ensuring it exists here for safe execution
    # if user skipped cells.
    metadata = pd.read_csv(METADATA_PATH)
    metadata = map_classes(metadata, CLASS_MAP)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")

    fold_accuracies = []
    fold_f1s = []

    for fold in range(1, FOLDS + 1):
        print(f"\n{'='*20} Fold {fold}/{FOLDS} {'='*20}")
        
        train_df = metadata[metadata['fold'] != fold]
        val_df = metadata[metadata['fold'] == fold]
        
        # Calculate Weights for Imbalance
        class_counts = train_df['new_label'].value_counts().sort_index().values
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['new_label']), y=train_df['new_label'])
        class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

        train_ds = UrbanSoundDataset(train_df, AUDIO_DIR, audio_transform, SAMPLE_RATE, TARGET_SAMPLE_COUNT)
        val_ds = UrbanSoundDataset(val_df, AUDIO_DIR, audio_transform, SAMPLE_RATE, TARGET_SAMPLE_COUNT)
        
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
        
        model = AudioClassifier(NUM_CLASSES).to(device)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
        # Weighted Cross Entropy
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        
        best_val_acc = 0.0
        best_val_f1 = 0.0
        
        for epoch in range(EPOCHS):
            train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
            val_loss, val_acc, val_f1 = validate_epoch(model, val_loader, criterion, device)
            
            print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} F1: {val_f1:.4f}")
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_val_f1 = val_f1
        
        fold_accuracies.append(best_val_acc)
        fold_f1s.append(best_val_f1)
        print(f"--> Best Validation Accuracy for Fold {fold}: {best_val_acc:.4f}")

    print("\n" + "*"*30)
    print("FINAL RESULTS ACROSS 10 FOLDS")
    print("*"*30)
    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
    print(f"Average F1 Score: {np.mean(fold_f1s):.4f}")