In [9]:
# Cell 1 — Imports & Constants
import os, glob, numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from tqdm.notebook import tqdm
import pandas as pd
import subprocess
import mlflow
import mlflow.pytorch
import psutil
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)
import librosa
from sklearn.preprocessing import StandardScaler

BASE_DIR        = '/home/jovyan/Features'
EMB_DIR         = os.path.join(BASE_DIR, 'embeddings')
MEL_DIR         = os.path.join(BASE_DIR, 'mel')
TRAIN_MANIF       = os.path.join(BASE_DIR, 'manifest_train.csv')
TEST_MANIF      = os.path.join(BASE_DIR, 'manifest_test.csv')

TAXONOMY_CSV       = '/home/jovyan/Features/taxonomy.csv'

# Spectrogram params (must match your preprocessing)
PANNS_SR     = 32000
N_FFT        = 2048
HOP_LENGTH   = 512
N_MELS       = 128
ALPHA        = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


tax_df      = pd.read_csv(TAXONOMY_CSV)
labels_all  = sorted(tax_df['primary_label'].unique())
label2idx   = {lab: i for i, lab in enumerate(labels_all)}
num_classes = len(labels_all)
# MLFlow setup
mlflow.set_experiment("ResNet-50")
try: 
    mlflow.end_run() # end pre-existing run, if there was one
except:
    pass
finally:
    mlflow.start_run(log_system_metrics=True) # Start MLFlow run
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout for cmd in ["nvidia-smi", "rocm-smi"] 
     if subprocess.run(f"command -v {cmd}", shell=True, capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")


2025/05/04 17:42:53 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/04 17:42:53 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2025/05/04 17:42:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Using device: cuda
🏃 View run vaunted-gnat-451 at: http://192.5.87.49:8000/#/experiments/2/runs/23e44d9b033d41b8962b1d3bd65e4cef
🧪 View experiment at: http://192.5.87.49:8000/#/experiments/2


In [10]:
# Cell 2 — Helpers: MEL & Augmentation
def create_augmented_mel(log_mel, emb, alpha=ALPHA):
    """
    log_mel: np.array shape (n_mels, T)
    emb:     np.array shape (embed_dim,)
    """
    n_mels, T    = log_mel.shape
    embed_dim    = emb.shape[0]

    if embed_dim == n_mels:
        proj = emb
    elif embed_dim > n_mels:
        factor = embed_dim // n_mels
        if embed_dim % n_mels == 0:
            proj = emb.reshape(n_mels, factor).mean(axis=1)
        else:
            proj = emb[:n_mels]
    else:
        proj = np.pad(emb, (0, n_mels - embed_dim))

    tiled = np.tile(proj[:, None], (1, T))
    normed = StandardScaler().fit_transform(tiled.T).T
    return log_mel + alpha * normed

In [11]:
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def log_system_metrics_mlflow(step=None):
    # CPU Utilization (%)
    cpu_util = psutil.cpu_percent()
    mlflow.log_metric("system.cpu.utilization", cpu_util, step=step)

    # Memory Usage (bytes)
    mem = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", mem.used, step=step)
    mlflow.log_metric("system.memory.percent", mem.percent, step=step)

    # GPU Utilization (%)
    gpu_util = nvmlDeviceGetUtilizationRates(gpu_handle).gpu
    mlflow.log_metric("system.gpu.0.utilization", gpu_util, step=step)

    # GPU Memory (bytes)
    gpu_mem = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.0.memory.used", gpu_mem.used, step=step)
    mlflow.log_metric("system.gpu.0.memory.percent", (gpu_mem.used / gpu_mem.total) * 100, step=step)

    # GPU Temperature (°C)
    gpu_temp = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.0.temperature", gpu_temp, step=step)

In [12]:
class AugmentedMelImageDataset(Dataset):
    def __init__(self, manifest_fp, mel_dir, emb_dir, label2idx, transform=None):
        self.df          = pd.read_csv(manifest_fp)
        self.mel_dir     = mel_dir
        self.emb_dir     = emb_dir
        self.label2idx   = label2idx
        self.transform   = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row    = self.df.iloc[idx]
        lbl    = self.label2idx[row['primary_label']]

        # load precomputed mel
        mel_rel  = row['mel_path'].lstrip('/')
        mel_full = os.path.join(self.mel_dir, mel_rel)
        log_mel  = np.load(mel_full)['mel']

        # load corresponding embedding
        emb_rel  = row['emb_path'].lstrip('/')
        emb_full = os.path.join(self.emb_dir, emb_rel)
        emb      = np.load(emb_full)['embedding']

        # compute augmented mel
        aug      = create_augmented_mel(log_mel, emb, alpha=ALPHA)

        # to “image” tensor: (1, H, W)
        img      = aug.astype(np.float32)[None, ...]
        img_t    = torch.from_numpy(img)

        if self.transform:
            img_t = self.transform(img_t)

        return img_t, lbl

In [13]:
train_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.Lambda(lambda x: x.repeat(3,1,1)),
    transforms.Normalize([0.5]*3, [0.5]*3)
])
val_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.Lambda(lambda x: x.repeat(3,1,1)),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# datasets
train_ds = AugmentedMelImageDataset(
    TRAIN_MANIF, 
    MEL_DIR, EMB_DIR, 
    label2idx, 
    transform=train_tf
)
val_ds = AugmentedMelImageDataset(
    TEST_MANIF, 
    MEL_DIR, EMB_DIR, 
    label2idx, 
    transform=val_tf
)

# dataloaders
train_loader = DataLoader(
    train_ds, 
    batch_size=64, 
    shuffle=True,
    num_workers=16, 
    pin_memory=True
)
val_loader = DataLoader(
    val_ds, 
    batch_size=64, 
    shuffle=False,
    num_workers=16, 
    pin_memory=True
)

print(f"Train Dataset: {len(train_ds)}, Val batches: {len(val_ds)}")

Train Dataset: 69676, Val batches: 11474


In [14]:
# Cell 5 — Model, Loss, Optimizer
# ResNet-50 from scratch
model = models.resnet50(weights=None, num_classes=num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# count params
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable params:", total_params)


Total trainable params: 23930126


In [15]:
mlflow.log_params({
    "model": "ResNet50",
    "pretrained": False,
    "num_classes": num_classes,
    "batch_size": 32,
    "lr": 1e-3,
    "optimizer": "Adam",
    "loss": "CrossEntropy",
    "train_transforms": "Resize+Flip+Repeat+Norm",
    "val_transforms": "Resize+Repeat+Norm"
})

mlflow.log_param("total_params", total_params)

23930126

In [16]:
# Cell 6 — Train & Validate Loop (1 epochs example)
num_epochs = 20
best_acc = 0.0
for epoch in range(1, num_epochs+1):
    # Train
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for imgs, labels in tqdm(train_loader, desc=f"Epoch {epoch} Train"):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()*imgs.size(0)
        preds = outputs.argmax(1)
        correct += (preds==labels).sum().item()
        total += imgs.size(0)

    train_loss = running_loss/total
    train_acc  = correct/total

    # Validate
    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for imgs, labels in tqdm(val_loader, desc=f"Epoch {epoch} Val"):
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()*imgs.size(0)
            preds = outputs.argmax(1)
            val_correct += (preds==labels).sum().item()
            val_total   += imgs.size(0)

    val_loss /= val_total
    val_acc   = val_correct/val_total

    print(f"Epoch {epoch}: Train loss {train_loss:.4f}, acc {train_acc:.4f} | "
          f"Val loss {val_loss:.4f}, acc {val_acc:.4f}")

    mlflow.log_metrics({
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "val_loss": val_loss,
        "val_accuracy": val_acc,
    }, step=epoch)
    
    log_system_metrics_mlflow(step=epoch)

    # Save best
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), 'best_resnet50.pth')
        print(f"➡️  New best saved (acc {best_acc:.4f})")
        mlflow.log_artifact('best_resnet50.pth', artifact_path="model")

print("Finished. Best Val Acc:", best_acc)


Epoch 1 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 1 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 1: Train loss 3.6962, acc 0.2092 | Val loss 3.1951, acc 0.3072
➡️  New best saved (acc 0.3072)


Epoch 2 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 2 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 2: Train loss 2.6251, acc 0.4086 | Val loss 2.8503, acc 0.3967
➡️  New best saved (acc 0.3967)


Epoch 3 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 3 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 3: Train loss 2.0989, acc 0.5203 | Val loss 2.5245, acc 0.4626
➡️  New best saved (acc 0.4626)


Epoch 4 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 4 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 4: Train loss 1.7560, acc 0.5915 | Val loss 2.4451, acc 0.4811
➡️  New best saved (acc 0.4811)


Epoch 5 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 5 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 5: Train loss 1.4913, acc 0.6474 | Val loss 2.2481, acc 0.5329
➡️  New best saved (acc 0.5329)


Epoch 6 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 6 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 6: Train loss 1.2892, acc 0.6901 | Val loss 2.2232, acc 0.5641
➡️  New best saved (acc 0.5641)


Epoch 7 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 7 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 7: Train loss 1.1148, acc 0.7301 | Val loss 2.4525, acc 0.5340


Epoch 8 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 8 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 8: Train loss 0.9594, acc 0.7644 | Val loss 2.2982, acc 0.5588


Epoch 9 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 9 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 9: Train loss 0.8244, acc 0.7920 | Val loss 2.3251, acc 0.5773
➡️  New best saved (acc 0.5773)


Epoch 10 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 10 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 10: Train loss 0.7054, acc 0.8206 | Val loss 2.3700, acc 0.5902
➡️  New best saved (acc 0.5902)


Epoch 11 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 11 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 11: Train loss 0.6055, acc 0.8454 | Val loss 2.4583, acc 0.5815


Epoch 12 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 12 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 12: Train loss 0.5129, acc 0.8668 | Val loss 2.5247, acc 0.5885


Epoch 13 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 13 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 13: Train loss 0.4387, acc 0.8838 | Val loss 3.0385, acc 0.5548


Epoch 14 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 14 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 14: Train loss 0.3809, acc 0.8994 | Val loss 2.9900, acc 0.5745


Epoch 15 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 15 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 15: Train loss 0.3349, acc 0.9102 | Val loss 2.9075, acc 0.5878


Epoch 16 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 16 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 16: Train loss 0.2929, acc 0.9220 | Val loss 2.9725, acc 0.5769


Epoch 17 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 17 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 17: Train loss 0.2597, acc 0.9301 | Val loss 3.1168, acc 0.5907
➡️  New best saved (acc 0.5907)


Epoch 18 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 18 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 18: Train loss 0.2292, acc 0.9371 | Val loss 3.3096, acc 0.5816


Epoch 19 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 19 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 19: Train loss 0.2019, acc 0.9445 | Val loss 3.1926, acc 0.5926
➡️  New best saved (acc 0.5926)


Epoch 20 Train:   0%|          | 0/1089 [00:00<?, ?it/s]

Epoch 20 Val:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 20: Train loss 0.1796, acc 0.9505 | Val loss 3.4543, acc 0.5675
Finished. Best Val Acc: 0.5925570855848005
