In [3]:
# Cell 1 — Imports & Constants
import os, glob, numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from tqdm.notebook import tqdm
import pandas as pd
import subprocess
import mlflow
import mlflow.pytorch
import psutil
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)

# Paths
DEN_DIR = '/mnt/birdclef/processed/denoised'
EMB_DIR = '/mnt/birdclef/processed/embeddings'

# Spectrogram params (must match your preprocessing)
PANNS_SR     = 32000
N_FFT        = 2048
HOP_LENGTH   = 512
N_MELS       = 128
ALPHA        = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


meta = pd.read_csv('/mnt/birdclef/raw/train.csv')
label2idx = {lab:i for i, lab in enumerate(sorted(meta['primary_label'].unique()))}
num_classes = len(label2idx)

# MLFlow setup
mlflow.set_experiment("ResNet-50")
try: 
    mlflow.end_run() # end pre-existing run, if there was one
except:
    pass
finally:
    mlflow.start_run(log_system_metrics=True) # Start MLFlow run
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout for cmd in ["nvidia-smi", "rocm-smi"] 
     if subprocess.run(f"command -v {cmd}", shell=True, capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")


Using device: cuda


2025/05/01 00:46:59 INFO mlflow.tracking.fluent: Experiment with name 'ResNet-50' does not exist. Creating a new experiment.
2025/05/01 00:46:59 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/01 00:46:59 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2025/05/01 00:46:59 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


🏃 View run delicate-donkey-438 at: http://192.5.86.175:8000/#/experiments/1/runs/95ca2e7f8d6f45a4aee5ff49b8b084b5
🧪 View experiment at: http://192.5.86.175:8000/#/experiments/1


In [4]:
# Cell 2 — Helpers: MEL & Augmentation
import librosa
from sklearn.preprocessing import StandardScaler

def calculate_mel_spectrogram(wave_np, sr=PANNS_SR,
                              n_fft=N_FFT, hop_length=HOP_LENGTH,
                              n_mels=N_MELS):
    S = librosa.feature.melspectrogram(
        y=wave_np, sr=sr,
        n_fft=n_fft, hop_length=hop_length,
        n_mels=n_mels
    )
    return librosa.power_to_db(S, ref=np.max)

def create_augmented_mel(log_mel, emb, n_mels=N_MELS, alpha=ALPHA):
    dim, T = emb.shape[0], log_mel.shape[1]
    if dim == n_mels:
        proj = emb
    elif dim > n_mels:
        factor = dim // n_mels
        proj = emb[:n_mels*factor].reshape(n_mels, factor).mean(axis=1) \
               if dim % n_mels == 0 else emb[:n_mels]
    else:
        proj = np.pad(emb, (0, n_mels - dim))
    tiled = np.tile(proj[:, None], (1, T))
    normed = StandardScaler().fit_transform(tiled.T).T
    return log_mel + alpha * normed


In [5]:
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def log_system_metrics_mlflow(step=None):
    # CPU Utilization (%)
    cpu_util = psutil.cpu_percent()
    mlflow.log_metric("system.cpu.utilization", cpu_util, step=step)

    # Memory Usage (bytes)
    mem = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", mem.used, step=step)
    mlflow.log_metric("system.memory.percent", mem.percent, step=step)

    # GPU Utilization (%)
    gpu_util = nvmlDeviceGetUtilizationRates(gpu_handle).gpu
    mlflow.log_metric("system.gpu.0.utilization", gpu_util, step=step)

    # GPU Memory (bytes)
    gpu_mem = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.0.memory.used", gpu_mem.used, step=step)
    mlflow.log_metric("system.gpu.0.memory.percent", (gpu_mem.used / gpu_mem.total) * 100, step=step)

    # GPU Temperature (°C)
    gpu_temp = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.0.temperature", gpu_temp, step=step)

In [6]:
# Cell 3 — Dataset for Augmented MEL as Image
class AugmentedMelImageDataset(Dataset):
    def __init__(self, den_dir, emb_dir, transform=None):
        self.den_paths = sorted(glob.glob(f"{den_dir}/**/*.npz", recursive=True))
        self.emb_dir   = emb_dir
        self.transform = transform

    def __len__(self):
        return len(self.den_paths)

    def __getitem__(self, idx):
        dpath = self.den_paths[idx]
        d = np.load(dpath)
        wave = d['waveform']           # [CHUNK_SAMPLES]
        lbl  = int(d['label'])

        # load embedding
        rel   = os.path.relpath(dpath, DEN_DIR)
        emb_p = os.path.join(self.emb_dir, rel.replace('.npz','_emb.npz'))
        emb   = np.load(emb_p)['embedding']

        # compute spectrograms
        mel = calculate_mel_spectrogram(wave)
        aug = create_augmented_mel(mel, emb)

        # to “image”
        img = aug.astype(np.float32)   # H x W
        img = np.expand_dims(img, 0)   # 1 x H x W

        if self.transform:
            img = self.transform(torch.from_numpy(img))

        return img, lbl


In [7]:
# Cell 4 — Transforms & DataLoaders
# Resize to 224×224 and duplicate channel→3
train_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.Lambda(lambda x: x.repeat(3,1,1)),      # 1→3 channels
    transforms.Normalize([0.5]*3, [0.5]*3)
])
val_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.Lambda(lambda x: x.repeat(3,1,1)),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

dataset = AugmentedMelImageDataset(DEN_DIR, EMB_DIR, transform=None)
# simple 80/20 split
n = len(dataset)
n_train = int(0.8*n)
train_ds, val_ds = torch.utils.data.random_split(dataset, [n_train, n-n_train])

train_ds.dataset.transform = train_tf
val_ds.dataset.transform   = val_tf

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,
                          num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False,
                          num_workers=4, pin_memory=True)
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")


Train batches: 281, Val batches: 71


In [8]:
# Cell 5 — Model, Loss, Optimizer
# ResNet-50 from scratch
model = models.resnet50(weights=None, num_classes=num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# count params
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable params:", total_params)


Total trainable params: 23930126


In [9]:
mlflow.log_params({
    "model": "ResNet50",
    "pretrained": False,
    "num_classes": num_classes,
    "batch_size": 32,
    "lr": 1e-3,
    "optimizer": "Adam",
    "loss": "CrossEntropy",
    "train_transforms": "Resize+Flip+Repeat+Norm",
    "val_transforms": "Resize+Repeat+Norm"
})

mlflow.log_param("total_params", total_params)

23930126

In [10]:
# Cell 6 — Train & Validate Loop (1 epochs example)
num_epochs = 1
best_acc = 0.0
for epoch in range(1, num_epochs+1):
    # Train
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for imgs, labels in tqdm(train_loader, desc=f"Epoch {epoch} Train"):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()*imgs.size(0)
        preds = outputs.argmax(1)
        correct += (preds==labels).sum().item()
        total += imgs.size(0)

    train_loss = running_loss/total
    train_acc  = correct/total

    # Validate
    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for imgs, labels in tqdm(val_loader, desc=f"Epoch {epoch} Val"):
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()*imgs.size(0)
            preds = outputs.argmax(1)
            val_correct += (preds==labels).sum().item()
            val_total   += imgs.size(0)

    val_loss /= val_total
    val_acc   = val_correct/val_total

    print(f"Epoch {epoch}: Train loss {train_loss:.4f}, acc {train_acc:.4f} | "
          f"Val loss {val_loss:.4f}, acc {val_acc:.4f}")

    mlflow.log_metrics({
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "val_loss": val_loss,
        "val_accuracy": val_acc,
    }, step=epoch)
    
    log_system_metrics_mlflow(step=epoch)

    # Save best
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), 'best_resnet50.pth')
        print(f"➡️  New best saved (acc {best_acc:.4f})")
        mlflow.log_artifact('best_resnet50.pth', artifact_path="model")

print("Finished. Best Val Acc:", best_acc)


Epoch 1 Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 1 Val:   0%|          | 0/71 [00:00<?, ?it/s]

Epoch 1: Train loss 4.7883, acc 0.0516 | Val loss 8.0711, acc 0.0151
➡️  New best saved (acc 0.0151)
Finished. Best Val Acc: 0.015124555160142349
