In [1]:
import os
import subprocess
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50
from tqdm import tqdm
import mlflow
import mlflow.pytorch
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve
from torch.amp import autocast, GradScaler


In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", DEVICE)

mlflow.set_experiment("ResNet50_MelAug")
if mlflow.active_run():
    mlflow.end_run()
mlflow.start_run(log_system_metrics=True)

# log GPU/CPU info once
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout
        for cmd in ["nvidia-smi","rocm-smi"]
        if subprocess.run(f"command -v {cmd}", shell=True, capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")

nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)
def log_sys(step=None):
    mlflow.log_metric("system.cpu.utilization", psutil.cpu_percent(), step=step)
    m = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", m.used, step=step)
    mlflow.log_metric("system.memory.percent", m.percent, step=step)
    u = nvmlDeviceGetUtilizationRates(gpu_handle)
    mlflow.log_metric("system.gpu.utilization", u.gpu, step=step)
    gm = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.mem.used", gm.used, step=step)
    mlflow.log_metric("system.gpu.mem.percent", (gm.used/gm.total)*100, step=step)
    t = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.temperature", t, step=step)

2025/05/07 16:26:22 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Using device: cuda


In [3]:
BATCH_SIZE     = 64
LR             = 1e-4
WEIGHT_DECAY   = 1e-4
EPOCHS         = 20
SAVE_EPOCH_CK  = False
BEST_CKPT      = "best_resnet50.pt"

TAXONOMY_CSV   = "/home/jovyan/Data/birdclef-2025/taxonomy.csv"
TRAIN_MANIFEST = "/home/jovyan/Features/manifest_train.csv"
TEST_MANIFEST  = "/home/jovyan/Features/manifest_test.csv"
TRAIN_CSV      = "/home/jovyan/Data/birdclef-2025/train.csv"
FEATURE_BASE   = "/home/jovyan/Features"

tax_df     = pd.read_csv(TAXONOMY_CSV)
CLASSES    = sorted(tax_df["primary_label"].astype(str).tolist())
NUM_CLASSES= len(CLASSES)

mlflow.log_params({
    "model":         "resnet50_scratch",
    "input":         "mel_aug",
    "num_classes":   NUM_CLASSES,
    "batch_size":    BATCH_SIZE,
    "lr":            LR,
    "weight_decay":  WEIGHT_DECAY,
    "epochs":        EPOCHS,
    "save_epoch_ck": SAVE_EPOCH_CK
})

In [4]:
class MelAugDataset(Dataset):
    def __init__(self, manifest_csv, meta_csv, feature_base, classes, key="mel"):
        m_df = pd.read_csv(manifest_csv)
        m_df["mel_path"] = (
            m_df["mel_aug_path"].astype(str)
                .str.lstrip(os.sep)
                .apply(lambda p: os.path.join(feature_base, "mel_aug", p))
        )
        meta = pd.read_csv(meta_csv, usecols=["filename","secondary_labels"])
        meta["rid"]     = meta.filename.str.replace(r"\.ogg$","",regex=True)
        meta["sec_list"]= meta.secondary_labels.fillna("").str.split()
        sec_map = dict(zip(meta.rid, meta.sec_list))

        self.rows = []
        self.idx_map   = {c:i for i,c in enumerate(classes)}
        self.num_cls   = len(classes)
        self.key       = key

        for _, row in tqdm(m_df.iterrows(), total=len(m_df), desc="Building dataset"):
            rid  = row.chunk_id.split("_chk")[0]
            labs = [row.primary_label] + sec_map.get(rid, [])
            labs = [l for l in labs if l in self.idx_map]
            prim_idx = self.idx_map[row.primary_label]
            self.rows.append((row.mel_path, labs, prim_idx))

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, i):
        path, labs, prim_idx = self.rows[i]
        npz  = np.load(path)
        arr  = npz[self.key]                   # [n_mels, n_frames]
        x    = torch.from_numpy(arr).unsqueeze(0).float()  # [1,n_mels,n_frames]
        y    = torch.zeros(self.num_cls, dtype=torch.float32)
        for l in labs:
            y[self.idx_map[l]] = 1.0
        return x, y, prim_idx

In [5]:
train_ds = MelAugDataset(TRAIN_MANIFEST, TRAIN_CSV, FEATURE_BASE, CLASSES)
test_ds  = MelAugDataset(TEST_MANIFEST,  TRAIN_CSV, FEATURE_BASE, CLASSES)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE,
                          shuffle=True,  num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=4, pin_memory=True)

Building dataset: 100%|██████████| 108451/108451 [00:05<00:00, 21636.70it/s]
Building dataset: 100%|██████████| 11022/11022 [00:00<00:00, 22590.69it/s]


In [6]:
def get_resnet50_multilabel(num_classes):
    m = resnet50(weights=None)
    # adapt 1‐channel
    m.conv1 = nn.Conv2d(1, m.conv1.out_channels,
                        kernel_size=m.conv1.kernel_size,
                        stride=m.conv1.stride,
                        padding=m.conv1.padding,
                        bias=False)
    m.fc    = nn.Linear(m.fc.in_features, num_classes)
    return m

In [7]:
model     = get_resnet50_multilabel(NUM_CLASSES).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=LR,
    steps_per_epoch=len(train_loader),
    epochs=EPOCHS,
    pct_start=0.1,
    div_factor=10
)
scaler = GradScaler()

In [8]:
best_f1, best_ap, best_acc = 0.0, 0.0, 0.0
thresholds = np.full(NUM_CLASSES, 0.5, dtype=np.float32)

for epoch in range(1, EPOCHS+1):
    # — Train —
    model.train()
    run_loss, total = 0.0, 0
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS} Train", unit="batch")
    for xb, yb, _ in train_bar:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        with autocast(device_type="cuda"):
            logits = model(xb)
            loss   = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        bs = xb.size(0)
        run_loss += loss.item()*bs
        total    += bs
        train_bar.set_postfix({"loss": f"{run_loss/total:.4f}"})
    train_loss = run_loss/total

    # — Eval —
    model.eval()
    all_scores, all_tgts, all_prims = [], [], []
    val_loss, total = 0.0, 0
    with torch.no_grad():
        for xb, yb, prim_idx in tqdm(test_loader, desc=f"Epoch {epoch}/{EPOCHS} Eval", unit="batch"):
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            with autocast(device_type="cuda"):
                logits = model(xb)
                val_loss += criterion(logits, yb).item()*xb.size(0)
                scores   = torch.sigmoid(logits).cpu().numpy()
            all_scores.append(scores)
            all_tgts.append(yb.cpu().numpy())
            all_prims.extend(prim_idx.tolist())
            total += xb.size(0)

    val_loss /= total
    scores = np.vstack(all_scores)
    tgts   = np.vstack(all_tgts)
    prims  = np.array(all_prims, dtype=int)

    # fast threshold calibration
    for i in range(NUM_CLASSES):
        y_true = tgts[:, i]
        if 0 < y_true.sum() < len(y_true):
            prec, rec, th = precision_recall_curve(y_true, scores[:, i])
            f1_vals = 2*prec*rec/(prec+rec+1e-8)
            best    = np.nanargmax(f1_vals[:-1])
            thresholds[i] = th[best]

    preds      = (scores >= thresholds).astype(int)
    micro_f1   = f1_score(tgts, preds, average="micro", zero_division=0)
    micro_ap   = average_precision_score(tgts, scores, average="micro")
    top1       = scores.argmax(axis=1)
    primary_acc= (top1 == prims).mean()

    # checkpoint best
    if micro_f1 > best_f1:
        best_f1, best_ap, best_acc = micro_f1, micro_ap, primary_acc
        torch.save(model.state_dict(), BEST_CKPT)
        mlflow.log_artifact(BEST_CKPT, artifact_path="model")

    # log metrics
    mlflow.log_metrics({
        "train_loss":     train_loss,
        "val_loss":       val_loss,
        "micro_f1":       micro_f1,
        "micro_ap":       micro_ap,
        "primary_acc":    primary_acc
    }, step=epoch)
    log_sys(step=epoch)

    print(f"→ Epoch {epoch}/{EPOCHS}  "
          f"F1={micro_f1:.4f}  AP={micro_ap:.4f}  PrimAcc={primary_acc:.4f}")


Epoch 1/20 Train: 100%|██████████| 1695/1695 [01:30<00:00, 18.82batch/s, loss=0.0504]
Epoch 1/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.91batch/s]


→ Epoch 1/20  F1=0.0535  AP=0.1047  PrimAcc=0.1511


Epoch 2/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.65batch/s, loss=0.0184]
Epoch 2/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.69batch/s]


→ Epoch 2/20  F1=0.2603  AP=0.3726  PrimAcc=0.3760


Epoch 3/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.69batch/s, loss=0.0128]
Epoch 3/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.44batch/s]


→ Epoch 3/20  F1=0.3069  AP=0.4549  PrimAcc=0.4494


Epoch 4/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.56batch/s, loss=0.0100]
Epoch 4/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.33batch/s]


→ Epoch 4/20  F1=0.4174  AP=0.5306  PrimAcc=0.5170


Epoch 5/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.54batch/s, loss=0.0081]
Epoch 5/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.67batch/s]


→ Epoch 5/20  F1=0.3718  AP=0.5717  PrimAcc=0.5425


Epoch 6/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.67batch/s, loss=0.0066]
Epoch 6/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.34batch/s]


→ Epoch 6/20  F1=0.4272  AP=0.5839  PrimAcc=0.5644


Epoch 7/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.66batch/s, loss=0.0053]
Epoch 7/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.89batch/s]


→ Epoch 7/20  F1=0.3207  AP=0.5996  PrimAcc=0.5727


Epoch 8/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.50batch/s, loss=0.0041]
Epoch 8/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.27batch/s]


→ Epoch 8/20  F1=0.4756  AP=0.5994  PrimAcc=0.5806


Epoch 9/20 Train: 100%|██████████| 1695/1695 [01:32<00:00, 18.24batch/s, loss=0.0030]
Epoch 9/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.17batch/s]


→ Epoch 9/20  F1=0.4330  AP=0.6027  PrimAcc=0.5808


Epoch 10/20 Train: 100%|██████████| 1695/1695 [01:28<00:00, 19.07batch/s, loss=0.0021]
Epoch 10/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.11batch/s]


→ Epoch 10/20  F1=0.3611  AP=0.5995  PrimAcc=0.5837


Epoch 11/20 Train: 100%|██████████| 1695/1695 [01:27<00:00, 19.48batch/s, loss=0.0015]
Epoch 11/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.44batch/s]


→ Epoch 11/20  F1=0.3504  AP=0.6054  PrimAcc=0.5905


Epoch 12/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.61batch/s, loss=0.0011]
Epoch 12/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.13batch/s]


→ Epoch 12/20  F1=0.3531  AP=0.5991  PrimAcc=0.5867


Epoch 13/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.61batch/s, loss=0.0009]
Epoch 13/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.91batch/s]


→ Epoch 13/20  F1=0.3708  AP=0.6114  PrimAcc=0.5957


Epoch 14/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.57batch/s, loss=0.0007]
Epoch 14/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.59batch/s]


→ Epoch 14/20  F1=0.3766  AP=0.6080  PrimAcc=0.5940


Epoch 15/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.57batch/s, loss=0.0006]
Epoch 15/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.30batch/s]


→ Epoch 15/20  F1=0.3972  AP=0.6192  PrimAcc=0.6027


Epoch 16/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.63batch/s, loss=0.0006]
Epoch 16/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.47batch/s]


→ Epoch 16/20  F1=0.3958  AP=0.6162  PrimAcc=0.5998


Epoch 17/20 Train: 100%|██████████| 1695/1695 [01:27<00:00, 19.48batch/s, loss=0.0005]
Epoch 17/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.35batch/s]


→ Epoch 17/20  F1=0.4002  AP=0.6222  PrimAcc=0.6064


Epoch 18/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.49batch/s, loss=0.0005]
Epoch 18/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.41batch/s]


→ Epoch 18/20  F1=0.4061  AP=0.6252  PrimAcc=0.6049


Epoch 19/20 Train: 100%|██████████| 1695/1695 [01:26<00:00, 19.62batch/s, loss=0.0005]
Epoch 19/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 59.85batch/s]


→ Epoch 19/20  F1=0.3985  AP=0.6254  PrimAcc=0.6049


Epoch 20/20 Train: 100%|██████████| 1695/1695 [01:27<00:00, 19.46batch/s, loss=0.0005]
Epoch 20/20 Eval: 100%|██████████| 173/173 [00:02<00:00, 60.07batch/s]


→ Epoch 20/20  F1=0.3996  AP=0.6262  PrimAcc=0.6049


In [9]:
mlflow.log_metric("best_micro_f1", best_f1)
mlflow.log_metric("best_micro_ap", best_ap)
mlflow.log_metric("best_primary_acc", best_acc)
mlflow.end_run()

2025/05/07 16:57:13 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/07 16:57:13 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run carefree-dove-950 at: http://192.5.87.49:8000/#/experiments/6/runs/a0716a5a62cb4a3f8285f7875126e57e
🧪 View experiment at: http://192.5.87.49:8000/#/experiments/6
