In [None]:
import os
import subprocess
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50            # ResNet50 backbone
from tqdm import tqdm
import mlflow
import mlflow.pytorch
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

mlflow.set_experiment("ResNet50_MelAug")
try:
    mlflow.end_run()
except:
    pass
mlflow.start_run(log_system_metrics=True)

# log GPU info
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout
        for cmd in ["nvidia-smi", "rocm-smi"]
        if subprocess.run(f"command -v {cmd}", shell=True,
                          capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")

nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def log_system_metrics_mlflow(step=None):
    mlflow.log_metric("system.cpu.utilization", psutil.cpu_percent(), step=step)
    mem = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", mem.used, step=step)
    mlflow.log_metric("system.memory.percent", mem.percent, step=step)
    gpu_util = nvmlDeviceGetUtilizationRates(gpu_handle).gpu
    mlflow.log_metric("system.gpu.0.utilization", gpu_util, step=step)
    gpu_mem = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.0.memory.used", gpu_mem.used, step=step)
    mlflow.log_metric("system.gpu.0.memory.percent",
                      (gpu_mem.used / gpu_mem.total) * 100, step=step)
    gpu_temp = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.0.temperature", gpu_temp, step=step)

In [None]:
class MelAugDataset(Dataset):
    def __init__(self, manifest_csv: str, metadata_csv: str, mel_key="mel"):
        """
        manifest_csv: path to manifest_train.csv or manifest_test.csv
        metadata_csv: path to full train.csv (for secondary_labels)
        """
        m_df = pd.read_csv(manifest_csv)  # chunk_id, mel_aug_path, primary_label, etc.
        meta = pd.read_csv(metadata_csv, usecols=["filename", "secondary_labels"])
        meta["recording_id"] = meta.filename.str.replace(r"\.ogg$", "", regex=True)
        meta["secondary_list"] = meta.secondary_labels.fillna("").str.split()
        sec_map = dict(zip(meta.recording_id, meta.secondary_list))

        self.rows = []
        all_labels = set()
        for _, row in tqdm(m_df.iterrows(),
                            total=len(m_df),
                            desc=f"Building MelAugDataset ({os.path.basename(manifest_csv)})"):
            rid    = row.chunk_id.split("_chk")[0]
            prim   = row.primary_label
            secs   = sec_map.get(rid, [])
            labels = [prim] + secs
            all_labels.update(labels)
            self.rows.append({
                "mel_path": row.mel_aug_path,
                "labels": labels
            })

        self.classes   = sorted(all_labels)
        self.label2idx = {lab:i for i,lab in enumerate(self.classes)}
        self.num_classes = len(self.classes)
        self.mel_key = mel_key

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        r = self.rows[idx]
        npz = np.load(r["mel_path"])
        arr = npz[self.mel_key]                # (n_mels, n_frames)
        x   = torch.from_numpy(arr).unsqueeze(0).float()  # (1, n_mels, n_frames)
        y   = torch.zeros(self.num_classes, dtype=torch.float32)
        for lab in r["labels"]:
            y[self.label2idx[lab]] = 1.0
        return x, y


In [None]:
def get_resnet50_multilabel(num_classes: int):
    model = resnet50(pretrained=False)
    # adapt first conv to 1‑channel
    model.conv1 = nn.Conv2d(
        in_channels=1,
        out_channels=model.conv1.out_channels,
        kernel_size=model.conv1.kernel_size,
        stride=model.conv1.stride,
        padding=model.conv1.padding,
        bias=False
    )
    # replace final layer
    in_feat = model.fc.in_features
    model.fc = nn.Linear(in_feat, num_classes)
    return model

In [None]:
TRAIN_MANIFEST = "/home/jovyan/Features/manifest_train.csv"
TEST_MANIFEST  = "/home/jovyan/Features/manifest_test.csv"
TRAIN_CSV      = "/home/jovyan/Data/birdclef-2025/train.csv"

BATCH_SIZE = 16    # lower batch size for large images
LR         = 1e-4
EPOCHS     = 30

train_ds = MelAugDataset(TRAIN_MANIFEST, TRAIN_CSV)
test_ds  = MelAugDataset(TEST_MANIFEST,  TRAIN_CSV)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE,
                          shuffle=True,  num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=4, pin_memory=True)

model = get_resnet50_multilabel(train_ds.num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

mlflow.log_params({
    "model": "resnet50_scratch",
    "input": "aug_mel",
    "num_classes": train_ds.num_classes,
    "batch_size": BATCH_SIZE,
    "lr": LR,
    "epochs": EPOCHS
})

In [None]:
best_test_acc = 0.0
best_ckpt     = None

for epoch in range(1, EPOCHS+1):
    # — Train —
    model.train()
    train_bar = tqdm(train_loader,
                     desc=f"Epoch {epoch} Train",
                     unit="batch")
    running_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_bar:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)          # [B, num_classes]
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = (torch.sigmoid(logits) > 0.5).float()
        correct += (preds == yb).all(dim=1).sum().item()
        total   += xb.size(0)

        train_bar.set_postfix({
            "loss": f"{running_loss/total:.4f}",
            "acc":  f"{correct/total:.4f}"
        })

    train_loss = running_loss / total
    train_acc  = correct / total

    # — Test —
    model.eval()
    test_bar = tqdm(test_loader,
                    desc=f"Epoch {epoch} Test ",
                    unit="batch")
    test_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for xb, yb in test_bar:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss   = criterion(logits, yb)

            test_loss += loss.item() * xb.size(0)
            preds = (torch.sigmoid(logits) > 0.5).float()
            correct += (preds == yb).all(dim=1).sum().item()
            total   += xb.size(0)

            test_bar.set_postfix({
                "loss": f"{test_loss/total:.4f}",
                "acc":  f"{correct/total:.4f}"
            })

    test_loss /= total
    test_acc   = correct / total

    # checkpoint
    ckpt = f"resnet50_epoch_{epoch}.pt"
    torch.save({
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optim_state": optimizer.state_dict(),
        "train_loss": train_loss,
        "test_loss": test_loss
    }, ckpt)

    # MLflow logging
    mlflow.log_metrics({
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "test_loss": test_loss,
        "test_accuracy": test_acc
    }, step=epoch)
    log_system_metrics_mlflow(step=epoch)
    mlflow.log_artifact(ckpt, artifact_path="checkpoints")

    if test_acc > best_test_acc:
        best_test_acc = test_acc
        best_ckpt     = ckpt

    print(f"→ Epoch {epoch}/{EPOCHS}  "
          f"Train loss={train_loss:.4f}, acc={train_acc:.4f} │ "
          f"Test loss={test_loss:.4f}, acc={test_acc:.4f}")

In [None]:
mlflow.log_metric("best_test_accuracy", best_test_acc)
mlflow.log_artifact(best_ckpt, artifact_path="model")
mlflow.pytorch.log_model(model, "resnet50_melaug_model")
mlflow.end_run()