In [1]:
import os
import subprocess
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm                         # ← add tqdm import
import mlflow
import mlflow.pytorch
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

mlflow.set_experiment("PannsMLP")
try:
    mlflow.end_run()
except:
    pass
mlflow.start_run(log_system_metrics=True)

# log GPU info
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout
        for cmd in ["nvidia-smi", "rocm-smi"]
        if subprocess.run(f"command -v {cmd}", shell=True,
                          capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")

nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def log_system_metrics_mlflow(step=None):
    mlflow.log_metric("system.cpu.utilization", psutil.cpu_percent(), step=step)
    mem = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", mem.used, step=step)
    mlflow.log_metric("system.memory.percent", mem.percent, step=step)
    gpu_util = nvmlDeviceGetUtilizationRates(gpu_handle).gpu
    mlflow.log_metric("system.gpu.0.utilization", gpu_util, step=step)
    gpu_mem = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.0.memory.used", gpu_mem.used, step=step)
    mlflow.log_metric("system.gpu.0.memory.percent",
                      (gpu_mem.used / gpu_mem.total) * 100, step=step)
    gpu_temp = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.0.temperature", gpu_temp, step=step)

Using device: cuda


2025/05/07 00:25:56 INFO mlflow.tracking.fluent: Experiment with name 'PannsMLP' does not exist. Creating a new experiment.
2025/05/07 00:25:57 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


In [14]:
class EmbeddingDataset(Dataset):
    def __init__(self,
                 manifest_csv: str,
                 metadata_csv: str,
                 feature_base: str,
                 classes: list,
                 embeddings_key: str = "embedding"):
        """
        manifest_csv: path to manifest_train.csv or manifest_test.csv
        metadata_csv: path to train.csv (for secondary_labels)
        feature_base: e.g. "/home/jovyan/Features"
        classes: global sorted list of all primary_label codes
        """
        m_df = pd.read_csv(manifest_csv)
        # build full path under Features/embeddings/
        m_df["emb_path"] = (
            m_df["emb_path"].astype(str)
                       .str.lstrip(os.sep)
                       .apply(lambda p: os.path.join(feature_base, "embeddings", p))
        )

        meta = pd.read_csv(metadata_csv, usecols=["filename","secondary_labels"])
        meta["recording_id"]   = meta.filename.str.replace(r"\.ogg$", "", regex=True)
        meta["secondary_list"] = meta.secondary_labels.fillna("").str.split()
        sec_map = dict(zip(meta.recording_id, meta.secondary_list))

        self.rows = []
        for _, row in tqdm(m_df.iterrows(),
                          total=len(m_df),
                          desc=f"Building {os.path.basename(manifest_csv)}"):
            rid  = row.chunk_id.split("_chk")[0]
            labs = [row.primary_label] + sec_map.get(rid, [])
            self.rows.append({
                "emb_path": row.emb_path,
                "labels":   labs
            })

        self.classes       = classes
        self.label2idx     = {lab:i for i, lab in enumerate(self.classes)}
        self.num_classes   = len(self.classes)
        self.embeddings_key = embeddings_key

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        r   = self.rows[idx]
        arr = np.load(r["emb_path"])[self.embeddings_key]   # (n_windows, emb_dim)
        x   = arr.mean(axis=0).astype(np.float32)           # (emb_dim,)
        y   = np.zeros(self.num_classes, dtype=np.float32)
        for lab in r["labels"]:
            # ignore any labels not in taxonomy
            if lab in self.label2idx:
                y[self.label2idx[lab]] = 1.0
        return x, y

In [4]:
class EmbeddingClassifier(nn.Module):
    def __init__(self, emb_dim, num_classes, hidden_dims=[512,256], dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(emb_dim, hidden_dims[0])
        self.bn1 = nn.BatchNorm1d(hidden_dims[0])
        self.drop1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.bn2 = nn.BatchNorm1d(hidden_dims[1])
        self.drop2 = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_dims[1], num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        return self.out(x)

In [15]:
TRAIN_MANIFEST = "/home/jovyan/Features/manifest_train.csv"
TEST_MANIFEST  = "/home/jovyan/Features/manifest_test.csv"
TRAIN_CSV      = "/home/jovyan/Data/birdclef-2025/train.csv"
FEATURE_BASE   = "/home/jovyan/Features"

BATCH_SIZE  = 32
LR          = 1e-3
EPOCHS      = 20
HIDDEN_DIMS = [512, 256]
DROPOUT     = 0.3

TAXONOMY_CSV = "/home/jovyan/Data/birdclef-2025/taxonomy.csv"
tax_df = pd.read_csv(TAXONOMY_CSV)
classes = sorted(tax_df['primary_label'].astype(str).tolist())

train_ds = EmbeddingDataset(
    manifest_csv = TRAIN_MANIFEST,
    metadata_csv = TRAIN_CSV,
    feature_base = FEATURE_BASE,
    classes      = classes
)
test_ds = EmbeddingDataset(
    manifest_csv = TEST_MANIFEST,
    metadata_csv = TRAIN_CSV,
    feature_base = FEATURE_BASE,
    classes      = classes
)

assert train_ds.num_classes == test_ds.num_classes == len(classes)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

sample_x, _ = train_ds[0]
emb_dim = sample_x.shape[0]

model   = EmbeddingClassifier(emb_dim, train_ds.num_classes, HIDDEN_DIMS, DROPOUT).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

mlflow.log_params({
    "input_dim": emb_dim,
    "hidden_dims": HIDDEN_DIMS,
    "dropout": DROPOUT,
    "batch_size": BATCH_SIZE,
    "lr": LR,
    "epochs": EPOCHS
})

Building manifest_train.csv: 100%|██████████| 108451/108451 [00:05<00:00, 21554.23it/s]
Building manifest_test.csv: 100%|██████████| 11022/11022 [00:00<00:00, 22569.60it/s]


In [16]:
best_test_acc = 0.0
best_ckpt     = None

for epoch in range(1, EPOCHS + 1):
    # — Train —
    model.train()
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch} Train", unit="batch")
    running_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_bar:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = (torch.sigmoid(logits) > 0.5).float()
        correct += (preds == yb).all(dim=1).sum().item()
        total   += xb.size(0)

        train_bar.set_postfix({
            "batch_loss": f"{loss.item():.4f}",
            "acc":         f"{correct/total:.4f}"
        })

    train_loss = running_loss / total
    train_acc  = correct / total

    # — Test —
    model.eval()
    test_bar = tqdm(test_loader, desc=f"Epoch {epoch} Test ", unit="batch")
    test_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for xb, yb in test_bar:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)

            test_loss += loss.item() * xb.size(0)
            preds = (torch.sigmoid(logits) > 0.5).float()
            correct += (preds == yb).all(dim=1).sum().item()
            total   += xb.size(0)

            test_bar.set_postfix({
                "batch_loss": f"{loss.item():.4f}",
                "acc":         f"{correct/total:.4f}"
            })

    test_loss /= total
    test_acc   = correct / total

    # checkpoint
    ckpt = f"ckpt_epoch_{epoch}.pt"
    torch.save({
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optim_state": optimizer.state_dict(),
        "train_loss": train_loss,
        "test_loss": test_loss
    }, ckpt)

    # MLflow logging
    mlflow.log_metrics({
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "test_loss": test_loss,
        "test_accuracy": test_acc
    }, step=epoch)
    log_system_metrics_mlflow(step=epoch)
    mlflow.log_artifact(ckpt, artifact_path="checkpoints")

    if test_acc > best_test_acc:
        best_test_acc = test_acc
        best_ckpt     = ckpt

    print(f"→ Epoch {epoch}/{EPOCHS}  "
          f"Train loss={train_loss:.4f}, acc={train_acc:.4f} │ "
          f"Test loss={test_loss:.4f}, acc={test_acc:.4f}")

Epoch 1 Train: 100%|██████████| 3390/3390 [00:30<00:00, 110.71batch/s, batch_loss=0.0231, acc=0.0992]
Epoch 1 Test : 100%|██████████| 345/345 [00:02<00:00, 163.97batch/s, batch_loss=0.0231, acc=0.1381]


→ Epoch 1/20  Train loss=0.0243, acc=0.0992 │ Test loss=0.0196, acc=0.1381


Epoch 2 Train: 100%|██████████| 3390/3390 [00:32<00:00, 104.77batch/s, batch_loss=0.0168, acc=0.1990]
Epoch 2 Test : 100%|██████████| 345/345 [00:02<00:00, 166.55batch/s, batch_loss=0.0208, acc=0.1764]


→ Epoch 2/20  Train loss=0.0175, acc=0.1990 │ Test loss=0.0184, acc=0.1764


Epoch 3 Train: 100%|██████████| 3390/3390 [00:30<00:00, 112.31batch/s, batch_loss=0.0354, acc=0.2394]
Epoch 3 Test : 100%|██████████| 345/345 [00:02<00:00, 161.09batch/s, batch_loss=0.0245, acc=0.2050]


→ Epoch 3/20  Train loss=0.0163, acc=0.2394 │ Test loss=0.0178, acc=0.2050


Epoch 4 Train: 100%|██████████| 3390/3390 [00:31<00:00, 106.30batch/s, batch_loss=0.0328, acc=0.2657]
Epoch 4 Test : 100%|██████████| 345/345 [00:02<00:00, 159.93batch/s, batch_loss=0.0231, acc=0.2102]


→ Epoch 4/20  Train loss=0.0155, acc=0.2657 │ Test loss=0.0175, acc=0.2102


Epoch 5 Train: 100%|██████████| 3390/3390 [00:31<00:00, 106.74batch/s, batch_loss=0.0257, acc=0.2858]
Epoch 5 Test : 100%|██████████| 345/345 [00:02<00:00, 167.31batch/s, batch_loss=0.0221, acc=0.2261]


→ Epoch 5/20  Train loss=0.0150, acc=0.2858 │ Test loss=0.0173, acc=0.2261


Epoch 6 Train: 100%|██████████| 3390/3390 [00:28<00:00, 117.39batch/s, batch_loss=0.0138, acc=0.2994]
Epoch 6 Test : 100%|██████████| 345/345 [00:02<00:00, 169.59batch/s, batch_loss=0.0215, acc=0.2373]


→ Epoch 6/20  Train loss=0.0147, acc=0.2994 │ Test loss=0.0172, acc=0.2373


Epoch 7 Train: 100%|██████████| 3390/3390 [00:29<00:00, 113.77batch/s, batch_loss=0.0354, acc=0.3119]
Epoch 7 Test : 100%|██████████| 345/345 [00:02<00:00, 158.32batch/s, batch_loss=0.0221, acc=0.2477]


→ Epoch 7/20  Train loss=0.0143, acc=0.3119 │ Test loss=0.0169, acc=0.2477


Epoch 8 Train: 100%|██████████| 3390/3390 [00:29<00:00, 115.18batch/s, batch_loss=0.0226, acc=0.3212]
Epoch 8 Test : 100%|██████████| 345/345 [00:02<00:00, 158.71batch/s, batch_loss=0.0218, acc=0.2530]


→ Epoch 8/20  Train loss=0.0141, acc=0.3212 │ Test loss=0.0169, acc=0.2530


Epoch 9 Train: 100%|██████████| 3390/3390 [00:29<00:00, 116.88batch/s, batch_loss=0.0277, acc=0.3323]
Epoch 9 Test : 100%|██████████| 345/345 [00:02<00:00, 163.74batch/s, batch_loss=0.0215, acc=0.2657]


→ Epoch 9/20  Train loss=0.0138, acc=0.3323 │ Test loss=0.0168, acc=0.2657


Epoch 10 Train: 100%|██████████| 3390/3390 [00:29<00:00, 116.10batch/s, batch_loss=0.0279, acc=0.3382]
Epoch 10 Test : 100%|██████████| 345/345 [00:02<00:00, 159.15batch/s, batch_loss=0.0210, acc=0.2641]


→ Epoch 10/20  Train loss=0.0136, acc=0.3382 │ Test loss=0.0167, acc=0.2641


Epoch 11 Train: 100%|██████████| 3390/3390 [00:29<00:00, 115.45batch/s, batch_loss=0.0206, acc=0.3452]
Epoch 11 Test : 100%|██████████| 345/345 [00:02<00:00, 144.60batch/s, batch_loss=0.0203, acc=0.2770]


→ Epoch 11/20  Train loss=0.0135, acc=0.3452 │ Test loss=0.0165, acc=0.2770


Epoch 12 Train: 100%|██████████| 3390/3390 [00:29<00:00, 115.11batch/s, batch_loss=0.0424, acc=0.3531]
Epoch 12 Test : 100%|██████████| 345/345 [00:02<00:00, 138.61batch/s, batch_loss=0.0199, acc=0.2775]


→ Epoch 12/20  Train loss=0.0133, acc=0.3531 │ Test loss=0.0164, acc=0.2775


Epoch 13 Train: 100%|██████████| 3390/3390 [00:29<00:00, 116.76batch/s, batch_loss=0.0236, acc=0.3580]
Epoch 13 Test : 100%|██████████| 345/345 [00:02<00:00, 120.97batch/s, batch_loss=0.0200, acc=0.2811]


→ Epoch 13/20  Train loss=0.0132, acc=0.3580 │ Test loss=0.0165, acc=0.2811


Epoch 14 Train: 100%|██████████| 3390/3390 [00:29<00:00, 116.51batch/s, batch_loss=0.0370, acc=0.3637]
Epoch 14 Test : 100%|██████████| 345/345 [00:02<00:00, 141.31batch/s, batch_loss=0.0205, acc=0.2766]


→ Epoch 14/20  Train loss=0.0130, acc=0.3637 │ Test loss=0.0166, acc=0.2766


Epoch 15 Train: 100%|██████████| 3390/3390 [00:31<00:00, 108.61batch/s, batch_loss=0.0588, acc=0.3686]
Epoch 15 Test : 100%|██████████| 345/345 [00:02<00:00, 136.35batch/s, batch_loss=0.0215, acc=0.2823]


→ Epoch 15/20  Train loss=0.0129, acc=0.3686 │ Test loss=0.0164, acc=0.2823


Epoch 16 Train: 100%|██████████| 3390/3390 [00:29<00:00, 116.84batch/s, batch_loss=0.0308, acc=0.3732]
Epoch 16 Test : 100%|██████████| 345/345 [00:02<00:00, 141.55batch/s, batch_loss=0.0192, acc=0.2824]


→ Epoch 16/20  Train loss=0.0128, acc=0.3732 │ Test loss=0.0164, acc=0.2824


Epoch 17 Train: 100%|██████████| 3390/3390 [00:29<00:00, 115.58batch/s, batch_loss=0.0292, acc=0.3779]
Epoch 17 Test : 100%|██████████| 345/345 [00:02<00:00, 144.16batch/s, batch_loss=0.0219, acc=0.2873]


→ Epoch 17/20  Train loss=0.0126, acc=0.3779 │ Test loss=0.0164, acc=0.2873


Epoch 18 Train: 100%|██████████| 3390/3390 [00:28<00:00, 119.66batch/s, batch_loss=0.0418, acc=0.3828]
Epoch 18 Test : 100%|██████████| 345/345 [00:02<00:00, 169.07batch/s, batch_loss=0.0190, acc=0.2917]


→ Epoch 18/20  Train loss=0.0126, acc=0.3828 │ Test loss=0.0163, acc=0.2917


Epoch 19 Train: 100%|██████████| 3390/3390 [00:28<00:00, 119.82batch/s, batch_loss=0.0264, acc=0.3865]
Epoch 19 Test : 100%|██████████| 345/345 [00:02<00:00, 167.83batch/s, batch_loss=0.0187, acc=0.2946]


→ Epoch 19/20  Train loss=0.0125, acc=0.3865 │ Test loss=0.0162, acc=0.2946


Epoch 20 Train: 100%|██████████| 3390/3390 [00:29<00:00, 114.60batch/s, batch_loss=0.0147, acc=0.3891]
Epoch 20 Test : 100%|██████████| 345/345 [00:02<00:00, 167.96batch/s, batch_loss=0.0192, acc=0.2866]


→ Epoch 20/20  Train loss=0.0124, acc=0.3891 │ Test loss=0.0163, acc=0.2866


In [18]:
mlflow.log_metric("best_test_accuracy", best_test_acc)
mlflow.log_artifact(best_ckpt, artifact_path="model")
mlflow.end_run()

2025/05/07 00:57:21 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/07 00:57:21 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run rare-stoat-788 at: http://192.5.87.49:8000/#/experiments/1/runs/89cdd90cdd88426eb2fb80fcff474108
🧪 View experiment at: http://192.5.87.49:8000/#/experiments/1
