In [1]:
import os
import subprocess
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import mlflow
import mlflow.pytorch
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast, GradScaler
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", DEVICE)

mlflow.set_experiment("RawAudioCNN")
try: mlflow.end_run()
except: pass
mlflow.start_run(log_system_metrics=True)

# log GPU/CPU info
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout 
     for cmd in ["nvidia-smi","rocm-smi"]
     if subprocess.run(f"command -v {cmd}", shell=True,
                       capture_output=True).returncode==0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)
def log_sys(step=None):
    mlflow.log_metric("cpu_pct", psutil.cpu_percent(), step=step)
    mem = psutil.virtual_memory()
    mlflow.log_metric("mem_used", mem.used, step=step)
    mlflow.log_metric("mem_pct", mem.percent, step=step)
    g = nvmlDeviceGetUtilizationRates(gpu_handle)
    mlflow.log_metric("gpu_util", g.gpu, step=step)
    gm = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("gpu_mem_used", gm.used, step=step)
    mlflow.log_metric("gpu_mem_pct", (gm.used/gm.total)*100, step=step)
    t = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("gpu_temp", t, step=step)

2025/05/07 22:25:38 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Using device: cuda


In [3]:
BATCH_SIZE   = 64
LR           = 3e-3
WEIGHT_DECAY = 1e-2
EPOCHS       = 20
SAVE_CKPT    = False
BEST_CKPT    = "best_rawcnn.pt"

TAXONOMY_CSV = "/home/jovyan/Data/birdclef-2025/taxonomy.csv"
TRAIN_MAN    = "/home/jovyan/Features/manifest_train.csv"
TEST_MAN     = "/home/jovyan/Features/manifest_test.csv"
TRAIN_META   = "/home/jovyan/Data/birdclef-2025/train.csv"
FEATURE_BASE = "/home/jovyan/Features"

tax_df  = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax_df["primary_label"].astype(str).tolist())
NUM_CLS = len(CLASSES)

mlflow.log_params({
    "model":        "RawAudioCNN",
    "batch_size":   BATCH_SIZE,
    "lr":           LR,
    "weight_decay": WEIGHT_DECAY,
    "epochs":       EPOCHS,
    "input":        "raw_waveform",
})

In [4]:
class RawAudioDataset(Dataset):
    def __init__(self, manifest_csv, meta_csv, base, classes,
                 sr=32000, dur=10.0):
        m = pd.read_csv(manifest_csv)
        m["path"] = (
            m["audio_path"].astype(str)
             .str.lstrip(os.sep)
             .apply(lambda p: os.path.join(base, "denoised", p))
        )
        meta = pd.read_csv(meta_csv, usecols=["filename","secondary_labels"])
        meta["rid"]  = meta.filename.str.replace(r"\.ogg$","",regex=True)
        meta["secs"] = meta.secondary_labels.fillna("").str.split()
        sec_map      = dict(zip(meta.rid, meta.secs))

        self.rows    = []
        self.idx_map = {c:i for i,c in enumerate(classes)}
        self.wav_len = int(sr * dur)
        for _, r in m.iterrows():
            rid  = r.chunk_id.split("_chk")[0]
            labs = [r.primary_label] + sec_map.get(rid, [])
            labs = [l for l in labs if l in self.idx_map]
            prim = self.idx_map[r.primary_label]
            self.rows.append((r.path, labs, prim))

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, i):
        path, labs, prim = self.rows[i]
        wav, _ = torchaudio.load(path)    # (1, samples)
        wav     = wav.squeeze(0)          # (samples,)
        if wav.size(0) < self.wav_len:
            wav = F.pad(wav, (0, self.wav_len-wav.size(0)))
        else:
            wav = wav[:self.wav_len]
        # normalize per‐sample
        wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
        # label vector
        y = torch.zeros(NUM_CLS, dtype=torch.float32)
        for l in labs:
            y[self.idx_map[l]] = 1.0
        return wav, y, prim

In [5]:
train_ds = RawAudioDataset(TRAIN_MAN, TRAIN_META, FEATURE_BASE, CLASSES)
test_ds  = RawAudioDataset(TEST_MAN,  TRAIN_META, FEATURE_BASE, CLASSES)

train_loader = DataLoader(train_ds,
    batch_size=BATCH_SIZE, shuffle=True,  num_workers=16, pin_memory=True)
test_loader  = DataLoader(test_ds,
    batch_size=BATCH_SIZE, shuffle=False, num_workers=16, pin_memory=True)

In [6]:
class RawAudioCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # initial downsample
        self.conv1 = nn.Conv1d(1, 16, kernel_size=15, stride=4, padding=7)
        self.bn1   = nn.BatchNorm1d(16)
        self.pool  = nn.MaxPool1d(4)
        # deeper layers
        self.conv2 = nn.Conv1d(16, 32, kernel_size=15, stride=2, padding=7)
        self.bn2   = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=15, stride=2, padding=7)
        self.bn3   = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64,128,kernel_size=15, stride=2,padding=7)
        self.bn4   = nn.BatchNorm1d(128)
        # global pooling & head
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc          = nn.Linear(128, num_classes)

    def forward(self, x):
        # x: [B, T] → [B,1,T]
        x = x.unsqueeze(1)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.global_pool(x).squeeze(-1)  # [B,128]
        return self.fc(x)

In [7]:

model     = RawAudioCNN(NUM_CLS).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(
    model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY
)
scheduler = OneCycleLR(
    optimizer, max_lr=LR,
    steps_per_epoch=len(train_loader),
    epochs=EPOCHS, pct_start=0.1,
    div_factor=10, final_div_factor=100
)
scaler    = GradScaler()

In [8]:
best_f1 = best_ap = best_acc = 0.0
thresholds = np.full(NUM_CLS, 0.5, dtype=np.float32)

for epoch in range(1, EPOCHS+1):
    # — Train —
    model.train()
    run_loss = total = 0
    train_bar = tqdm(train_loader, desc=f"[{epoch}/{EPOCHS}] Train", unit="batch")
    for wav, yb, prim in train_bar:
        wav, yb = wav.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        with autocast(device_type="cuda"):
            logits = model(wav)
            loss   = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        bs = wav.size(0)
        run_loss += loss.item()*bs
        total    += bs
        train_bar.set_postfix({"loss": f"{run_loss/total:.4f}"})

    train_loss = run_loss/total

    # — Eval —
    model.eval()
    val_loss = total = 0
    all_scores, all_tgts, all_prims = [], [], []
    eval_bar = tqdm(test_loader, desc=f"[{epoch}/{EPOCHS}] Test ", unit="batch")
    with torch.no_grad():
        for wav, yb, prim in eval_bar:
            wav, yb = wav.to(DEVICE), yb.to(DEVICE)
            with autocast(device_type="cuda"):
                logits = model(wav)
                loss   = criterion(logits, yb)
            bs = wav.size(0)
            val_loss += loss.item()*bs
            total    += bs
            scores = torch.sigmoid(logits).cpu().numpy()
            all_scores.append(scores)
            all_tgts.append(yb.cpu().numpy())
            all_prims.append(prim.numpy())
            eval_bar.set_postfix({"loss": f"{val_loss/total:.4f}"})

    val_loss = val_loss/total
    scores   = np.vstack(all_scores)
    tgts     = np.vstack(all_tgts)
    prims    = np.concatenate(all_prims, axis=0)

    # threshold calibration
    for i in range(NUM_CLS):
        y_true, y_score = tgts[:,i], scores[:,i]
        if 0<y_true.sum()<len(y_true):
            prec, rec, th = precision_recall_curve(y_true, y_score)
            f1s = 2*prec[:-1]*rec[:-1]/(prec[:-1]+rec[:-1]+1e-8)
            if f1s.size>0:
                thresholds[i] = th[np.nanargmax(f1s)]

    preds    = (scores>=thresholds).astype(int)
    micro_f1 = f1_score(tgts, preds, average="micro", zero_division=0)
    micro_ap = average_precision_score(tgts, scores, average="micro")
    prim_acc = (scores.argmax(axis=1)==prims).mean()

    # checkpoint best
    if micro_f1>best_f1:
        best_f1, best_ap, best_acc = micro_f1, micro_ap, prim_acc
        torch.save(model.state_dict(), BEST_CKPT)
        mlflow.log_artifact(BEST_CKPT, artifact_path="model")

    # MLflow logging
    mlflow.log_metrics({
        "train_loss": train_loss,
        "val_loss":   val_loss,
        "micro_f1":   micro_f1,
        "micro_ap":   micro_ap,
        "prim_acc":   prim_acc
    }, step=epoch)
    log_sys(step=epoch)

    print(f"→ Epoch {epoch}/{EPOCHS}  "
          f"F1={micro_f1:.4f}  AP={micro_ap:.4f}  PrimAcc={prim_acc:.4f}")

[1/20] Train: 100%|██████████| 1695/1695 [02:06<00:00, 13.45batch/s, loss=0.0560]
[1/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.11batch/s, loss=0.0273]


→ Epoch 1/20  F1=0.0237  AP=0.0492  PrimAcc=0.1066


[2/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.71batch/s, loss=0.0240]
[2/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.31batch/s, loss=0.0229]


→ Epoch 2/20  F1=0.0837  AP=0.1585  PrimAcc=0.2202


[3/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.66batch/s, loss=0.0202]
[3/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.05batch/s, loss=0.0213]


→ Epoch 3/20  F1=0.1712  AP=0.2017  PrimAcc=0.2692


[4/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.66batch/s, loss=0.0182]
[4/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.69batch/s, loss=0.0211]


→ Epoch 4/20  F1=0.2314  AP=0.2140  PrimAcc=0.2928


[5/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.69batch/s, loss=0.0170]
[5/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.26batch/s, loss=0.0202]


→ Epoch 5/20  F1=0.2722  AP=0.2607  PrimAcc=0.3272


[6/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.68batch/s, loss=0.0161]
[6/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.33batch/s, loss=0.0190]


→ Epoch 6/20  F1=0.2856  AP=0.3200  PrimAcc=0.3779


[7/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.68batch/s, loss=0.0155]
[7/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.81batch/s, loss=0.0195]


→ Epoch 7/20  F1=0.3096  AP=0.3219  PrimAcc=0.3867


[8/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.62batch/s, loss=0.0149]
[8/20] Test : 100%|██████████| 173/173 [00:09<00:00, 17.92batch/s, loss=0.0191]


→ Epoch 8/20  F1=0.3078  AP=0.3154  PrimAcc=0.3825


[9/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.64batch/s, loss=0.0145]
[9/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.07batch/s, loss=0.0188]


→ Epoch 9/20  F1=0.3099  AP=0.3466  PrimAcc=0.4026


[10/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.67batch/s, loss=0.0141]
[10/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.37batch/s, loss=0.0182]


→ Epoch 10/20  F1=0.3412  AP=0.3618  PrimAcc=0.4217


[11/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.64batch/s, loss=0.0138]
[11/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.66batch/s, loss=0.0178]


→ Epoch 11/20  F1=0.3388  AP=0.3793  PrimAcc=0.4341


[12/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.67batch/s, loss=0.0135]
[12/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.33batch/s, loss=0.0175]


→ Epoch 12/20  F1=0.3229  AP=0.3951  PrimAcc=0.4467


[13/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.68batch/s, loss=0.0132]
[13/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.62batch/s, loss=0.0174]


→ Epoch 13/20  F1=0.3467  AP=0.4075  PrimAcc=0.4692


[14/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.67batch/s, loss=0.0130]
[14/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.52batch/s, loss=0.0172]


→ Epoch 14/20  F1=0.3550  AP=0.4113  PrimAcc=0.4670


[15/20] Train: 100%|██████████| 1695/1695 [02:03<00:00, 13.67batch/s, loss=0.0128]
[15/20] Test : 100%|██████████| 173/173 [00:09<00:00, 17.75batch/s, loss=0.0170]


→ Epoch 15/20  F1=0.3411  AP=0.4181  PrimAcc=0.4700


[16/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.64batch/s, loss=0.0126]
[16/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.92batch/s, loss=0.0168]


→ Epoch 16/20  F1=0.3505  AP=0.4293  PrimAcc=0.4774


[17/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.65batch/s, loss=0.0125]
[17/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.33batch/s, loss=0.0167]


→ Epoch 17/20  F1=0.3502  AP=0.4321  PrimAcc=0.4874


[18/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.62batch/s, loss=0.0124]
[18/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.23batch/s, loss=0.0166]


→ Epoch 18/20  F1=0.3386  AP=0.4361  PrimAcc=0.4874


[19/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.65batch/s, loss=0.0123]
[19/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.33batch/s, loss=0.0166]


→ Epoch 19/20  F1=0.3319  AP=0.4362  PrimAcc=0.4903


[20/20] Train: 100%|██████████| 1695/1695 [02:04<00:00, 13.66batch/s, loss=0.0123]
[20/20] Test : 100%|██████████| 173/173 [00:09<00:00, 18.38batch/s, loss=0.0166]


→ Epoch 20/20  F1=0.3392  AP=0.4372  PrimAcc=0.4904


In [9]:
mlflow.log_metric("best_micro_f1", best_f1)
mlflow.log_metric("best_micro_ap", best_ap)
mlflow.log_metric("best_prim_acc", best_acc)
mlflow.end_run()

2025/05/07 23:10:51 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/07 23:10:51 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run useful-fish-586 at: http://192.5.87.49:8000/#/experiments/9/runs/de5306d0af1e4f77859e41cc45550152
🧪 View experiment at: http://192.5.87.49:8000/#/experiments/9
