In [13]:
import os
import subprocess
import psutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import timm
from peft import get_peft_model, LoraConfig, TaskType
import mlflow
import mlflow.pytorch
from types import SimpleNamespace
from pynvml import (
    nvmlInit, nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetUtilizationRates, nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetTemperature, NVML_TEMPERATURE_GPU
)

In [14]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", DEVICE)

mlflow.set_experiment("EfficientNetB3_LoRA")
try: mlflow.end_run()
except: pass
mlflow.start_run(log_system_metrics=True)

gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout
     for cmd in ["nvidia-smi", "rocm-smi"]
     if subprocess.run(f"command -v {cmd}", shell=True, capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")

nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

def log_system_metrics_mlflow(step=None):
    mlflow.log_metric("system.cpu.utilization", psutil.cpu_percent(), step=step)
    mem = psutil.virtual_memory()
    mlflow.log_metric("system.memory.used", mem.used, step=step)
    mlflow.log_metric("system.memory.percent", mem.percent, step=step)
    gpu_util = nvmlDeviceGetUtilizationRates(gpu_handle).gpu
    mlflow.log_metric("system.gpu.0.utilization", gpu_util, step=step)
    gpu_mem = nvmlDeviceGetMemoryInfo(gpu_handle)
    mlflow.log_metric("system.gpu.0.memory.used", gpu_mem.used, step=step)
    mlflow.log_metric("system.gpu.0.memory.percent",
                      (gpu_mem.used / gpu_mem.total) * 100, step=step)
    gpu_temp = nvmlDeviceGetTemperature(gpu_handle, NVML_TEMPERATURE_GPU)
    mlflow.log_metric("system.gpu.0.temperature", gpu_temp, step=step)

2025/05/07 03:38:41 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Using device: cuda


In [15]:
tax_df      = pd.read_csv("/home/jovyan/Data/birdclef-2025/taxonomy.csv")
classes     = sorted(tax_df['primary_label'].astype(str).tolist())
num_classes = len(classes)

In [16]:
class MelDataset(Dataset):
    def __init__(self, manifest_csv, metadata_csv, feature_base, classes, mel_key="mel"):
        m_df = pd.read_csv(manifest_csv)
        # build full mel npz paths
        m_df["mel_path"] = (
            m_df["mel_path"].astype(str)
                .str.lstrip(os.sep)
                .apply(lambda p: os.path.join(feature_base, "mel", p))
        )
        # load secondary labels
        meta = pd.read_csv(metadata_csv, usecols=["filename","secondary_labels"])
        meta["recording_id"]   = meta.filename.str.replace(r"\.ogg$","",regex=True)
        meta["secondary_list"] = meta.secondary_labels.fillna("").str.split()
        sec_map = dict(zip(meta.recording_id, meta.secondary_list))

        self.rows       = []
        self.label2idx  = {lab:i for i, lab in enumerate(classes)}
        self.num_classes= len(classes)
        self.mel_key    = mel_key

        for _, row in tqdm(m_df.iterrows(), total=len(m_df),
                          desc=f"Building {os.path.basename(manifest_csv)}"):
            rid  = row.chunk_id.split("_chk")[0]
            labs = [row.primary_label] + sec_map.get(rid, [])
            # filter to only known classes
            labs = [l for l in labs if l in self.label2idx]
            self.rows.append({"mel_path": row.mel_path, "labels": labs})

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        r   = self.rows[idx]
        arr = np.load(r["mel_path"])[self.mel_key]          # (n_mels, n_frames)
        x   = torch.from_numpy(arr).unsqueeze(0).float()     # (1, n_mels, n_frames)
        y   = torch.zeros(self.num_classes, dtype=torch.float32)
        for lab in r["labels"]:
            y[self.label2idx[lab]] = 1.0
        return x, y


In [17]:
def build_efficientnetb3_lora(num_classes):
    base = timm.create_model("efficientnet_b3", pretrained=True)

    # ─── Monkey‑patch forward to swallow input_ids / kwargs ─────────────────
    orig_forward = base.forward
    def forward_patch(*args, input_ids=None, **kwargs):
        # PEFT wrapper will call this with input_ids=xb (our tensor)
        x = input_ids if input_ids is not None else args[0]
        return orig_forward(x)
    base.forward = forward_patch

    # ─── Adapt to 1‑channel ───────────────────────────────────────────────────
    stem = base.conv_stem
    base.conv_stem = nn.Conv2d(
        in_channels=1,
        out_channels=stem.out_channels,
        kernel_size=stem.kernel_size,
        stride=stem.stride,
        padding=stem.padding,
        bias=False
    )
    # ─── Replace the classifier head ────────────────────────────────────────
    in_feat = base.classifier.in_features
    base.classifier = nn.Linear(in_feat, num_classes)

    lora_config = LoraConfig(
        r=12,
        lora_alpha=24,
        target_modules=TARGET_MODULES,
        lora_dropout=0.1,
        bias="none",
        modules_to_save=MODULES_TO_SAVE,
        task_type="FEATURE_EXTRACTION",
        inference_mode=False
    )
    model = get_peft_model(base, lora_config)
    model.print_trainable_parameters()
    return model

In [18]:
FEATURE_BASE   = "/home/jovyan/Features"
TRAIN_MANIFEST = os.path.join(FEATURE_BASE, "manifest_train.csv")
TEST_MANIFEST  = os.path.join(FEATURE_BASE, "manifest_test.csv")
TRAIN_CSV      = "/home/jovyan/Data/birdclef-2025/train.csv"

TARGET_MODULES  = ["conv_pw", "conv_dw", "conv_pwl", "conv_head"]
MODULES_TO_SAVE = ["classifier"]


BATCH_SIZE = 64
LR         = 1e-4
EPOCHS     = 1

train_ds = MelDataset(TRAIN_MANIFEST, TRAIN_CSV, FEATURE_BASE, classes)
test_ds  = MelDataset(TEST_MANIFEST,  TRAIN_CSV, FEATURE_BASE, classes)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE,
                          shuffle=True, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE,
                          shuffle=False, num_workers=4, pin_memory=True)

model     = build_efficientnetb3_lora(num_classes).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

mlflow.log_params({
    "model":           "efficientnet_b3_lora",
    "input":           "mel",
    "num_classes":     num_classes,
    "batch_size":      BATCH_SIZE,
    "lr":              LR,
    "epochs":          EPOCHS,
    "lora_r":          12,
    "lora_alpha":      24,
    "lora_dropout":    0.1,
    "target_modules":  TARGET_MODULES
})

Building manifest_train.csv: 100%|██████████| 108451/108451 [00:04<00:00, 22855.52it/s]
Building manifest_test.csv: 100%|██████████| 11022/11022 [00:00<00:00, 23751.14it/s]


trainable params: 5,572,334 || all params: 16,584,468 || trainable%: 33.5997


In [19]:
best_test_acc, best_ckpt = 0.0, None

for epoch in range(1, EPOCHS+1):
    # — Train —
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch} Train", unit="batch")
    run_loss, correct, total = 0.0, 0, 0
    for xb, yb in pbar:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        run_loss += loss.item() * xb.size(0)
        preds    = (torch.sigmoid(logits)>0.5).float()
        correct  += (preds==yb).all(dim=1).sum().item()
        total    += xb.size(0)
        pbar.set_postfix({"loss":f"{run_loss/total:.4f}",
                          "acc": f"{correct/total:.4f}"})

    train_loss, train_acc = run_loss/total, correct/total

    # — Test —
    model.eval()
    pbar = tqdm(test_loader, desc=f"Epoch {epoch} Test ", unit="batch")
    test_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for xb, yb in pbar:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss   = criterion(logits, yb)

            test_loss += loss.item() * xb.size(0)
            preds      = (torch.sigmoid(logits)>0.5).float()
            correct   += (preds==yb).all(dim=1).sum().item()
            total     += xb.size(0)
            pbar.set_postfix({"loss":f"{test_loss/total:.4f}",
                              "acc": f"{correct/total:.4f}"})

    test_loss, test_acc = test_loss/total, correct/total

    # — Checkpoint —
    ckpt = f"effb3_lora_epoch_{epoch}.pt"
    torch.save({
        "epoch":       epoch,
        "model_state": model.state_dict(),
        "optim_state": optimizer.state_dict(),
        "train_loss":  train_loss,
        "test_loss":   test_loss
    }, ckpt)

    mlflow.log_metrics({
        "train_loss":     train_loss,
        "train_accuracy": train_acc,
        "test_loss":      test_loss,
        "test_accuracy":  test_acc
    }, step=epoch)
    log_system_metrics_mlflow(step=epoch)
    mlflow.log_artifact(ckpt, artifact_path="checkpoints")

    if test_acc > best_test_acc:
        best_test_acc, best_ckpt = test_acc, ckpt

    print(f"→ Epoch {epoch}/{EPOCHS}  "
          f"Train loss={train_loss:.4f}, acc={train_acc:.4f} │ "
          f"Test loss={test_loss:.4f}, acc={test_acc:.4f}")

Epoch 1 Train: 100%|██████████| 1695/1695 [05:36<00:00,  5.04batch/s, loss=0.0363, acc=0.0158]
Epoch 1 Test : 100%|██████████| 173/173 [00:12<00:00, 14.19batch/s, loss=0.0241, acc=0.0351]


→ Epoch 1/1  Train loss=0.0363, acc=0.0158 │ Test loss=0.0241, acc=0.0351


In [20]:
mlflow.log_metric("best_test_accuracy", best_test_acc)

LOCAL_MODEL_DIR = "effb3_lora_local"
mlflow.pytorch.save_model(model, LOCAL_MODEL_DIR)
mlflow.log_artifacts(LOCAL_MODEL_DIR, artifact_path="effb3_lora_model")

mlflow.end_run()

2025/05/07 03:45:08 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/07 03:45:08 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


🏃 View run funny-colt-568 at: http://192.5.87.49:8000/#/experiments/3/runs/7e2c3350dc09484ca4221d28b614e35c
🧪 View experiment at: http://192.5.87.49:8000/#/experiments/3
