In [4]:
import os, glob, numpy as np
import subprocess
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pandas as pd
import mlflow
import mlflow.pytorch

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# MLFlow setup
mlflow.set_tracking_uri("http://192.5.86.161:8000/")
mlflow.set_experiment("PannsMLP")
try: 
    mlflow.end_run() # end pre-existing run, if there was one
except:
    pass
finally:
    mlflow.start_run(log_system_metrics=True) # Start MLFlow run
gpu_info = next(
    (subprocess.run(cmd, capture_output=True, text=True).stdout for cmd in ["nvidia-smi", "rocm-smi"] 
     if subprocess.run(f"command -v {cmd}", shell=True, capture_output=True).returncode == 0),
    "No GPU found."
)
mlflow.log_text(gpu_info, "gpu-info.txt")

from mlflow.system_metrics.system_metrics_logger import SystemMetricsLogger

logger = SystemMetricsLogger()
print("System metrics logger initialized:", logger._should_log())

2025/04/30 19:00:16 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/04/30 19:00:16 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2025/04/30 19:00:16 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Using device: cuda
üèÉ View run spiffy-sow-547 at: http://192.5.86.161:8000/#/experiments/1/runs/50f9a3318bf44705ae22e29e2143f9db
üß™ View experiment at: http://192.5.86.161:8000/#/experiments/1


ModuleNotFoundError: No module named 'mlflow.system_metrics.system_metrics_logger'

In [2]:
EMB_DIR = '/home/jovyan/Features/embeddings'

meta = pd.read_csv('/home/jovyan/Data/birdclef-2025/train.csv')
label2idx = {lab:i for i, lab in enumerate(sorted(meta['primary_label'].unique()))}
num_classes = len(label2idx)

# 1) Gather embedding files
all_paths  = sorted(glob.glob(os.path.join(EMB_DIR, '**', '*_emb.npz'), recursive=True))
all_labels = [int(np.load(p)['label']) for p in all_paths]

# 2) Split into train/test, try stratify then fallback
try:
    train_paths, test_paths, train_labels, test_labels = train_test_split(
        all_paths, all_labels,
        test_size=0.2,
        random_state=42
    )
except ValueError:
    print("Warning: stratify failed (too few samples in some classes), splitting without stratify.")
    train_paths, test_paths, train_labels, test_labels = train_test_split(
        all_paths, all_labels,
        test_size=0.2,
        shuffle=True,
        random_state=42
    )

# 3) Dataset definition
class EmbeddingDataset(Dataset):
    def __init__(self, paths):
        self.paths = paths
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        data = np.load(self.paths[idx])
        emb  = data['embedding'].astype(np.float32)
        lbl  = int(data['label'])
        return torch.from_numpy(emb), torch.tensor(lbl)

# 4) Instantiate & wrap in DataLoaders
train_ds = EmbeddingDataset(train_paths)
test_ds  = EmbeddingDataset(test_paths)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, num_workers=0, pin_memory=True)

print(f"Train samples: {len(train_ds)}, Test samples: {len(test_ds)}")

Train samples: 8988, Test samples: 2248


In [3]:
# Cell 3 ‚Äî Define MLP, criterion, optimizer & count params
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes, dropout=0.5):
        super().__init__()
        layers = []
        dims = [input_dim] + hidden_dims
        for i in range(len(hidden_dims)):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.ReLU(inplace=True),
                nn.Dropout(dropout)
            ]
        layers.append(nn.Linear(dims[-1], num_classes))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

sample_emb, sample_lbl = next(iter(train_loader))
input_dim   = sample_emb.shape[1]

model     = MLPClassifier(input_dim, [1024, 512], num_classes, dropout=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params:,}")


Total trainable parameters: 2,728,654


In [4]:
# Log hyperparameters
mlflow.log_params({
    "input_dim": input_dim,
    "hidden_dims": [1024, 512],
    "dropout": 0.5,
    "batch_size": 32,
    "lr": 1e-3,
    "num_epochs": 20
})

In [5]:
# Training loop with MLFlow logging
num_epochs = 20
best_acc   = 0.0
checkpoint_path = 'best_panns_mlp_checkpoint.pt'

for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0.0
    train_correct = 0
    total = 0
    for emb, lbl in tqdm(train_loader, desc=f"Epoch {epoch} ‚ñ∂ Train"):
        emb, lbl = emb.to(device), lbl.to(device)
        optimizer.zero_grad()
        logits = model(emb)
        loss   = criterion(logits, lbl)
        loss.backward()
        optimizer.step()

        train_loss    += loss.item() * emb.size(0)
        train_correct += (logits.argmax(dim=1) == lbl).sum().item()
        total         += emb.size(0)

    train_loss /= total
    train_acc   = train_correct / total

    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for emb, lbl in tqdm(test_loader, desc=f"Epoch {epoch} ‚úÖ Val"):
            emb, lbl = emb.to(device), lbl.to(device)
            logits = model(emb)
            loss   = criterion(logits, lbl)

            val_loss    += loss.item() * emb.size(0)
            val_correct += (logits.argmax(dim=1) == lbl).sum().item()
            val_total   += emb.size(0)

    val_loss /= val_total
    val_acc   = val_correct / val_total

    mlflow.log_metrics({
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "val_loss": val_loss,
        "val_accuracy": val_acc
    }, step=epoch)

    print(f"\nEpoch {epoch:02d} | Train: loss={train_loss:.4f}, acc={train_acc:.4f} | Val: loss={val_loss:.4f}, acc={val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({
            'epoch': epoch,
            'model_state_dict':       model.state_dict(),
            'optimizer_state_dict':   optimizer.state_dict(),
            'best_validation_acc':    best_acc,
            'input_dim':              input_dim,
            'hidden_dims':            [1024, 512],
            'num_classes':            num_classes
        }, checkpoint_path)
        print(f"‚úîÔ∏è  New best model saved (epoch {epoch}, val_acc={val_acc:.4f})")
        mlflow.pytorch.log_model(model, "best_panns_mlp")

mlflow.log_metric("best_val_accuracy", best_acc)
mlflow.end_run()

print(f"\nüéâ Best validation accuracy: {best_acc:.4f}")

Epoch 1 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 1 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]


Epoch 01 | Train: loss=4.5005, acc=0.0765 | Val: loss=4.0391, acc=0.1361
‚úîÔ∏è  New best model saved (epoch 1, val_acc=0.1361)




Epoch 2 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 2 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 02 | Train: loss=3.9142, acc=0.1500 | Val: loss=3.6218, acc=0.2069
‚úîÔ∏è  New best model saved (epoch 2, val_acc=0.2069)




Epoch 3 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 3 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 03 | Train: loss=3.5981, acc=0.1941 | Val: loss=3.3143, acc=0.2487
‚úîÔ∏è  New best model saved (epoch 3, val_acc=0.2487)




Epoch 4 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 4 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 04 | Train: loss=3.3664, acc=0.2293 | Val: loss=3.1672, acc=0.2727
‚úîÔ∏è  New best model saved (epoch 4, val_acc=0.2727)




Epoch 5 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 5 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 05 | Train: loss=3.1976, acc=0.2618 | Val: loss=3.0009, acc=0.3154
‚úîÔ∏è  New best model saved (epoch 5, val_acc=0.3154)




Epoch 6 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 6 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 06 | Train: loss=3.0741, acc=0.2787 | Val: loss=2.8812, acc=0.3461
‚úîÔ∏è  New best model saved (epoch 6, val_acc=0.3461)




Epoch 7 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 7 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]


Epoch 07 | Train: loss=2.9361, acc=0.3020 | Val: loss=2.8243, acc=0.3381


Epoch 8 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 8 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 08 | Train: loss=2.8365, acc=0.3192 | Val: loss=2.7599, acc=0.3701
‚úîÔ∏è  New best model saved (epoch 8, val_acc=0.3701)




Epoch 9 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 9 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 09 | Train: loss=2.7315, acc=0.3451 | Val: loss=2.6081, acc=0.4057
‚úîÔ∏è  New best model saved (epoch 9, val_acc=0.4057)




Epoch 10 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 10 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 10 | Train: loss=2.6546, acc=0.3593 | Val: loss=2.5869, acc=0.4101
‚úîÔ∏è  New best model saved (epoch 10, val_acc=0.4101)




Epoch 11 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 11 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 11 | Train: loss=2.5888, acc=0.3695 | Val: loss=2.5355, acc=0.4186
‚úîÔ∏è  New best model saved (epoch 11, val_acc=0.4186)




Epoch 12 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 12 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 12 | Train: loss=2.5320, acc=0.3814 | Val: loss=2.4915, acc=0.4288
‚úîÔ∏è  New best model saved (epoch 12, val_acc=0.4288)




Epoch 13 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 13 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 13 | Train: loss=2.4548, acc=0.3910 | Val: loss=2.4596, acc=0.4417
‚úîÔ∏è  New best model saved (epoch 13, val_acc=0.4417)




Epoch 14 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 14 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 14 | Train: loss=2.3895, acc=0.4085 | Val: loss=2.3842, acc=0.4506
‚úîÔ∏è  New best model saved (epoch 14, val_acc=0.4506)




Epoch 15 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 15 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 15 | Train: loss=2.3308, acc=0.4192 | Val: loss=2.3465, acc=0.4649
‚úîÔ∏è  New best model saved (epoch 15, val_acc=0.4649)




Epoch 16 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 16 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 16 | Train: loss=2.2853, acc=0.4365 | Val: loss=2.3332, acc=0.4684
‚úîÔ∏è  New best model saved (epoch 16, val_acc=0.4684)




Epoch 17 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 17 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 17 | Train: loss=2.2358, acc=0.4405 | Val: loss=2.3063, acc=0.4715
‚úîÔ∏è  New best model saved (epoch 17, val_acc=0.4715)




Epoch 18 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 18 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]


Epoch 18 | Train: loss=2.1960, acc=0.4489 | Val: loss=2.2727, acc=0.4689


Epoch 19 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 19 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]




Epoch 19 | Train: loss=2.1525, acc=0.4603 | Val: loss=2.2657, acc=0.4795
‚úîÔ∏è  New best model saved (epoch 19, val_acc=0.4795)




Epoch 20 ‚ñ∂ Train:   0%|          | 0/281 [00:00<?, ?it/s]

Epoch 20 ‚úÖ Val:   0%|          | 0/71 [00:00<?, ?it/s]

2025/04/30 18:52:44 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/04/30 18:52:44 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!



Epoch 20 | Train: loss=2.1322, acc=0.4557 | Val: loss=2.2550, acc=0.4795
üèÉ View run masked-toad-913 at: http://192.5.86.161:8000/#/experiments/1/runs/3a16f8b54e0c4b58b5eca4db2e13708f
üß™ View experiment at: http://192.5.86.161:8000/#/experiments/1

üéâ Best validation accuracy: 0.4795


In [32]:
# Cell 5 ‚Äî (Optional) Load best checkpoint for inference or continued training
ckpt = torch.load('best_panns_mlp_checkpoint.pt', map_location=device)
model.load_state_dict(ckpt['model_state_dict'])
optimizer.load_state_dict(ckpt['optimizer_state_dict'])
print(f"Loaded checkpoint from epoch {ckpt['epoch']} with val_acc={ckpt['best_validation_acc']:.4f}")


Loaded checkpoint from epoch 13 with val_acc=0.5565


  ckpt = torch.load('best_panns_mlp_checkpoint.pt', map_location=device)
