In [1]:
#CELL 1
import os
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import timm
from random import sample as rand_sample
from sklearn.metrics import accuracy_score
from peft import get_peft_model, LoraConfig
import onnx
import onnxruntime as ort

In [2]:
#CELL 2
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
FEATURE_BASE  = "/mnt/BirdCLEF/features_sampled"
TEST_MANIFEST = os.path.join(FEATURE_BASE, "manifest_test.csv")
TAXONOMY_CSV  = "/mnt/BirdCLEF/taxonomy.csv"
TRAIN_META    = "/mnt/BirdCLEF/train.csv"
DROPOUT       = 0.3

CKPT_EMB    = "/mnt/BirdCLEF/Models/best_emb_mlp.pt"
CKPT_RES    = "/mnt/BirdCLEF/Models/best_resnet50.pt"
CKPT_EFF    = "/mnt/BirdCLEF/Models/best_effb3_lora.pt"
CKPT_RAW    = "/mnt/BirdCLEF/Models/best_rawcnn.pt"
CKPT_META   = "/mnt/BirdCLEF/Models/best_meta_mlp.pt"

HIDDEN_DIMS    = [1024, 512]
THRESHOLD   = 0.5


In [3]:
#CELL 3
tax = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax["primary_label"].astype(str).tolist())
NUM_CLASSES = len(CLASSES)

In [4]:
#CELL 4
class MetaMLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, dropout):
        super().__init__()
        layers, dims = [], [in_dim]+hidden_dims
        for i in range(len(hidden_dims)):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.BatchNorm1d(dims[i+1]),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
        layers.append(nn.Linear(dims[-1], NUM_CLASSES))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)
    

In [5]:
#CELL 5
class EmbeddingClassifier(nn.Module):
    def __init__(self, emb_dim, num_cls):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 2048), nn.BatchNorm1d(2048), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(2048, 1024),    nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(1024, 512),     nn.BatchNorm1d(512),  nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(512, num_cls)
        )
    def forward(self, x): return self.net(x)

def get_resnet50_multilabel(num_classes):
    m = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', pretrained=False)
    m.conv1 = nn.Conv2d(1, m.conv1.out_channels,
                        kernel_size=m.conv1.kernel_size,
                        stride=m.conv1.stride,
                        padding=m.conv1.padding,
                        bias=False)
    m.fc    = nn.Linear(m.fc.in_features, num_classes)
    return m

TARGET_MODULES  = ["conv_pw","conv_dw","conv_pwl","conv_head"]
MODULES_TO_SAVE = ["classifier"]
def build_efficientnetb3_lora(num_classes):
    base = timm.create_model("efficientnet_b3", pretrained=True)
    # patch forward
    orig_fwd = base.forward
    def forward_patch(*args, input_ids=None, **kwargs):
        x = input_ids if input_ids is not None else args[0]
        return orig_fwd(x)
    base.forward = forward_patch
    # adapt stem & head
    stem = base.conv_stem
    base.conv_stem = nn.Conv2d(1, stem.out_channels,
                               kernel_size=stem.kernel_size,
                               stride=stem.stride,
                               padding=stem.padding,
                               bias=False)
    base.classifier = nn.Linear(base.classifier.in_features, num_classes)
    # LoRA
    lora_cfg = LoraConfig(
        r=12, lora_alpha=24,
        target_modules=TARGET_MODULES,
        lora_dropout=0.1, bias="none",
        modules_to_save=MODULES_TO_SAVE,
        task_type="FEATURE_EXTRACTION",
        inference_mode=False
    )
    return get_peft_model(base, lora_cfg)

class RawAudioCNN(nn.Module):
    def __init__(self, num_cls):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16,  kernel_size=15, stride=4, padding=7)
        self.bn1   = nn.BatchNorm1d(16)
        self.pool  = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(16,32,  kernel_size=15, stride=2, padding=7)
        self.bn2   = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32,64,  kernel_size=15, stride=2, padding=7)
        self.bn3   = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64,128, kernel_size=15, stride=2, padding=7)
        self.bn4   = nn.BatchNorm1d(128)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc          = nn.Linear(128, num_cls)
    def forward(self, x):
        x = x.unsqueeze(1)  # [B,T]→[B,1,T]
        x = F.relu(self.bn1(self.conv1(x))); x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.global_pool(x).squeeze(-1)
        return self.fc(x)


In [17]:
# === Step 1: Get emb_dim ===
class EmbeddingDatasetForDim:
    def __init__(self, manifest, base, key="embedding"):
        df = pd.read_csv(manifest)
        df["emb_path"] = df["emb_path"].astype(str).apply(
            lambda p: os.path.join(base, "embeddings", p.lstrip(os.sep))
        )
        first_sample_path = df.iloc[0].emb_path
        arr = np.load(first_sample_path)[key]  # shape: (n_windows, emb_dim)
        self.emb_dim = arr.shape[1]

_emb_ds = EmbeddingDatasetForDim(TEST_MANIFEST, FEATURE_BASE)
emb_dim = _emb_ds.emb_dim

# === Step 2: Instantiate all models ===
emb_model  = EmbeddingClassifier(emb_dim=emb_dim, num_cls=NUM_CLASSES).to(DEVICE)
res_model  = get_resnet50_multilabel(NUM_CLASSES).to(DEVICE)
eff_model  = build_efficientnetb3_lora(NUM_CLASSES).to(DEVICE)
raw_model  = RawAudioCNN(NUM_CLASSES).to(DEVICE)
meta_model = MetaMLP(NUM_CLASSES * 4, HIDDEN_DIMS, DROPOUT).to(DEVICE)

# === Step 3: Load weights & freeze base models ===
for model, ckpt in [
    (emb_model, CKPT_EMB),
    (res_model, CKPT_RES),
    (eff_model, CKPT_EFF),
    (raw_model, CKPT_RAW)
]:
    model.load_state_dict(torch.load(ckpt, map_location="cpu"))
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

# === Step 4: Load meta model weights (not frozen) ===
meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))
meta_model.eval()

# === Step 5: Compile all models (PyTorch 2.x) ===
emb_model  = torch.compile(emb_model)
res_model  = torch.compile(res_model)
eff_model  = torch.compile(eff_model)
raw_model  = torch.compile(raw_model)
meta_model = torch.compile(meta_model)


Using cache found in /home/jovyan/.cache/torch/hub/pytorch_vision_v0.14.0
  model.load_state_dict(torch.load(ckpt, map_location="cpu"))
  meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))


In [18]:
#Cell 7
def preprocess_sample(sample):
    
    emb_path = os.path.join(FEATURE_BASE, "embeddings", sample.emb_path.lstrip(os.sep))
    emb_arr = np.load(emb_path)["embedding"].mean(axis=0).astype(np.float32)
    emb = torch.from_numpy(emb_arr).unsqueeze(0).to(DEVICE)

    ma_path = os.path.join(FEATURE_BASE, "mel_aug", sample.mel_aug_path.lstrip(os.sep))
    ma_arr = np.load(ma_path)["mel"].astype(np.float32)
    ma = torch.from_numpy(ma_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    m_path = os.path.join(FEATURE_BASE, "mel", sample.mel_path.lstrip(os.sep))
    m_arr = np.load(m_path)["mel"].astype(np.float32)
    m = torch.from_numpy(m_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    wav_path = os.path.join(FEATURE_BASE, "denoised", sample.audio_path.lstrip(os.sep))
    wav, sr = torchaudio.load(wav_path)
    wav = wav.float().squeeze(0)
    T = sr * 10
    if wav.size(0) < T:
        wav = F.pad(wav, (0, T - wav.size(0)))
    else:
        wav = wav[:T]
    wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
    wav = wav.unsqueeze(0).to(DEVICE)

    return emb, ma, m, wav


In [19]:
# === Step 0: Load sample for inference ===
test_manifest = pd.read_csv(TEST_MANIFEST)
sample = test_manifest.iloc[10]  # Or any other row for testing

emb, ma, m, wav = preprocess_sample(sample)


In [20]:
#CELL 9
with torch.no_grad():
    p1 = torch.sigmoid(emb_model(emb))     # [1,NUM_CLASSES]
    p2 = torch.sigmoid(res_model(ma))      # [1,NUM_CLASSES]
    p3 = torch.sigmoid(eff_model(m))       # [1,NUM_CLASSES]
    p4 = torch.sigmoid(raw_model(wav))     # [1,NUM_CLASSES]

    feat   = torch.cat([p1,p2,p3,p4], dim=1)
    logits = meta_model(feat)
    probs  = torch.sigmoid(logits)[0].cpu().numpy()
    

In [21]:
#CELL 10
def get_model_size(path):
    size_mb = os.path.getsize(path) / (1024 ** 2)
    return f"{size_mb:.2f} MB"

print("\nModel Sizes on Disk:")
print(f"  Embedding MLP:    {get_model_size(CKPT_EMB)}")
print(f"  ResNet50:         {get_model_size(CKPT_RES)}")
print(f"  EfficientNetB3:   {get_model_size(CKPT_EFF)}")
print(f"  RawAudioCNN:      {get_model_size(CKPT_RAW)}")
print(f"  Meta MLP:         {get_model_size(CKPT_META)}")



Model Sizes on Disk:
  Embedding MLP:    26.48 MB
  ResNet50:         91.57 MB
  EfficientNetB3:   63.92 MB
  RawAudioCNN:      0.73 MB
  Meta MLP:         5.66 MB


In [22]:
#CELL 11
ml_preds = [(CLASSES[i], float(probs[i]))
            for i in range(NUM_CLASSES) if probs[i] >= THRESHOLD]

print(f"\nMulti-label Predictions (Threshold ≥ {THRESHOLD}):")
if ml_preds:
    for label, score in ml_preds:
        print(f"{label:<15}|Confidence: {score:.3f}")
else:
    print(" No predictions met the threshold.")



Multi-label Predictions (Threshold ≥ 0.5):
126247         |Confidence: 0.635


In [23]:
#CELL 12
primary_idx   = int(probs.argmax())
primary_label = CLASSES[primary_idx]
primary_score = float(probs[primary_idx])

print(f"\nPrimary‑label (top‑1) prediction:")
print(f"  → {primary_label}: {primary_score:.3f}")



Primary‑label (top‑1) prediction:
  → 126247: 0.635


In [24]:
#Cell 13
all_preds = []
all_labels = []

for _, sample in test_manifest.iterrows():
    
    emb, ma, m, wav = preprocess_sample(sample)

    # Inference
    with torch.no_grad():
        p1 = torch.sigmoid(emb_model(emb))
        p2 = torch.sigmoid(res_model(ma))
        p3 = torch.sigmoid(eff_model(m))
        p4 = torch.sigmoid(raw_model(wav))

        feat = torch.cat([p1, p2, p3, p4], dim=1)
        logits = meta_model(feat)
        probs = torch.sigmoid(logits)[0].cpu().numpy()

    # Thresholded prediction
    pred_labels = (probs >= THRESHOLD).astype(int)

    # Ground truth (one-hot)
    gt = np.zeros(NUM_CLASSES, dtype=int)
    labels = sample.primary_label if isinstance(sample.primary_label, list) else [sample.primary_label]
    for lbl in labels:
        if lbl in CLASSES:
            gt[CLASSES.index(lbl)] = 1

    all_preds.append(pred_labels)
    all_labels.append(gt)

# Compute accuracy
y_pred = np.stack(all_preds)
y_true = np.stack(all_labels)
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.4f}")


Accuracy: 0.5961


In [25]:
#Cell 14
N_TRIALS = 100
sample_indices = rand_sample(range(len(test_manifest)), N_TRIALS)
latencies = []

with torch.no_grad():
    for idx in sample_indices:
        sample = test_manifest.iloc[idx]
        emb, ma, m, wav = preprocess_sample(sample)

        start = time.time()

        p1 = torch.sigmoid(emb_model(emb))
        p2 = torch.sigmoid(res_model(ma))
        p3 = torch.sigmoid(eff_model(m))
        p4 = torch.sigmoid(raw_model(wav))

        feat = torch.cat([p1, p2, p3, p4], dim=1)
        logits = meta_model(feat)
        _ = torch.sigmoid(logits)[0].cpu().numpy()

        end = time.time()
        latencies.append(end - start)

latencies = np.array(latencies)

print(f"\n=== Inference Latency Stats over {N_TRIALS} Random Samples ===")
print(f"Median Latency:      {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"95th Percentile:     {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"99th Percentile:     {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Average Latency:     {np.mean(latencies) * 1000:.2f} ms")
print(f"Throughput:          {N_TRIALS / np.sum(latencies):.2f} FPS")



=== Inference Latency Stats over 100 Random Samples ===
Median Latency:      20.91 ms
95th Percentile:     26.80 ms
99th Percentile:     28.82 ms
Average Latency:     22.20 ms
Throughput:          45.05 FPS


In [40]:

import time

num_batches = 50
batch_size = 256

# Cap total trials to available samples
num_trials = min(num_batches * batch_size, len(test_manifest))
sample_indices = rand_sample(range(len(test_manifest)), num_trials)

# Group into batches
batches = [sample_indices[i:i + batch_size] for i in range(0, len(sample_indices), batch_size)]

# Warm-up batch
warmup_batch = batches[0]
embs, mas, ms, wavs = [], [], [], []
for idx in warmup_batch:
    sample = test_manifest.iloc[idx]
    emb, ma, m, wav = preprocess_sample(sample)
    embs.append(emb)
    mas.append(ma)
    ms.append(m)
    wavs.append(wav)

with torch.no_grad():
    _ = meta_model(torch.cat([
        torch.sigmoid(emb_model(torch.cat(embs))),
        torch.sigmoid(res_model(torch.cat(mas))),
        torch.sigmoid(eff_model(torch.cat(ms))),
        torch.sigmoid(raw_model(torch.cat(wavs)))
    ], dim=1))

# Timed batches
batch_times = []
for batch in batches[1:]:
    embs, mas, ms, wavs = [], [], [], []
    for idx in batch:
        sample = test_manifest.iloc[idx]
        emb, ma, m, wav = preprocess_sample(sample)
        embs.append(emb)
        mas.append(ma)
        ms.append(m)
        wavs.append(wav)

    with torch.no_grad():
        start_time = time.time()

        p1 = torch.sigmoid(emb_model(torch.cat(embs)))
        p2 = torch.sigmoid(res_model(torch.cat(mas)))
        p3 = torch.sigmoid(eff_model(torch.cat(ms)))
        p4 = torch.sigmoid(raw_model(torch.cat(wavs)))

        feat = torch.cat([p1, p2, p3, p4], dim=1)
        _ = torch.sigmoid(meta_model(feat))

        batch_times.append(time.time() - start_time)

# Report
batch_times = np.array(batch_times)
print(f"\n=== Batch Inference Timing ({batch_size} samples × {len(batch_times)} batches) ===")
print(f"Median Latency:      {np.percentile(batch_times, 50) * 1000:.2f} ms")
print(f"95th Percentile:     {np.percentile(batch_times, 95) * 1000:.2f} ms")
print(f"99th Percentile:     {np.percentile(batch_times, 99) * 1000:.2f} ms")
print(f"Average Latency:     {np.mean(batch_times) * 1000:.2f} ms")
print(f"Throughput:          {(len(batch_times) * batch_size) / np.sum(batch_times):.2f} FPS")



=== Batch Inference Timing (256 samples × 43 batches) ===
Median Latency:      19.35 ms
95th Percentile:     21.26 ms
99th Percentile:     63.34 ms
Average Latency:     21.27 ms
Throughput:          12036.27 FPS


In [16]:
#CELL 16.a
print("\n=== Final Performance Summary: BirdCLEF Fusion Model (EAGER) ===")

# Accuracy summary
total = len(test_manifest)
correct = int((y_pred == y_true).all(axis=1).sum())  # Subset accuracy match
accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

# Model size summary
model_ckpts = {
    "Embedding MLP": CKPT_EMB,
    "ResNet50": CKPT_RES,
    "EffNetB3 + LoRA": CKPT_EFF,
    "RawAudioCNN": CKPT_RAW,
    "Meta MLP": CKPT_META
}
total_model_size = sum(os.path.getsize(p) for p in model_ckpts.values())
print(f"Model Size on Disk (total): {total_model_size / 1e6:.2f} MB")

# Latency summary (single sample)
num_trials = len(latencies)
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials / np.sum(latencies):.2f} FPS")

# Batch throughput (from Cell 15)
batch_fps = (len(batch_times) * batch_size) / np.sum(batch_times)
print(f"Batch Throughput: {batch_fps:.2f} FPS")



=== Final Performance Summary: BirdCLEF Fusion Model (EAGER) ===
Accuracy: 59.61% (6570/11022 correct)
Model Size on Disk (total): 197.51 MB
Inference Latency (single sample, median): 26.70 ms
Inference Latency (single sample, 95th percentile): 29.41 ms
Inference Latency (single sample, 99th percentile): 38.15 ms
Inference Throughput (single sample): 36.71 FPS
Batch Throughput: 1184.37 FPS


In [41]:
#CELL 16.b
print("\n=== Final Performance Summary: BirdCLEF Fusion Model (Compile) ===")

# Accuracy summary
total = len(test_manifest)
correct = int((y_pred == y_true).all(axis=1).sum())  # Subset accuracy match
accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

# Model size summary
model_ckpts = {
    "Embedding MLP": CKPT_EMB,
    "ResNet50": CKPT_RES,
    "EffNetB3 + LoRA": CKPT_EFF,
    "RawAudioCNN": CKPT_RAW,
    "Meta MLP": CKPT_META
}
total_model_size = sum(os.path.getsize(p) for p in model_ckpts.values())
print(f"Model Size on Disk (total): {total_model_size / 1e6:.2f} MB")

# Latency summary (single sample)
num_trials = len(latencies)
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials / np.sum(latencies):.2f} FPS")

# Batch throughput (from Cell 15)
batch_fps = (len(batch_times) * batch_size) / np.sum(batch_times)
print(f"Batch Throughput: {batch_fps:.2f} FPS")



=== Final Performance Summary: BirdCLEF Fusion Model (Compile) ===
Accuracy: 59.61% (6570/11022 correct)
Model Size on Disk (total): 197.51 MB
Inference Latency (single sample, median): 20.91 ms
Inference Latency (single sample, 95th percentile): 26.80 ms
Inference Latency (single sample, 99th percentile): 28.82 ms
Inference Throughput (single sample): 45.05 FPS
Batch Throughput: 12036.27 FPS
