In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
from peft import get_peft_model, LoraConfig
import onnx

# ====== Constants ======
NUM_CLASSES = 206
EMB_DIM = 2048
MEL_SHAPE = (1, 64, 313)
WAV_LEN = 320000
FUSION_DIM = NUM_CLASSES * 4
DROPOUT = 0.3
EXPORT_OPSET = 20
onnx_dir = "onnx_exports"
os.makedirs(onnx_dir, exist_ok=True)

# ====== Model Definitions ======

class EmbeddingClassifier(nn.Module):
    def __init__(self, emb_dim, num_cls):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 2048), nn.BatchNorm1d(2048), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(2048, 1024),    nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(1024, 512),     nn.BatchNorm1d(512),  nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(512, num_cls)
        )
    def forward(self, x): return self.net(x)

def get_resnet50_multilabel(num_classes):
    m = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', pretrained=False)
    m.conv1 = nn.Conv2d(1, m.conv1.out_channels,
                        kernel_size=m.conv1.kernel_size,
                        stride=m.conv1.stride,
                        padding=m.conv1.padding,
                        bias=False)
    m.fc = nn.Linear(m.fc.in_features, num_classes)
    return m

TARGET_MODULES  = ["conv_pw", "conv_dw", "conv_pwl", "conv_head"]
MODULES_TO_SAVE = ["classifier"]

def build_efficientnetb3_lora(num_classes):
    base = timm.create_model("efficientnet_b3", pretrained=False)
    orig_fwd = base.forward
    def forward_patch(*args, input_ids=None, **kwargs):
        x = input_ids if input_ids is not None else args[0]
        return orig_fwd(x)
    base.forward = forward_patch
    base.conv_stem = nn.Conv2d(1, base.conv_stem.out_channels,
                               kernel_size=base.conv_stem.kernel_size,
                               stride=base.conv_stem.stride,
                               padding=base.conv_stem.padding,
                               bias=False)
    base.classifier = nn.Linear(base.classifier.in_features, num_classes)
    lora_cfg = LoraConfig(
        r=12, lora_alpha=24,
        target_modules=TARGET_MODULES,
        lora_dropout=0.1, bias="none",
        modules_to_save=MODULES_TO_SAVE,
        task_type="FEATURE_EXTRACTION",
        inference_mode=False
    )
    return get_peft_model(base, lora_cfg)

class RawAudioCNN(nn.Module):
    def __init__(self, num_cls):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=15, stride=4, padding=7)
        self.bn1   = nn.BatchNorm1d(16)
        self.pool  = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=15, stride=2, padding=7)
        self.bn2   = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=15, stride=2, padding=7)
        self.bn3   = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64, 128, kernel_size=15, stride=2, padding=7)
        self.bn4   = nn.BatchNorm1d(128)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(128, num_cls)
    def forward(self, x):
        x = x.unsqueeze(1)  # [B, T] -> [B, 1, T]
        x = F.relu(self.bn1(self.conv1(x))); x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.global_pool(x).squeeze(-1)
        return self.fc(x)

class MetaMLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, dropout):
        super().__init__()
        layers, dims = [], [in_dim] + hidden_dims
        for i in range(len(hidden_dims)):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.BatchNorm1d(dims[i+1]),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
        layers.append(nn.Linear(dims[-1], NUM_CLASSES))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)

# ====== Instantiate Models ======
emb_model = EmbeddingClassifier(EMB_DIM, NUM_CLASSES)
res_model = get_resnet50_multilabel(NUM_CLASSES)
eff_model = build_efficientnetb3_lora(NUM_CLASSES)
raw_model = RawAudioCNN(NUM_CLASSES)
meta_model = MetaMLP(NUM_CLASSES * 4, [1024, 512], DROPOUT)

# Ensure eval mode
for m in [emb_model, res_model, eff_model, raw_model, meta_model]:
    m.eval()

# ====== Export to ONNX ======

def export_model(model, dummy_input, filename, input_name, output_name):
    torch.onnx.export(
        model, dummy_input, os.path.join(onnx_dir, filename),
        input_names=[input_name], output_names=[output_name],
        dynamic_axes={input_name: {0: "batch_size"}, output_name: {0: "batch_size"}},
        opset_version=EXPORT_OPSET, export_params=True, do_constant_folding=True
    )
    print(f"Exported: {filename}")

# Embedding MLP
export_model(emb_model, torch.randn(1, EMB_DIM), "embedding_classifier.onnx", "embedding_input", "embedding_output")

# ResNet50 (mel_aug)
export_model(res_model, torch.randn(1, *MEL_SHAPE), "resnet50_multilabel.onnx", "mel_aug_input", "resnet_output")

# EfficientNetB3 + LoRA (mel)
export_model(eff_model, torch.randn(1, *MEL_SHAPE), "efficientnet_b3_lora.onnx", "mel_input", "effnet_output")

# RawAudioCNN (wav)
export_model(raw_model, torch.randn(1, WAV_LEN), "raw_audio_cnn.onnx", "wav_input", "raw_output")

# Meta MLP
export_model(meta_model, torch.randn(1, FUSION_DIM), "meta_mlp.onnx", "fusion_input", "meta_output")


Using cache found in /home/jovyan/.cache/torch/hub/pytorch_vision_v0.14.0


Exported: embedding_classifier.onnx
Exported: resnet50_multilabel.onnx
Exported: efficientnet_b3_lora.onnx
Exported: raw_audio_cnn.onnx
Exported: meta_mlp.onnx


In [2]:
import os
import time
import torch
import torchaudio
import numpy as np
import pandas as pd
import onnxruntime as ort
from scipy.special import expit as sigmoid

# File paths
FEATURE_BASE = "/mnt/BirdCLEF/birdclef_dataset/features_sampled"
TEST_MANIFEST = os.path.join(FEATURE_BASE, "manifest_test.csv")
TAXONOMY_CSV = os.path.join(FEATURE_BASE, "taxonomy.csv")

# Constants
THRESHOLD = 0.5
DEVICE = torch.device("cpu")

# Load taxonomy and test manifest
tax = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax["primary_label"].astype(str).tolist())
NUM_CLASSES = len(CLASSES)
test_manifest = pd.read_csv(TEST_MANIFEST)


In [3]:
def preprocess_sample(sample):
    emb_path = os.path.join(FEATURE_BASE, "embeddings", sample.emb_path.lstrip(os.sep))
    emb_arr = np.load(emb_path)["embedding"].mean(axis=0).astype(np.float32)
    emb = torch.from_numpy(emb_arr).unsqueeze(0).to(DEVICE)

    ma_path = os.path.join(FEATURE_BASE, "mel_aug", sample.mel_aug_path.lstrip(os.sep))
    ma_arr = np.load(ma_path)["mel"].astype(np.float32)
    ma = torch.from_numpy(ma_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    m_path = os.path.join(FEATURE_BASE, "mel", sample.mel_path.lstrip(os.sep))
    m_arr = np.load(m_path)["mel"].astype(np.float32)
    m = torch.from_numpy(m_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    wav_path = os.path.join(FEATURE_BASE, "denoised", sample.audio_path.lstrip(os.sep))
    wav, sr = torchaudio.load(wav_path)
    wav = wav.float().squeeze(0)
    T = sr * 10  # 20 seconds
    if wav.size(0) < T:
        wav = F.pad(wav, (0, T - wav.size(0)))
    else:
        wav = wav[:T]
    wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
    wav = wav.unsqueeze(0).to(DEVICE)

    return emb, ma, m, wav


In [4]:
onnx_dir = "onnx_exports"
model_paths = {
    "embedding": os.path.join(onnx_dir, "embedding_classifier.onnx"),
    "resnet":    os.path.join(onnx_dir, "resnet50_multilabel.onnx"),
    "effnet":    os.path.join(onnx_dir, "efficientnet_b3_lora.onnx"),
    "rawaudio":  os.path.join(onnx_dir, "raw_audio_cnn.onnx"),
    "meta":      os.path.join(onnx_dir, "meta_mlp.onnx")
}

sessions = {k: ort.InferenceSession(v, providers=["CPUExecutionProvider"]) for k, v in model_paths.items()}


In [7]:
# === Model Size on Disk ===
print("\nModel Sizes on Disk:")
total_size = 0
for name, path in model_paths.items():
    size = os.path.getsize(path)
    total_size += size
    print(f"{name:<12}: {size / 1e6:.2f} MB")
print(f"Total ONNX Model Size: {total_size / 1e6:.2f} MB")



Model Sizes on Disk:
embedding   : 27.73 MB
resnet      : 95.52 MB
effnet      : 63.34 MB
rawaudio    : 0.76 MB
meta        : 5.92 MB
Total ONNX Model Size: 193.27 MB


In [5]:
sample = test_manifest.iloc[0]
emb, ma, m, wav = preprocess_sample(sample)

# Convert to NumPy for ONNX
emb_np = emb.cpu().numpy()
ma_np = ma.cpu().numpy()
m_np = m.cpu().numpy()
wav_np = wav.cpu().numpy()

# Run through all submodels
p1 = sessions["embedding"].run(None, {"embedding_input": emb_np})[0]
p2 = sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0]
p3 = sessions["effnet"].run(None, {"mel_input": m_np})[0]
p4 = sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]

fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
meta_out = sessions["meta"].run(None, {"fusion_input": fusion_input})[0]
probs = sigmoid(meta_out[0])


In [6]:
top_idx = int(np.argmax(probs))
print(f"Top-1 Prediction: {CLASSES[top_idx]} ({probs[top_idx]:.3f})")

print("\nMulti-label Predictions (≥ 0.5):")
for i, score in enumerate(probs):
    if score >= THRESHOLD:
        print(f"{CLASSES[i]:<15} | Confidence: {score:.3f}")


Top-1 Prediction: 1139490 (1.000)

Multi-label Predictions (≥ 0.5):
1139490         | Confidence: 1.000
1192948         | Confidence: 1.000
126247          | Confidence: 0.972
134933          | Confidence: 1.000
1462711         | Confidence: 0.999
1462737         | Confidence: 1.000
21038           | Confidence: 0.999
21116           | Confidence: 1.000
22333           | Confidence: 0.919
22973           | Confidence: 1.000
22976           | Confidence: 1.000
24272           | Confidence: 1.000
24322           | Confidence: 0.899
41663           | Confidence: 1.000
41970           | Confidence: 1.000
42087           | Confidence: 1.000
42113           | Confidence: 1.000
46010           | Confidence: 0.997
47067           | Confidence: 1.000
476538          | Confidence: 1.000
50186           | Confidence: 1.000
517119          | Confidence: 1.000
523060          | Confidence: 1.000
528041          | Confidence: 0.969
52884           | Confidence: 1.000
548639          | Confidence: 0.

In [12]:
from sklearn.metrics import accuracy_score

correct = 0
total = 0
all_preds = []
all_labels = []

sample_indices = np.random.choice(len(test_manifest), size=100, replace=False)

for idx in sample_indices:
    sample = test_manifest.iloc[idx]
    emb, ma, m, wav = preprocess_sample(sample)

    emb_np = emb.cpu().numpy()
    ma_np  = ma.cpu().numpy()
    m_np   = m.cpu().numpy()
    wav_np = wav.cpu().numpy()

    # Run inference
    p1 = sessions["embedding"].run(None, {"embedding_input": emb_np})[0]
    p2 = sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0]
    p3 = sessions["effnet"].run(None, {"mel_input": m_np})[0]
    p4 = sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]
    fused = np.concatenate([p1, p2, p3, p4], axis=1)
    meta_out = sessions["meta"].run(None, {"fusion_input": fused})[0]

    probs = sigmoid(meta_out[0])
    pred = (probs >= THRESHOLD).astype(int)

    gt = np.zeros(NUM_CLASSES, dtype=int)
    labels = sample.primary_label if isinstance(sample.primary_label, list) else [sample.primary_label]
    for lbl in labels:
        if lbl in CLASSES:
            gt[CLASSES.index(lbl)] = 1

    all_preds.append(pred)
    all_labels.append(gt)

# Subset accuracy
y_pred = np.stack(all_preds)
y_true = np.stack(all_labels)
correct = (y_pred == y_true).all(axis=1).sum()
total = len(y_pred)
accuracy = (correct / total) * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")


Accuracy: 0.00% (0/100 correct)


In [13]:
total_size = 0
for name, path in model_paths.items():
    size = os.path.getsize(path)
    total_size += size
    print(f"{name:<12}: {size / 1e6:.2f} MB")

print(f"Total ONNX Model Size: {total_size / 1e6:.2f} MB")


embedding   : 27.73 MB
resnet      : 95.52 MB
effnet      : 63.34 MB
rawaudio    : 0.76 MB
meta        : 5.92 MB
Total ONNX Model Size: 193.27 MB


In [30]:
# Use one sample for latency test
sample = test_manifest.iloc[0]
emb, ma, m, wav = preprocess_sample(sample)
emb_np = emb.cpu().numpy()
ma_np  = ma.cpu().numpy()
m_np   = m.cpu().numpy()
wav_np = wav.cpu().numpy()

# Warm-up
p1 = sessions["embedding"].run(None, {"embedding_input": emb_np})[0]
p2 = sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0]
p3 = sessions["effnet"].run(None, {"mel_input": m_np})[0]
p4 = sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]
fused = np.concatenate([p1, p2, p3, p4], axis=1)
_ = sessions["meta"].run(None, {"fusion_input": fused})[0]

# Time it
num_trials = 100
latencies = []
for _ in range(num_trials):
    start_time = time.time()

    p1 = sessions["embedding"].run(None, {"embedding_input": emb_np})[0]
    p2 = sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0]
    p3 = sessions["effnet"].run(None, {"mel_input": m_np})[0]
    p4 = sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]
    fused = np.concatenate([p1, p2, p3, p4], axis=1)
    _ = sessions["meta"].run(None, {"fusion_input": fused})[0]

    latencies.append(time.time() - start_time)

latencies = np.array(latencies)
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")


Inference Latency (single sample, median): 115.08 ms
Inference Latency (single sample, 95th percentile): 143.71 ms
Inference Latency (single sample, 99th percentile): 155.30 ms
Inference Throughput (single sample): 8.62 FPS


In [31]:
print("\n=== Final BirdCLEF Inference Summary ===")
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")
print(f"Total ONNX Model Size: {total_size / 1e6:.2f} MB")
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials/np.sum(latencies):.2f} FPS")



=== Final BirdCLEF Inference Summary ===
Accuracy: 0.00% (0/100 correct)
Total ONNX Model Size: 193.27 MB
Inference Latency (single sample, median): 115.08 ms
Inference Latency (single sample, 95th percentile): 143.71 ms
Inference Latency (single sample, 99th percentile): 155.30 ms
Inference Throughput (single sample): 8.62 FPS


In [32]:
from tqdm import tqdm

batch_size = 20
num_batches = 25 # Total: 8 × 25 = 200 samples
sample_indices = np.random.choice(len(test_manifest), size=batch_size * num_batches, replace=True)
batches = [sample_indices[i:i + batch_size] for i in range(0, len(sample_indices), batch_size)]

batch_times = []

# ONNX Runtime returns [output_array], so fix shape safely
def safe_squeeze(p_list):
    p = p_list[0]
    return np.squeeze(p, axis=1) if p.ndim == 3 else p

# === Warm-up Batch ===
warmup_batch = batches[0]
embs, mas, ms, wavs = [], [], [], []

for idx in warmup_batch:
    sample = test_manifest.iloc[idx]
    emb, ma, m, wav = preprocess_sample(sample)
    embs.append(emb.cpu().numpy())
    mas.append(ma.cpu().numpy())
    ms.append(m.cpu().numpy())
    wavs.append(wav.cpu().numpy())

# Run warm-up inference
p1 = safe_squeeze(sessions["embedding"].run(None, {"embedding_input": np.concatenate(embs)}))
p2 = safe_squeeze(sessions["resnet"].run(None, {"mel_aug_input": np.concatenate(mas)}))
p3 = safe_squeeze(sessions["effnet"].run(None, {"mel_input": np.concatenate(ms)}))
p4 = safe_squeeze(sessions["rawaudio"].run(None, {"wav_input": np.concatenate(wavs)}))
fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
_ = sessions["meta"].run(None, {"fusion_input": fusion_input})

# === Timed Batches ===
for batch in tqdm(batches[1:], desc="Benchmarking batch throughput"):
    embs, mas, ms, wavs = [], [], [], []
    for idx in batch:
        sample = test_manifest.iloc[idx]
        emb, ma, m, wav = preprocess_sample(sample)
        embs.append(emb.cpu().numpy())
        mas.append(ma.cpu().numpy())
        ms.append(m.cpu().numpy())
        wavs.append(wav.cpu().numpy())

    start_time = time.time()

    p1 = safe_squeeze(sessions["embedding"].run(None, {"embedding_input": np.concatenate(embs)}))
    p2 = safe_squeeze(sessions["resnet"].run(None, {"mel_aug_input": np.concatenate(mas)}))
    p3 = safe_squeeze(sessions["effnet"].run(None, {"mel_input": np.concatenate(ms)}))
    p4 = safe_squeeze(sessions["rawaudio"].run(None, {"wav_input": np.concatenate(wavs)}))
    fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
    _ = sessions["meta"].run(None, {"fusion_input": fusion_input})

    batch_times.append(time.time() - start_time)

# === Compute and print throughput ===
total_samples = batch_size * (len(batches) - 1)
batch_fps = total_samples / np.sum(batch_times)
print(f"\nBatch Throughput ({batch_size}x): {batch_fps:.2f} FPS")


Benchmarking batch throughput: 100%|██████████| 24/24 [00:13<00:00,  1.77it/s]


Batch Throughput (20x): 62.43 FPS





In [33]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Accuracy metrics
subset_acc = (y_pred == y_true).all(axis=1).mean() * 100
per_label_acc = (y_pred == y_true).mean() * 100

# F1, Precision, Recall
f1 = f1_score(y_true, y_pred, average="micro")
precision = precision_score(y_true, y_pred, average="micro")
recall = recall_score(y_true, y_pred, average="micro")

# Model size
total_model_size = sum(os.path.getsize(path) for path in model_paths.values()) / 1e6

# Print all metrics
print("\n=== Final BirdCLEF ONNX Inference Summary ===")
print(f"Subset Accuracy:              {subset_acc:.2f}%")
print(f"Per-label Accuracy:           {per_label_acc:.2f}%")
print(f"F1 Score (micro):             {f1:.4f}")
print(f"Precision (micro):            {precision:.4f}")
print(f"Recall (micro):               {recall:.4f}")
print(f"Total ONNX Model Size:        {total_model_size:.2f} MB")
print(f"Latency (single sample):")
print(f"  Median:                   {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"  95th percentile:          {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"  99th percentile:          {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"  Throughput (1x):          {len(latencies) / np.sum(latencies):.2f} FPS")
print(f" Batch Throughput ({batch_size}x):    {batch_fps:.2f} FPS")



=== Final BirdCLEF ONNX Inference Summary ===
Subset Accuracy:              0.00%
Per-label Accuracy:           47.34%
F1 Score (micro):             0.0093
Precision (micro):            0.0047
Recall (micro):               0.5100
Total ONNX Model Size:        193.27 MB
Latency (single sample):
  Median:                   115.08 ms
  95th percentile:          143.71 ms
  99th percentile:          155.30 ms
  Throughput (1x):          8.62 FPS
 Batch Throughput (20x):    62.43 FPS


In [45]:
def benchmark_birdclef_inference_speed(sessions, test_manifest, num_trials=100, batch_size=8, num_batches=25):
    print(f"Execution providers: {[s.get_providers()[0] for s in sessions.values()]}")
    
    # === Single Sample Latency Benchmark ===
    sample = test_manifest.iloc[0]
    emb, ma, m, wav = preprocess_sample(sample)
    emb_np = emb.cpu().numpy()
    ma_np = ma.cpu().numpy()
    m_np = m.cpu().numpy()
    wav_np = wav.cpu().numpy()

    # Warm-up
    _ = sessions["embedding"].run(None, {"embedding_input": emb_np})
    _ = sessions["resnet"].run(None, {"mel_aug_input": ma_np})
    _ = sessions["effnet"].run(None, {"mel_input": m_np})
    _ = sessions["rawaudio"].run(None, {"wav_input": wav_np})
    fusion_input = np.concatenate([
        sessions["embedding"].run(None, {"embedding_input": emb_np})[0],
        sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0],
        sessions["effnet"].run(None, {"mel_input": m_np})[0],
        sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]
    ], axis=1)
    _ = sessions["meta"].run(None, {"fusion_input": fusion_input})

    latencies = []
    for _ in range(num_trials):
        start = time.time()
        p1 = sessions["embedding"].run(None, {"embedding_input": emb_np})[0]
        p2 = sessions["resnet"].run(None, {"mel_aug_input": ma_np})[0]
        p3 = sessions["effnet"].run(None, {"mel_input": m_np})[0]
        p4 = sessions["rawaudio"].run(None, {"wav_input": wav_np})[0]
        fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
        _ = sessions["meta"].run(None, {"fusion_input": fusion_input})
        latencies.append(time.time() - start)

    latencies = np.array(latencies)
    print(f"\nSingle Sample Latency:")
    print(f"  • Median:           {np.percentile(latencies, 50) * 1000:.2f} ms")
    print(f"  • 95th percentile:  {np.percentile(latencies, 95) * 1000:.2f} ms")
    print(f"  • 99th percentile:  {np.percentile(latencies, 99) * 1000:.2f} ms")
    print(f"  • Throughput:       {len(latencies) / np.sum(latencies):.2f} FPS")


In [47]:
from tqdm import tqdm

def run_batch_throughput(sessions, test_manifest, batch_size=8, num_batches=25):
    sample_indices = np.random.choice(len(test_manifest), size=batch_size * num_batches, replace=True)
    batches = [sample_indices[i:i + batch_size] for i in range(0, len(sample_indices), batch_size)]

    def safe_squeeze(p_list):
        p = p_list[0]
        return np.squeeze(p, axis=1) if p.ndim == 3 else p

    # Warm-up batch
    warmup_batch = batches[0]
    embs, mas, ms, wavs = [], [], [], []
    for idx in warmup_batch:
        sample = test_manifest.iloc[idx]
        emb, ma, m, wav = preprocess_sample(sample)
        embs.append(emb.cpu().numpy())
        mas.append(ma.cpu().numpy())
        ms.append(m.cpu().numpy())
        wavs.append(wav.cpu().numpy())

    p1 = safe_squeeze(sessions["embedding"].run(None, {"embedding_input": np.concatenate(embs)}))
    p2 = safe_squeeze(sessions["resnet"].run(None, {"mel_aug_input": np.concatenate(mas)}))
    p3 = safe_squeeze(sessions["effnet"].run(None, {"mel_input": np.concatenate(ms)}))
    p4 = safe_squeeze(sessions["rawaudio"].run(None, {"wav_input": np.concatenate(wavs)}))
    fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
    _ = sessions["meta"].run(None, {"fusion_input": fusion_input})

    # Timed batches
    batch_times = []
    for batch in tqdm(batches[1:], desc="Measuring batch throughput"):
        embs, mas, ms, wavs = [], [], [], []
        for idx in batch:
            sample = test_manifest.iloc[idx]
            emb, ma, m, wav = preprocess_sample(sample)
            embs.append(emb.cpu().numpy())
            mas.append(ma.cpu().numpy())
            ms.append(m.cpu().numpy())
            wavs.append(wav.cpu().numpy())

        start_time = time.time()
        p1 = safe_squeeze(sessions["embedding"].run(None, {"embedding_input": np.concatenate(embs)}))
        p2 = safe_squeeze(sessions["resnet"].run(None, {"mel_aug_input": np.concatenate(mas)}))
        p3 = safe_squeeze(sessions["effnet"].run(None, {"mel_input": np.concatenate(ms)}))
        p4 = safe_squeeze(sessions["rawaudio"].run(None, {"wav_input": np.concatenate(wavs)}))
        fusion_input = np.concatenate([p1, p2, p3, p4], axis=1)
        _ = sessions["meta"].run(None, {"fusion_input": fusion_input})
        batch_times.append(time.time() - start_time)

    total_samples = batch_size * (len(batches) - 1)
    batch_fps = total_samples / np.sum(batch_times)
    print(f"\n Batch Throughput ({batch_size}x): {batch_fps:.2f} FPS")


In [49]:
benchmark_birdclef_inference_speed(sessions, test_manifest)
run_batch_throughput(sessions, test_manifest, batch_size=8, num_batches=25)

Execution providers: ['CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider']

Single Sample Latency:
  • Median:           109.84 ms
  • 95th percentile:  135.36 ms
  • 99th percentile:  146.56 ms
  • Throughput:       8.91 FPS


Measuring batch throughput: 100%|██████████| 24/24 [00:07<00:00,  3.12it/s]


 Batch Throughput (8x): 42.84 FPS





In [51]:
import onnxruntime as ort
import os

# Directory setup
original_dir = "onnx_exports"
optimized_dir = "onnx_optimized"
os.makedirs(optimized_dir, exist_ok=True)

# Model names and paths
model_names = {
    "embedding":  "embedding_classifier.onnx",
    "resnet":     "resnet50_multilabel.onnx",
    "effnet":     "efficientnet_b3_lora.onnx",
    "rawaudio":   "raw_audio_cnn.onnx",
    "meta":       "meta_mlp.onnx"
}

# Apply graph optimizations and save optimized models
for name, filename in model_names.items():
    input_path = os.path.join(original_dir, filename)
    output_path = os.path.join(optimized_dir, f"{name}_optimized.onnx")

    session_options = ort.SessionOptions()
    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
    session_options.optimized_model_filepath = output_path

    _ = ort.InferenceSession(input_path, sess_options=session_options, providers=["CPUExecutionProvider"])
    print(f"Optimized {name}: saved to {output_path}")


Optimized embedding: saved to onnx_optimized/embedding_optimized.onnx
Optimized resnet: saved to onnx_optimized/resnet_optimized.onnx
Optimized effnet: saved to onnx_optimized/effnet_optimized.onnx
Optimized rawaudio: saved to onnx_optimized/rawaudio_optimized.onnx
Optimized meta: saved to onnx_optimized/meta_optimized.onnx


In [52]:
# Load optimized ONNX sessions for benchmarking
sessions_optimized = {}
for name in model_names:
    path = os.path.join(optimized_dir, f"{name}_optimized.onnx")
    sessions_optimized[name] = ort.InferenceSession(path, providers=["CPUExecutionProvider"])

# Benchmark single-sample latency
benchmark_birdclef_inference_speed(sessions_optimized, test_manifest)

# Benchmark batch throughput
run_batch_throughput(sessions_optimized, test_manifest, batch_size=8, num_batches=25)


Execution providers: ['CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider']

Single Sample Latency:
  • Median:           113.44 ms
  • 95th percentile:  141.97 ms
  • 99th percentile:  146.43 ms
  • Throughput:       8.77 FPS


Measuring batch throughput: 100%|██████████| 24/24 [00:07<00:00,  3.08it/s]


 Batch Throughput (8x): 42.13 FPS





In [53]:
import os
from neural_compressor import quantization
from neural_compressor.model.onnx_model import ONNXModel
from neural_compressor.config import PostTrainingQuantConfig

# Directories
original_dir = "onnx_exports"
quantized_dir = "onnx_quantized"
os.makedirs(quantized_dir, exist_ok=True)

# Model filenames
model_names = {
    "embedding":  "embedding_classifier.onnx",
    "resnet":     "resnet50_multilabel.onnx",
    "effnet":     "efficientnet_b3_lora.onnx",
    "rawaudio":   "raw_audio_cnn.onnx",
    "meta":       "meta_mlp.onnx"
}

# Dynamic Quantization config (no calibration dataset required)
config_ptq = PostTrainingQuantConfig(approach="dynamic")

# Quantize each model
for name, filename in model_names.items():
    input_path = os.path.join(original_dir, filename)
    output_path = os.path.join(quantized_dir, f"{name}_quantized.onnx")

    print(f"Quantizing {name}...")
    fp32_model = ONNXModel(input_path)
    q_model = quantization.fit(model=fp32_model, conf=config_ptq)
    q_model.save_model_to_file(output_path)
    print(f"Saved: {output_path}")


2025-05-11 02:13:42 [INFO] Start auto tuning.
2025-05-11 02:13:42 [INFO] Quantize model without tuning!
2025-05-11 02:13:42 [INFO] Quantize the model with default configuration without evaluating the model.                To perform the tuning process, please either provide an eval_func or provide an                    eval_dataloader an eval_metric.
2025-05-11 02:13:42 [INFO] Adaptor has 5 recipes.
2025-05-11 02:13:42 [INFO] 0 recipes specified by user.
2025-05-11 02:13:42 [INFO] 3 recipes require future tuning.
2025-05-11 02:13:43 [INFO] *** Initialize auto tuning
2025-05-11 02:13:43 [INFO] {
2025-05-11 02:13:43 [INFO]     'PostTrainingQuantConfig': {
2025-05-11 02:13:43 [INFO]         'AccuracyCriterion': {
2025-05-11 02:13:43 [INFO]             'criterion': 'relative',
2025-05-11 02:13:43 [INFO]             'higher_is_better': True,
2025-05-11 02:13:43 [INFO]             'tolerable_loss': 0.01,
2025-05-11 02:13:43 [INFO]             'absolute': None,
2025-05-11 02:13:43 [INFO]     

Quantizing embedding...


2025-05-11 02:13:43 [INFO] Do not evaluate the baseline and quantize the model with default configuration.
2025-05-11 02:13:43 [INFO] Quantize the model with default config.
2025-05-11 02:13:44 [INFO] |******Mixed Precision Statistics******|
2025-05-11 02:13:44 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:44 [INFO] |        Op Type        | Total | INT8 |
2025-05-11 02:13:44 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:44 [INFO] |         MatMul        |   4   |  4   |
2025-05-11 02:13:44 [INFO] | DynamicQuantizeLinear |   4   |  4   |
2025-05-11 02:13:44 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:44 [INFO] Pass quantize model elapsed time: 603.21 ms
2025-05-11 02:13:44 [INFO] Save tuning history to /home/jovyan/work/nc_workspace/2025-05-11_02-13-40/./history.snapshot.
2025-05-11 02:13:44 [INFO] [Strategy] Found the model meets accuracy requirements, ending the tuning process.
2025-05-11 02:13:44 [INFO] Specified timeout or 

Saved: onnx_quantized/embedding_quantized.onnx
Quantizing resnet...


2025-05-11 02:13:44 [INFO]             'weight_correction': False,
2025-05-11 02:13:44 [INFO]             'gemm_to_matmul': True,
2025-05-11 02:13:44 [INFO]             'graph_optimization_level': None,
2025-05-11 02:13:44 [INFO]             'first_conv_or_matmul_quantization': True,
2025-05-11 02:13:44 [INFO]             'last_conv_or_matmul_quantization': True,
2025-05-11 02:13:44 [INFO]             'pre_post_process_quantization': True,
2025-05-11 02:13:44 [INFO]             'add_qdq_pair_to_weight': False,
2025-05-11 02:13:44 [INFO]             'optypes_to_exclude_output_quant': [
2025-05-11 02:13:44 [INFO]             ],
2025-05-11 02:13:44 [INFO]             'dedicated_qdq_pair': False,
2025-05-11 02:13:44 [INFO]             'rtn_args': {
2025-05-11 02:13:44 [INFO]             },
2025-05-11 02:13:44 [INFO]             'awq_args': {
2025-05-11 02:13:44 [INFO]             },
2025-05-11 02:13:44 [INFO]             'gptq_args': {
2025-05-11 02:13:44 [INFO]             },
2025-05-11 0

Saved: onnx_quantized/resnet_quantized.onnx
Quantizing effnet...


2025-05-11 02:13:49 [INFO] Do not evaluate the baseline and quantize the model with default configuration.
2025-05-11 02:13:49 [INFO] Quantize the model with default config.
2025-05-11 02:13:52 [INFO] |******Mixed Precision Statistics******|
2025-05-11 02:13:52 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:52 [INFO] |        Op Type        | Total | INT8 |
2025-05-11 02:13:52 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:52 [INFO] |          Conv         |  284  | 284  |
2025-05-11 02:13:52 [INFO] |         MatMul        |   1   |  1   |
2025-05-11 02:13:52 [INFO] | DynamicQuantizeLinear |  208  | 208  |
2025-05-11 02:13:52 [INFO] +-----------------------+-------+------+
2025-05-11 02:13:52 [INFO] Pass quantize model elapsed time: 3292.43 ms
2025-05-11 02:13:52 [INFO] Save tuning history to /home/jovyan/work/nc_workspace/2025-05-11_02-13-40/./history.snapshot.
2025-05-11 02:13:52 [INFO] [Strategy] Found the model meets accuracy requirements, ending

Saved: onnx_quantized/effnet_quantized.onnx
Quantizing rawaudio...
Saved: onnx_quantized/rawaudio_quantized.onnx
Quantizing meta...


2025-05-11 02:13:53 [INFO]         ],
2025-05-11 02:13:53 [INFO]         'framework': 'onnxruntime',
2025-05-11 02:13:53 [INFO]         'inputs': [
2025-05-11 02:13:53 [INFO]         ],
2025-05-11 02:13:53 [INFO]         'model_name': '',
2025-05-11 02:13:53 [INFO]         'op_name_dict': None,
2025-05-11 02:13:53 [INFO]         'op_type_dict': None,
2025-05-11 02:13:53 [INFO]         'outputs': [
2025-05-11 02:13:53 [INFO]         ],
2025-05-11 02:13:53 [INFO]         'quant_format': 'default',
2025-05-11 02:13:53 [INFO]         'quant_level': 'auto',
2025-05-11 02:13:53 [INFO]         'recipes': {
2025-05-11 02:13:53 [INFO]             'smooth_quant': False,
2025-05-11 02:13:53 [INFO]             'smooth_quant_args': {
2025-05-11 02:13:53 [INFO]             },
2025-05-11 02:13:53 [INFO]             'layer_wise_quant': False,
2025-05-11 02:13:53 [INFO]             'layer_wise_quant_args': {
2025-05-11 02:13:53 [INFO]             },
2025-05-11 02:13:53 [INFO]             'fast_bias_cor

Saved: onnx_quantized/meta_quantized.onnx


In [54]:
import onnxruntime as ort

# Load quantized models into sessions
sessions_quantized = {}
for name in model_names:
    path = os.path.join(quantized_dir, f"{name}_quantized.onnx")
    sessions_quantized[name] = ort.InferenceSession(path, providers=["CPUExecutionProvider"])

# Run benchmarks
benchmark_birdclef_inference_speed(sessions_quantized, test_manifest)
run_batch_throughput(sessions_quantized, test_manifest, batch_size=8, num_batches=25)


Execution providers: ['CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider']

Single Sample Latency:
  • Median:           199.78 ms
  • 95th percentile:  254.92 ms
  • 99th percentile:  296.44 ms
  • Throughput:       4.86 FPS


Measuring batch throughput: 100%|██████████| 24/24 [00:28<00:00,  1.19s/it]


 Batch Throughput (8x): 7.61 FPS





In [55]:
import os

quantized_dir = "onnx_quantized"

print(" Quantized Model Sizes on Disk:")
for name in ["embedding", "resnet", "effnet", "rawaudio", "meta"]:
    onnx_model_path = os.path.join(quantized_dir, f"{name}_quantized.onnx")
    model_size = os.path.getsize(onnx_model_path) / 1e6  # in MB
    print(f"  {name.capitalize():<10}: {model_size:.2f} MB")


 Quantized Model Sizes on Disk:
  Embedding : 6.99 MB
  Resnet    : 24.13 MB
  Effnet    : 17.14 MB
  Rawaudio  : 0.20 MB
  Meta      : 1.51 MB


In [58]:
import onnxruntime as ort

# Load quantized models into sessions
sessions_quantized = {}
for name in ["embedding", "resnet", "effnet", "rawaudio", "meta"]:
    path = os.path.join(quantized_dir, f"{name}_quantized.onnx")
    sessions_quantized[name] = ort.InferenceSession(path, providers=["CPUExecutionProvider"])

print("\nBenchmarking Quantized BirdCLEF Models...")
benchmark_birdclef_inference_speed(sessions_quantized, test_manifest)
run_batch_throughput(sessions_quantized, test_manifest, batch_size=8, num_batches=25)



Benchmarking Quantized BirdCLEF Models...
Execution providers: ['CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider']

Single Sample Latency:
  • Median:           201.51 ms
  • 95th percentile:  239.01 ms
  • 99th percentile:  287.63 ms
  • Throughput:       4.88 FPS


Measuring batch throughput: 100%|██████████| 24/24 [00:28<00:00,  1.19s/it]


 Batch Throughput (8x): 7.65 FPS





In [65]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Quadro RTX 6000


In [63]:
import os
import onnxruntime as ort

# Mapping of expected logical keys to actual ONNX filenames
model_files = {
    "embedding":  "embedding_classifier.onnx",
    "resnet":     "resnet50_multilabel.onnx",
    "effnet":     "efficientnet_b3_lora.onnx",
    "rawaudio":   "raw_audio_cnn.onnx",
    "meta":       "meta_mlp.onnx"
}

# Directory containing the ONNX models
onnx_dir = "onnx_exports"  # Change to "onnx_optimized" or "onnx_quantized" if needed

# Load ONNX models using CUDAExecutionProvider
sessions_cuda = {}
for key, filename in model_files.items():
    model_path = os.path.join(onnx_dir, filename)
    sessions_cuda[key] = ort.InferenceSession(model_path, providers=["CUDAExecutionProvider"])

# Confirm active execution providers
print(f" Execution device: {ort.get_device()}")
print(" Providers used:")
for name, session in sessions_cuda.items():
    print(f"  {name:<9}: {session.get_providers()[0]}")

# Run benchmarks
print("\nBenchmarking on CUDAExecutionProvider...")
benchmark_birdclef_inference_speed(sessions_cuda, test_manifest)
run_batch_throughput(sessions_cuda, test_manifest, batch_size=8, num_batches=25)


 Execution device: CPU
 Providers used:
  embedding: CPUExecutionProvider
  resnet   : CPUExecutionProvider
  effnet   : CPUExecutionProvider
  rawaudio : CPUExecutionProvider
  meta     : CPUExecutionProvider

Benchmarking on CUDAExecutionProvider...
Execution providers: ['CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider', 'CPUExecutionProvider']

Single Sample Latency:
  • Median:           116.02 ms
  • 95th percentile:  144.12 ms
  • 99th percentile:  150.50 ms
  • Throughput:       8.57 FPS


Measuring batch throughput: 100%|██████████| 24/24 [00:07<00:00,  3.07it/s]


 Batch Throughput (8x): 42.81 FPS



