In [39]:
!pip install peft

Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers (from peft)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting regex!=2019.12.17 (from transformers->peft)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers->peft)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading peft-0.15.2-py3-none-any.whl (411 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_

In [32]:
!pip install grad-cam pytest

Collecting SHAP
  Downloading shap-0.47.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from SHAP)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.47.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, SHAP
Successfully installed SHAP-0.47.2 slicer-0.0.8


In [40]:
import os
import numpy as np
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from random import sample as rand_sample
from sklearn.metrics import accuracy_score
from peft import get_peft_model, LoraConfig
import timm

5

In [44]:
TEST_MANIFEST = "/mnt/BirdCLEF-2025/birdclef_dataset/features_sampled/manifest_test.csv"
TAXONOMY_CSV = "/mnt/BirdCLEF-2025/birdclef_dataset/features_sampled/taxonomy.csv"
FEATURE_BASE = "/mnt/BirdCLEF-2025/birdclef_dataset/features_sampled"
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DROPOUT = 0.3
THRESHOLD   = 0.5

NUM_CLASSES = 206
WAV_LEN = 320000
MEL_SHAPE = (1, 64, 313)
HIDDEN_DIMS = [1024, 512]
THRESHOLD   = 0.

In [45]:
CLASSES = sorted(tax["primary_label"].astype(str).tolist())
NUM_CLASSES = len(CLASSES)
#CELL 4
class MetaMLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, dropout):
        super().__init__()
        layers, dims = [], [in_dim]+hidden_dims
        for i in range(len(hidden_dims)):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.BatchNorm1d(dims[i+1]),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
        layers.append(nn.Linear(dims[-1], NUM_CLASSES))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)
    
#CELL 5
class EmbeddingClassifier(nn.Module):
    def __init__(self, emb_dim, num_cls):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 2048), nn.BatchNorm1d(2048), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(2048, 1024),    nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(1024, 512),     nn.BatchNorm1d(512),  nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(512, num_cls)
        )
    def forward(self, x): return self.net(x)

def get_resnet50_multilabel(num_classes):
    m = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', pretrained=False)
    m.conv1 = nn.Conv2d(1, m.conv1.out_channels,
                        kernel_size=m.conv1.kernel_size,
                        stride=m.conv1.stride,
                        padding=m.conv1.padding,
                        bias=False)
    m.fc    = nn.Linear(m.fc.in_features, num_classes)
    return m

TARGET_MODULES  = ["conv_pw","conv_dw","conv_pwl","conv_head"]
MODULES_TO_SAVE = ["classifier"]
def build_efficientnetb3_lora(num_classes):
    base = timm.create_model("efficientnet_b3", pretrained=True)
    # patch forward
    orig_fwd = base.forward
    def forward_patch(*args, input_ids=None, **kwargs):
        x = input_ids if input_ids is not None else args[0]
        return orig_fwd(x)
    base.forward = forward_patch
    # adapt stem & head
    stem = base.conv_stem
    base.conv_stem = nn.Conv2d(1, stem.out_channels,
                               kernel_size=stem.kernel_size,
                               stride=stem.stride,
                               padding=stem.padding,
                               bias=False)
    base.classifier = nn.Linear(base.classifier.in_features, num_classes)
    # LoRA
    lora_cfg = LoraConfig(
        r=12, lora_alpha=24,
        target_modules=TARGET_MODULES,
        lora_dropout=0.1, bias="none",
        modules_to_save=MODULES_TO_SAVE,
        task_type="FEATURE_EXTRACTION",
        inference_mode=False
    )
    return get_peft_model(base, lora_cfg)

class RawAudioCNN(nn.Module):
    def __init__(self, num_cls):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16,  kernel_size=15, stride=4, padding=7)
        self.bn1   = nn.BatchNorm1d(16)
        self.pool  = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(16,32,  kernel_size=15, stride=2, padding=7)
        self.bn2   = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32,64,  kernel_size=15, stride=2, padding=7)
        self.bn3   = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64,128, kernel_size=15, stride=2, padding=7)
        self.bn4   = nn.BatchNorm1d(128)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc          = nn.Linear(128, num_cls)
    def forward(self, x):
        x = x.unsqueeze(1)  # [B,T]→[B,1,T]
        x = F.relu(self.bn1(self.conv1(x))); x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.global_pool(x).squeeze(-1)
        return self.fc(x)
# === Step 1: Get emb_dim ===
class EmbeddingDatasetForDim:
    def __init__(self, manifest, base, key="embedding"):
        df = pd.read_csv(manifest)
        df["emb_path"] = df["emb_path"].astype(str).apply(
            lambda p: os.path.join(base, "embeddings", p.lstrip(os.sep))
        )
        first_sample_path = df.iloc[0].emb_path
        arr = np.load(first_sample_path)[key]  # shape: (n_windows, emb_dim)
        self.emb_dim = arr.shape[1]

_emb_ds = EmbeddingDatasetForDim(TEST_MANIFEST, FEATURE_BASE)
emb_dim = _emb_ds.emb_dim

# === Step 2: Instantiate all models ===
emb_model  = EmbeddingClassifier(emb_dim=emb_dim, num_cls=NUM_CLASSES).to(DEVICE)
res_model  = get_resnet50_multilabel(NUM_CLASSES).to(DEVICE)
eff_model  = build_efficientnetb3_lora(NUM_CLASSES).to(DEVICE)
raw_model  = RawAudioCNN(NUM_CLASSES).to(DEVICE)
meta_model = MetaMLP(NUM_CLASSES * 4, HIDDEN_DIMS, DROPOUT).to(DEVICE)

# === Step 3: Load weights & freeze base models ===
for model, ckpt in [
    (emb_model, CKPT_EMB),
    (res_model, CKPT_RES),
    (eff_model, CKPT_EFF),
    (raw_model, CKPT_RAW)
]:
    model.load_state_dict(torch.load(ckpt, map_location="cpu"))
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

# === Step 4: Load meta model weights (not frozen) ===
meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))
meta_model.eval()


Using cache found in /home/jovyan/.cache/torch/hub/pytorch_vision_v0.14.0
  model.load_state_dict(torch.load(ckpt, map_location="cpu"))
  meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))


MetaMLP(
  (net): Sequential(
    (0): Linear(in_features=824, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=512, out_features=206, bias=True)
  )
)

In [46]:
tax = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax["primary_label"].astype(str).tolist())

In [47]:
test_manifest = pd.read_csv(TEST_MANIFEST)

In [48]:
import os
import torch
import torchaudio
import torch.nn.functional as F
import numpy as np

# === Update checkpoint paths ===


# === Inference sample preprocessor ===
def preprocess_sample(sample):
    emb_path = os.path.join(FEATURE_BASE, "embeddings", sample.emb_path.lstrip("/"))
    emb_arr = np.load(emb_path)["embedding"].mean(axis=0).astype(np.float32)
    emb = torch.from_numpy(emb_arr).unsqueeze(0).to(DEVICE)

    ma_path = os.path.join(FEATURE_BASE, "mel_aug", sample.mel_aug_path.lstrip("/"))
    ma_arr = np.load(ma_path)["mel"].astype(np.float32)
    ma = torch.from_numpy(ma_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    m_path = os.path.join(FEATURE_BASE, "mel", sample.mel_path.lstrip("/"))
    m_arr = np.load(m_path)["mel"].astype(np.float32)
    m = torch.from_numpy(m_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)

    wav_path = os.path.join(FEATURE_BASE, "denoised", sample.audio_path.lstrip("/"))
    if not os.path.exists(wav_path):
        raise FileNotFoundError(f"Audio file not found: {wav_path}")
    wav, sr = torchaudio.load(wav_path)
    wav = wav.mean(dim=0) if wav.dim() > 1 else wav
    T = sr * 10
    wav = F.pad(wav, (0, max(0, T - wav.size(0))))[:T]
    wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
    wav = wav.unsqueeze(0).to(DEVICE)

    return emb, ma, m, wav

# === Inference function ===
def run_full_inference(sample):
    emb, ma, m, wav = preprocess_sample(sample)

    with torch.no_grad():
        p1 = torch.sigmoid(emb_model(emb))   # Embedding MLP
        p2 = torch.sigmoid(res_model(ma))    # ResNet (mel_aug)
        p3 = torch.sigmoid(eff_model(m))     # EfficientNet (mel)
        p4 = torch.sigmoid(raw_model(wav))   # Raw waveform CNN

        feat = torch.cat([p1, p2, p3, p4], dim=1)
        logits = meta_model(feat)
        probs = torch.sigmoid(logits)[0].cpu().numpy()

    return probs  # shape: (NUM_CLASSES,)


In [49]:
from collections import defaultdict
import numpy as np

class_names = sorted(test_manifest["primary_label"].unique())
label2id = {label: idx for idx, label in enumerate(class_names)}

total_correct = 0
total_samples = 0
class_hits = defaultdict(int)
class_counts = defaultdict(int)

for idx, row in test_manifest.head(200).iterrows():
    try:
        true_label = label2id[row["primary_label"]]
        probs = run_full_inference(row)
        pred_label = int(np.argmax(probs))

        class_counts[true_label] += 1
        if pred_label == true_label:
            total_correct += 1
            class_hits[true_label] += 1
        total_samples += 1

    except Exception as e:
        print(f"Error at index {idx}: {e}")


In [50]:
overall_acc = total_correct / total_samples
print(f"\n Overall Accuracy: {overall_acc:.4f} ({total_correct}/{total_samples})")

per_class_acc = {
    CLASSES[i]: class_hits[i] / class_counts[i] if class_counts[i] > 0 else 0.0
    for i in range(NUM_CLASSES)
}

least_accurate = sorted(per_class_acc.items(), key=lambda x: x[1])[:]

print("\n Top 20 Least Accurate Classes:")
for cls, acc in least_accurate:
    print(f"{cls:<30} Acc: {acc:.4f}")


 Overall Accuracy: 0.0000 (0/200)

 Top 20 Least Accurate Classes:
1139490                        Acc: 0.0000
1192948                        Acc: 0.0000
1194042                        Acc: 0.0000
126247                         Acc: 0.0000
1346504                        Acc: 0.0000
134933                         Acc: 0.0000
135045                         Acc: 0.0000
1462711                        Acc: 0.0000
1462737                        Acc: 0.0000
1564122                        Acc: 0.0000
21038                          Acc: 0.0000
21116                          Acc: 0.0000
21211                          Acc: 0.0000
22333                          Acc: 0.0000
22973                          Acc: 0.0000
22976                          Acc: 0.0000
24272                          Acc: 0.0000
24292                          Acc: 0.0000
24322                          Acc: 0.0000
41663                          Acc: 0.0000
41778                          Acc: 0.0000
41970                        

In [None]:
import shap
import torch
import numpy as np
import os
import pandas as pd

# Ensure model is in eval mode
emb_model.eval()
emb_model.to(DEVICE)

# === Wrapper for emb_model ===
def model_wrapper(emb_batch):
    if isinstance(emb_batch, list):
        emb_batch = np.stack(emb_batch)
    tensor = torch.tensor(emb_batch, dtype=torch.float32).to(DEVICE)
    with torch.no_grad():
        logits = emb_model(tensor)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
    return probs

# === Load embedding vector from row ===
def get_embedding_vector(row):
    emb_path = os.path.join(FEATURE_BASE, "embeddings", row.emb_path.lstrip("/"))
    emb = np.load(emb_path)["embedding"].mean(axis=0).astype(np.float32)
    return emb

# === Load manifest if needed ===
if 'test_manifest' not in globals():
    TEST_MANIFEST = os.path.join(FEATURE_BASE, "manifest_test.csv")
    test_manifest = pd.read_csv(TEST_MANIFEST)

# === SHAP Input Data ===
background_samples = np.array([get_embedding_vector(test_manifest.iloc[i]) for i in range(5)])
test_sample = get_embedding_vector(test_manifest.iloc[42])

# === SHAP Explainer ===
masker = shap.maskers.Partition(background_samples)
explainer = shap.Explainer(model_wrapper, masker, algorithm="partition")

# === Visualize SHAP ===
shap_values = explainer([test_sample])
shap.plots.waterfall(shap_values[0])


In [None]:

CUSTOM_TEST_ROOT = "/mnt/BirdCLEF/custom_templates"
def evaluate_with_ground_truth(folder_name):
    folder_path = os.path.join(CUSTOM_TEST_ROOT, folder_name)
    y_true, y_pred = [], []

    for fname in os.listdir(folder_path):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder_path, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred_label = int(torch.argmax(logits))
            y_pred.append(pred_label)
            y_true.append(CLASSES.index(folder_name))  # use folder name as ground truth
        except Exception as e:
            print(f"Error on {fname}: {e}")
    
    return y_true, y_pred

print("\nAccuracy per template folder:\n")
for group in ["insects", "mammalia", "amphibia"]:
    true, pred = evaluate_with_ground_truth(group)
    if true:
        acc = accuracy_score(true, pred)
        print(f"{group.capitalize():<10} → Accuracy: {acc:.4f} ({len(true)} samples)")
        print(classification_report(true, pred, target_names=[group]))
    else:
        print(f"{group.capitalize():<10} → No valid samples")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# === Cell 9: Confusion Matrix for One Group ===
def plot_confusion_for_group(group):
    folder = os.path.join(CUSTOM_TEST_ROOT, group)
    y_true, y_pred = [], []

    for fname in os.listdir(folder):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred = int(torch.argmax(logits))
            y_pred.append(pred)
            y_true.append(CLASSES.index(group))  # assumes folder name is the true label
        except:
            continue



In [None]:
# === Cell 10: Evaluate on Folder with Amphibia-like Sounds ===
def evaluate_confused_folder(folder_name, expected_class):
    folder = os.path.join(CUSTOM_TEST_ROOT, folder_name)
    y_true, y_pred = [], []

    for fname in os.listdir(folder):
        if not fname.endswith(".wav"): continue
        path = os.path.join(folder, fname)
        try:
            wav = preprocess_sample(path)
            logits = run_full_inference(wav)
            pred = int(torch.argmax(logits))
            y_pred.append(pred)
            y_true.append(CLASSES.index(expected_class))
        except Exception as e:
            print(f"Error on {fname}: {e}")

    if y_true:
        acc = accuracy_score(y_true, y_pred)
        print(f"\n Accuracy on '{folder_name}' (expected='{expected_class}'): {acc:.4f} ({len(y_true)} samples)")

        # Show confusion distribution
        pred_labels = [CLASSES[p] for p in y_pred]
        error_counts = pd.Series(pred_labels).value_counts().head(10)
        print("\n Top 10 Predicted Classes (Confusion Candidates):")
        for lbl, count in error_counts.items():
            print(f"{lbl:<30} {count} predictions")

        # Optional barplot
        plt.figure(figsize=(10, 4))
        sns.barplot(x=error_counts.values, y=error_counts.index)
        plt.title(f"Most Common Misclassifications for '{folder_name}'")
        plt.xlabel("Prediction Count")
        plt.ylabel("Predicted Label")
        plt.tight_layout()
        plt.show()
    else:
        print(f"No valid audio samples in '{folder_name}'")

# Run on a confusion-prone folder
evaluate_confused_folder("sound_similar_to_amphibia", expected_class="amphibia")


In [None]:
!pytest --verbose --tb=no tests/

In [None]:
!pytest --verbose --lf --tb=no tests/