In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import pandas as pd
import timm
from peft import get_peft_model, LoraConfig

In [2]:
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
FEATURE_BASE  = "/Users/harish/Desktop/MLOps Project/features_sampled"
TEST_MANIFEST = os.path.join(FEATURE_BASE, "manifest_test.csv")
TAXONOMY_CSV  = "/Users/harish/Desktop/MLOps Project/Data/birdclef-2025/taxonomy.csv"
TRAIN_META    = "/Users/harish/Desktop/MLOps Project/Data/birdclef-2025/train.csv"
DROPOUT        = 0.3
CKPT_EMB    = "/Users/harish/Desktop/MLOps Project/Models/best_emb_mlp.pt"
CKPT_RES    = "/Users/harish/Desktop/MLOps Project/Models/best_resnet50.pt"
CKPT_EFF    = "/Users/harish/Desktop/MLOps Project/Models/best_effb3_lora.pt"
CKPT_RAW    = "/Users/harish/Desktop/MLOps Project/Models/best_rawcnn.pt"
CKPT_META   = "/Users/harish/Desktop/MLOps Project/Models/best_meta_mlp.pt"
HIDDEN_DIMS    = [1024, 512]
THRESHOLD   = 0.5

In [3]:
tax = pd.read_csv(TAXONOMY_CSV)
CLASSES = sorted(tax["primary_label"].astype(str).tolist())
NUM_CLASSES = len(CLASSES)


In [4]:
class MetaMLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, dropout):
        super().__init__()
        layers, dims = [], [in_dim]+hidden_dims
        for i in range(len(hidden_dims)):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.BatchNorm1d(dims[i+1]),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
        layers.append(nn.Linear(dims[-1], NUM_CLASSES))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [None]:

class EmbeddingClassifier(nn.Module):
    def __init__(self, emb_dim, num_cls):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 2048), nn.BatchNorm1d(2048), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(2048, 1024),    nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(1024, 512),     nn.BatchNorm1d(512),  nn.ReLU(), nn.Dropout(DROPOUT),
            nn.Linear(512, num_cls)
        )
    def forward(self, x): return self.net(x)

def get_resnet50_multilabel(num_classes):
    m = torch.hub.load('pytorch/vision:v0.14.0', 'resnet50', pretrained=False)
    m.conv1 = nn.Conv2d(1, m.conv1.out_channels,
                        kernel_size=m.conv1.kernel_size,
                        stride=m.conv1.stride,
                        padding=m.conv1.padding,
                        bias=False)
    m.fc    = nn.Linear(m.fc.in_features, num_classes)
    return m

TARGET_MODULES  = ["conv_pw","conv_dw","conv_pwl","conv_head"]
MODULES_TO_SAVE = ["classifier"]
def build_efficientnetb3_lora(num_classes):
    base = timm.create_model("efficientnet_b3", pretrained=True)
    # patch forward
    orig_fwd = base.forward
    def forward_patch(*args, input_ids=None, **kwargs):
        x = input_ids if input_ids is not None else args[0]
        return orig_fwd(x)
    base.forward = forward_patch
    # adapt stem & head
    stem = base.conv_stem
    base.conv_stem = nn.Conv2d(1, stem.out_channels,
                               kernel_size=stem.kernel_size,
                               stride=stem.stride,
                               padding=stem.padding,
                               bias=False)
    base.classifier = nn.Linear(base.classifier.in_features, num_classes)
    # LoRA
    lora_cfg = LoraConfig(
        r=12, lora_alpha=24,
        target_modules=TARGET_MODULES,
        lora_dropout=0.1, bias="none",
        modules_to_save=MODULES_TO_SAVE,
        task_type="FEATURE_EXTRACTION",
        inference_mode=False
    )
    return get_peft_model(base, lora_cfg)

class RawAudioCNN(nn.Module):
    def __init__(self, num_cls):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16,  kernel_size=15, stride=4, padding=7)
        self.bn1   = nn.BatchNorm1d(16)
        self.pool  = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(16,32,  kernel_size=15, stride=2, padding=7)
        self.bn2   = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32,64,  kernel_size=15, stride=2, padding=7)
        self.bn3   = nn.BatchNorm1d(64)
        self.conv4 = nn.Conv1d(64,128, kernel_size=15, stride=2, padding=7)
        self.bn4   = nn.BatchNorm1d(128)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.fc          = nn.Linear(128, num_cls)
    def forward(self, x):
        x = x.unsqueeze(1)  # [B,T]→[B,1,T]
        x = F.relu(self.bn1(self.conv1(x))); x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.global_pool(x).squeeze(-1)
        return self.fc(x)

In [None]:
class EmbeddingDatasetForDim:
    def __init__(self, manifest, meta_csv, base, classes, key="embedding"):
        import pandas as pd, os
        df = pd.read_csv(manifest)
        # assume emb_path column already points to your .npz under base/embeddings
        df["emb_path"] = df["emb_path"].astype(str) \
            .apply(lambda p: os.path.join(base, "embeddings", p.lstrip(os.sep)))
        row = df.iloc[0]
        arr = np.load(row.emb_path)[key]      # (n_windows, emb_dim)
        self.emb_dim = arr.shape[1]

# use it to grab emb_dim
_emb_ds = EmbeddingDatasetForDim(TEST_MANIFEST, TRAIN_META, FEATURE_BASE, CLASSES)
emb_dim = _emb_ds.emb_dim

# now build and load your embedding model correctly:
emb_model = EmbeddingClassifier(emb_dim=emb_dim, num_cls=NUM_CLASSES).to(DEVICE)
emb_model.load_state_dict(torch.load(CKPT_EMB,map_location=torch.device('cpu')))
emb_model.eval()


# load weights & freeze
for m, ckpt in [(emb_model,CKPT_EMB),(res_model,CKPT_RES),
               (eff_model,CKPT_EFF),(raw_model,CKPT_RAW)]:
    m.load_state_dict(torch.load(ckpt, map_location=torch.device('cpu')))
    m.eval()
    for p in m.parameters(): p.requires_grad = False

  emb_model.load_state_dict(torch.load(CKPT_EMB,map_location=torch.device('cpu')))
Using cache found in /Users/harish/.cache/torch/hub/pytorch_vision_v0.14.0
  m.load_state_dict(torch.load(ckpt, map_location=torch.device('cpu')))


In [None]:
# 1) Embedding MLP
# we need emb_dim: grab from one sample
test_manifest = pd.read_csv(TEST_MANIFEST)
sample = test_manifest.iloc[0]

#1 ) Embedding MLP
emb_path = os.path.join(FEATURE_BASE, "embeddings", sample.emb_path.lstrip(os.sep))
emb_arr  = np.load(emb_path)["embedding"].mean(axis=0).astype(np.float32)

# 2 )ResNet50
res_model = get_resnet50_multilabel(NUM_CLASSES).to(DEVICE)
# 3) EffNet
eff_model = build_efficientnetb3_lora(NUM_CLASSES).to(DEVICE)
# 4) RawCNN
raw_model = RawAudioCNN(NUM_CLASSES).to(DEVICE)


# 5) Meta supervisor
meta_model = MetaMLP(NUM_CLASSES*4, HIDDEN_DIMS, DROPOUT).to(DEVICE)
#meta_model = MetaMLP().to(DEVICE)
meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))
meta_model.eval()

  meta_model.load_state_dict(torch.load(CKPT_META, map_location=DEVICE))


MetaMLP(
  (net): Sequential(
    (0): Linear(in_features=824, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=512, out_features=206, bias=True)
  )
)

In [9]:
# embedding
emb = torch.from_numpy(emb_arr).unsqueeze(0).to(DEVICE)  # [1,emb_dim]

# mel‑aug (ResNet50)
ma_path = os.path.join(FEATURE_BASE, "mel_aug", sample.mel_aug_path.lstrip(os.sep))
ma_arr  = np.load(ma_path)["mel"].astype(np.float32)
ma = torch.from_numpy(ma_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)  # [1,1,n_mels,n_frames]

# mel (EffNetB3)
m_path = os.path.join(FEATURE_BASE, "mel", sample.mel_path.lstrip(os.sep))
m_arr  = np.load(m_path)["mel"].astype(np.float32)
m = torch.from_numpy(m_arr).unsqueeze(0).unsqueeze(0).to(DEVICE)       # [1,1,n_mels,n_frames]

# raw audio
wav_path = os.path.join(FEATURE_BASE, "denoised", sample.audio_path.lstrip(os.sep))
wav, sr   = torchaudio.load(wav_path)   # [1,T]
wav       = wav.squeeze(0)
T         = sr * 10
if wav.size(0)<T:
    wav = F.pad(wav, (0, T-wav.size(0)))
else:
    wav = wav[:T]
wav = (wav - wav.mean())/wav.std().clamp_min(1e-6)
wav = wav.unsqueeze(0).to(DEVICE)       # [1,T]

In [10]:
with torch.no_grad():
    p1 = torch.sigmoid(emb_model(emb))     # [1,NUM_CLASSES]
    p2 = torch.sigmoid(res_model(ma))      # [1,NUM_CLASSES]
    p3 = torch.sigmoid(eff_model(m))       # [1,NUM_CLASSES]
    p4 = torch.sigmoid(raw_model(wav))     # [1,NUM_CLASSES]

    feat   = torch.cat([p1,p2,p3,p4], dim=1)
    logits = meta_model(feat)
    probs  = torch.sigmoid(logits)[0].cpu().numpy()

In [11]:
ml_preds = [(CLASSES[i], float(probs[i]))
            for i in range(NUM_CLASSES) if probs[i] >= THRESHOLD]

print(f"\nMulti-label predictions (prob ≥ {THRESHOLD}):")
if ml_preds:
    for lab, sc in ml_preds:
        print(f"  • {lab}: {sc:.3f}")
else:
    print("  • <none>")



Multi-label predictions (prob ≥ 0.5):
  • 1564122: 0.766


In [12]:
primary_idx   = int(probs.argmax())
primary_label = CLASSES[primary_idx]
primary_score = float(probs[primary_idx])

print(f"\nPrimary‑label (top‑1) prediction:")
print(f"  → {primary_label}: {primary_score:.3f}")


Primary‑label (top‑1) prediction:
  → 1564122: 0.766
