In [1]:
import os
import torch
import numpy as np
import pandas as pd
import torchaudio
import torch.nn.functional as F

In [2]:
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
FEATURE_BASE  = "/home/jovyan/Features"
TEST_MAN      = os.path.join(FEATURE_BASE, "manifest_test.csv")
TAXONOMY_CSV  = "/home/jovyan/Data/birdclef-2025/taxonomy.csv"
CHECKPOINT    = "best_rawcnn.pt"    # your saved weights
THRESHOLD     = 0.5                 # or load a per-class array

In [3]:
class RawAudioCNN(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = torch.nn.Conv1d(1,16,15,stride=4,padding=7)
        self.bn1   = torch.nn.BatchNorm1d(16)
        self.pool  = torch.nn.MaxPool1d(4)
        self.conv2 = torch.nn.Conv1d(16,32,15,stride=2,padding=7)
        self.bn2   = torch.nn.BatchNorm1d(32)
        self.conv3 = torch.nn.Conv1d(32,64,15,stride=2,padding=7)
        self.bn3   = torch.nn.BatchNorm1d(64)
        self.conv4 = torch.nn.Conv1d(64,128,15,stride=2,padding=7)
        self.bn4   = torch.nn.BatchNorm1d(128)
        self.gpool = torch.nn.AdaptiveAvgPool1d(1)
        self.fc    = torch.nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)               # [B,1,T]
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.gpool(x).squeeze(-1)    # [B,128]
        return self.fc(x)

In [4]:
tax_df      = pd.read_csv(TAXONOMY_CSV)
classes     = sorted(tax_df["primary_label"].astype(str).tolist())
num_classes = len(classes)

In [5]:
test_df = pd.read_csv(TEST_MAN)
sample  = test_df.sample(1).iloc[0]
print("Running inference on chunk:", sample.chunk_id)

Running inference on chunk: XC166627_chk0


In [6]:
wav_path = os.path.join(FEATURE_BASE, "denoised", sample.audio_path.lstrip(os.sep))
wav, sr  = torchaudio.load(wav_path)  # [1, T]
wav       = wav.squeeze(0)            # [T]
target_len = 32000 * 10
if wav.size(0) < target_len:
    wav = F.pad(wav, (0, target_len - wav.size(0)))
else:
    wav = wav[:target_len]
wav = (wav - wav.mean()) / wav.std().clamp_min(1e-6)
wav = wav.to(DEVICE).unsqueeze(0)     # [1, T]

In [7]:
model = RawAudioCNN(num_classes).to(DEVICE)
state = torch.load(CHECKPOINT, map_location=DEVICE)
model.load_state_dict(state)
model.eval()

RawAudioCNN(
  (conv1): Conv1d(1, 16, kernel_size=(15,), stride=(4,), padding=(7,))
  (bn1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(16, 32, kernel_size=(15,), stride=(2,), padding=(7,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv1d(32, 64, kernel_size=(15,), stride=(2,), padding=(7,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(64, 128, kernel_size=(15,), stride=(2,), padding=(7,))
  (bn4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gpool): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=128, out_features=206, bias=True)
)

In [8]:
with torch.no_grad():
    logits = model(wav)               # [1, C]
    probs  = torch.sigmoid(logits)[0].cpu().numpy()

In [9]:
ml_preds = [
    (classes[i], float(probs[i])) 
    for i in range(num_classes) 
    if probs[i] >= THRESHOLD
]

print(f"\nMulti‑label predictions (prob ≥ {THRESHOLD}):")
if ml_preds:
    for label, score in ml_preds:
        print(f"  • {label}: {score:.3f}")
else:
    print("  • <none>")


Multi‑label predictions (prob ≥ 0.5):
  • chbant1: 0.990


In [10]:
primary_idx   = int(probs.argmax())
primary_pred  = classes[primary_idx]
primary_score = float(probs[primary_idx])

print(f"\nPrimary‑label (top‑1) prediction:")
print(f"  → {primary_pred}: {primary_score:.3f}")


Primary‑label (top‑1) prediction:
  → chbant1: 0.990
