* Training Notebook [here.](https://www.kaggle.com/code/myso1987/birdclef2025-2-train-baseline-5s)
* Dataset Creation [here](https://www.kaggle.com/code/myso1987/birdclef2025-1-crop-audio-5s)

In [1]:
import os
import gc
import time
import pandas as pd
import torch
import torch.nn as nn
from torchvision import models
import torchaudio
import torchaudio.transforms as AT
from contextlib import contextmanager
import concurrent.futures

In [2]:
test_audio_dir = '../input/birdclef-2025/test_soundscapes/'
file_list = [f for f in sorted(os.listdir(test_audio_dir))]
file_list = [file.split('.')[0] for file in file_list if file.endswith('.ogg')]

debug = False
if len(file_list) == 0:
    debug = True
    debug_st_num = 5
    debug_num = 8
    test_audio_dir = '../input/birdclef-2025/train_soundscapes/'
    file_list = [f for f in sorted(os.listdir(test_audio_dir))]
    file_list = [file.split('.')[0] for file in file_list if file.endswith('.ogg')]
    file_list = file_list[debug_st_num:debug_st_num+debug_num]

print('Debug mode:', debug)
print('Number of test soundscapes:', len(file_list))

Debug mode: True
Number of test soundscapes: 8


In [3]:
wav_sec = 5
sample_rate = 32000
min_segment = sample_rate*wav_sec

class_labels = sorted(os.listdir('../input/birdclef-2025/train_audio/'))

n_fft=1024
win_length=1024
hop_length=512
f_min=20
f_max=15000
n_mels=128

mel_spectrogram = AT.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    f_min=f_min,
    f_max=f_max,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    n_mels=n_mels,
    mel_scale="htk",
    # normalized=True
)

def normalize_std(spec, eps=1e-23):
    mean = torch.mean(spec)
    std = torch.std(spec)
    return torch.where(std == 0, spec-mean, (spec - mean) / (std+eps))

def audio_to_mel(filepath=None):
    waveform, sample_rate = torchaudio.load(filepath,backend="soundfile")
    len_wav = waveform.shape[1]
    waveform = waveform[0,:].reshape(1, len_wav) # stereo->mono mono->mono
    waveform = waveform / torch.max(torch.abs(waveform))
    waveform = waveform + 1.5849e-05*(torch.rand(1, len_wav)-0.5) 
    PREDS = []
    for i in range(12):
        waveform2 = waveform[:,i*sample_rate*5:i*sample_rate*5+sample_rate*5]
        melspec = mel_spectrogram(waveform2)
        melspec = torch.log(melspec)
        melspec = normalize_std(melspec)
        melspec = torch.unsqueeze(melspec, dim=0)
        
        PREDS.append(melspec)
    return torch.vstack(PREDS)

In [4]:
class Model_resnet34(nn.Module):
    def __init__(self, pretrained=False):
        super().__init__()

        # Use timm
        model = models.resnet34(pretrained=pretrained)

        num_ftrs = model.fc.in_features
        model.fc = nn.Linear(num_ftrs, len(class_labels))
        self.model = model

    def forward(self, x):
        x = torch.cat((x,x,x),1)
        x = self.model(x)
        return x

model = Model_resnet34(pretrained=False)
model.load_state_dict(torch.load('/kaggle/input/birdclef-2025-models/baseline.pth', weights_only=True, map_location=torch.device('cpu')))
model.eval();



In [5]:
def prediction(afile):    
    global pred
    path = test_audio_dir + afile + '.ogg'
    with torch.inference_mode():
        sig = audio_to_mel(path)
        print()
        outputs = model(sig)
        outputs = torch.sigmoid(outputs).detach().cpu().numpy()
        chunks = [[] for i in range(12)]
        for i in range(len(chunks)):        
            chunk_end_time = (i + 1) * 5
            row_id = afile + '_' + str(chunk_end_time)
            pred['row_id'].append(row_id)
            bird_no = 0
            for bird in class_labels:         
                pred[bird].append(outputs[i,bird_no])
                bird_no += 1
        gc.collect()

In [6]:
pred = {'row_id': []}
for species_code in class_labels:
    pred[species_code] = []
    
start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    _ = list(executor.map(prediction, file_list))
end_t = time.time()

if debug == True:
    print(700*(end_t - start)/60/debug_num)









9.387339353561401


In [7]:
results = pd.DataFrame(pred, columns = ['row_id'] + class_labels) 
    
results.to_csv("submission.csv", index=False)    

if debug:
    display(results.head())

Unnamed: 0,row_id,1139490,1192948,1194042,126247,1346504,134933,135045,1462711,1462737,...,yebfly1,yebsee1,yecspi2,yectyr1,yehbla2,yehcar1,yelori1,yeofly1,yercac1,ywcpar
0,H02_20230421_233500_5,3.105756e-11,1.088897e-11,1.176964e-06,2.283467e-07,1.767216e-07,2.362253e-06,6.065724e-08,5.171996e-12,1.204082e-09,...,0.000335,0.000105,3.9e-05,8.7e-05,8.525438e-07,3.3e-05,2.28397e-07,2.2e-05,0.010014,1.8e-05
1,H02_20230421_233500_10,4.852676e-12,1.722202e-12,4.737399e-07,3.593193e-07,9.971394e-08,2.68353e-06,5.338451e-09,3.396952e-13,1.021689e-09,...,0.000128,6.2e-05,2.2e-05,9e-05,1.062993e-07,1.5e-05,5.069804e-08,1.5e-05,0.003232,9e-06
2,H02_20230421_233500_15,4.227579e-11,4.231629e-11,7.181205e-06,6.200702e-06,6.75704e-07,5.610537e-06,2.173163e-08,6.148789e-12,4.49943e-10,...,0.000212,7e-05,1.2e-05,5.5e-05,2.547455e-07,2.6e-05,6.51398e-08,2.1e-05,0.008833,2.7e-05
3,H02_20230421_233500_20,1.424047e-10,4.724696e-11,7.394086e-07,1.289133e-06,1.974123e-07,4.327341e-07,1.87836e-09,4.789522e-12,9.196865e-11,...,0.000672,3.1e-05,9e-06,0.000158,6.490615e-08,1.3e-05,1.869558e-07,1.7e-05,0.033741,1e-05
4,H02_20230421_233500_25,2.135889e-10,8.097572e-11,2.627633e-06,5.422377e-07,1.189174e-07,3.383848e-06,2.291648e-08,1.182682e-10,1.674741e-09,...,0.000465,9.8e-05,4.8e-05,0.00029,6.099415e-07,0.000125,4.062196e-07,5e-05,0.011089,6.5e-05
