In [1]:
!pip install timm

Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.5/431.5 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: timm
Successfully installed timm-0.5.4
[0m

In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import json

import librosa

import torch
import torchaudio as ta
import timm

from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv(os.path.join(data_root, 'train_metadata.csv'))
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [14]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [15]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [16]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration)
    assert sr <= 32000, sr
    return wav, sr

In [33]:
%%time
duration = 30
sample_rate = 32000

wav, sr = load_wav('afrsil1/XC125458.ogg', 5, duration)
to_pad = duration * sample_rate - wav.shape[0]

if to_pad > 0:
    wav = np.pad(wav, (0, to_pad))



CPU times: user 16.5 ms, sys: 1.83 ms, total: 18.3 ms
Wall time: 21.4 ms


In [35]:
wav.shape

(960000,)

### Torch Dataset

In [18]:
from torch.utils.data import Dataset, DataLoader

class BirdDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __getitem__(self, idx):
        duration = 30
        sample_rate = 32000
        
        fname = self.df.iloc[idx]['filename']
        # TODO: add random offset
        wav, sr = load_wav(fname, 0, duration)
        to_pad = duration * sample_rate - wav.shape[0]
        if to_pad > 0:
            wav = np.pad(wav, (0, to_pad))
            
        target = self.df.iloc[idx]['target']
        
        # TODO: add weighting
            
        wav = torch.tensor(wav)
        target = torch.tensor(target, dtype=float)
        return {
            'wav': wav,
            'target': target,
        }

    def __len__(self):
        return len(self.df)

### Model

In [89]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.audio2image = self._init_audio2image()
        self.backbone = self._init_backbone()
        self.head = self._init_head(self.backbone.feature_info[-1]['num_chs'])      
        self.loss = torch.nn.BCEWithLogitsLoss()
        
    def forward(self, wav_tensor, y=None):
        spectrogram = self.audio2image(wav_tensor)
        spectrogram = spectrogram.permute(0, 2, 1)
        spectrogram = spectrogram[:, None, :, :]
        x = self.backbone(spectrogram)
        logits = self.head(x)
        
        if y:
            loss = self.loss(logits, y)
        else:
            loss = None

        return {'loss': loss, 'logits': logits.sigmoid()}

    
    @staticmethod
    def _init_audio2image():
        mel = ta.transforms.MelSpectrogram(
            sample_rate=32000,
            n_fft=2048,
            win_length=2048,
            hop_length=512,
            f_min=16,
            f_max=16386,
            pad=0,
            n_mels=256,
            power=2,
            normalized=False,
        )
        db_scale = ta.transforms.AmplitudeToDB(top_db=80.0)
        audio2image = torch.nn.Sequential(mel, db_scale)
        return audio2image
    
    @staticmethod
    def _init_backbone():
        backbone = "resnet18"
        pretrained = True
        pretrained_weights = None
        train = True
        val = False
        in_chans = 1

        backbone = timm.create_model(
            backbone,
            pretrained=pretrained,
            num_classes=0,
            global_pool="",
            in_chans=in_chans,
        )
        return backbone
    
    @staticmethod
    def _init_head(num_chs):
        head = torch.nn.Sequential(
            torch.nn.AdaptiveAvgPool2d(output_size=1),
            torch.nn.Flatten(),
            torch.nn.Linear(num_chs, len(all_species))
        )
        return head
        

### Train loop

In [90]:
model = Net()
train_dataset = BirdDataset(train_meta)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    pin_memory=False,
    drop_last=True,
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters())

In [91]:
def train_epoch(model, optimizer, dataloader, device):
    tqdm_dataloader = tqdm(dataloader)
    loss_list = []
    for batch in tqdm_dataloader:
        loss = model(batch['wav'].to(device), batch['target'].to(device))['loss']
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    return loss_list
    

In [22]:
if False:
    epochs = 2
    model.train()
    model.to(device)
    for e in range(epochs):
        epoch_loss = train_epoch(model, optimizer, train_dataloader, device)
        print(f'{e} train loss:', f'{epoch_loss:.3f}', sep='\t')

### Submit

In [92]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [52]:
test_fnames = [f for f in os.listdir(f'{data_root}/test_soundscapes') if f.endswith('.ogg')]
test_pd = []
for fname in test_fnames:
    fpath = os.path.join(data_root, 'test_soundscapes', fname)
    wav, sr = librosa.load(fpath, sr=None)
    prefix = fname.split('.')[0]
    window_size = 5 * sr
    for i, chunk in enumerate(wav[::window_size]):
        end_time = (i + 1) * 5
        samples = [{
            'row_id': f'{prefix}_{b}_{end_time}',
            'file_id': fname,
            'bird': b,
            'end_time': end_time
        } for b in test_birds]
        test_pd.extend(samples)
        
test_pd = pd.DataFrame(test_pd)
test_pd

In [55]:
test_pd.groupby('file_id').get_group('soundscape_453028782.ogg')

Unnamed: 0,row_id,file_id,bird,end_time
0,soundscape_453028782_akiapo_5,soundscape_453028782.ogg,akiapo,5
1,soundscape_453028782_aniani_5,soundscape_453028782.ogg,aniani,5
2,soundscape_453028782_apapan_5,soundscape_453028782.ogg,apapan,5
3,soundscape_453028782_barpet_5,soundscape_453028782.ogg,barpet,5
4,soundscape_453028782_crehon_5,soundscape_453028782.ogg,crehon,5
...,...,...,...,...
247,soundscape_453028782_omao_60,soundscape_453028782.ogg,omao,60
248,soundscape_453028782_puaioh_60,soundscape_453028782.ogg,puaioh,60
249,soundscape_453028782_skylar_60,soundscape_453028782.ogg,skylar,60
250,soundscape_453028782_warwhe1_60,soundscape_453028782.ogg,warwhe1,60


In [93]:
class TestDataset(Dataset):
    def __init__(self, test_folder):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [94]:
test_dataset = TestDataset(os.path.join(data_root, 'test_soundscapes'))
test_dataloader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [99]:
pred_list = []
treshold = 0.5
model.eval()
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))['logits']
        pred = pred.cpu().numpy()
        pred = pred > treshold
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': chunk_pred[species2id[b]]
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

In [100]:
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,True
4,soundscape_453028782_crehon_5,True
...,...,...
247,soundscape_453028782_omao_60,True
248,soundscape_453028782_puaioh_60,True
249,soundscape_453028782_skylar_60,True
250,soundscape_453028782_warwhe1_60,False
