In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import json
import random

import librosa

import torch
import torchaudio as ta
import timm

from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv('../input/birdclef-data-with-wav-durations/train_metadata_extended.csv')
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [3]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [4]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [5]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration)
    assert sr <= 32000, sr
    return wav, sr

### Torch Dataset

In [6]:
TEST_SIZE = 5 
CONFIG = {
    'crop_len': 30,
    'sample_rate': 32000,    
}


In [7]:
from torch.utils.data import Dataset, DataLoader

class BirdDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __getitem__(self, idx):
        duration = CONFIG['crop_len']
        sample_rate = CONFIG['sample_rate']
        
        fname = self.df.iloc[idx]['filename']
        wav_len = train_meta.iloc[0]['duration']
        
        max_offset = max(0, wav_len - duration)
        random_offset = random.randint(0, max_offset)
                
        wav, sr = load_wav(fname, random_offset, duration)
        to_pad = duration * sample_rate - wav.shape[0]
        if to_pad > 0:
            wav = np.pad(wav, (0, to_pad))
            
        target = self.df.iloc[idx]['target']
        
        # TODO: add weighting
            
        wav = torch.tensor(wav)
        target = torch.tensor(target, dtype=float)
        return {
            'wav': wav,
            'target': target,
        }

    def __len__(self):
        return len(self.df)

### Model

In [8]:
from torch.distributions import Beta


class Mixup(torch.nn.Module):
    def __init__(self, mix_beta=1):

        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)

    def forward(self, X, Y, weight=None):

        bs = X.shape[0]
        n_dims = len(X.shape)
        perm = torch.randperm(bs)
        coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

        if n_dims == 2:
            X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
        elif n_dims == 3:
            X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
        else:
            X = coeffs.view(-1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]

        Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]

        if weight is None:
            return X, Y
        else:
            weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
            return X, Y, weight

In [9]:
class Net(torch.nn.Module):
    def __init__(self, backbone_path=None):
        super().__init__()
        self.audio2image = self._init_audio2image()
        self.backbone = self._init_backbone()
        self.load_backbone(backbone_path)
        self.head = self._init_head(self.backbone.feature_info[-1]['num_chs'])      
        self.loss = torch.nn.BCEWithLogitsLoss()
        self.mixup = Mixup()
        
    def forward(self, wav_tensor, y=None):
        if self.training:
            wav_tensor = self.batch_crop(wav_tensor)
            
        spectrogram = self.audio2image(wav_tensor)
        spectrogram = spectrogram.permute(0, 2, 1)
        spectrogram = spectrogram[:, None, :, :]
        
        if self.training:
            spectrogram = spectrogram.permute(0, 2, 1, 3)
            spectrogram = self.batch_uncrop(spectrogram)
            
            spectrogram, y = self.mixup(spectrogram, y)
            
            spectrogram = self.batch_crop(spectrogram)
            spectrogram = spectrogram.permute(0, 2, 1, 3)
                
        x = self.backbone(spectrogram)
        if self.training:
            x = x.permute(0, 2, 1, 3)
            x = self.batch_uncrop(x)
            x = x.permute(0, 2, 1, 3)
                
        logits = self.head(x)
        
        if y is not None:
            loss = self.loss(logits, y)
        else:
            loss = None

        return {'loss': loss, 'logits': logits.sigmoid()}
    
    def batch_crop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b * factor, t // factor, *tensor.shape[2:])
        return tensor
    
    def batch_uncrop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b // factor, t * factor, *tensor.shape[2:])
        return tensor
    
    @staticmethod
    def _init_audio2image():
        mel = ta.transforms.MelSpectrogram(
            sample_rate=32000,
            n_fft=2048,
            win_length=2048,
            hop_length=512,
            f_min=16,
            f_max=16386,
            pad=0,
            n_mels=256,
            power=2,
            normalized=False,
        )
        db_scale = ta.transforms.AmplitudeToDB(top_db=80.0)
        audio2image = torch.nn.Sequential(mel, db_scale)
        return audio2image
    
    @staticmethod
    def _init_backbone():
        backbone = "resnet18"
        pretrained = False
        pretrained_weights = None
        train = True
        val = False
        in_chans = 1

        backbone = timm.create_model(
            backbone,
            pretrained=pretrained,
            num_classes=0,
            global_pool="",
            in_chans=in_chans,
        )
        return backbone
    
    @staticmethod
    def _init_head(num_chs):
        head = torch.nn.Sequential(
            torch.nn.AdaptiveAvgPool2d(output_size=1),
            torch.nn.Flatten(),
            torch.nn.Linear(num_chs, len(all_species))
        )
        return head
    
    def load_backbone(self, weights_path=None):
        if weights_path:
            state_dict=torch.load(weights_path)
            conv1_weight = state_dict['conv1.weight']
            state_dict['conv1.weight'] = conv1_weight.sum(dim=1, keepdim=True)
            state_dict.pop('fc.bias')
            state_dict.pop('fc.weight')
            self.backbone.load_state_dict(state_dict)
        

### Model load

In [10]:
model_path = '../input/birdclefsubmit/7_model.pt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = torch.load(model_path, map_location=device)
# model.load_state_dict(state_dict)
# model.to(device)

### Submit

In [11]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [12]:
class TestDataset(Dataset):
    def __init__(self, test_folder):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [13]:
test_dataset = TestDataset(os.path.join(data_root, 'test_soundscapes'))
test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [14]:
pred_list = []
treshold = 0.1
model.eval()
with torch.no_grad():
    for i, batch in tqdm(enumerate(test_dataloader)):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))['logits']
        pred = pred.cpu().numpy()
        pred = pred > treshold
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': chunk_pred[species2id[b]]
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

1it [00:06,  6.12s/it]


In [15]:
pred_pd.to_csv("submission.csv", index=False)
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
...,...,...
247,soundscape_453028782_omao_60,False
248,soundscape_453028782_puaioh_60,False
249,soundscape_453028782_skylar_60,False
250,soundscape_453028782_warwhe1_60,False
