In [1]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/birdclefmodels/')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa

from matplotlib import pyplot as plt
import json
import random
from tqdm import tqdm
import os
import yaml

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
from torch import nn
import torchaudio as ta
import timm
from torch.utils.data import Dataset, DataLoader

import neural_network

In [2]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv('../input/birdclef-data-with-wav-durations/train_metadata_extended.csv')
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [3]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [4]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [5]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration)
    assert sr <= 32000, sr
    return wav, sr

### No call

In [6]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = librosa.feature.melspectrogram(
            y=y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = librosa.power_to_db(melspec).astype(np.float32)
        return melspec
    
    
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])        
    elif len(y) > length:
        y = y[:length]
    return y


class NoCallDataset(Dataset):
    def __init__(self, test_folder):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        self.mel_spectrogram = MelSpecComputer(
            sr=32000,
            n_mels=128,
            fmin=0,
            fmax=16000
        )
        self.augmentations = A.Compose([
            A.Resize(128, 281),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
            ToTensorV2(),
        ])
        
    def audio2image(self, wav):
        melspec = self.mel_spectrogram(wav) 
        image = mono_to_color(melspec)
        image = np.stack((image,)*3, -1)
        image = self.augmentations(image=image)['image']
        return image
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        wav = wav[:sr * 60]
        spectrograms = []
        crop_size = 5 * sr
        model_crop_size = 10 * sr
        for left in range(0, 60, 5):
            left = left * sr
            right = left + crop_size
            crop = wav[left:right]
            crop = crop_or_pad(crop, model_crop_size)
            spectrograms.append(self.audio2image(crop))
        spectrograms = torch.stack(spectrograms) 
        return spectrograms
    
    
class CustomResNext(nn.Module):
    def __init__(self, model_name='resnext50_32x4d'):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False)
        n_features = self.model.fc.in_features
        self.model.fc = nn.Linear(n_features, 2)

    def forward(self, x):
        x = self.model(x)
        return x
    


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

nocall_model = CustomResNext()
nocall_model.to(device)

state_dict = torch.load('../input/nocall-classifier/resnext50_32x4d_best.pth', map_location=device)
nocall_model.load_state_dict(state_dict['model'])

nocall_dataset = NoCallDataset(os.path.join(data_root, 'test_soundscapes'))
nocall_dataloader = DataLoader(
    nocall_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [8]:
nocall_list = []
part_count = 12

with torch.no_grad():
    for i, batch in tqdm(enumerate(nocall_dataloader)):
        batch = batch[0]
        pred = nocall_model(batch.to(device))
        pred = torch.softmax(pred, dim=1)
        pred = pred.cpu().numpy()[:, 0] < 0.9
        
        for j, chunk_pred in enumerate(pred):
            fname = nocall_dataset.fnames[i]
            prefix = fname.split('.')[0]
            
            nocall_list.append({
                'fname': fname,
                'r_sec': 5 * (j + 1),
                'iscall': chunk_pred
            })
nocall_pd = pd.DataFrame(nocall_list)

1it [00:07,  7.39s/it]


### Model load

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def init_model(config_fname, model_fname, model_root):
    config_path = os.path.join(model_root, config_fname)
    model_path = os.path.join(model_root, model_fname)
    
    with open(config_path) as fin:
        config = yaml.safe_load(fin)

    model_config = config['model']
    if 'backbone_config' in model_config['params']:
        model_config['params']['backbone_config']['pretrained'] = False
    else:
        model_config['params']['backbone_config'] = {'pretrained': False}
    if 'model_name' in model_config['params']['backbone_config']:
        model_config['params']['backbone_config'].pop('model_name')
        
    data_config = config['data']
    model_class = NN_CATALOG[model_config['name']]

    model = model_class(len(all_species), int(data_config['crop_len'] // data_config['test_wav_len']),
                        **model_config['params'])
    model.to(device)

    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    return model


In [10]:
from neural_network import NN_CATALOG


ckpt_root = '../input/birdclefsubmit'
eff_path = ('eff/baseline_config.yaml', 'eff/final-model.pt')
baseline_path = ('baseline/baseline_config.yaml', 'baseline/final-model.pt')
eff_bt_path = [('eff_crop/baseline_config.yaml', f'eff_crop/main_stage-{i}-model.pt') for i in range(9, 11)]
path_list = [
    eff_path,
    baseline_path,
    *eff_bt_path,
]

models = [init_model(*p, ckpt_root) for p in path_list]
weights = None


In [11]:
class Blending(nn.Module):
    def __init__(self, models, weights):
        super().__init__()
        for m in models:
            m.eval()
        self.models = models
        self.weights = weights or [1 / len(models) for _ in models]
        
    def forward(self, wav_tensor):
        pred = [m(wav_tensor)['logits'] for m in self.models]
        pred = sum(p * w for p, w in zip(pred, self.weights))
        return pred

In [12]:
model = Blending(models, weights)

### Torch Dataset

In [13]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [14]:
class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray, sr):
        for trns in self.transforms:
            y = trns(y, sr)
        return y
    
    
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray, sr):
        if self.always_apply:
            return self.apply(y, sr=sr)
        else:
            if np.random.rand() < self.p:
                return self.apply(y, sr=sr)
            else:
                return y

    def apply(self, y: np.ndarray, **params):
        raise NotImplementedError
        
        
class Normalize(AudioTransform):
    def __init__(self, always_apply=False, p=1):
        super().__init__(always_apply, p)

    def apply(self, y: np.ndarray, **params):
        max_vol = np.abs(y).max()
        y_vol = y / max_vol
        assert not np.isnan(y_vol).any(), f'{max_vol}'
        return y_vol

In [15]:
class TestDataset(Dataset):
    def __init__(self, test_folder, augmentations=None):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        if self.augmentations:
            wav = self.augmentations(wav, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [16]:
test_dataset = TestDataset(
    os.path.join(data_root, 'test_soundscapes'),
    Compose([Normalize(p=1)])
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [17]:
def find_topk(pred_proba, max_birds=5):
    pred_proba = pred_proba[:, [species2id[b] for b in test_birds]]
    mean_proba = pred_proba.mean(axis=0)
    topk_birds = [i for i, _ in sorted(enumerate(mean_proba),
                                       key=lambda x: x[1],
                                       reverse=True)][:max_birds]
    return topk_birds

In [18]:
treshold_dict = {'akiapo': 0.05,
 'aniani': 0.05,
 'apapan': 0.05,
 'barpet': 0.05,
 'crehon': 0.05,
 'elepai': 0.05,
 'ercfra': 0.05,
 'hawama': 0.05,
 'hawcre': 0.05,
 'hawgoo': 0.15,
 'hawhaw': 0.05,
 'hawpet1': 0.05,
 'houfin': 0.15,
 'iiwi': 0.15,
 'jabwar': 0.05,
 'maupar': 0.05,
 'omao': 0.05,
 'puaioh': 0.05,
 'skylar': 0.1,
 'warwhe1': 0.05,
 'yefcan': 0.15}

In [19]:
nocall_pd

Unnamed: 0,fname,r_sec,iscall
0,soundscape_453028782.ogg,5,False
1,soundscape_453028782.ogg,10,False
2,soundscape_453028782.ogg,15,False
3,soundscape_453028782.ogg,20,False
4,soundscape_453028782.ogg,25,False
5,soundscape_453028782.ogg,30,True
6,soundscape_453028782.ogg,35,True
7,soundscape_453028782.ogg,40,False
8,soundscape_453028782.ogg,45,True
9,soundscape_453028782.ogg,50,False


In [20]:
pred_list = []
treshold = 0.1
with torch.no_grad():
    for i, batch in tqdm(enumerate(test_dataloader)):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))
        pred = pred.cpu().numpy()
#         topk_birds = find_topk(pred_proba, max_birds=10)
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            nocall = nocall_pd[(nocall_pd.fname == fname) & (nocall_pd.r_sec == 5 * chunk_number)]['iscall'].values[0]
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': (chunk_pred[species2id[b]] > treshold_dict[b]) and nocall # if species2id[b] in topk_birds else False
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

1it [00:00,  1.41it/s]


In [21]:
pred_pd.to_csv("submission.csv", index=False)
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
...,...,...
247,soundscape_453028782_omao_60,False
248,soundscape_453028782_puaioh_60,False
249,soundscape_453028782_skylar_60,False
250,soundscape_453028782_warwhe1_60,False
