In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import json
import random
# import colorednoise as cn

import librosa

import torch
import torchaudio as ta
import timm

from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv('../input/birdclef-data-with-wav-durations/train_metadata_extended.csv')
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [3]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [4]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [5]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration, offset=offset)
    assert sr <= 32000, sr
    return wav, sr

In [6]:
# %%time
# duration = 30
# sample_rate = 32000

# wav, sr = load_wav('afrsil1/XC125458.ogg', 5, duration)
# to_pad = duration * sample_rate - wav.shape[0]

# if to_pad > 0:
#     wav = np.pad(wav, (0, to_pad))



### Augmentations

In [7]:
class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray, sr):
        for trns in self.transforms:
            y = trns(y, sr)
        return y


class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray, sr):
        if self.always_apply:
            return self.apply(y, sr=sr)
        else:
            if np.random.rand() < self.p:
                return self.apply(y, sr=sr)
            else:
                return y

    def apply(self, y: np.ndarray, **params):
        raise NotImplementedError


class OneOf(Compose):
    # https://github.com/albumentations-team/albumentations/blob/master/albumentations/core/composition.py
    def __init__(self, transforms, p=0.5):
        super().__init__(transforms)
        self.p = p
        transforms_ps = [t.p for t in transforms]
        s = sum(transforms_ps)
        self.transforms_ps = [t / s for t in transforms_ps]

    def __call__(self, y: np.ndarray, sr):
        data = y
        if self.transforms_ps and (random.random() < self.p):
            random_state = np.random.RandomState(random.randint(0, 2 ** 32 - 1))
            t = random_state.choice(self.transforms, p=self.transforms_ps)
            data = t(y, sr)
        return data


class Normalize(AudioTransform):
    def __init__(self, always_apply=False, p=1):
        super().__init__(always_apply, p)

    def apply(self, y: np.ndarray, **params):
        max_vol = np.abs(y).max()
        y_vol = y / max_vol
        assert not np.isnan(y_vol).any(), f'{max_vol}'
        return y_vol


class NewNormalize(AudioTransform):
    def __init__(self, always_apply=False, p=1):
        super().__init__(always_apply, p)

    def apply(self, y: np.ndarray, **params):
        y_mm = y - y.mean()
        return y_mm / y_mm.abs().max()


class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        
        assert not np.isnan(augmented).any(), f'{noise_level}'
        max_vol = np.abs(augmented).max()
        assert max_vol != 0.
        return augmented


class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        
        assert not np.isnan(augmented).any(), f'{a_noise}, {snr}'
        max_vol = np.abs(augmented).max()
        assert max_vol != 0.
        return augmented


# class PinkNoise(AudioTransform):
#     def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20):
#         super().__init__(always_apply, p)

#         self.min_snr = min_snr
#         self.max_snr = max_snr

#     def apply(self, y: np.ndarray, **params):
#         snr = np.random.uniform(self.min_snr, self.max_snr)
#         a_signal = np.sqrt(y ** 2).max()
#         a_noise = a_signal / (10 ** (snr / 20))

#         pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
#         a_pink = np.sqrt(pink_noise ** 2).max()
#         augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
#         return augmented


class PitchShift(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_range=5):
        super().__init__(always_apply, p)
        self.max_range = max_range

    def apply(self, y: np.ndarray, sr, **params):
        n_steps = np.random.randint(-self.max_range, self.max_range)
        augmented = librosa.effects.pitch_shift(y, sr, n_steps)
        return augmented


class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1):
        super().__init__(always_apply, p)
        self.max_rate = max_rate

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented


def _db2float(db: float, amplitude=True):
    if amplitude:
        return 10 ** (db / 20)
    else:
        return 10 ** (db / 10)


def volume_down(y: np.ndarray, db: float):
    """
    Low level API for decreasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to decrease
    Returns
    -------
    applied: numpy.ndarray
        audio with decreased volume
    """
    applied = y * _db2float(-db)
    return applied


def volume_up(y: np.ndarray, db: float):
    """
    Low level API for increasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to increase
    Returns
    -------
    applied: numpy.ndarray
        audio with increased volume
    """
    applied = y * _db2float(db)
    return applied


class RandomVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        if db >= 0:
            augmented = volume_up(y, db) 
        else:
            augmented = volume_down(y, db)
        assert not np.isnan(augmented).any(), f'{db}'
        max_vol = np.abs(augmented).max()
        assert max_vol != 0.
        return augmented


class CosineVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
        dbs = _db2float(cosine * db)
        return y * dbs


### Torch Dataset

In [8]:
TEST_SIZE = 5 
CONFIG = {
    'crop_len': 30,
    'sample_rate': 32000,    
}


In [9]:
from torch.utils.data import Dataset, DataLoader

class BirdDataset(Dataset):
    def __init__(self, df, augmentations):
        super().__init__()
        self.df = df
        self.augmentations = augmentations
        
    def __getitem__(self, idx):
        duration = CONFIG['crop_len']
        sample_rate = CONFIG['sample_rate']
        
        fname = self.df.iloc[idx]['filename']
        wav_len = self.df.iloc[idx]['duration']
        
        max_offset = max(0, wav_len - duration)
        random_offset = random.randint(0, max_offset)
                
        wav, sr = load_wav(fname, random_offset, duration)
        to_pad = duration * sample_rate - wav.shape[0]
        if to_pad > 0:
            wav = np.pad(wav, (0, to_pad))
            
        if self.augmentations:
            try:
                wav = self.augmentations(wav, None)
            except ValueError as e:
                print(random_offset)
                raise e
        target = self.df.iloc[idx]['target']
        
        # TODO: add weighting
            
        wav = torch.tensor(wav)
        target = torch.tensor(target, dtype=float)
        return {
            'wav': wav,
            'target': target,
        }

    def __len__(self):
        return len(self.df)

### Model

In [10]:
from torch.distributions import Beta


class Mixup(torch.nn.Module):
    def __init__(self, mix_beta=1):

        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)

    def forward(self, X, Y, weight=None):

        bs = X.shape[0]
        n_dims = len(X.shape)
        perm = torch.randperm(bs)
        coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

        if n_dims == 2:
            X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
        elif n_dims == 3:
            X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
        else:
            X = coeffs.view(-1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]

        Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]

        if weight is None:
            return X, Y
        else:
            weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
            return X, Y, weight
        
        
class Attention(torch.nn.Module):
    def __init__(self, in_channels, out_channels, activation='linear'):
        super().__init__()
        self.activation = activation
        self.attn = torch.nn.Conv1d(in_channels, out_channels, kernel_size=1)
        self.cla = torch.nn.Conv1d(in_channels, out_channels, kernel_size=1)
        
    def forward(self, x):
        # x: b, c, t
        attn = torch.softmax(torch.tanh(self.attn(x)), dim=-1) # b, c, t
        x = self.cla(x) # b, c, t
        x = torch.sum(x * attn, dim=-1) #b, c
        return x


In [11]:
class Net(torch.nn.Module):
    def __init__(self, backbone_path=None):
        super().__init__()
        self.audio2image = self._init_audio2image()
        self.backbone = self._init_backbone()
        self.load_backbone(backbone_path)
        self.head = self._init_head(self.backbone.feature_info[-1]['num_chs'])      
        self.loss = torch.nn.BCEWithLogitsLoss()
        self.mixup = Mixup()
        
    def forward(self, wav_tensor, y=None):
        # wav_tensor: b, t
        if self.training:
            wav_tensor = self.batch_crop(wav_tensor) # b, t
            
        spectrogram = self.audio2image(wav_tensor) # b, m, t
        spectrogram = spectrogram.permute(0, 2, 1) # b, t, m
        spectrogram = spectrogram[:, None, :, :] # b, c, t, m
        
        if self.training:
            spectrogram = spectrogram.permute(0, 2, 1, 3) # b, t, c, m
            spectrogram = self.batch_uncrop(spectrogram)
            
            spectrogram, y = self.mixup(spectrogram, y)
            
            spectrogram = self.batch_crop(spectrogram)
            spectrogram = spectrogram.permute(0, 2, 1, 3) # b, c, t, m
                
        x = self.backbone(spectrogram) # b, c, t, m
        if self.training:
            x = x.permute(0, 2, 1, 3) # b, t, c, m
            x = self.batch_uncrop(x)
            x = x.permute(0, 2, 1, 3) # b, c, t, m
        
        # average mel axis
        x = torch.mean(x, axis=-1)
                
        logits = self.head(x) # b, n_out
        
        if y is not None:
            loss = self.loss(logits, y)
        else:
            loss = None

        return {'loss': loss, 'logits': logits.sigmoid()}
    
    def batch_crop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b * factor, t // factor, *tensor.shape[2:])
        return tensor
    
    def batch_uncrop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b // factor, t * factor, *tensor.shape[2:])
        return tensor
    
    @staticmethod
    def _init_audio2image():
        mel = ta.transforms.MelSpectrogram(
            sample_rate=32000,
            n_fft=2048,
            win_length=2048,
            hop_length=512,
            f_min=16,
            f_max=16386,
            pad=0,
            n_mels=256,
            power=2,
            normalized=False,
        )
        db_scale = ta.transforms.AmplitudeToDB(top_db=80.0)
        audio2image = torch.nn.Sequential(mel, db_scale)
        return audio2image
    
    @staticmethod
    def _init_backbone():
        backbone = "resnet18"
        pretrained = False
        pretrained_weights = None
        train = True
        val = False
        in_chans = 1

        backbone = timm.create_model(
            backbone,
            pretrained=pretrained,
            num_classes=0,
            global_pool="",
            in_chans=in_chans,
        )
        return backbone
    
    @staticmethod
    def _init_head(num_chs):
        head = Attention(num_chs, len(all_species), activation='linear')
        return head
    
    def load_backbone(self, weights_path=None):
        if weights_path:
            state_dict=torch.load(weights_path)
            conv1_weight = state_dict['conv1.weight']
            state_dict['conv1.weight'] = conv1_weight.sum(dim=1, keepdim=True)
            state_dict.pop('fc.bias')
            state_dict.pop('fc.weight')
            self.backbone.load_state_dict(state_dict)
        

### Train loop

In [12]:
with open('../input/timm-pretrained-resnet/index.json') as fin:
    timm_index = json.load(fin)
resnet_path = os.path.join('../input/timm-pretrained-resnet/resnet', timm_index['resnet']['resnet18'])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score

In [14]:
train_meta, val_meta = train_test_split(train_meta, test_size=0.2)

In [15]:
model = Net(resnet_path)
train_dataset = BirdDataset(
    train_meta,
    Compose(
                [
                    OneOf(
                        [
                            NoiseInjection(p=1, max_noise_level=0.04),
                            GaussianNoise(p=1, min_snr=5, max_snr=20),
#                             PinkNoise(p=1, min_snr=5, max_snr=20),
                        ],
                        p=0.2,
                    ),
                    RandomVolume(p=0.2, limit=4),
                    Normalize(p=1),
                ]
            )
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    pin_memory=False,
    drop_last=True,
)

val_dataset = BirdDataset(
    val_meta,
    Compose([Normalize(p=1)])
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    pin_memory=False,
    drop_last=False,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters())

In [16]:
# model_path = '../input/birdclefsubmit/9_model.pt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = torch.load(model_path, map_location=device)
# model.load_state_dict(state_dict)
# model.to(device)

In [17]:
def train_epoch(model, optimizer, dataloader, device):
    tqdm_dataloader = tqdm(dataloader)
    loss_list = []
    model.train()
    for batch in tqdm_dataloader:
        loss = model(batch['wav'].to(device), batch['target'].to(device))['loss']
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    return loss_list
    

def val_epoch(model, dataloader, device):
    tqdm_dataloader = tqdm(dataloader)
    loss_list = []
    model.eval()
    y_true = None
    y_pred = None
    
    for batch in tqdm_dataloader:
        logits = model(batch['wav'].to(device))['logits']
        batch_target = batch['target'].cpu().numpy()
        batch_pred = logits.cpu().numpy()
        
        if y_true is None:
            y_true = batch_target
            y_pred = batch_pred
        else:
            y_true = np.vstack((y_true, batch_target))
            y_pred = np.vstack((y_pred, batch_pred))
        
    return y_true, y_pred
    

In [18]:
def score_pred(y_true, y_pred, trsh, score_conf):
    score_dict = {}
    for score_f, score_kwargs, score_prefix in score_conf:
        score_dict.update({
            f'{score_prefix}-{t}': score_f(y_true, y_pred > t, **score_kwargs)
            for t in trsh
        })
    return score_dict


import sklearn.metrics

def comp_metric(y_true, y_pred, epsilon=1e-9):
    """ Function to calculate competition metric in an sklearn like fashion

    Args:
        y_true{array-like, sparse matrix} of shape (n_samples, n_outputs)
            - Ground truth (correct) target values.
        y_pred{array-like, sparse matrix} of shape (n_samples, n_outputs)
            - Estimated targets as returned by a classifier.
    Returns:
        The single calculated score representative of this competitions evaluation
    """

    # Get representative confusion matrices for each label
    mlbl_cms = sklearn.metrics.multilabel_confusion_matrix(y_true, y_pred)

    # Get two scores (TP and TN SCORES)
    tp_scores = np.array([
        mlbl_cm[1, 1]/(epsilon+mlbl_cm[:, 1].sum()) \
        for mlbl_cm in mlbl_cms
        ])
    tn_scores = np.array([
        mlbl_cm[0, 0]/(epsilon+mlbl_cm[:, 0].sum()) \
        for mlbl_cm in mlbl_cms
        ])

    # Get average
    tp_mean = tp_scores.mean()
    tn_mean = tn_scores.mean()

    return round((tp_mean+tn_mean)/2, 8)


def balanced_accuracy(pred, target, eps=1e-6):
    tp = (pred * target).sum(axis=-1)
    fn = ((1 - pred) * target).sum(axis=-1)
    fp = (pred * (1 - target)).sum(axis=-1)
    tn = ((1 - pred) * (1 - target)).sum(axis=-1)
    tpr = tp / (tp + fn + eps)
    tnr = tn / (tn + fp + eps)
    return (0.5 * (tpr + tnr)).mean()

In [19]:
score_conf = [
    [f1_score, {'average': 'macro'}, 'f1'],
    [comp_metric, {}, 'comp_metric'],
    [balanced_accuracy, {}, 'balanced_accuracy']
]

In [20]:
import warnings
warnings.filterwarnings('ignore')

epochs = 10
model.to(device)
for e in range(epochs):
    epoch_loss = train_epoch(model, optimizer, train_dataloader, device)
    print(f'{e} train loss:', f'{np.mean(epoch_loss):.3f}', sep='\t')
    with torch.no_grad():
        y_true, y_pred = val_epoch(model, val_dataloader, device)
    score_dict = score_pred(
        y_true, y_pred,
        score_conf=score_conf,
        trsh={0.1, 0.2, 0.3, 0.4}
    )
    torch.save(model, f'{e}_model.pt')
    print(f'{e} val scores:')
    print(*[
        f'\t{case}: {case_score}' 
        for case, case_score in score_dict.items()
    ], sep='\n')


100%|██████████| 185/185 [11:53<00:00,  3.86s/it]


0 train loss:	0.050


100%|██████████| 47/47 [02:34<00:00,  3.29s/it]


0 val scores:
	f1-0.1: 0.005904889839201814
	f1-0.4: 0.0017804412020149384
	f1-0.3: 0.001776466337468385
	f1-0.2: 0.003249090787903309
	comp_metric-0.1: 0.49992994
	comp_metric-0.4: 0.49711071
	comp_metric-0.3: 0.49699692
	comp_metric-0.2: 0.50027031
	balanced_accuracy-0.1: 0.5107905110834818
	balanced_accuracy-0.4: 0.5032661244521772
	balanced_accuracy-0.3: 0.5062301021627575
	balanced_accuracy-0.2: 0.5085963884695998


100%|██████████| 185/185 [11:19<00:00,  3.67s/it]


1 train loss:	0.037


100%|██████████| 47/47 [02:22<00:00,  3.02s/it]


1 val scores:
	f1-0.1: 0.027033849122260326
	f1-0.4: 0.018142708333145227
	f1-0.3: 0.018785164083193882
	f1-0.2: 0.02127612074688056
	comp_metric-0.1: 0.51960961
	comp_metric-0.4: 0.51333256
	comp_metric-0.3: 0.51167081
	comp_metric-0.2: 0.51407679
	balanced_accuracy-0.1: 0.5583332673519819
	balanced_accuracy-0.4: 0.547072081622852
	balanced_accuracy-0.3: 0.5503982153673509
	balanced_accuracy-0.2: 0.5559929552864539


100%|██████████| 185/185 [11:22<00:00,  3.69s/it]


2 train loss:	0.034


100%|██████████| 47/47 [02:23<00:00,  3.06s/it]


2 val scores:
	f1-0.1: 0.1121819042128423
	f1-0.4: 0.08285671873387351
	f1-0.3: 0.09285454776759769
	f1-0.2: 0.10431258612121645
	comp_metric-0.1: 0.56295326
	comp_metric-0.4: 0.577455
	comp_metric-0.3: 0.57290245
	comp_metric-0.2: 0.57305623
	balanced_accuracy-0.1: 0.6493013198954362
	balanced_accuracy-0.4: 0.6111653594542074
	balanced_accuracy-0.3: 0.6271443706163543
	balanced_accuracy-0.2: 0.6439986889530447


100%|██████████| 185/185 [11:24<00:00,  3.70s/it]


3 train loss:	0.030


100%|██████████| 47/47 [02:26<00:00,  3.11s/it]


3 val scores:
	f1-0.1: 0.1805224451807214
	f1-0.4: 0.13784254354636327
	f1-0.3: 0.15094806474850322
	f1-0.2: 0.16990522830381574
	comp_metric-0.1: 0.59674168
	comp_metric-0.4: 0.61582874
	comp_metric-0.3: 0.60891723
	comp_metric-0.2: 0.6071952
	balanced_accuracy-0.1: 0.7056628896968791
	balanced_accuracy-0.4: 0.6800911653901802
	balanced_accuracy-0.3: 0.696776290684886
	balanced_accuracy-0.2: 0.7111233986416694


100%|██████████| 185/185 [11:18<00:00,  3.67s/it]


4 train loss:	0.028


100%|██████████| 47/47 [02:26<00:00,  3.12s/it]


4 val scores:
	f1-0.1: 0.21796864406938593
	f1-0.4: 0.17121325017970193
	f1-0.3: 0.19152267854548644
	f1-0.2: 0.20835214615415493
	comp_metric-0.1: 0.61504775
	comp_metric-0.4: 0.63582148
	comp_metric-0.3: 0.64806704
	comp_metric-0.2: 0.63856206
	balanced_accuracy-0.1: 0.7167532414179364
	balanced_accuracy-0.4: 0.6955858494654591
	balanced_accuracy-0.3: 0.712571551604692
	balanced_accuracy-0.2: 0.7234849518396546


100%|██████████| 185/185 [11:20<00:00,  3.68s/it]


5 train loss:	0.027


100%|██████████| 47/47 [02:23<00:00,  3.06s/it]


5 val scores:
	f1-0.1: 0.2512657401563303
	f1-0.4: 0.2295542598515118
	f1-0.3: 0.24077940008037244
	f1-0.2: 0.2524350526209989
	comp_metric-0.1: 0.62837303
	comp_metric-0.4: 0.66180966
	comp_metric-0.3: 0.65216606
	comp_metric-0.2: 0.6468649
	balanced_accuracy-0.1: 0.7431702782594599
	balanced_accuracy-0.4: 0.7484570849052739
	balanced_accuracy-0.3: 0.7586157569023497
	balanced_accuracy-0.2: 0.7613733541225814


100%|██████████| 185/185 [11:14<00:00,  3.64s/it]


6 train loss:	0.026


100%|██████████| 47/47 [02:26<00:00,  3.12s/it]


6 val scores:
	f1-0.1: 0.24064872478643645
	f1-0.4: 0.20252232090717004
	f1-0.3: 0.21019003437237127
	f1-0.2: 0.2305656530894011
	comp_metric-0.1: 0.64080417
	comp_metric-0.4: 0.66294881
	comp_metric-0.3: 0.66479199
	comp_metric-0.2: 0.65238864
	balanced_accuracy-0.1: 0.7053768718732224
	balanced_accuracy-0.4: 0.6991037675687346
	balanced_accuracy-0.3: 0.7092935871433989
	balanced_accuracy-0.2: 0.717830680492405


100%|██████████| 185/185 [11:21<00:00,  3.68s/it]


7 train loss:	0.025


100%|██████████| 47/47 [02:25<00:00,  3.11s/it]


7 val scores:
	f1-0.1: 0.30592284734445646
	f1-0.4: 0.28266418076260075
	f1-0.3: 0.29363336829398834
	f1-0.2: 0.3023420984490839
	comp_metric-0.1: 0.65673965
	comp_metric-0.4: 0.70757947
	comp_metric-0.3: 0.69443038
	comp_metric-0.2: 0.68013074
	balanced_accuracy-0.1: 0.7622138394960898
	balanced_accuracy-0.4: 0.7674638866863177
	balanced_accuracy-0.3: 0.7785961626105657
	balanced_accuracy-0.2: 0.7840385362357434


100%|██████████| 185/185 [11:22<00:00,  3.69s/it]


8 train loss:	0.024


100%|██████████| 47/47 [02:23<00:00,  3.06s/it]


8 val scores:
	f1-0.1: 0.33017603764459863
	f1-0.4: 0.3078971937409136
	f1-0.3: 0.31711207128270624
	f1-0.2: 0.3375207065504641
	comp_metric-0.1: 0.67315883
	comp_metric-0.4: 0.71240481
	comp_metric-0.3: 0.70630272
	comp_metric-0.2: 0.70820667
	balanced_accuracy-0.1: 0.7685383282485523
	balanced_accuracy-0.4: 0.7784249252305927
	balanced_accuracy-0.3: 0.7870844212988631
	balanced_accuracy-0.2: 0.7918554554072144


100%|██████████| 185/185 [11:13<00:00,  3.64s/it]


9 train loss:	0.023


100%|██████████| 47/47 [02:24<00:00,  3.07s/it]


9 val scores:
	f1-0.1: 0.33363933206155616
	f1-0.4: 0.3235419526388814
	f1-0.3: 0.32722256951817913
	f1-0.2: 0.3384155114477694
	comp_metric-0.1: 0.6753217
	comp_metric-0.4: 0.7328664
	comp_metric-0.3: 0.71912892
	comp_metric-0.2: 0.70110392
	balanced_accuracy-0.1: 0.7568669119773269
	balanced_accuracy-0.4: 0.7752561282011735
	balanced_accuracy-0.3: 0.7769569969592962
	balanced_accuracy-0.2: 0.779585657915643


In [21]:
score_pred(
        y_true, y_pred,
        score_conf=score_conf,
        trsh={0.1, 0.15, 0.2, 0.25, 0.3}
    )

{'f1-0.1': 0.33363933206155616,
 'f1-0.25': 0.33730968713714216,
 'f1-0.3': 0.32722256951817913,
 'f1-0.2': 0.3384155114477694,
 'f1-0.15': 0.33879490542361324,
 'comp_metric-0.1': 0.6753217,
 'comp_metric-0.25': 0.71384632,
 'comp_metric-0.3': 0.71912892,
 'comp_metric-0.2': 0.70110392,
 'comp_metric-0.15': 0.68914757,
 'balanced_accuracy-0.1': 0.7568669119773269,
 'balanced_accuracy-0.25': 0.7799686526861095,
 'balanced_accuracy-0.3': 0.7769569969592962,
 'balanced_accuracy-0.2': 0.779585657915643,
 'balanced_accuracy-0.15': 0.7731807205998932}

### Submit

In [22]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [23]:
class TestDataset(Dataset):
    def __init__(self, test_folder):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [24]:
test_dataset = TestDataset(os.path.join(data_root, 'test_soundscapes'))
test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [25]:
pred_list = []
treshold = 0.1
model.eval()
with torch.no_grad():
    for i, batch in tqdm(enumerate(test_dataloader)):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))['logits']
        pred = pred.cpu().numpy()
        pred = pred > treshold
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': chunk_pred[species2id[b]]
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

1it [00:00,  6.18it/s]


In [26]:
pred_pd.to_csv("submission.csv", index=False)
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
...,...,...
247,soundscape_453028782_omao_60,False
248,soundscape_453028782_puaioh_60,False
249,soundscape_453028782_skylar_60,False
250,soundscape_453028782_warwhe1_60,False


In [27]:
# def chunk_wav(wav, sr, window_size):
#     chunks_count = len(wav) // (window_size * sr)
#     chunk_size = window_size * sr
#     chunks = []
#     for chunk_idx in range(chunks_count):
#         left = chunk_idx * sr
#         right = min(left + chunk_size, len(wav))
#         chunks.append(wav[left:right])
#     chunk_tensor = torch.tensor(chunks)
#     return chunk_tensor

# file_list = os.listdir(os.path.join(data_root, 'test_soundscapes'))

# # This is where we will store our results
# treshold = 0.5
# pred = {'row_id': [], 'target': []}
# model.eval()
# with torch.no_grad():
#     # Process audio files and make predictions
#     for fname in file_list:
#         prefix = fname.split('.')[0]
#         # Complete file path
#         fpath = os.path.join(data_root, 'test_soundscapes', fname)
#         wav, sr = load_wav(fpath, 0, None)
#         chunk_tensor = chunk_wav(wav, sr, window_size=5)

#         # Open file with librosa and split signal into 5-second chunks
#         # sig, rate = librosa.load(path)
#         # ...

#         # Let's assume we have a list of 12 audio chunks (1min / 5s == 12 segments)
#         chunk_score = model(chunk_tensor.to(device))['logits'].cpu().numpy()

#         # Make prediction for each chunk
#         # Each scored bird gets a random value in our case
#         # since we don't actually have a model
#         for i, all_score in enumerate(chunk_score):        
#             chunk_end_time = (i + 1) * 5
#             for bird in test_birds:

#                 # This is our random prediction score for this bird
#                 bird_score = all_score[species2id[bird]]

#                 # Assemble the row_id which we need to do for each scored bird
#                 row_id = prefix + '_' + bird + '_' + str(chunk_end_time)

#                 # Put the result into our prediction dict and
#                 # apply a "confidence" threshold of 0.5
#                 pred['row_id'].append(row_id)
#                 pred['target'].append(True if bird_score > treshold else False)


# # Make a new data frame and look at some results        
# results = pd.DataFrame(pred, columns = ['row_id', 'target'])

# # Quick sanity check
# print(results.head()) 
    
# # Convert our results to csv
# results.to_csv("submission.csv", index=False)    

In [28]:
# test_fnames = [f for f in os.listdir(f'{data_root}/test_soundscapes') if f.endswith('.ogg')]
# test_pd = []
# for fname in test_fnames:
#     fpath = os.path.join(data_root, 'test_soundscapes', fname)
#     wav, sr = librosa.load(fpath, sr=None)
#     prefix = fname.split('.')[0]
#     window_size = 5 * sr
#     for i, chunk in enumerate(wav[::window_size]):
#         end_time = (i + 1) * 5
#         samples = [{
#             'row_id': f'{prefix}_{b}_{end_time}',
#             'file_id': fname,
#             'bird': b,
#             'end_time': end_time
#         } for b in test_birds]
#         test_pd.extend(samples)
        
# test_pd = pd.DataFrame(test_pd)
# test_pd