In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import json
import random

import librosa

import torch
import torchaudio as ta
import timm

from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv('../input/birdclef-data-with-wav-durations/train_metadata_extended.csv')
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [3]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [4]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [5]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration, offset=offset)
    assert sr <= 32000, sr
    return wav, sr

In [6]:
# %%time
# duration = 30
# sample_rate = 32000

# wav, sr = load_wav('afrsil1/XC125458.ogg', 5, duration)
# to_pad = duration * sample_rate - wav.shape[0]

# if to_pad > 0:
#     wav = np.pad(wav, (0, to_pad))



### Torch Dataset

In [7]:
TEST_SIZE = 5 
CONFIG = {
    'crop_len': 30,
    'sample_rate': 32000,    
}


In [8]:
from torch.utils.data import Dataset, DataLoader

class BirdDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __getitem__(self, idx):
        duration = CONFIG['crop_len']
        sample_rate = CONFIG['sample_rate']
        
        fname = self.df.iloc[idx]['filename']
        wav_len = train_meta.iloc[0]['duration']
        
        max_offset = max(0, wav_len - duration)
        random_offset = random.randint(0, max_offset)
                
        wav, sr = load_wav(fname, random_offset, duration)
        to_pad = duration * sample_rate - wav.shape[0]
        if to_pad > 0:
            wav = np.pad(wav, (0, to_pad))
            
        target = self.df.iloc[idx]['target']
        
        # TODO: add weighting
            
        wav = torch.tensor(wav)
        target = torch.tensor(target, dtype=float)
        return {
            'wav': wav,
            'target': target,
        }

    def __len__(self):
        return len(self.df)

### Model

In [9]:
from torch.distributions import Beta


class Mixup(torch.nn.Module):
    def __init__(self, mix_beta=1):

        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)

    def forward(self, X, Y, weight=None):

        bs = X.shape[0]
        n_dims = len(X.shape)
        perm = torch.randperm(bs)
        coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

        if n_dims == 2:
            X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
        elif n_dims == 3:
            X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
        else:
            X = coeffs.view(-1, 1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]

        Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]

        if weight is None:
            return X, Y
        else:
            weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
            return X, Y, weight
        
        
class Attention(torch.nn.Module):
    def __init__(self, in_channels, out_channels, activation='linear'):
        super().__init__()
        self.activation = activation
        self.attn = torch.nn.Conv1d(in_channels, out_channels, kernel_size=1)
        self.cla = torch.nn.Conv1d(in_channels, out_channels, kernel_size=1)
        
    def forward(self, x):
        # x: b, c, t
        attn = torch.softmax(torch.tanh(self.attn(x)), dim=-1) # b, c, t
        x = self.cla(x) # b, c, t
        x = torch.sum(x * attn, dim=-1) #b, c
        return x


In [10]:
class Net(torch.nn.Module):
    def __init__(self, backbone_path=None):
        super().__init__()
        self.audio2image = self._init_audio2image()
        self.backbone = self._init_backbone()
        self.load_backbone(backbone_path)
        self.head = self._init_head(self.backbone.feature_info[-1]['num_chs'])      
        self.loss = torch.nn.BCEWithLogitsLoss()
        self.mixup = Mixup()
        
    def forward(self, wav_tensor, y=None):
        # wav_tensor: b, t
        if self.training:
            wav_tensor = self.batch_crop(wav_tensor) # b, t
            
        spectrogram = self.audio2image(wav_tensor) # b, m, t
        spectrogram = spectrogram.permute(0, 2, 1) # b, t, m
        spectrogram = spectrogram[:, None, :, :] # b, c, t, m
        
        if self.training:
            spectrogram = spectrogram.permute(0, 2, 1, 3) # b, t, c, m
            spectrogram = self.batch_uncrop(spectrogram)
            
            spectrogram, y = self.mixup(spectrogram, y)
            
            spectrogram = self.batch_crop(spectrogram)
            spectrogram = spectrogram.permute(0, 2, 1, 3) # b, c, t, m
                
        x = self.backbone(spectrogram) # b, c, t, m
        if self.training:
            x = x.permute(0, 2, 1, 3) # b, t, c, m
            x = self.batch_uncrop(x)
            x = x.permute(0, 2, 1, 3) # b, c, t, m
        
        # average mel axis
        x = torch.mean(x, axis=-1)
                
        logits = self.head(x) # b, n_out
        
        if y is not None:
            loss = self.loss(logits, y)
        else:
            loss = None

        return {'loss': loss, 'logits': logits.sigmoid()}
    
    def batch_crop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b * factor, t // factor, *tensor.shape[2:])
        return tensor
    
    def batch_uncrop(self, tensor):
        factor = int(CONFIG['crop_len'] // TEST_SIZE)
        b, t = tensor.shape[:2]
        tensor = tensor.reshape(b // factor, t * factor, *tensor.shape[2:])
        return tensor
    
    @staticmethod
    def _init_audio2image():
        mel = ta.transforms.MelSpectrogram(
            sample_rate=32000,
            n_fft=2048,
            win_length=2048,
            hop_length=512,
            f_min=16,
            f_max=16386,
            pad=0,
            n_mels=256,
            power=2,
            normalized=False,
        )
        db_scale = ta.transforms.AmplitudeToDB(top_db=80.0)
        audio2image = torch.nn.Sequential(mel, db_scale)
        return audio2image
    
    @staticmethod
    def _init_backbone():
        backbone = "resnet18"
        pretrained = False
        pretrained_weights = None
        train = True
        val = False
        in_chans = 1

        backbone = timm.create_model(
            backbone,
            pretrained=pretrained,
            num_classes=0,
            global_pool="",
            in_chans=in_chans,
        )
        return backbone
    
    @staticmethod
    def _init_head(num_chs):
        head = Attention(num_chs, len(all_species), activation='linear')
        return head
    
    def load_backbone(self, weights_path=None):
        if weights_path:
            state_dict=torch.load(weights_path)
            conv1_weight = state_dict['conv1.weight']
            state_dict['conv1.weight'] = conv1_weight.sum(dim=1, keepdim=True)
            state_dict.pop('fc.bias')
            state_dict.pop('fc.weight')
            self.backbone.load_state_dict(state_dict)
        

### Train loop

In [11]:
with open('../input/timm-pretrained-resnet/index.json') as fin:
    timm_index = json.load(fin)
resnet_path = os.path.join('../input/timm-pretrained-resnet/resnet', timm_index['resnet']['resnet18'])

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score

In [13]:
train_meta, val_meta = train_test_split(train_meta, test_size=0.2)

In [14]:
model = Net(resnet_path)
train_dataset = BirdDataset(train_meta)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    pin_memory=False,
    drop_last=True,
)

val_dataset = BirdDataset(val_meta)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    pin_memory=False,
    drop_last=False,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters())

In [15]:
def train_epoch(model, optimizer, dataloader, device):
    tqdm_dataloader = tqdm(dataloader)
    loss_list = []
    model.train()
    for batch in tqdm_dataloader:
        loss = model(batch['wav'].to(device), batch['target'].to(device))['loss']
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    return loss_list
    

def val_epoch(model, dataloader, device):
    tqdm_dataloader = tqdm(dataloader)
    loss_list = []
    model.eval()
    y_true = None
    y_pred = None
    
    for batch in tqdm_dataloader:
        logits = model(batch['wav'].to(device))['logits']
        batch_target = batch['target'].cpu().numpy()
        batch_pred = logits.cpu().numpy()
        
        if y_true is None:
            y_true = batch_target
            y_pred = batch_pred
        else:
            y_true = np.vstack((y_true, batch_target))
            y_pred = np.vstack((y_pred, batch_pred))
        
    return y_true, y_pred
    

In [16]:
def score_pred(y_true, y_pred, trsh, score_conf):
    score_dict = {}
    for score_f, score_kwargs, score_prefix in score_conf:
        score_dict.update({
            f'{score_prefix}-{t}': score_f(y_true, y_pred > t, **score_kwargs)
            for t in trsh
        })
    return score_dict


import sklearn.metrics

def comp_metric(y_true, y_pred, epsilon=1e-9):
    """ Function to calculate competition metric in an sklearn like fashion

    Args:
        y_true{array-like, sparse matrix} of shape (n_samples, n_outputs)
            - Ground truth (correct) target values.
        y_pred{array-like, sparse matrix} of shape (n_samples, n_outputs)
            - Estimated targets as returned by a classifier.
    Returns:
        The single calculated score representative of this competitions evaluation
    """

    # Get representative confusion matrices for each label
    mlbl_cms = sklearn.metrics.multilabel_confusion_matrix(y_true, y_pred)

    # Get two scores (TP and TN SCORES)
    tp_scores = np.array([
        mlbl_cm[1, 1]/(epsilon+mlbl_cm[:, 1].sum()) \
        for mlbl_cm in mlbl_cms
        ])
    tn_scores = np.array([
        mlbl_cm[0, 0]/(epsilon+mlbl_cm[:, 0].sum()) \
        for mlbl_cm in mlbl_cms
        ])

    # Get average
    tp_mean = tp_scores.mean()
    tn_mean = tn_scores.mean()

    return round((tp_mean+tn_mean)/2, 8)


def balanced_accuracy(pred, target, eps=1e-6):
    tp = (pred * target).sum(axis=-1)
    fn = ((1 - pred) * target).sum(axis=-1)
    fp = (pred * (1 - target)).sum(axis=-1)
    tn = ((1 - pred) * (1 - target)).sum(axis=-1)
    tpr = tp / (tp + fn + eps)
    tnr = tn / (tn + fp + eps)
    return (0.5 * (tpr + tnr)).mean()

In [17]:
score_conf = [
    [f1_score, {'average': 'macro'}, 'f1'],
    [comp_metric, {}, 'comp_metric'],
    [balanced_accuracy, {}, 'balanced_accuracy']
]

In [18]:
import warnings
warnings.filterwarnings('ignore')

epochs = 10
model.to(device)
for e in range(epochs):
    epoch_loss = train_epoch(model, optimizer, train_dataloader, device)
    print(f'{e} train loss:', f'{np.mean(epoch_loss):.3f}', sep='\t')
    with torch.no_grad():
        y_true, y_pred = val_epoch(model, val_dataloader, device)
    score_dict = score_pred(
        y_true, y_pred,
        score_conf=score_conf,
        trsh={0.1, 0.2, 0.3, 0.4}
    )
    torch.save(model, f'{e}_model.pt')
    print(f'{e} val scores:')
    print(*[
        f'\t{case}: {case_score}' 
        for case, case_score in score_dict.items()
    ], sep='\n')


100%|██████████| 185/185 [08:06<00:00,  2.63s/it]


0 train loss:	0.048


100%|██████████| 47/47 [01:53<00:00,  2.41s/it]


0 val scores:
	f1-0.1: 0.02785076176624267
	f1-0.4: 0.01838242431988643
	f1-0.3: 0.01993751517692378
	f1-0.2: 0.022816580528668206
	comp_metric-0.1: 0.51116656
	comp_metric-0.4: 0.50820996
	comp_metric-0.3: 0.50809563
	comp_metric-0.2: 0.51283226
	balanced_accuracy-0.1: 0.550834179757332
	balanced_accuracy-0.4: 0.5304477856908878
	balanced_accuracy-0.3: 0.5350658075987573
	balanced_accuracy-0.2: 0.5400039616246725


100%|██████████| 185/185 [07:29<00:00,  2.43s/it]


1 train loss:	0.034


100%|██████████| 47/47 [01:43<00:00,  2.21s/it]


1 val scores:
	f1-0.1: 0.10440764928801999
	f1-0.4: 0.06976993519299571
	f1-0.3: 0.079297963298281
	f1-0.2: 0.0906366698490779
	comp_metric-0.1: 0.55764078
	comp_metric-0.4: 0.56546624
	comp_metric-0.3: 0.56333231
	comp_metric-0.2: 0.55709439
	balanced_accuracy-0.1: 0.630207631422816
	balanced_accuracy-0.4: 0.5926188176105743
	balanced_accuracy-0.3: 0.606493126383713
	balanced_accuracy-0.2: 0.6237727162394577


100%|██████████| 185/185 [07:42<00:00,  2.50s/it]


2 train loss:	0.030


100%|██████████| 47/47 [01:45<00:00,  2.24s/it]


2 val scores:
	f1-0.1: 0.16215474379038622
	f1-0.4: 0.11175814314628367
	f1-0.3: 0.1287866320022398
	f1-0.2: 0.14559821332869446
	comp_metric-0.1: 0.59595873
	comp_metric-0.4: 0.60747019
	comp_metric-0.3: 0.61088935
	comp_metric-0.2: 0.6055229
	balanced_accuracy-0.1: 0.6877930571529352
	balanced_accuracy-0.4: 0.635354002883715
	balanced_accuracy-0.3: 0.654663264612551
	balanced_accuracy-0.2: 0.6764531738902231


100%|██████████| 185/185 [07:36<00:00,  2.47s/it]


3 train loss:	0.027


100%|██████████| 47/47 [01:45<00:00,  2.25s/it]


3 val scores:
	f1-0.1: 0.1505942996120673
	f1-0.4: 0.11114010675216307
	f1-0.3: 0.11901302426813193
	f1-0.2: 0.13486983236157427
	comp_metric-0.1: 0.60675943
	comp_metric-0.4: 0.61569632
	comp_metric-0.3: 0.60689447
	comp_metric-0.2: 0.61350158
	balanced_accuracy-0.1: 0.6570813153081547
	balanced_accuracy-0.4: 0.6385276689197753
	balanced_accuracy-0.3: 0.6489609194616872
	balanced_accuracy-0.2: 0.6601684061527413


100%|██████████| 185/185 [07:34<00:00,  2.46s/it]


4 train loss:	0.026


100%|██████████| 47/47 [01:44<00:00,  2.23s/it]


4 val scores:
	f1-0.1: 0.19421171922612343
	f1-0.4: 0.13964265924665184
	f1-0.3: 0.15134406695915809
	f1-0.2: 0.17297539655965255
	comp_metric-0.1: 0.6363768
	comp_metric-0.4: 0.6307933
	comp_metric-0.3: 0.6366906
	comp_metric-0.2: 0.64223401
	balanced_accuracy-0.1: 0.689635029550626
	balanced_accuracy-0.4: 0.6467264428669777
	balanced_accuracy-0.3: 0.661541033765906
	balanced_accuracy-0.2: 0.6812770472191835


100%|██████████| 185/185 [07:31<00:00,  2.44s/it]


5 train loss:	0.024


100%|██████████| 47/47 [01:42<00:00,  2.19s/it]


5 val scores:
	f1-0.1: 0.28690224130786024
	f1-0.4: 0.26983714885972776
	f1-0.3: 0.2753571856871604
	f1-0.2: 0.2854550760936979
	comp_metric-0.1: 0.64793041
	comp_metric-0.4: 0.69731961
	comp_metric-0.3: 0.68487579
	comp_metric-0.2: 0.67275955
	balanced_accuracy-0.1: 0.7455407214736353
	balanced_accuracy-0.4: 0.7423640102401375
	balanced_accuracy-0.3: 0.7526250261065794
	balanced_accuracy-0.2: 0.758434351411963


100%|██████████| 185/185 [07:39<00:00,  2.48s/it]


6 train loss:	0.023


100%|██████████| 47/47 [01:42<00:00,  2.19s/it]


6 val scores:
	f1-0.1: 0.2946613933519787
	f1-0.4: 0.26439997843091456
	f1-0.3: 0.27758825810667725
	f1-0.2: 0.28540951613682464
	comp_metric-0.1: 0.66281063
	comp_metric-0.4: 0.69507413
	comp_metric-0.3: 0.69017093
	comp_metric-0.2: 0.68296256
	balanced_accuracy-0.1: 0.7444899795846703
	balanced_accuracy-0.4: 0.7523785430898147
	balanced_accuracy-0.3: 0.7610632381275686
	balanced_accuracy-0.2: 0.7630784890068999


100%|██████████| 185/185 [07:32<00:00,  2.44s/it]


7 train loss:	0.022


100%|██████████| 47/47 [01:44<00:00,  2.22s/it]


7 val scores:
	f1-0.1: 0.31372541445602914
	f1-0.4: 0.2751714586723098
	f1-0.3: 0.29751279139388076
	f1-0.2: 0.3122069845481974
	comp_metric-0.1: 0.68440083
	comp_metric-0.4: 0.72020475
	comp_metric-0.3: 0.71787852
	comp_metric-0.2: 0.71132752
	balanced_accuracy-0.1: 0.7601718744481615
	balanced_accuracy-0.4: 0.7479663814610451
	balanced_accuracy-0.3: 0.7605264684764387
	balanced_accuracy-0.2: 0.770465079044059


100%|██████████| 185/185 [07:28<00:00,  2.42s/it]


8 train loss:	0.021


100%|██████████| 47/47 [01:44<00:00,  2.22s/it]


8 val scores:
	f1-0.1: 0.3612682957556494
	f1-0.4: 0.3277640506806172
	f1-0.3: 0.338721925154208
	f1-0.2: 0.35055938994529184
	comp_metric-0.1: 0.69045586
	comp_metric-0.4: 0.73429276
	comp_metric-0.3: 0.72148431
	comp_metric-0.2: 0.7113206
	balanced_accuracy-0.1: 0.7851551260899097
	balanced_accuracy-0.4: 0.7712625351214399
	balanced_accuracy-0.3: 0.7839360691994245
	balanced_accuracy-0.2: 0.7954382314162467


100%|██████████| 185/185 [07:33<00:00,  2.45s/it]


9 train loss:	0.020


100%|██████████| 47/47 [01:45<00:00,  2.24s/it]


9 val scores:
	f1-0.1: 0.37000017576687083
	f1-0.4: 0.3598725197120665
	f1-0.3: 0.3739288758865253
	f1-0.2: 0.38388376033431454
	comp_metric-0.1: 0.6833308
	comp_metric-0.4: 0.74028545
	comp_metric-0.3: 0.73314187
	comp_metric-0.2: 0.71913055
	balanced_accuracy-0.1: 0.7632549465146488
	balanced_accuracy-0.4: 0.7805323504592091
	balanced_accuracy-0.3: 0.7886386633785687
	balanced_accuracy-0.2: 0.7886708153171288


In [19]:
score_pred(
        y_true, y_pred,
        score_conf=score_conf,
        trsh={0.1, 0.15, 0.2, 0.25, 0.3}
    )

{'f1-0.1': 0.37000017576687083,
 'f1-0.25': 0.3777747174069073,
 'f1-0.3': 0.3739288758865253,
 'f1-0.2': 0.38388376033431454,
 'f1-0.15': 0.3776436788848753,
 'comp_metric-0.1': 0.6833308,
 'comp_metric-0.25': 0.72730619,
 'comp_metric-0.3': 0.73314187,
 'comp_metric-0.2': 0.71913055,
 'comp_metric-0.15': 0.70495069,
 'balanced_accuracy-0.1': 0.7632549465146488,
 'balanced_accuracy-0.25': 0.790655280497284,
 'balanced_accuracy-0.3': 0.7886386633785687,
 'balanced_accuracy-0.2': 0.7886708153171288,
 'balanced_accuracy-0.15': 0.7810565321406316}

### Submit

In [20]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [21]:
class TestDataset(Dataset):
    def __init__(self, test_folder):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [22]:
test_dataset = TestDataset(os.path.join(data_root, 'test_soundscapes'))
test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [23]:
pred_list = []
treshold = 0.1
model.eval()
with torch.no_grad():
    for i, batch in tqdm(enumerate(test_dataloader)):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))['logits']
        pred = pred.cpu().numpy()
        pred = pred > treshold
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': chunk_pred[species2id[b]]
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

1it [00:00,  6.23it/s]


In [24]:
pred_pd.to_csv("submission.csv", index=False)
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
...,...,...
247,soundscape_453028782_omao_60,False
248,soundscape_453028782_puaioh_60,False
249,soundscape_453028782_skylar_60,False
250,soundscape_453028782_warwhe1_60,False


In [25]:
# def chunk_wav(wav, sr, window_size):
#     chunks_count = len(wav) // (window_size * sr)
#     chunk_size = window_size * sr
#     chunks = []
#     for chunk_idx in range(chunks_count):
#         left = chunk_idx * sr
#         right = min(left + chunk_size, len(wav))
#         chunks.append(wav[left:right])
#     chunk_tensor = torch.tensor(chunks)
#     return chunk_tensor

# file_list = os.listdir(os.path.join(data_root, 'test_soundscapes'))

# # This is where we will store our results
# treshold = 0.5
# pred = {'row_id': [], 'target': []}
# model.eval()
# with torch.no_grad():
#     # Process audio files and make predictions
#     for fname in file_list:
#         prefix = fname.split('.')[0]
#         # Complete file path
#         fpath = os.path.join(data_root, 'test_soundscapes', fname)
#         wav, sr = load_wav(fpath, 0, None)
#         chunk_tensor = chunk_wav(wav, sr, window_size=5)

#         # Open file with librosa and split signal into 5-second chunks
#         # sig, rate = librosa.load(path)
#         # ...

#         # Let's assume we have a list of 12 audio chunks (1min / 5s == 12 segments)
#         chunk_score = model(chunk_tensor.to(device))['logits'].cpu().numpy()

#         # Make prediction for each chunk
#         # Each scored bird gets a random value in our case
#         # since we don't actually have a model
#         for i, all_score in enumerate(chunk_score):        
#             chunk_end_time = (i + 1) * 5
#             for bird in test_birds:

#                 # This is our random prediction score for this bird
#                 bird_score = all_score[species2id[bird]]

#                 # Assemble the row_id which we need to do for each scored bird
#                 row_id = prefix + '_' + bird + '_' + str(chunk_end_time)

#                 # Put the result into our prediction dict and
#                 # apply a "confidence" threshold of 0.5
#                 pred['row_id'].append(row_id)
#                 pred['target'].append(True if bird_score > treshold else False)


# # Make a new data frame and look at some results        
# results = pd.DataFrame(pred, columns = ['row_id', 'target'])

# # Quick sanity check
# print(results.head()) 
    
# # Convert our results to csv
# results.to_csv("submission.csv", index=False)    

In [26]:
# test_fnames = [f for f in os.listdir(f'{data_root}/test_soundscapes') if f.endswith('.ogg')]
# test_pd = []
# for fname in test_fnames:
#     fpath = os.path.join(data_root, 'test_soundscapes', fname)
#     wav, sr = librosa.load(fpath, sr=None)
#     prefix = fname.split('.')[0]
#     window_size = 5 * sr
#     for i, chunk in enumerate(wav[::window_size]):
#         end_time = (i + 1) * 5
#         samples = [{
#             'row_id': f'{prefix}_{b}_{end_time}',
#             'file_id': fname,
#             'bird': b,
#             'end_time': end_time
#         } for b in test_birds]
#         test_pd.extend(samples)
        
# test_pd = pd.DataFrame(test_pd)
# test_pd