In [1]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/birdclefmodels/')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa

from matplotlib import pyplot as plt
import json
import random
from tqdm import tqdm
import os
import yaml

import torch
import torchaudio as ta
import timm
from torch.utils.data import Dataset, DataLoader

import neural_network

In [2]:
data_root = '/kaggle/input/birdclef-2022'
train_meta = pd.read_csv('../input/birdclef-data-with-wav-durations/train_metadata_extended.csv')
ebird_taxonomy = pd.read_csv(os.path.join(data_root, 'eBird_Taxonomy_v2021.csv'))

In [3]:
train_meta.loc[:, 'secondary_labels'] = train_meta.secondary_labels.apply(eval)
train_meta['target_raw'] = train_meta.secondary_labels + train_meta.primary_label.apply(lambda x: [x])

In [4]:
all_species = sorted(set(train_meta.target_raw.sum()))
species2id = {s: i for i, s in enumerate(all_species)}
id2species = {i: s for i, s in enumerate(all_species)}

train_meta['target'] = train_meta.target_raw.apply(lambda species: [int(s in species) for s in all_species])

In [5]:
def load_wav(fname, offset, duration):
#     fname = 'afrsil1/XC125458.ogg'
    fpath = os.path.join(data_root, 'train_audio', fname)
    wav, sr = librosa.load(fpath, sr=None, duration=duration)
    assert sr <= 32000, sr
    return wav, sr

### Torch Dataset

In [6]:
TEST_SIZE = 5 
CONFIG = {
    'crop_len': 30,
    'sample_rate': 32000,    
}


### Model

In [7]:
# from torch.distributions import Beta
# from torch import nn


# class Net(torch.nn.Module):
#     def __init__(self, backbone_path=None):
#         super().__init__()
#         self.audio2image = self._init_audio2image()
#         self.backbone = self._init_backbone()
#         self.load_backbone(backbone_path)
#         self.head = self._init_head(self.backbone.feature_info[-1]['num_chs'])      
#         self.loss = torch.nn.BCEWithLogitsLoss()
#         self.mixup = Mixup()
        
#     def forward(self, wav_tensor, y=None):
#         # wav_tensor: b, t
#         if self.training:
#             wav_tensor = self.batch_crop(wav_tensor) # b, t
            
#         spectrogram = self.audio2image(wav_tensor) # b, m, t
#         spectrogram = spectrogram.permute(0, 2, 1) # b, t, m
#         spectrogram = spectrogram[:, None, :, :] # b, c, t, m
        
#         if self.training:
#             spectrogram = spectrogram.permute(0, 2, 1, 3) # b, t, c, m
#             spectrogram = self.batch_uncrop(spectrogram)
            
#             spectrogram, y = self.mixup(spectrogram, y)
            
#             spectrogram = self.batch_crop(spectrogram)
#             spectrogram = spectrogram.permute(0, 2, 1, 3) # b, c, t, m
                
#         x = self.backbone(spectrogram) # b, c, t, m
#         if self.training:
#             x = x.permute(0, 2, 1, 3) # b, t, c, m
#             x = self.batch_uncrop(x)
#             x = x.permute(0, 2, 1, 3) # b, c, t, m
        
#         # average mel axis
#         x = torch.mean(x, axis=-1)
                
#         logits = self.head(x) # b, n_out
        
#         if y is not None:
#             loss = self.loss(logits, y)
#         else:
#             loss = None

#         return {'loss': loss, 'logits': logits.sigmoid()}
    
#     def batch_crop(self, tensor):
#         factor = int(CONFIG['crop_len'] // TEST_SIZE)
#         b, t = tensor.shape[:2]
#         tensor = tensor.reshape(b * factor, t // factor, *tensor.shape[2:])
#         return tensor
    
#     def batch_uncrop(self, tensor):
#         factor = int(CONFIG['crop_len'] // TEST_SIZE)
#         b, t = tensor.shape[:2]
#         tensor = tensor.reshape(b // factor, t * factor, *tensor.shape[2:])
#         return tensor
    
#     @staticmethod
#     def _init_audio2image():
#         mel = ta.transforms.MelSpectrogram(
#             sample_rate=32000,
#             n_fft=2048,
#             win_length=2048,
#             hop_length=512,
#             f_min=16,
#             f_max=16386,
#             pad=0,
#             n_mels=256,
#             power=2,
#             normalized=False,
#         )
#         db_scale = ta.transforms.AmplitudeToDB(top_db=80.0)
#         audio2image = torch.nn.Sequential(mel, db_scale)
#         return audio2image
    
#     @staticmethod
#     def _init_backbone():
#         backbone = "resnet18"
#         pretrained = False
#         pretrained_weights = None
#         train = True
#         val = False
#         in_chans = 1

#         backbone = timm.create_model(
#             backbone,
#             pretrained=pretrained,
#             num_classes=0,
#             global_pool="",
#             in_chans=in_chans,
#         )
#         return backbone
    
#     @staticmethod
#     def _init_head(num_chs):
#         head = Attention(num_chs, len(all_species), activation='linear')
#         return head
    
#     def load_backbone(self, weights_path=None):
#         if weights_path:
#             state_dict=torch.load(weights_path)
#             conv1_weight = state_dict['conv1.weight']
#             state_dict['conv1.weight'] = conv1_weight.sum(dim=1, keepdim=True)
#             state_dict.pop('fc.bias')
#             state_dict.pop('fc.weight')
#             self.backbone.load_state_dict(state_dict)


# class Mixup(nn.Module):
#     def __init__(self, mix_beta=1):
#         super(Mixup, self).__init__()
#         self.beta_distribution = Beta(mix_beta, mix_beta)

#     def forward(self, X, Y, sample_weight=None):

#         bs = X.shape[0]
#         n_dims = len(X.shape)
#         perm = torch.randperm(bs)
#         mixup_weight = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

#         if n_dims == 2:
#             X = mixup_weight.view(-1, 1) * X + (1 - mixup_weight.view(-1, 1)) * X[perm]
#         elif n_dims == 3:
#             X = mixup_weight.view(-1, 1, 1) * X + (1 - mixup_weight.view(-1, 1, 1)) * X[perm]
#         else:
#             X = mixup_weight.view(-1, 1, 1, 1) * X + (1 - mixup_weight.view(-1, 1, 1, 1)) * X[perm]

#         Y = mixup_weight.view(-1, 1) * Y + (1 - mixup_weight.view(-1, 1)) * Y[perm]

#         if sample_weight is None:
#             return X, Y
#         else:
#             sample_weight = mixup_weight.view(-1) * sample_weight + (1 - mixup_weight.view(-1)) * sample_weight[perm]
#             return X, Y, sample_weight

        
# class Attention(nn.Module):
#     def __init__(self, in_channels, out_channels, activation='linear'):
#         super().__init__()
#         self.activation = activation
#         self.attn = nn.Conv1d(in_channels, out_channels, kernel_size=1)
#         self.cla = nn.Conv1d(in_channels, out_channels, kernel_size=1)
        
#     def forward(self, x):
#         # x: b, c, t
#         attn = torch.softmax(torch.tanh(self.attn(x)), dim=-1) # b, c, t
#         x = self.cla(x) # b, c, t
#         x = torch.sum(x * attn, dim=-1) #b, c
#         return x

### Model load

In [8]:
from neural_network import NN_CATALOG

config_path = '../input/birdclefsubmit/baseline_config.yaml'
model_path = '../input/birdclefsubmit/final-model.pt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open(config_path) as fin:
    config = yaml.safe_load(fin)

model_config = config['model']
model_config['params']['backbone_config']['pretrained'] = False
data_config = config['data']
model_class = NN_CATALOG[model_config['name']]

model = model_class(len(all_species), int(data_config['crop_len'] // data_config['test_wav_len']),
                    **model_config['params'])
model.to(device)

state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [9]:



# state_dict = torch.load(model_path, map_location=device)
# model = Net(len(all_species), int(CONFIG['crop_len'] // TEST_SIZE),
#             backbone_config=dict(model_name='resnet18', pretrained=False))
# model.load_state_dict(state_dict)
# model.to(device)

### Submit

In [10]:
with open(os.path.join(data_root, 'scored_birds.json')) as fin:
    test_birds = json.load(fin)

In [11]:
class Compose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray, sr):
        for trns in self.transforms:
            y = trns(y, sr)
        return y
    
    
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray, sr):
        if self.always_apply:
            return self.apply(y, sr=sr)
        else:
            if np.random.rand() < self.p:
                return self.apply(y, sr=sr)
            else:
                return y

    def apply(self, y: np.ndarray, **params):
        raise NotImplementedError
        
        
class Normalize(AudioTransform):
    def __init__(self, always_apply=False, p=1):
        super().__init__(always_apply, p)

    def apply(self, y: np.ndarray, **params):
        max_vol = np.abs(y).max()
        y_vol = y / max_vol
        assert not np.isnan(y_vol).any(), f'{max_vol}'
        return y_vol

In [12]:
class TestDataset(Dataset):
    def __init__(self, test_folder, augmentations=None):
        super().__init__()
        self.test_folder = test_folder
        self.fnames = [f for f in os.listdir(test_folder) if f.endswith('.ogg')]
        self.augmentations = augmentations
        
    def __len__(self):
        return len(self.fnames)
    
    def __getitem__(self, idx):
        fpath = os.path.join(self.test_folder, self.fnames[idx])
        wav, sr = load_wav(fpath, 0, None)
        if self.augmentations:
            wav = self.augmentations(wav, None)
        wav = torch.tensor(wav)
        assert (13 * 5 * sr) > len(wav) 
        wav = wav[:len(wav) // 12 * 12].reshape((12, len(wav) // 12))
        return wav

In [13]:
test_dataset = TestDataset(
    os.path.join(data_root, 'test_soundscapes'),
    Compose([Normalize(p=1)])
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    drop_last=False,   
)

In [14]:
pred_list = []
treshold = 0.05
model.eval()
with torch.no_grad():
    for i, batch in tqdm(enumerate(test_dataloader)):
        batch_size, part_count, part_size = batch.shape
        batch = batch.reshape(batch_size * part_count, part_size)
        pred = model(batch.to(device))['logits']
        pred = pred.cpu().numpy()
        pred = pred > treshold
        
        for j, chunk_pred in enumerate(pred):
            inbatch_number = j // part_count
            chunk_number = j % part_count + 1
            f_idx = i * batch_size + inbatch_number
            fname = test_dataset.fnames[f_idx]
            prefix = fname.split('.')[0]
            sufix = f'{5 * chunk_number}'
            
            pred_list.extend([{
                'row_id': '_'.join([prefix, b, sufix]),
                'target': chunk_pred[species2id[b]]
            } for b in test_birds])
pred_pd = pd.DataFrame(pred_list)

1it [00:06,  6.71s/it]


In [15]:
pred_pd.to_csv("submission.csv", index=False)
pred_pd

Unnamed: 0,row_id,target
0,soundscape_453028782_akiapo_5,False
1,soundscape_453028782_aniani_5,False
2,soundscape_453028782_apapan_5,False
3,soundscape_453028782_barpet_5,False
4,soundscape_453028782_crehon_5,False
...,...,...
247,soundscape_453028782_omao_60,False
248,soundscape_453028782_puaioh_60,False
249,soundscape_453028782_skylar_60,False
250,soundscape_453028782_warwhe1_60,False
