In [2]:
import json
import os
import random

from pydantic import BaseModel as ConfigBaseModel
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torchvision.models.resnet import ResNet, BasicBlock

In [31]:
# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Prepare paths
root_path = "/content/drive/MyDrive/birds/"
input_path = root_path + '/train_audio/'   
    
# Bird labels
train_meta = pd.read_csv(root_path + 'train_metadata.csv')
with open(root_path + '/scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)
bird_label = np.asarray(scored_birds)
print("Labels:")
print(bird_label)

# Preprocessing data
sample_rate = 32000
n_fft = 2048
win_length = None
hop_length = 1024
n_mels = 128
min_sec_proc = sample_rate*5

# Conver to spectogramm
mel_spectrogram = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    onesided=True,
    n_mels=n_mels,
    mel_scale="htk",
)

Using cuda device
Labels:
['akiapo' 'aniani' 'apapan' 'barpet' 'crehon' 'elepai' 'ercfra' 'hawama'
 'hawcre' 'hawgoo' 'hawhaw' 'hawpet1' 'houfin' 'iiwi' 'jabwar' 'maupar'
 'omao' 'puaioh' 'skylar' 'warwhe1' 'yefcan']




In [5]:
# Set pseudo randomize
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True
torch_fix_seed()

In [6]:
# Create spectrogramm for audio files
def audio_to_mel_label(filepath,
                       min_sec_proc,
                       mode='train',
                       data_index=0,
                       label_list=[],
                       bird_label=[],
                       label_file=[],
                       mel_list=[]):

    waveform, sample_rate_file = torchaudio.load(filepath=filepath)
    len_wav = waveform.shape[1]
    waveform = waveform[0, :].reshape(1, len_wav)  # stereo->mono mono->mono
    if not len_wav < min_sec_proc * 12:
        waveform = torch.cat((waveform, waveform[:, 0:len_wav]), 1)
        len_wav = min_sec_proc * 12
        waveform = waveform[:, 0:len_wav]

    for index in range(int(len_wav / min_sec_proc)):
        log_melspec = torch.log10(
            mel_spectrogram(waveform[0, index * min_sec_proc:index * min_sec_proc + min_sec_proc]).reshape(1, 128,157) + 1e-10)
        log_melspec = (log_melspec - torch.mean(log_melspec)) / torch.std(log_melspec)
        mel_list.append(log_melspec)
    return mel_list

# model
class ResNetBird(ResNet):
    def __init__(self):
        super().__init__(BasicBlock, [4, 8, 6, 4], num_classes=21)
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=1, padding=3, bias=False)


net = ResNetBird().to(device)



In [7]:
print(net)

ResNetBird(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True

In [8]:
print(torch.load('/content/drive/MyDrive/birds/model.pt'))

OrderedDict([('conv1.weight', tensor([[[[-1.7985e-01, -2.2334e-01,  6.7939e-03,  ...,  9.7836e-03,
            8.8091e-02,  1.0088e-01],
          [-2.5246e-04, -1.4284e-01,  2.5209e-01,  ..., -3.7774e-01,
           -1.1645e-01,  1.0901e-01],
          [-2.3688e-01, -1.9402e-01,  3.3950e-01,  ..., -3.7551e-01,
           -1.2012e-01,  1.5947e-01],
          ...,
          [-8.8587e-02,  1.6473e-01,  1.4931e-01,  ..., -2.4894e-01,
            2.9161e-02,  3.1622e-01],
          [-1.3138e-03,  4.3094e-01, -6.0429e-02,  ..., -1.5185e-01,
           -7.2346e-02,  2.0710e-01],
          [ 2.1082e-01,  4.0162e-01, -6.3082e-02,  ..., -1.5802e-01,
           -5.3744e-02,  3.1854e-02]]],


        [[[-2.6476e-01, -9.0947e-02,  4.0502e-02,  ..., -3.3398e-03,
           -7.4061e-02, -3.0883e-01],
          [ 1.5864e-01,  1.4845e-01,  1.7138e-02,  ...,  1.7215e-01,
            2.6415e-02,  1.3707e-01],
          [-5.4156e-02, -1.8043e-01,  1.7462e-02,  ..., -5.3574e-03,
            7.3158e-02, -2

In [32]:
# Download weights
net.load_state_dict(torch.load('/content/drive/MyDrive/birds/model.pt'), strict=False)
out_sigmoid = nn.Sigmoid()

# Path to test
test_audio_dir = root_path + 'test/'
file_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]

# Test
pred = {'row_id': [], 'target': []}
binary_th = 5e-8
net.eval()
max = 0.0
for afile in file_list:
  sings_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]
  path = test_audio_dir + afile + '.ogg'
  chunks = [[] for i in range(12)]
  mel_list_test = []
  mel_list_test = audio_to_mel_label(path, min_sec_proc, 'test', mel_list=mel_list_test)
  mel_list_test = torch.stack(mel_list_test).to(device)
  outputs = net(mel_list_test)
  outputs_test = out_sigmoid(outputs)
  for idx, i in enumerate(range(len(chunks))):
        chunk_end_time = (i + 1) * 5
        for bird in scored_birds:
            try:
                score = outputs_test[idx][np.where(bird_label == bird)]
            except IndexError:
                score = 0
            row_id = afile + '_' + bird + '_' + str(chunk_end_time)
            pred['row_id'].append(row_id)
            pred['target'].append(0 if score == 0 else score.item())
              
# Convert to DataFrame              
results = pd.DataFrame(pred, columns=['row_id', 'target'])

# Print results
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(results)

# Save results to csv file
results.to_csv("./submission.csv", index=False)

              row_id        target
0      test_akiapo_5  2.370773e-16
1      test_aniani_5  1.699042e-21
2      test_apapan_5  1.463552e-13
3      test_barpet_5  9.135622e-22
4      test_crehon_5  2.035578e-25
5      test_elepai_5  9.092754e-22
6      test_ercfra_5  4.905993e-16
7      test_hawama_5  3.547839e-22
8      test_hawcre_5  1.771184e-14
9      test_hawgoo_5  2.423774e-16
10     test_hawhaw_5  1.997216e-13
11    test_hawpet1_5  6.723368e-24
12     test_houfin_5  2.546145e-14
13       test_iiwi_5  1.372403e-14
14     test_jabwar_5  1.028015e-16
15     test_maupar_5  1.430172e-24
16       test_omao_5  6.161033e-16
17     test_puaioh_5  8.729851e-22
18     test_skylar_5  1.311794e-13
19    test_warwhe1_5  1.437117e-14
20     test_yefcan_5  1.803243e-18
21    test_akiapo_10  2.995290e-17
22    test_aniani_10  2.536659e-21
23    test_apapan_10  6.803688e-12
24    test_barpet_10  7.455852e-22
25    test_crehon_10  4.842859e-26
26    test_elepai_10  8.256996e-23
27    test_ercfra_10