# 0 Imports

In [1]:
import math, random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import init
import torchaudio
from torchaudio import transforms

from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer
from ignite.metrics import Accuracy, Loss, RunningAverage
from ignite.handlers.param_scheduler import LRScheduler
from ignite.contrib.handlers.tqdm_logger import ProgressBar
# from ignite.contrib.metrics import ROC_AUC

plt.rcParams['figure.facecolor'] = 'white'

***
# 1 Data loading

## 1.1 Train data

In [2]:
data_train = pd.read_csv("data/train_metadata.csv")

In [3]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14852 entries, 0 to 14851
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     14852 non-null  object 
 1   secondary_labels  14852 non-null  object 
 2   type              14852 non-null  object 
 3   latitude          14852 non-null  float64
 4   longitude         14852 non-null  float64
 5   scientific_name   14852 non-null  object 
 6   common_name       14852 non-null  object 
 7   author            14852 non-null  object 
 8   license           14852 non-null  object 
 9   rating            14852 non-null  float64
 10  time              14852 non-null  object 
 11  url               14852 non-null  object 
 12  filename          14852 non-null  object 
dtypes: float64(3), object(10)
memory usage: 1.5+ MB


In [4]:
data_train.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,time,url,filename
0,afrsil1,[],"['call', 'flight call']",12.391,-1.493,Euodice cantans,African Silverbill,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,08:00,https://www.xeno-canto.org/125458,afrsil1/XC125458.ogg
1,afrsil1,"['houspa', 'redava', 'zebdov']",['call'],19.8801,-155.7254,Euodice cantans,African Silverbill,Dan Lane,Creative Commons Attribution-NonCommercial-Sha...,3.5,08:30,https://www.xeno-canto.org/175522,afrsil1/XC175522.ogg
2,afrsil1,[],"['call', 'song']",16.2901,-16.0321,Euodice cantans,African Silverbill,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,4.0,11:30,https://www.xeno-canto.org/177993,afrsil1/XC177993.ogg
3,afrsil1,[],"['alarm call', 'call']",17.0922,54.2958,Euodice cantans,African Silverbill,Oscar Campbell,Creative Commons Attribution-NonCommercial-Sha...,4.0,11:00,https://www.xeno-canto.org/205893,afrsil1/XC205893.ogg
4,afrsil1,[],['flight call'],21.4581,-157.7252,Euodice cantans,African Silverbill,Ross Gallardy,Creative Commons Attribution-NonCommercial-Sha...,3.0,16:30,https://www.xeno-canto.org/207431,afrsil1/XC207431.ogg


In [5]:
data_train.drop(columns=["type", "latitude", "longitude", "scientific_name", "common_name", "license", "time", "url"], inplace=True)

In [6]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14852 entries, 0 to 14851
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     14852 non-null  object 
 1   secondary_labels  14852 non-null  object 
 2   author            14852 non-null  object 
 3   rating            14852 non-null  float64
 4   filename          14852 non-null  object 
dtypes: float64(1), object(4)
memory usage: 580.3+ KB


In [7]:
data_train.head()

Unnamed: 0,primary_label,secondary_labels,author,rating,filename
0,afrsil1,[],Bram Piot,2.5,afrsil1/XC125458.ogg
1,afrsil1,"['houspa', 'redava', 'zebdov']",Dan Lane,3.5,afrsil1/XC175522.ogg
2,afrsil1,[],Bram Piot,4.0,afrsil1/XC177993.ogg
3,afrsil1,[],Oscar Campbell,4.0,afrsil1/XC205893.ogg
4,afrsil1,[],Ross Gallardy,3.0,afrsil1/XC207431.ogg


***
## 1.2 Test data

In [8]:
data_test = pd.read_csv("data/test.csv")

In [9]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   row_id    3 non-null      object
 1   file_id   3 non-null      object
 2   bird      3 non-null      object
 3   end_time  3 non-null      int64 
dtypes: int64(1), object(3)
memory usage: 224.0+ bytes


In [10]:
data_test.head()

Unnamed: 0,row_id,file_id,bird,end_time
0,soundscape_1000170626_akiapo_5,soundscape_1000170626,akiapo,5
1,soundscape_1000170626_akiapo_10,soundscape_1000170626,akiapo,10
2,soundscape_1000170626_akiapo_15,soundscape_1000170626,akiapo,15


***
## 1.3 Scored birds data

In [11]:
scored_birds = pd.read_json("data/scored_birds.json")

In [12]:
scored_birds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       21 non-null     object
dtypes: object(1)
memory usage: 296.0+ bytes


In [13]:
scored_birds.head()

Unnamed: 0,0
0,akiapo
1,aniani
2,apapan
3,barpet
4,crehon


***
# 2 Data exploration

In [14]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14852 entries, 0 to 14851
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     14852 non-null  object 
 1   secondary_labels  14852 non-null  object 
 2   author            14852 non-null  object 
 3   rating            14852 non-null  float64
 4   filename          14852 non-null  object 
dtypes: float64(1), object(4)
memory usage: 580.3+ KB


In [15]:
data_train.describe()

Unnamed: 0,rating
count,14852.0
mean,3.719129
std,1.181014
min,0.0
25%,3.0
50%,4.0
75%,4.5
max,5.0


In [16]:
data_train["rating"].value_counts()

4.0    3974
5.0    3318
3.0    1957
3.5    1765
4.5    1632
2.5     724
2.0     575
0.0     570
1.0     155
1.5     152
0.5      30
Name: rating, dtype: int64

In [17]:
data_train = data_train[(data_train["rating"] >= 1.0) & (data_train["rating"] <= 5.0)]

In [18]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14252 entries, 0 to 14851
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     14252 non-null  object 
 1   secondary_labels  14252 non-null  object 
 2   author            14252 non-null  object 
 3   rating            14252 non-null  float64
 4   filename          14252 non-null  object 
dtypes: float64(1), object(4)
memory usage: 668.1+ KB


In [19]:
data_train["rating"].value_counts()

4.0    3974
5.0    3318
3.0    1957
3.5    1765
4.5    1632
2.5     724
2.0     575
1.0     155
1.5     152
Name: rating, dtype: int64

In [20]:
data_train = data_train[(data_train["primary_label"].isin(scored_birds.iloc[:,0].values)) | (data_train["secondary_labels"].isin(scored_birds.iloc[:,0].values))]

In [21]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1240 entries, 32 to 14762
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     1240 non-null   object 
 1   secondary_labels  1240 non-null   object 
 2   author            1240 non-null   object 
 3   rating            1240 non-null   float64
 4   filename          1240 non-null   object 
dtypes: float64(1), object(4)
memory usage: 58.1+ KB


***
# 3 Utilities setup

In [22]:
class AudioUtils():
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(audio, new_channel):
        sig, sr = audio
        if sig.shape[0] == new_channel:
            return audio
        if new_channel == 1:
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return resig, sr

    @staticmethod
    def resample(audio, newsr):
        sig, sr = audio
        if sr == newsr:
            return audio
        num_channels = sig.shape[0]
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1, :])
        if num_channels > 1:
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:, :])
            resig = torch.cat([resig, retwo])
        return resig, newsr

    @staticmethod
    def pad_trunc(audio, max_ms):
        sig, sr = audio
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms
        if sig_len > max_len:
            sig = sig[:, :max_len]
        elif sig_len < max_len:
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len  =max_len - sig_len - pad_begin_len
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return sig, sr

    @staticmethod
    def time_shift(audio, shift_limit):
        sig, sr = audio
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def spectrogram(audio, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = audio
        top_db = 80
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return spec

    def spectrogram_augment(spectrogram, max_mask_pct=0.1, n_freq_maks=1, n_time_masks=1):
        _, n_mels, n_steps = spectrogram.shape
        mask_value = spectrogram.mean()
        aug_spec = spectrogram
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_maks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        return aug_spec

***
# 4 Dataset and Dataloader

In [23]:
class TrainSoundDataset(Dataset):
    def __init__(self, df, data_path, label_encoder):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000 # ?
        self.sr = 32000
        self.channel = 2 # ?
        self.shift_pct = 0.4
        self.label_encoder = label_encoder
        self.df["label"] = self.label_encoder.transform(df[["primary_label"]])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        audio_file = self.data_path + self.df["filename"].iloc[index]
        label = torch.tensor(self.df["label"].iloc[index], dtype=torch.long)

        audio = AudioUtils.open(audio_file)
        re_aud = AudioUtils.resample(audio, self.sr)
        re_chan = AudioUtils.rechannel(re_aud, self.channel)
        dur_aud = AudioUtils.pad_trunc(re_chan, self.duration)
        shift_aud = AudioUtils.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtils.spectrogram(shift_aud)
        aug_sgram = AudioUtils.spectrogram_augment(sgram)
        return aug_sgram, label

In [24]:
class TestSoundDataset(Dataset):
    def __init__(self, df, data_path, label_encoder):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000 # ?
        self.sr = 32000
        self.channel = 2 # ?
        self.label_encoder = label_encoder
        self.df["label"] = self.label_encoder.transform(df[["bird"]])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        audio_file = self.data_path + self.df["file_id"].iloc[index] + ".ogg"
        label = torch.tensor(self.df["label"].iloc[index], dtype=torch.long)

        audio = AudioUtils.open(audio_file)
        re_aud = AudioUtils.resample(audio, self.sr)
        re_chan = AudioUtils.rechannel(re_aud, self.channel)
        dur_aud = AudioUtils.pad_trunc(re_chan, self.duration)
        sgram = AudioUtils.spectrogram(dur_aud)
        return sgram, label

In [25]:
label_encoder = OrdinalEncoder()
label_encoder.fit(data_train[["primary_label"]])

OrdinalEncoder()

In [26]:
BATCH_SIZE = 16
LABELS_COUNT = scored_birds.value_counts().count()
TRAIN_PATH = "data/train_audio/"
TEST_PATH = "data/test_soundscapes/"

In [27]:
train_ds = TrainSoundDataset(data_train, TRAIN_PATH, label_encoder)
test_ds = TestSoundDataset(data_test, TEST_PATH, label_encoder)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

In [28]:
train_ds[0]

  label = torch.tensor(self.df["label"].iloc[index], dtype=torch.long)


(tensor([[[-10.8259,  -6.6460, -11.7051,  ...,  -4.5460, -14.9803,  -3.3526],
          [-20.3229, -10.5576, -14.1383,  ...,  -9.3691, -13.6688, -18.8991],
          [-18.7182, -16.2491, -20.4181,  ..., -20.2193, -15.4452, -23.3583],
          ...,
          [-22.1894, -22.1894, -22.1894,  ..., -22.1894, -22.1894, -22.1894],
          [-40.0528, -55.3034, -55.3146,  ..., -27.3471, -34.4192, -31.2256],
          [-43.0875, -55.3146, -55.3146,  ..., -32.2454, -40.1447, -31.9132]],
 
         [[ -2.1663,  -1.7601, -12.4692,  ...,  -1.4733,  -6.7810, -14.2131],
          [-24.6872,  -6.3274, -13.4037,  ...,  -7.0139, -11.5496, -21.6858],
          [-22.4923, -12.7291, -19.3943,  ..., -17.3735, -16.0152, -15.2321],
          ...,
          [-22.1894, -22.1894, -22.1894,  ..., -22.1894, -22.1894, -22.1894],
          [-44.1821, -53.0161, -55.3146,  ..., -30.4371, -41.1404, -35.3810],
          [-46.2206, -55.3146, -55.3146,  ..., -36.6155, -47.1668, -36.4704]]]),
 tensor(0))

In [29]:
data_test.head()

Unnamed: 0,row_id,file_id,bird,end_time,label
0,soundscape_1000170626_akiapo_5,soundscape_1000170626,akiapo,5,0.0
1,soundscape_1000170626_akiapo_10,soundscape_1000170626,akiapo,10,0.0
2,soundscape_1000170626_akiapo_15,soundscape_1000170626,akiapo,15,0.0


In [30]:
# test_ds[0]

***
# 5 Neural Network

In [31]:
class AudioClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        conv_layers = []

        # CONV BLOCK 1
        self.conv_1 = torch.nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu_1 = torch.nn.ReLU()
        self.batchnorm_1 = torch.nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv_1.weight, a=0.1)
        self.conv_1.bias.data.zero_()
        conv_layers += [self.conv_1, self.relu_1, self.batchnorm_1]

        # CONV BLOCK 2
        self.conv_2 = torch.nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu_2 = torch.nn.ReLU()
        self.batchnorm_2 = torch.nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv_2.weight, a=0.1)
        self.conv_2.bias.data.zero_()
        conv_layers += [self.conv_2, self.relu_2, self.batchnorm_2]

        # CONV BLOCK 3
        self.conv_3 = torch.nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu_3 = torch.nn.ReLU()
        self.batchnorm_3 = torch.nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv_3.weight, a=0.1)
        self.conv_3.bias.data.zero_()
        conv_layers += [self.conv_3, self.relu_3, self.batchnorm_3]

        # CONV BLOCK 4
        self.conv_4 = torch.nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu_4 = torch.nn.ReLU()
        self.batchnorm_4 = torch.nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv_4.weight, a=0.1)
        self.conv_4.bias.data.zero_()
        conv_layers += [self.conv_4, self.relu_4, self.batchnorm_4]

        self.ap = torch.nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = torch.nn.Linear(in_features=64, out_features=LABELS_COUNT)

        self.conv = torch.nn.Sequential(*conv_layers)

    def forward(self, x):
        x = self.conv(x)
        x = self.ap(x)
        x = x.view(x.shape[0], -1)
        x = self.lin(x)
        return x

In [32]:
model = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
next(model.parameters()).device

device(type='cuda', index=0)

***
# 6 Training

In [33]:
NUM_EPOCHS = 5

In [34]:
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.NLLLoss()
torch_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                            steps_per_epoch=int(len(train_dl)),
                                            epochs=NUM_EPOCHS,
                                            anneal_strategy='linear')
scheduler = LRScheduler(torch_scheduler)

val_metrics = {
    "accuracy": Accuracy(),
    "nll": Loss(loss_fn)
}

trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device)
evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)


trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
ProgressBar(persist=True).attach(trainer, ["loss"])

# @trainer.on(Events.ITERATION_COMPLETED(every=1))
# def log_training_loss(trainer):
#     print(f"Epoch[{trainer.state.epoch}] Loss: {trainer.state.output:.2f}")

# @trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_dl)
    metrics = evaluator.state.metrics
    print(f"Training Resulsts - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Av loss: {metrics['nll']:.2f}")

# @trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    evaluator.run(test_dl)
    metrics = evaluator.state.metrics
    print(f"Validation Resulsts - Epoch: {trainer.state.epoch} Avg accuracy: {metrics['accuracy']:.2f} Av loss: {metrics['nll']:.2f}")

In [None]:
trainer.run(train_dl, max_epochs=NUM_EPOCHS)

In [None]:
trainer.state.output