# Training Notebook for BirdCLEF2023

## Import all Dependencies (1)

In [1]:
import torch
import torchvision
import torchaudio
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import WeightedRandomSampler
import torch.nn.functional as F

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.metrics
from tqdm import tqdm

import os
import sys
import random
import time
import copy
import logging
import bisect
import json
import shutil

  from .autonotebook import tqdm as notebook_tqdm


## Define a CONFIG class containing all relevant hyperparameters

In [2]:
class LabelSmoothingBCEWithLogitsLoss(nn.Module):
    def __init__(self, smooth_eps=0.0025, weight=None, reduction="mean"):
        super(LabelSmoothingBCEWithLogitsLoss, self).__init__()
        self.smooth_eps = smooth_eps
        self.weight = weight
        self.reduction = reduction
        self.bce_with_logits_loss = nn.BCEWithLogitsLoss(weight=self.weight, reduction=self.reduction)

    def forward(self, input, target):
        target_smooth = torch.clamp(target.float(), self.smooth_eps, 1.0 - self.smooth_eps)
        target_smooth = target_smooth + (self.smooth_eps / target.size(1))
        return self.bce_with_logits_loss(input, target_smooth)

In [3]:
### DO NOT CHANGE UNLESS RE-GENERATING DATASET IMAGES ###
class Config_Mel():
    def __init__(self) -> None:
        
        # Device
        self.device = 'cpu'
        
        # Dataset Path
        self.birdclef2023 = 'birdclef-2023'

        # Out path
        self.outpath_images = self.birdclef2023 + '_MelSpectrograms'

        self.melSpecTransform = torchaudio.transforms.AmplitudeToDB()

        # Audio Features
        self.sample_rate = 32000
        self.n_fft=2048
        self.f_min=40
        self.f_max=15000
        self.hop_length=512
        self.n_mels=128
        self.mel_args = {'sample_rate': self.sample_rate,
                         'n_fft': self.n_fft,
                         'f_min': self.f_min,
                         'f_max': self.f_max,
                         'hop_length': self.hop_length,
                         'n_mels': self.n_mels}
### DO NOT CHANGE UNLESS RE-GENERATING DATASET IMAGES ###

class Config():
    def __init__(self) -> None:

        self.run_environment = 'local' # 'kaggle' 'local' 'colab'
        self.load_pretrained_weights = False
        self.rerun_split = False
        self.use_5_second_dataset = True
        self.uniform_sampler = True
        self.training_data_per_epoch = 0.25
        self.soft_second_label = 0.3
        self.class_weighting = True
        self.softmax_prob = True
        self.use_mixup = True
        self.criterion = 'LabelSmoothingBCEWithLogitsLoss' # 'LabelSmoothingBCEWithLogitsLoss' # 'CrossEntropyLoss' # 'BCEWithLogitsLoss'
        self.epochs = 25
        self.masking = True
        self.masking_prob = 0.5
        self.frac_nocall = 0.01
        self.use_nocall = False
        self.addbackground_prob = 0.5

        # Device
        if (self.run_environment == 'kaggle') or (self.run_environment == 'colab'):
            self.device = 'cuda'
        else:
            self.device = 'cpu'
        
        # Dataset Path
        if self.run_environment == 'kaggle':
            self.birdclef2023_melspectrograms = '/kaggle/input/birdclef-2023-melspectrograms/birdclef-2023_MelSpectrograms'
            self.birdclef2023_melspectrograms_5_seconds = '/kaggle/input/birdclef-2023-melspectrograms-5-seconds/birdclef-2023_MelSpectrograms_5_seconds'
            self.birdclef2023 = '/kaggle/input/birdclef-2023'
            self.birdclef2021_background_noise = '/kaggle/input/birdclef2021-background-noise/ff1010bird_nocall'
        elif self.run_environment == 'local':
            self.birdclef2023_melspectrograms = '/home/colin/elec5305/ele5305_research_project/birdclef-2023_MelSpectrograms'
            self.birdclef2023_melspectrograms_5_seconds = '/home/colin/elec5305/ele5305_research_project/birdclef-2023_MelSpectrograms_5_seconds'
            self.birdclef2023 = '/home/colin/elec5305/ele5305_research_project/birdclef-2023'
            self.birdclef2021_background_noise = '/home/colin/elec5305/ele5305_research_project/birdclef2021_background_noise/ff1010bird_nocall'
        elif self.run_environment == 'colab':
            self.birdclef2023_melspectrograms = ''

        # Out path
        if self.run_environment == 'kaggle':
            self.outpath = '/kaggle/working/results'
        elif self.run_environment == 'local':
            self.outpath = 'results'
        elif self.run_environment == 'colab':
            self.outpath = ''

        # Train/Validation Split 
        self.val_frac = 0.1

        # Dataloader options
        self.num_workers = 2
        self.train_batch_size = 64
        self.valid_batch_size = 32

        # Model name
        self.model_name = 'tf_efficientnet_b0.ns_jft_in1k'
        # Pretrained
        if self.run_environment == 'kaggle':
            self.pretrained_weights = '/kaggle/input/weights12/model_weights.pth'
        elif self.run_environment == 'local':
            self.pretrained_weights = '/home/colin/elec5305/ele5305_research_project/weights/model_weights.pth'


        # Optimizer Settings
        self.lr=5e-4
        self.weight_decay = 1e-3
        self.momentum=0.9
        self.optimizer = 'adam' # 'adam', 'sgd'

        self.scheduler = 'cosineannealing'
        self.eta_min = 1e-6
        self.T_mult = 1
        self.last_epoch = -1

        self.mixup_alpha = 0.5

        # Training Settings
        self.print_every_n_batches = 25
        self.patience = 5
        self.fix_features = False

        # Image Transforms
        self.train_transforms = torchvision.transforms.Compose([
                    torchaudio.transforms.AmplitudeToDB(),
                    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    torchvision.transforms.RandomResizedCrop(size=(128, 312), scale = (0.75, 1.0), antialias=True), 
                    ])
        
        self.val_transforms = torchvision.transforms.Compose([
                    torchaudio.transforms.AmplitudeToDB(),
                    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    torchvision.transforms.Resize(size=[128,312], antialias=True),
                    # torchvision.transforms.RandomResizedCrop(size=(128, 312), scale = (0.75, 1.0), antialias=True), 
                    ])
        
        self.test_transforms = torchvision.transforms.Compose([
                    transforms.ToTensor(),
                    # torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    # torchvision.transforms.Resize(size=(224, 224), antialias=True),  # Or Resize(antialias=True)
                    ])

        # Audio Transforms
        self.train_transforms_audio = None
        
        self.val_transforms_audio = None
        
        self.test_transforms_audio = None


        # Audio Features
        self.sample_rate = 32000
        self.period = 5

        # Mel Spectrogram Parameters
        self.n_fft=2048
        self.f_min=40
        self.f_max=15000
        self.hop_length=512
        self.n_mels=128
        self.mel_args = {'n_fft': self.n_fft,
                         'f_min': self.f_min,
                         'f_max': self.f_max,
                         'hop_length': self.hop_length,
                         'n_mels': self.n_mels}
        
        if self.run_environment == 'colab':
            from google.colab import drive
            drive.mount('/content/drive')

            notebook_path = 'My Drive/elec5305'

            env_path = f'/content/drive/{notebook_path}'
            # Add the handout folder to python paths
            if env_path not in sys.path:
                sys.path.append(env_path)

            # zip_path = os.path.join(env_path, 'birdclef-2023_MelSpectrograms.zip')
            zip_path = '/content/drive/MyDrive/elec5305/birdclef-2023_MelSpectrograms.zip'
            shutil.unpack_archive(zip_path, "content/")
            print(zip_path)
            # !unzip zip_path -d "/content"

            # Dataset path
            self.birdclef2023_melspectrograms = '/content/content/birdclef-2023_MelSpectrograms'

            # Output path
            self.outpath = os.path.join(env_path, 'results')
            os.makedirs(self.outpath, exist_ok=True)# !pip install --force-reinstall numpy==1.22.1
            
            %pip install -q torchtoolbox timm

            %pip install timm torchtoolbox

        if self.run_environment != 'local':
            # !pip install --force-reinstall numpy==1.22.1
            %pip install -q torchtoolbox timm
        
CONFIG = Config()

## Import all dependencies (2)

In [4]:
from torchtoolbox.tools import mixup_data, mixup_criterion
from torch.nn.functional import cross_entropy
import timm

## Define Network

In [5]:
class MelSpectrogramLayer(nn.Module):
    def __init__(self, sample_rate, n_fft, hop_length, n_mels, transform):
        super(MelSpectrogramLayer, self).__init__()
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels
        )
        self.transform = transform

    def forward(self, waveform):
        mel_spectrogram = self.mel_transform(waveform)

        batched = True
        if mel_spectrogram.dim() == 2:
            batched = False
        elif mel_spectrogram.dim() == 3:
            batched = True

        if batched == True:
            if self.training and torch.rand(1) >= CONFIG.masking_prob and CONFIG.masking == True:
                mel_spectrogram = torchaudio.transforms.FrequencyMasking(
                    freq_mask_param=mel_spectrogram.shape[1] // 5
                )(mel_spectrogram)
                mel_spectrogram = torchaudio.transforms.TimeMasking(
                    time_mask_param=mel_spectrogram.shape[2] // 5
                )(mel_spectrogram)
        else:
            if self.training and torch.rand(1) >= CONFIG.masking_prob and CONFIG.masking == True:
                mel_spectrogram = torchaudio.transforms.FrequencyMasking(
                    freq_mask_param=mel_spectrogram.shape[0] // 5
                )(mel_spectrogram)
                mel_spectrogram = torchaudio.transforms.TimeMasking(
                    time_mask_param=mel_spectrogram.shape[1] // 5
                )(mel_spectrogram)


        if batched == True:
            mel_spectrogram = mel_spectrogram.unsqueeze(1)
            mel_spectrogram = mel_spectrogram.expand(-1, 3, -1, -1)
        else:
            mel_spectrogram = mel_spectrogram.unsqueeze(0)
            mel_spectrogram = mel_spectrogram.expand(3, -1, -1)

        mel_spectrogram = self.transform(mel_spectrogram)

        mel_spectrogram = torch.nan_to_num(mel_spectrogram)
        
        return mel_spectrogram

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'
    
    
class Mel_Classifier(nn.Module):
    def __init__(self, mel_generator: MelSpectrogramLayer, model_name="tf_efficientnet_b4_ns", embedding_size=768, pretrained=True, num_classes = 264):
        super(Mel_Classifier, self).__init__()
        self.mel_generator = mel_generator
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.embedding = nn.Linear(in_features, embedding_size)
        self.fc = nn.Linear(embedding_size, num_classes)

    def forward(self, images):
        images = self.mel_generator(images)
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features)
        output = self.fc(embedding)
        return output

# class Mel_Classifier(torch.nn.Module):
#     def __init__(self, model_name, mel_generator: MelSpectrogramLayer, num_classes = 264, pretrained = True):
#         super().__init__()
#         self.num_classes = num_classes

#         self.mel_generator = mel_generator

#         self.backbone = timm.create_model(model_name, pretrained=pretrained)

#         if 'res' in model_name:
#             self.in_features = self.backbone.fc.in_features
#             self.backbone.fc = nn.Linear(self.in_features, num_classes)
#         elif 'dense' in model_name:
#             self.in_features = self.backbone.classifier.in_features
#             self.backbone.classifier = nn.Linear(self.in_features, num_classes)
#         elif 'efficientnet' in model_name:
#             self.in_features = self.backbone.classifier.in_features
#             self.backbone.classifier = nn.Sequential(
#                 nn.Linear(self.in_features, num_classes)
#             )

#     def forward(self,x):
#         x = self.mel_generator(x)
#         x = self.backbone(x)
#         return x




#####################################3
    

# https://www.kaggle.com/code/leonshangguan/faster-eb0-sed-model-inference

# def init_layer(layer):
#     nn.init.xavier_uniform_(layer.weight)

#     if hasattr(layer, "bias"):
#         if layer.bias is not None:
#             layer.bias.data.fill_(0.)


# def init_bn(bn):
#     bn.bias.data.fill_(0.)
#     bn.weight.data.fill_(1.0)


# def init_weights(model):
#     classname = model.__class__.__name__
#     if classname.find("Conv2d") != -1:
#         nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
#         model.bias.data.fill_(0)
#     elif classname.find("BatchNorm") != -1:
#         model.weight.data.normal_(1.0, 0.02)
#         model.bias.data.fill_(0)
#     elif classname.find("GRU") != -1:
#         for weight in model.parameters():
#             if len(weight.size()) > 1:
#                 nn.init.orghogonal_(weight.data)
#     elif classname.find("Linear") != -1:
#         model.weight.data.normal_(0, 0.01)
#         model.bias.data.zero_()


# def interpolate(x: torch.Tensor, ratio: int):
#     """Interpolate data in time domain. This is used to compensate the
#     resolution reduction in downsampling of a CNN.
#     Args:
#       x: (batch_size, time_steps, classes_num)
#       ratio: int, ratio to interpolate
#     Returns:
#       upsampled: (batch_size, time_steps * ratio, classes_num)
#     """
#     (batch_size, time_steps, classes_num) = x.shape
#     upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
#     upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
#     return upsampled


# def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int):
#     """Pad framewise_output to the same length as input frames. The pad value
#     is the same as the value of the last frame.
#     Args:
#       framewise_output: (batch_size, frames_num, classes_num)
#       frames_num: int, number of frames to pad
#     Outputs:
#       output: (batch_size, frames_num, classes_num)
#     """
#     output = torch.nn.functional.interpolate(
#         framewise_output.unsqueeze(1),
#         size=(frames_num, framewise_output.size(2)),
#         align_corners=True,
#         mode="bilinear").squeeze(1)

#     return output

# class AttBlockV2(nn.Module):
#     def __init__(self,
#                  in_features: int,
#                  out_features: int,
#                  activation="linear"):
#         super().__init__()

#         self.activation = activation
#         self.att = nn.Conv1d(
#             in_channels=in_features,
#             out_channels=out_features,
#             kernel_size=1,
#             stride=1,
#             padding=0,
#             bias=True)
#         self.cla = nn.Conv1d(
#             in_channels=in_features,
#             out_channels=out_features,
#             kernel_size=1,
#             stride=1,
#             padding=0,
#             bias=True)

#         self.init_weights()

#     def init_weights(self):
#         init_layer(self.att)
#         init_layer(self.cla)

#     def forward(self, x):
#         # x: (n_samples, n_in, n_time)
#         norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
#         cla = self.nonlinear_transform(self.cla(x))
#         x = torch.sum(norm_att * cla, dim=2)
#         return x, norm_att, cla

#     def nonlinear_transform(self, x):
#         if self.activation == 'linear':
#             return x
#         elif self.activation == 'sigmoid':
#             return torch.sigmoid(x)

# class Mel_Classifier(torch.nn.Module):
#     def __init__(
#         self, 
#         model_name: str,
#         mel_generator: MelSpectrogramLayer,
#         # config=None,
#         pretrained=True, 
#         num_classes=264, 
#         in_channels=3
#     ):
#         super().__init__()
        
#         # self.config = config

#         self.mel_generator = mel_generator

#         # self.bn0 = nn.BatchNorm2d(self.config.n_mels)

#         base_model = timm.create_model(
#             model_name, 
#             pretrained=pretrained, 
#             num_classes=0,
#             global_pool="",
#             in_chans=in_channels,
#         )
        
#         layers = list(base_model.children())[:-2]
#         self.backbone = nn.Sequential(*layers)

#         in_features = base_model.num_features

#         self.fc1 = nn.Linear(in_features, in_features, bias=True)
#         self.att_block = AttBlockV2(
#             in_features, num_classes, activation="linear")

#         self.init_weight()

#     def init_weight(self):
#         # init_bn(self.bn0)
#         init_layer(self.fc1)
        
#     def forward(self, input_data):
#         input_data = self.mel_generator(input_data)

#         # if self.config.in_channels == 3:
#         x = input_data
#         # else:
#         #     x = input_data[:, [0], :, :] # (batch_size, 1, time_steps, mel_bins)

#         # frames_num = x.shape[2]

#         # x = x.transpose(1, 3)
#         # x = self.bn0(x)
#         # x = x.transpose(1, 3)


#         # x = x.transpose(2, 3)

#         x = self.backbone(x)
        
#         # Aggregate in frequency axis
#         x = torch.mean(x, dim=2)

#         x1 = torch.nn.functional.max_pool1d(x, kernel_size=3, stride=1, padding=1)
#         x2 = torch.nn.functional.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
#         x = x1 + x2

#         x = x.transpose(1, 2)
#         x = torch.nn.functional.relu_(self.fc1(x))
#         x = x.transpose(1, 2)

#         (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)

#         output_dict = {
#             "clipwise_output": clipwise_output,
#         }

#         # return output_dict
#         return clipwise_output

## Create a Dataloader Class

In [6]:
class BirdCLEF2023(torch.nn.Module):
    def __init__(self, datapath: list, metadata_df, audio_transforms, sample_rate, period, soft_second_label, inherited_species_list = None, backgroundData = None, *args, **kwargs) -> None:
        super().__init__()

        # Default values
        self.sample_rate = sample_rate
        self.audio_transforms = audio_transforms
        self.period = period

        self.backgroundData = backgroundData

        self.df = metadata_df
        self.datapath = datapath

        self.soft_second_label = soft_second_label

        if len(self.datapath) == 2:
            self.audio_paths = [os.path.join(self.datapath[0],'train_audio'), os.path.join(self.datapath[1],'nocall')]
        else:
            self.audio_paths = [os.path.join(self.datapath[0],'train_audio')]

        # Get species list
        if inherited_species_list is None:
            self.species = list(set(self.df['primary_label']))
        else:
            self.species = inherited_species_list

        
        return

    def __len__(self):
        # length = self.df['cumulative_images'][-1]
        length = len(list(self.df['primary_label']))
        return length


    def __getitem__(self, idx):

        # Get row in df
        dict_idx = dict(self.df.iloc[idx])

        # Get labels as torch tensors
        primary_label = torch.tensor([1 if dict_idx['primary_label'] == label else 0 for label in self.species],dtype=float)
        secondary_label = torch.tensor([1 if label in dict_idx['secondary_labels'] else 0 for label in self.species], dtype=float)
        combined_label = self._prepare_target(main_tgt=primary_label, sec_tgt=secondary_label)
        dict_idx['combined_label_tensor'] = combined_label
        dict_idx['primary_label_tensor'] = primary_label
        dict_idx['secondary_label_tensor'] = secondary_label

        # Load audio
        if dict_idx['primary_label'] == 'nocall':
            idx_dataset = 1
        else:
            idx_dataset = 0
        ogg_file = os.path.join(self.audio_paths[idx_dataset],dict(self.df.iloc[idx])['filename'])
        waveform, sample_rate = torchaudio.load(ogg_file)
        waveform = waveform.ravel()

        # Get clip of length self.period
        target_audio_length = sample_rate * self.period
        current_audio_length = len(waveform)
        if current_audio_length >= target_audio_length:
            start = random.randint(0,current_audio_length - target_audio_length - 1)
            waveform_seg = waveform[start:start+target_audio_length]
        else:
            padding_length = target_audio_length - current_audio_length
            waveform_seg = torch.nn.functional.pad(waveform, (0, padding_length), 'constant', 0)

        resampler = torchaudio.transforms.Resample(orig_freq=self.sample_rate, new_freq=self.sample_rate)
        waveform_seg = resampler(waveform_seg)

        if random.uniform(0,1) > CONFIG.addbackground_prob and self.backgroundData is not None:
            idx = random.randint(0,len(self.backgroundData)-1)
            backgroundNoise = self.backgroundData[idx][0]
            waveform_seg += backgroundNoise

        return waveform_seg, combined_label

    # https://github.com/VSydorskyy/BirdCLEF_2023_1st_place/blob/main/code_base/datasets/wave_dataset.py, changed
    def _prepare_target(self, main_tgt, sec_tgt, all_labels=None):
        all_tgt = main_tgt + sec_tgt * self.soft_second_label
        all_tgt = torch.clamp(all_tgt, 0.0, 1.0)
        return all_tgt

In [7]:
class NoCallDataset(torch.nn.Module):
    def __init__(self, datapath, metadata_df, audio_transforms, sample_rate, period, inherited_species_list=None, *args, **kwargs) -> None:

        # Default values
        self.sample_rate = sample_rate
        self.audio_transforms = audio_transforms
        self.period = period

        self.df = metadata_df
        self.datapath = datapath

        self.soft_second_label = 0

        self.audio_paths = [0,os.path.join(self.datapath[1],'nocall')]


        # Get species list
        if inherited_species_list is None:
            self.species = list(set(self.df['primary_label']))
        else:
            self.species = inherited_species_list

        return
    
    def __len__(self):
        # length = self.df['cumulative_images'][-1]
        length = len(list(self.df['primary_label']))
        return length


    def __getitem__(self, idx):

        # Get row in df
        dict_idx = dict(self.df.iloc[idx])

        # Get labels as torch tensors
        primary_label = torch.tensor([1 if dict_idx['primary_label'] == label else 0 for label in self.species],dtype=float)
        secondary_label = torch.tensor([1 if label in dict_idx['secondary_labels'] else 0 for label in self.species], dtype=float)
        combined_label = self._prepare_target(main_tgt=primary_label, sec_tgt=secondary_label)
        dict_idx['combined_label_tensor'] = combined_label
        dict_idx['primary_label_tensor'] = primary_label
        dict_idx['secondary_label_tensor'] = secondary_label

        # Load audio
        if dict_idx['primary_label'] == 'nocall':
            idx_dataset = 1
        else:
            idx_dataset = 0
        ogg_file = os.path.join(self.audio_paths[idx_dataset],dict(self.df.iloc[idx])['filename'])
        waveform, sample_rate = torchaudio.load(ogg_file)
        waveform = waveform.ravel()

        # Get clip of length self.period
        target_audio_length = sample_rate * self.period
        current_audio_length = len(waveform)
        if current_audio_length >= target_audio_length:
            start = random.randint(0,current_audio_length - target_audio_length - 1)
            waveform_seg = waveform[start:start+target_audio_length]
        else:
            padding_length = target_audio_length - current_audio_length
            waveform_seg = torch.nn.functional.pad(waveform, (0, padding_length), 'constant', 0)

        resampler = torchaudio.transforms.Resample(orig_freq=self.sample_rate, new_freq=self.sample_rate)
        waveform_seg = resampler(waveform_seg)

        return waveform_seg, combined_label

    # https://github.com/VSydorskyy/BirdCLEF_2023_1st_place/blob/main/code_base/datasets/wave_dataset.py, changed
    def _prepare_target(self, main_tgt, sec_tgt, all_labels=None):
        all_tgt = main_tgt + sec_tgt * self.soft_second_label
        all_tgt = torch.clamp(all_tgt, 0.0, 1.0)
        return all_tgt

In [8]:
# https://www.kaggle.com/code/nischaydnk/split-creating-melspecs-stage-1
def birds_stratified_split(df, target_col, test_size=0.2):
    class_counts = df[target_col].value_counts()
    low_count_classes = class_counts[class_counts < 2].index.tolist() ### Birds with single counts

    df['train'] = df[target_col].isin(low_count_classes)

    train_df, val_df = train_test_split(df[~df['train']], test_size=test_size, stratify=df[~df['train']][target_col], random_state=42)

    train_df = pd.concat([train_df, df[df['train']]], axis=0).reset_index(drop=True)

    # Remove the 'valid' column
    train_df.drop('train', axis=1, inplace=True)
    val_df.drop('train', axis=1, inplace=True)

    return train_df, val_df

## Uniform Sampler

In [9]:
def make_uniformSampler(dataset):
    n_data = len(dataset)
    classes_lsit = dataset.species

    count_int = [0] * len(classes_lsit)
    nocall_count = 0

    # Get class counts
    for i in range(len(dataset)):
        if dataset.df.iloc[i]['primary_label'] == 'nocall':
            nocall_count += 1
        else:
            species_index = classes_lsit.index(dataset.df.iloc[i]['primary_label'])
            count_int[species_index] += 1

    # Calculate class weights
    n_call = sum(count_int)
    n_nocall = nocall_count
    if (CONFIG.use_nocall == True) and n_nocall != 0:
        class_weights = np.array(count_int) / n_call * (1 - CONFIG.frac_nocall)
        nocall_weights = nocall_count / n_nocall * CONFIG.frac_nocall
    else:
        class_weights = np.array(count_int) / n_call * (1 - CONFIG.frac_nocall)

    sample_weights = [0] * n_data

    # Assign class weights to samples
    for i in range(n_data):
        if dataset.df.iloc[i]['primary_label'] == 'nocall':
            sample_weights[i] = nocall_weights **  -1
        else:
            species_index = classes_lsit.index(dataset.df.iloc[i]['primary_label'])
            sample_weights[i] = class_weights[species_index] ** -1

    # Normalize
    sample_weights = sample_weights / sum(sample_weights)

    # Create sampler
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=n_data)

    return sampler

In [10]:
def make_class_weights(dataset):
    n_data = len(dataset)
    classes_lsit = dataset.species

    count_int = [0] * len(classes_lsit)
    nocall_count = 0

    for i in range(len(dataset)):
        if dataset.df.iloc[i]['primary_label'] == 'nocall':
            nocall_count += 1
        else:
            species_index = classes_lsit.index(dataset.df.iloc[i]['primary_label'])
            count_int[species_index] += 1

    n_call = sum(count_int)
    class_weights = (np.array(count_int) / n_call) ** -0.5   * len(classes_lsit)

    return class_weights

## Make Datasets and Dataloaders

In [11]:
# Load dataframe
df = pd.read_csv(os.path.join(CONFIG.birdclef2023, 'train_metadata.csv'))

# Get Split
train_df, val_df = birds_stratified_split(df=df, target_col='primary_label', test_size=CONFIG.val_frac)

# Get global species list
species_list = list(set(df['primary_label']))

# Initialize Datasets
train_class_kwargs = {  'sample_rate': CONFIG.sample_rate,
                        'n_fft': CONFIG.n_fft,
                        'f_min': CONFIG.f_min,
                        'f_max': CONFIG.f_max,
                        'hop_length': CONFIG.hop_length,
                        'n_mels': CONFIG.n_mels,
                        'period': CONFIG.period,
                        'device': CONFIG.device,
                        'transform': CONFIG.train_transforms,
                        'soft_second_label': CONFIG.soft_second_label
                     }

valid_class_kwargs = {   'sample_rate': CONFIG.sample_rate,
                        'n_fft': CONFIG.n_fft,
                        'f_min': CONFIG.f_min,
                        'f_max': CONFIG.f_max,
                        'hop_length': CONFIG.hop_length,
                        'n_mels': CONFIG.n_mels,
                        'period': CONFIG.period,
                        'device': CONFIG.device,
                        'transform': CONFIG.val_transforms
                    }

#Make No Call Dataframes
df_nocall = pd.read_csv(os.path.join(CONFIG.birdclef2021_background_noise, 'ff1010bird_metadata_v1.csv'))
df_train_nocall, df_valid_nocall = train_test_split(df_nocall, test_size=CONFIG.val_frac)

# Concatenate dataframes
df_train_full = pd.concat([train_df, df_train_nocall], axis = 0)
df_valid_full = pd.concat([val_df, df_valid_nocall], axis = 0)

# Reset the index to create a new index for the concatenated DataFrame
df_train_full.reset_index(drop=True, inplace=True)
df_valid_full.reset_index(drop=True, inplace=True)

# Make Datasets
train_dataset_nocall = NoCallDataset(datapath=[0,CONFIG.birdclef2021_background_noise], metadata_df=df_train_nocall, audio_transforms=CONFIG.train_transforms_audio, sample_rate=CONFIG.sample_rate, period=CONFIG.period, inherited_species_list=species_list)
valid_dataset_nocall = NoCallDataset(datapath=[0,CONFIG.birdclef2021_background_noise], metadata_df=df_valid_nocall, audio_transforms=CONFIG.val_transforms_audio, sample_rate=CONFIG.sample_rate, period=CONFIG.period, inherited_species_list=species_list)

# Make dataset
if CONFIG.use_nocall == True:
    train_dataset = BirdCLEF2023(datapath=[CONFIG.birdclef2023, CONFIG.birdclef2021_background_noise], metadata_df=df_train_full, audio_transforms=CONFIG.train_transforms_audio ,sample_rate=CONFIG.sample_rate, soft_second_label=CONFIG.soft_second_label, period=CONFIG.period, inherited_species_list=species_list)
    valid_dataset = BirdCLEF2023(datapath=[CONFIG.birdclef2023, CONFIG.birdclef2021_background_noise], metadata_df=df_valid_full, audio_transforms=CONFIG.val_transforms_audio, sample_rate=CONFIG.sample_rate, soft_second_label=CONFIG.soft_second_label, period=CONFIG.period, inherited_species_list=species_list)
else:
    train_dataset = BirdCLEF2023(datapath=[CONFIG.birdclef2023], metadata_df=train_df, audio_transforms=CONFIG.train_transforms_audio ,sample_rate=CONFIG.sample_rate, soft_second_label=CONFIG.soft_second_label, period=CONFIG.period, inherited_species_list=species_list, backgroundData=train_dataset_nocall)
    valid_dataset = BirdCLEF2023(datapath=[CONFIG.birdclef2023], metadata_df=val_df, audio_transforms=CONFIG.val_transforms_audio, sample_rate=CONFIG.sample_rate, soft_second_label=CONFIG.soft_second_label, period=CONFIG.period, inherited_species_list=species_list, backgroundData=valid_dataset_nocall)

if CONFIG.uniform_sampler == False:
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,num_workers=CONFIG.num_workers, batch_size=CONFIG.train_batch_size, shuffle = True, pin_memory = True)
    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,num_workers=CONFIG.num_workers, batch_size=CONFIG.valid_batch_size, shuffle = True, pin_memory = True)
else:
    train_sampler = make_uniformSampler(dataset=train_dataset)
    valid_sampler = make_uniformSampler(dataset=valid_dataset)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,num_workers=CONFIG.num_workers, batch_size=CONFIG.train_batch_size, pin_memory = True, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,num_workers=CONFIG.num_workers, batch_size=CONFIG.valid_batch_size, pin_memory = True, sampler=valid_sampler)

In [12]:
def test_sampler(dataloader, epochs = 1):

    species_list = dataloader.dataset.species
    species_count = torch.tensor([0] * len(species_list))
    total_samples = 0

    for epoch in range(epochs):
        for i, data in enumerate(tqdm(dataloader, desc="Processing")):
            labels = data[1]

            batch_size = labels.shape[0]

            label_idx = list(labels.argmax(1))
            species_count[label_idx] += 1
            total_samples += batch_size

    return species_count.numpy(), total_samples


# count_int_train, n_train = test_sampler(dataloader=train_loader)
# count_int_val, n_val = test_sampler(dataloader=valid_loader)

# penguin_means = {
#     'Training Set': count_int_train / n_train,
#     'Validation Set': count_int_val / n_val
# }



# x = np.arange(len(species_list))  # the label locations
# width = 0.35  # the width of the bars
# multiplier = 0

# fig, ax = plt.subplots(layout='constrained', figsize=(24,12))
# colors = ['crimson','midnightblue']
# for i,(attribute, measurement) in enumerate(penguin_means.items()):
#     offset = width * i
#     rects = ax.bar(x + offset, measurement, width, label=attribute, color=colors[i], alpha=0.7)
#     multiplier += 1

# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Fraction of Dataset [-]')
# ax.set_title('Distribution of Classes in Validation and Training Data')

# # ax.set_xticks(x + width / 2, classes_lsit)
# # ax.set_xticks(x + width / 2)

# ax.legend(loc='upper left', ncols=2)
# ax.set_ylim(0, 0.1)
# plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.5)

# plt.show()



## Define Metric Function as on Kaggle

In [13]:
def padded_cmap(solution, submission, padding_factor=5):
    solution = solution.drop(['row_id'], axis=1, errors='ignore')
    submission = submission.drop(['row_id'], axis=1, errors='ignore')
    new_rows = []
    for i in range(padding_factor):
        new_rows.append([1 for i in range(len(solution.columns))])
    new_rows = pd.DataFrame(new_rows)
    new_rows.columns = solution.columns
    padded_solution = pd.concat([solution, new_rows]).reset_index(drop=True).copy()
    padded_submission = pd.concat([submission, new_rows]).reset_index(drop=True).copy()
    score = sklearn.metrics.average_precision_score(
        padded_solution.values,
        padded_submission.values,
        average='macro',
    )
    return score

## Define Training Function

In [14]:
def train_with_mixup(X, y, y_pred, criterion):
    X, y_a, y_b, lam = mixup_data(X, y, alpha=CONFIG.mixup_alpha)
    loss_mixup = mixup_criterion(criterion, y_pred, y_a, y_b, lam) #cross_entropy
    return loss_mixup

def train_net(net, trainloader, valloader, criterion, optimizer, scheduler, epochs=1, patience = 3, savePth = 'project2_weights.pth', print_every_samples = 20, device = 'cpu'):

    print('Using device: {}'.format(device))
    net.to(device)
    criterion.to(device)

    if CONFIG.softmax_prob == True:
        toProb = torch.nn.Softmax(dim=1)
    else:
        toProb = torch.nn.Identity()

    validation_loss_list = [0] * epochs
    training_loss_list = [0] * epochs
    validation_accuracy_list = [0] * epochs
    training_accuracy_list = [0] * epochs
    cmap_5_list = [0] * epochs

    best_state_dictionary = None
    best_validation_cmap = 0.0
    inertia = 0
    for epoch in range(epochs):

        training_loss = 0.0
        training_accuracy = 0.0
        running_loss = 0.0
        # Set model to training mode
        net.mel_generator.transform = CONFIG.train_transforms
        net = net.train()

        # Calculate the number of batches to loop over
        num_batches_to_loop = int(CONFIG.training_data_per_epoch * len(trainloader))
        with tqdm(enumerate(trainloader, 0), total=num_batches_to_loop, desc="Training Batches Epoch {} / {}".format(epoch + 1, epochs)) as train_pbar:
            for i, data in train_pbar:
        
                # get the inputs
                if device == 'cuda':
                    inputs, labels = data[0].to(device), data[1].to(device)
                else:
                    inputs, labels = data

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = net(inputs)

                if CONFIG.use_mixup:
                    loss_value = train_with_mixup(inputs, labels, outputs, criterion=criterion)
                else:
                    loss_value = criterion(outputs,labels)

                loss_value.backward()
                optimizer.step()

                # print statistics and write to log
                running_loss += loss_value.item()
                training_loss += loss_value.item()

                train_pbar.set_postfix(loss=(running_loss / ((i + 1) * trainloader.batch_size)))
                training_accuracy += (outputs.argmax(1) == labels.argmax(1)).sum().item()

                if type(scheduler).__name__ != 'NoneType':
                    scheduler.step(epoch + i / len(trainloader))

                if i >= num_batches_to_loop:
                    break

        training_loss = training_loss / (len(trainloader.dataset) * CONFIG.training_data_per_epoch)
        training_loss_list[epoch] = training_loss
        training_accuracy = 100 * training_accuracy / (len(trainloader.dataset) * CONFIG.training_data_per_epoch)
        training_accuracy_list[epoch] = training_accuracy

        print('Batch {:5d} / {:5d}: Training Loss = {:.3f}, Training Accuracy = {:.3f}'.format(epoch + 1, epochs, training_loss, training_accuracy))

        running_loss = 0.0
        val_loss = 0.0
        correct = 0
        predictions_array = np.zeros((len(valloader.dataset), len(valloader.dataset.species)), dtype=float)
        solutions_array = np.zeros((len(valloader.dataset), len(valloader.dataset.species)), dtype=float)
        # Set model to validation mode
        net.mel_generator.transform = CONFIG.val_transforms
        net = net.eval()
        with tqdm(enumerate(valloader, 0), total=len(valloader), desc="Validation Batches Epoch {} / {}".format(epoch + 1, epochs)) as val_pbar:
            for i, data in val_pbar:
                # get the inputs
                if device == 'cuda':
                    inputs, labels = data[0].to(device), data[1].to(device)
                else:
                    inputs, labels = data
                    
                # forward + backward + optimize
                outputs = net(inputs)
                loss_value = criterion(outputs, labels)

                # print statistics and write to log
                running_loss += loss_value.item()
                val_loss += loss_value.item()

                # Get model output and label to array
                curr_predictions_array = toProb(outputs).detach().cpu().numpy()
                predictions_array[i*valloader.batch_size:(i+1)*valloader.batch_size,:] = curr_predictions_array
                hardlabels = labels.detach().cpu().numpy()
                hardlabels[hardlabels < 0.99] = 0
                curr_solutions_array = hardlabels
                solutions_array[i*valloader.batch_size:(i+1)*valloader.batch_size,:] = curr_solutions_array

                # Update progress bar
                val_pbar.set_postfix(loss=(running_loss / ((i + 1) * trainloader.batch_size)))
                correct += (outputs.argmax(1) == labels.argmax(1)).sum().item()
        
        # Get cMAP
        cmap_5 = padded_cmap(solution=pd.DataFrame(solutions_array), submission=pd.DataFrame(predictions_array), padding_factor=5)

        # Get Metrics
        val_loss = val_loss / len(valloader.dataset)
        validation_loss_list[epoch] = val_loss
        val_accuracy = 100 * correct / len(valloader.dataset)
        validation_accuracy_list[epoch] = val_accuracy
        cmap_5_list[epoch] = cmap_5

        print('Batch {:5d} / {:5d}: Validation Loss = {:.3f}, Validation Accuracy = {:.3f}, cmap score = {:.3f}'.format(epoch + 1, epochs, val_loss, val_accuracy, cmap_5))

        save_weights = os.path.join(savePth,'model_weights.pth')
        if cmap_5 > best_validation_cmap:
            best_validation_cmap = cmap_5
            best_state_dictionary = copy.deepcopy(net.state_dict())
            # save network
            torch.save(best_state_dictionary, save_weights)
            inertia = 0
            print('Epoch {:5d} / {:5d} saved: New Best Epoch!'.format(epoch + 1, epochs))
        else:
            inertia += 1
            if inertia == patience:
                if best_state_dictionary is None:
                    raise Exception("State dictionary should have been updated at least once")
                break
        # print(f"Validation accuracy: {val_accuracy}")

    print('Finished Training')

    output = {'validation_loss': validation_loss_list,
              'validation_accuracy': validation_accuracy_list,
              'training_loss': training_loss_list,
              'training_accuracy': training_accuracy_list,
              'cmap_5_scores': cmap_5_list}
    
    return output

In [15]:
def get_model(config):
    melspec_layer = MelSpectrogramLayer(sample_rate=config.sample_rate,
                                    n_fft=config.n_fft,
                                    hop_length=config.hop_length,
                                    n_mels=config.n_mels,
                                    transform=config.train_transforms)
    network = Mel_Classifier(model_name=config.model_name,
                            mel_generator=melspec_layer)
    
    if config.load_pretrained_weights == True:
        print('Load PreTrained Weigths')
        network.load_state_dict(torch.load(config.pretrained_weights, map_location=config.device))

    if config.fix_features == True:
        for param in network.backbone.parameters():
            param.requires_grad = False
    
    return network

def get_criterion(config):
    
    if config.criterion == 'LabelSmoothingBCEWithLogitsLoss':
        if config.class_weighting == True:
            class_weights = torch.tensor(make_class_weights(train_dataset))
            criterion = LabelSmoothingBCEWithLogitsLoss(weight=class_weights)
        else:
            criterion = LabelSmoothingBCEWithLogitsLoss()

    if config.criterion == 'CrossEntropyLoss':
        if config.class_weighting == True:
            class_weights = torch.tensor(make_class_weights(train_dataset))
            criterion = nn.CrossEntropyLoss(weight=class_weights)
        else:
            criterion = nn.CrossEntropyLoss()

    if config.criterion == 'BCEWithLogitsLoss':
        if config.class_weighting == True:
            class_weights = torch.tensor(make_class_weights(train_dataset))
            criterion = nn.BCEWithLogitsLoss(weight=class_weights)
        else:
            criterion = nn.BCEWithLogitsLoss()

    return criterion

def get_optimizer(config, params):

    if config.optimizer == 'adam':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, params), lr=config.lr, weight_decay=config.weight_decay)
    elif config.optimizer == 'sgd':
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, params), lr=config.lr, momentum=config.momentum)

    return optimizer

def get_scheduler(config, optimizer):

    if config.scheduler == 'cosineannealing':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                            optimizer, 
                            T_0=config.epochs, 
                            T_mult=config.T_mult, 
                            eta_min=config.eta_min, 
                            last_epoch=config.last_epoch
                        )
    elif config.scheduler == None:
        config.scheduler = None

    return scheduler
    

## Run Training Procedure

In [16]:
def main_train():

    # Change Output path
    folder_name = time.strftime('%Y-%m-%d_%H-%M-%S')
    outpath = os.path.join(CONFIG.outpath, folder_name)
    CONFIG.outpath = outpath

    # Create Output directory
    os.makedirs(CONFIG.outpath, exist_ok=True)

    # Get all variables to json file
    config_dict = {attr: value for attr, value in vars(CONFIG).items()}
    outputName = 'hyperparameters.json'
    jsonpath = os.path.join(CONFIG.outpath, outputName)
    with open(jsonpath, 'w') as json_file:
        json.dump(str(config_dict), json_file)

    # Get model
    network = get_model(config = CONFIG)
    
    # Get loss function
    criterion = get_criterion(config = CONFIG)
    
    # Get optimizer
    optimizer = get_optimizer(config = CONFIG, params = network.parameters())

    # Get scheduler
    scheduler = get_scheduler(config = CONFIG, optimizer = optimizer)

    # Train Net
    output = train_net( net=network,
                        trainloader=train_loader,
                        valloader=valid_loader,
                        criterion=criterion,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        epochs=CONFIG.epochs,
                        device=CONFIG.device,
                        print_every_samples=CONFIG.print_every_n_batches,
                        savePth=CONFIG.outpath,
                        patience=CONFIG.patience
                        )
    
    # Save Output
    outputName = 'training_prog.json'
    jsonpath = os.path.join(CONFIG.outpath, outputName)
    with open(jsonpath, 'w') as json_file:
        json.dump(output, json_file)

In [17]:
main_train()

Using device: cpu


Training Batches Epoch 1 / 25: 100%|██████████| 59/59 [10:30<00:00, 10.69s/it, loss=0.00114]

Batch     1 /    25: Training Loss = 0.001, Training Accuracy = 0.341



Validation Batches Epoch 1 / 25:   0%|          | 0/53 [00:00<?, ?it/s]

## Kaggle: Zip Directory

In [None]:
import zipfile
import os
from IPython.display import FileLink

# https://www.kaggle.com/code/hari31416/downloading-file-and-directory-from-kaggle
def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    """
    zip all the files in a directory
    
    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

if CONFIG.run_environment == 'kaggle':
    zip_dir()

In [None]:
# TODO: Fix issue that not all labels are in validation set
# TODO: Fix progress bar
# TODO: adam?
# TODO: cMAP
# TODO: Train with mixup    
# TODO: Dataset metrics
# TODO: Uniform sampler accross classes
# TODO: class weights
# TODO: use only sub-dataset per epoch?
# TODO: consider second label
# TODO: Inference Script

# TODO: add no-call samples
# TODO: augmentations
# TODO: include rating

Try class weights by 1st rank:
sample_weights = (
    all_primary_labels.value_counts() / 
    all_primary_labels.value_counts().sum()
)  ** (-0.5)

Also by 1st rank:

Small inference tricks

    Using temperature mean: pred = (pred**2).mean(axis=0) ** 0.5
    Using Attention SED probs * 0.75 + Max Timewise probs * 0.25

All these gave marginal improvements but it is was a matter of first 3 places :) 