In [None]:
import os
from google.colab import files, drive
drive.mount('/content/drive')
os.chdir('/')
DRIVE_PATH = 'content/drive/MyDrive/Colab Notebooks/kaggle_rainforest'
f = []
for (dirpath, dirnames, filenames) in os.walk('.'):
    f.extend(dirnames)
    break
    
# Only change the directory if the "models" folder is not accessible 
if 'models' not in f:
    os.chdir(f'{DRIVE_PATH}')

In [None]:
!nvidia-smi

In [None]:
!pip install transformers resnest efficientnet_pytorch &> /dev/null

In [None]:
import sys
import random
import math
import datetime
import time
import h5py
import multiprocessing

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split

import torch
import torchvision.transforms as transforms

import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup

import resnest.torch as resnest_torch
from efficientnet_pytorch import EfficientNet

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
NUM_WORKERS = multiprocessing.cpu_count()
MEL_FILE = 'mel_32_128_2'
NUM_CLASSES = 24
CUT = 6
SAMPLE_TIME = 4
HEIGHT, WIDTH = 300, 300
AUDIO_LENGTH = 60

In [None]:
def extract_dataset(label, h5file):
    """
    Extracts data from the h5 file and stores into a dictionary
    Arguments:
        label {String} - train or test set
        h5File {String} - filename of the hdf5 file
    Returns:
        dictionary {recording_id: [h, w]}
    """
    dataset = h5py.File(f'mel/{h5file}.hdf5', 'r')
    recording_ids = [i.decode('utf-8') for i in dataset[f'{label}_labels']]
    data = {k:v for k, v in tqdm(zip(recording_ids, dataset[f'{label}_files']), total=len(recording_ids))}
    return data

# Load data into memory for faster training
train_data = extract_dataset('train', MEL_FILE)
test_data = extract_dataset('test', MEL_FILE)

df_tp = pd.read_csv(f'train_tp.csv')
df_test = pd.read_csv(f'sample_submission.csv')

In [None]:
ONE_HOT = np.eye(NUM_CLASSES)

In [None]:
def create_test_df(dfx):
    """
    Takes the test dataframe and creates a new dataframe with multiple time ranges for each recording id.
    This is in order for the test dataframe to mimic the train dataframe.
    Arguments:
        dfx {DataFrame} - Submission file with each row being a unqiue recording_id
    Returns
        DataFrame - Submission file with multiple rows and timestamps for each recording_id
    """

    def summary_row(row):
        stride = 1
        cuts = np.vstack([[i,i+SAMPLE_TIME] for i in range(0, AUDIO_LENGTH-stride, stride)])
        row_new = pd.DataFrame(data={'recording_id': row.iloc[0, 0],
                        't_min': cuts[:, 0],
                        't_max': cuts[:, 1]
                        })
        return row_new

    df_new = dfx.groupby(['recording_id'], as_index=False).apply(summary_row).reset_index(drop=True)
    return df_new


def smooth_projections(dfx):
    """
    Takes the dataframe and creates a running average of each score through time and then
    takes the max value at every point. The dataframe is then pivoted to created the submission
    file where every row is a single recording and the columns are all the class scores.
    Arguments:
        dfx {DataFrame} - Input dataframe where each row is a single strong label projection
    Returns
        DataFrame - Submission formatted dataframe
    """
    
    def get_average(row):
        # Smooth out with a running average of 3 seconds and then take the max value accross entire 60 second sample
        row.drop(['t_min', 't_max', 'fold'], axis=1, inplace=True)
        row = row.rolling(3).mean()
        row = row.max().T
        return row
    
    dfx.sort_values(['t_min'], inplace=True)
    dfx = dfx.groupby(['recording_id', 'fold']).apply(get_average).reset_index()
    dfx.columns = ['recording_id', 'fold', 'column', 'score']

    dfx = pd.pivot_table(dfx, index=['recording_id', 'fold'], columns=['column'], values=['score']).reset_index()
    dfx.drop([('score', 'recording_id')], axis=1, inplace=True)
    dfx.columns = [i if i != 'score' else j for i, j in dfx.columns]
    dfx = dfx[['fold', 'recording_id'] + [f's{i}' for i in range(len(dfx.columns) - 2)]]
    return dfx

In [None]:
def count_parameters(model, all=False):
    """
    Count the parameters of a model
    Arguments:
        model {torch module} - Model to count the parameters of
        all {bool} - Whether to include not trainable parameters in the sum (default: {False})
    Returns:
        int - Number of parameters
    """
    if all:
        return sum(p.numel() for p in model.parameters())
    else:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
"""
Image augmentation classes, that are not used in the final models
"""
class SpecAugment(object):
    def __init__(self, prob=0.5, num_mask=2, freq_masking_max_percentage=0.05, time_masking_max_percentage=0.15):
        self.num_mask = num_mask
        self.freq_masking_max_percentage = freq_masking_max_percentage
        self.time_masking_max_percentage = time_masking_max_percentage
        self.prob = prob
        
    def __call__(self, spec):
        if torch.rand(1) < self.prob:
            for i in range(self.num_mask):
                _, all_freqs_num, all_frames_num = spec.shape
                freq_percentage = random.uniform(0.0, self.freq_masking_max_percentage)
                
                num_freqs_to_mask = int(freq_percentage * all_freqs_num)
                f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
                f0 = int(f0)
                spec[:, f0:f0 + num_freqs_to_mask, :] = 0

                time_percentage = random.uniform(0.0, self.time_masking_max_percentage)
                
                num_frames_to_mask = int(time_percentage * all_frames_num)
                t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
                t0 = int(t0)
                spec[:, :, t0:t0 + num_frames_to_mask] = 0
    
        return spec
    
    def __repr__(self):
        return self.__class__.__name__


class MonoToColor(object):
    def __init__(self, eps=1e-6, mean=None, std=None):
        self.mean = mean
        self.std = std
        self.eps = eps

    def __call__(self, X):
        # Standardize
        mean = self.mean or X.mean()
        std = self.std or X.std()
        X = (X - mean) / (std + self.eps)

        # Normalize to [0, 255]
        _min, _max = X.min(), X.max()

        if (_max - _min) > self.eps:
            V = np.clip(X, _min, _max)
            V = 255 * (V - _min) / (_max - _min)
            V = V.astype(np.uint8)
        else:
            V = np.zeros_like(X, dtype=np.uint8)

        V = np.stack([V, V, V], axis=-1)
        V = V.astype(np.uint8)
        return V
    
    def __repr__(self):
        return self.__class__.__name__


class GaussianNoise(object):
    def __init__(self, mean=0., std=0.5, sigma=0.3, prob=0.5):
        self.std = std
        self.mean = mean
        self.sigma = sigma
        self.prob = prob
        
    def __call__(self, tensor):
        if torch.rand(1) < self.prob:
            sample_noise = torch.randn(tensor.size()) * self.std + self.mean
            sample_noise *= self.sigma
            tensor += sample_noise
        return tensor
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [None]:
def get_model(name, num_classes=1):
    """
    Loads a pretrained model. 
    Supports ResNest, ResNext-wsl, EfficientNet, ResNext and ResNet.
    Arguments:
        name {str} - Name of the model to load
        num_classes {int} - Number of classes to use (default: {1})
    Returns:
        torch model - Pretrained model
    """

    if "resnest" in name:
        model = getattr(resnest_torch, name)(pretrained=True)
    elif "wsl" in name:
        model = torch.hub.load("facebookresearch/WSL-Images", name)
    elif "resnext" in name or "resnet" in name or 'densenet' in name:
        model = torch.hub.load("pytorch/vision:v0.6.0", name, pretrained=True)
    elif "efficientnet" in name:
        model = EfficientNet.from_pretrained(name)
    else:
        raise NotImplementedError


    if "efficientnet" not in name and "se" not in name:
        nb_ft = model.fc.in_features
        del model.fc
        model.fc = nn.BatchNorm1d(nb_ft)
    elif 'densenet' in name:
        nb_ft = model.classifier.in_features
        del model.classifier
        model.classifier = nn.BatchNorm1d(nb_ft)
    else:
        nb_ft = model._fc.in_features
        del model._fc
        model._fc = nn.BatchNorm1d(nb_ft)

    layers = 64
    dropout = 0.3

    modules = [
                model,
                nn.Dropout(dropout),
                nn.Linear(nb_ft, layers),
                nn.ReLU(),
                nn.BatchNorm1d(layers),
                nn.Dropout(dropout),
                nn.Linear(layers, num_classes)
               ]

    return nn.Sequential(*modules) 

In [None]:
def cut_train(x):
    """
    Cuts the full 60 second sample to only include the window which includes the labelled audio.
    Then takes a random sub-crop around that sample so that the sample only includes part of the audio.
    """
    full = x['audio_spec'].shape[1] 
    adj = full / AUDIO_LENGTH

    center = (x['t_max'] + x['t_min']) / 2

    tmax = center + CUT / 2
    tmin = center - CUT / 2

    # If the box would stretch beyond limits of the audio sample (0-60 seconds) then shift the box back within the limits
    extra_min = max(0, tmax - AUDIO_LENGTH)
    extra_max = -min(0, tmin)
    start_cut = max(0, tmin) - extra_min
    end_cut = min(AUDIO_LENGTH, tmax) + extra_max

    # Trim both the start and stop locations    
    half_time = (CUT - SAMPLE_TIME) / 2
    # Random selection from the window
    extra = np.random.uniform(-half_time, half_time)

    start_cut += extra + half_time
    end_cut += extra - half_time

    start_cut = int(start_cut * adj)
    end_cut = int(end_cut * adj)

    x['audio_spec'] = x['audio_spec'][:, start_cut: end_cut]
    return x


def cut_test(x):
    """
    Crops the 60 second audio to a window with the sample centered around the t_min and t_max values.
    Although the test set doesn't include a t_min and t_max, I artificially create a range for this
    with the create_test_df function.
    """
    full = x['audio_spec'].shape[1]
    adj = full / AUDIO_LENGTH

    tmax = x['t_max']
    tmin = x['t_min']

    half_length = SAMPLE_TIME / 2
    center = (tmax + tmin) / 2
    
    # If the box would stretch beyond limits then shift the box back
    extra_min = max(0, center + half_length - AUDIO_LENGTH)
    extra_max = -min(0, center - half_length)
    
    start_cut = max(0, center - half_length) - extra_min
    end_cut = min(AUDIO_LENGTH, center + half_length) + extra_max

    start_cut = int(start_cut * adj)
    end_cut = int(end_cut * adj)

    x['audio_spec'] = x['audio_spec'][:, start_cut:end_cut]
    return x


class AudioDataset(Dataset):
    def __init__(self, df, transform=None, data=train_data, train=True):
        self.train = train
        self.data = data
        self.files = df['recording_id'].values
        self.df = df
        self.transform = transform
        self.y = [ONE_HOT[int(i)] for i in df['species_id'].values]


    def __len__(self):
        return len(self.files) 

    def __getitem__(self, idx: int):
        recording_id = self.files[idx]
        data = self.df.iloc[idx, :]
        X = self.data[recording_id]
        species_id = data['species_id']

        output = {
            'audio_spec': X,
            'recording_id': recording_id,
            'species_id': species_id,
            't_min': data['t_min'],
            't_max': data['t_max'],
            'target': self.y[idx],
        }

        if self.train:
            output = cut_train(output)
        else:
            output = cut_test(output)

        if self.transform is not None:
            image = self.transform(output['audio_spec'])
            image = image.numpy()
            output['audio_spec'] = image

        return output


class TestDataset(Dataset):
    def __init__(self, df, transform=None, data=test_data):
        self.data = data
        self.files = df['recording_id'].values
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.files) 

    def __getitem__(self, idx: int):
        recording_id = self.files[idx]
        X = self.data[recording_id]
        data = self.df.iloc[idx, :]

        output = {
            'audio_spec': X,
            't_min': data['t_min'],
            't_max': data['t_max']
        }

        output = cut_test(output)

        if self.transform is not None:
            image = self.transform(output['audio_spec'])
            image = image.numpy()

        return {
            'audio_spec': image,
            'recording_id': recording_id
        }

In [None]:
"""
Competition metric functions from https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418
"""
def _one_sample_positive_class_precisions(scores, truth):
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)

    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)

    retrieved_classes = np.argsort(scores)[::-1]

    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)

    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True

    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)

    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def lwlrap(truth, scores):
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = _one_sample_positive_class_precisions(scores[sample_num, :], truth[sample_num, :])
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = precision_at_hits

    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))

    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    return per_class_lwlrap, weight_per_class


"""
AverageMeter and MetricMeter objects from https://www.kaggle.com/gopidurgaprasad/rfcs-audio-detection-pytorch-stater
"""
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class MetricMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.y_true = []
        self.y_pred = []
    
    def update(self, y_true, y_pred):
        self.y_true.extend(y_true.cpu().detach().numpy().tolist())
        self.y_pred.extend(torch.sigmoid(y_pred).cpu().detach().numpy().tolist())

    @property
    def avg(self):
        score_class, weight = lwlrap(np.array(self.y_true), np.array(self.y_pred))
        self.score = (score_class * weight).sum()

        return {"lwlrap" : self.score}

"""
The below score and loss objects are used to created a History object (similar to that in tensorflow),
which stores the full results from each run for easy visualization and logging.
"""
class Score(object):
    def __init__(self, target='max'):
        self.target = target
        self.target_func = max if target == 'max' else min
        self.reset()

    def reset(self):
        self.values = []
        self.best_index = None
        self.best = float('inf') if self.target == 'min' else -float('inf')

    def update(self, value):
        self.values.append(value)
        self.best = self.target_func(self.best, value)
        if self.best == value:
            self.best_index = len(self.values) - 1
    
    @property
    def improved(self):
        return self.best_index == len(self.values) - 1

    
class ScoreLoss(object):
    def __init__(self, score_target='max', loss_target='min'):
        self.score_target = 'max'
        self.loss_target = 'min'
        self.reset()

    def reset(self):
        self.score = Score(target=self.score_target)
        self.loss = Score(target=self.loss_target)
    
    def update(self, loss, score):
        self.score.update(score)
        self.loss.update(loss)
    
    @property
    def score_improved(self):
        return self.score.improved

    @property
    def loss_improved(self):
        return self.loss.improved


class History(object):
    def __init__(self, score_target='max', loss_target='min'):
        self.score_target = 'max'
        self.loss_target = 'min'
        self.reset()
    
    def reset(self):
        self.val = ScoreLoss(score_target=self.score_target, loss_target=self.loss_target)
        self.train = ScoreLoss(score_target=self.score_target, loss_target=self.loss_target)
        self.pred = None
        self.model_weights = None
    
    def update(self, loss, score, val_loss, val_score):
        self.val.update(val_loss, val_score)
        self.train.update(loss, score)

    def update_pred(self, pred, model):
        if self.score_improved:
            self.pred = pred
            self.model_weights = model.state_dict()
    
    @property
    def history(self):
        return {
            'loss': self.train.loss.values,
            'score': self.train.score.values,
            'val_loss': self.val.loss.values,
            'val_score': self.val.score.values
        }

    @property
    def best_result(self):
        return {
            'score': self.val.score.best,
            'loss': self.val.loss.values[self.val.score.best_index]
        }
    
    @property
    def score_improved(self):
        return self.val.score_improved

    @property
    def loss_improved(self):
        return self.val.loss_improved

In [None]:
def plot_history(history, filename):
    """
    Arguments:
        history {History class} - history object either from tensorflow or from the object outlined above
        filename {String} - directory to save a png copy of the graph\
    """
    validation = history.history["val_loss"]
    score = history.history["val_score"]
    best_score = max(score)
    index_score = score.index(best_score)
    print(f'epoch: {index_score + 1}, loss: {validation[index_score]:.4f}, score: {best_score:.4}')

    plt.style.use('dark_background')
    plt.rcParams.update({'font.size': 15, 'axes.xmargin': 0})

    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20,8))
    ax1.plot(history.history['score'])
    ax1.plot(history.history['val_score'])
    ax1.set_title('score')
    ax1.set_ylabel('score')
    ax1.set_xlabel('epoch')
    ax1.legend(['train', 'val'], loc='upper left')

    ax2.plot(history.history['loss'])
    ax2.plot(history.history['val_loss'])
    ax2.set_title('loss')
    ax2.set_ylabel('loss')
    ax2.set_xlabel('epoch')
    ax2.legend(['train', 'val'], loc='upper right')
    plt.show()
    try:
        plt.savefig(filename)
    except:
        print('Failed image save')
        pass

In [None]:
def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities
    Arguments:
        model {torch module} - Model to load the weights to
        filename {str} - Name of the checkpoint
        verbose {int} - Whether to display infos (default: {1})
        cp_folder {str} - Folder to load from (default: {''})
    Returns:
        torch module - Model with loaded weights
    """
    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=strict)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model


def save_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Saves the weights of a PyTorch model
    Arguments:
        model {torch module} - Model to save the weights of
        filename {str} - Name of the checkpoint
        verbose {int} - Whether to display infos (default: {1})
        cp_folder {str} - Folder to save to (default: {''})
    """
    if verbose:
        print(f"\n -> Saving weights to {os.path.join(cp_folder, filename)}\n")
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))

In [None]:
def mixup_data(x, y, alpha=0.4):
    """
    Applies mixup to a sample
    Arguments:
        x {torch tensor} - Input batch
        y {torch tensor} - Labels
        alpha {float} - Parameter of the beta distribution (default: {0.4})
    Returns:
        torch tensor - Mixed input
        torch tensor - Labels of the original batch
        torch tensor - Labels of the shuffle batch
        float - Probability samples by the beta distribution
    """
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1

    index = torch.randperm(x.size()[0]).cuda()

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]

    return mixed_x, y_a, y_b, lam

def fit(
    model,
    train_dataset,
    val_dataset,
    epochs=30,
    batch_size=32,
    val_bs=32,
    warmup_prop=0.1,
    lr=1e-3,
    alpha=0.4,
    mixup_proba=0.0,
    accum_gradient=None
):
    """
    Usual torch fit function
    Arguments:
        model {torch model} - Model to train
        train_dataset {torch dataset} - Dataset to train with
        val_dataset {torch dataset} - Dataset to validate with
        epochs {int} - Number of epochs (default: {50})
        batch_size {int} - Training batch size (default: {32})
        val_bs {int} - Validation batch size (default: {32})
        warmup_prop {float} - Warmup proportion (default: {0.1})
        lr {float} - Start (or maximum) learning rate (default: {1e-3})
        alpha {float} - alpha value for mixup (default: {0.4})
        mixup_proba {float} - Probability to apply mixup (default: {0.})
        accum_gradient {int} - After how many samples should you back-prop
    Returns:
        history {History} - Full history of every epoch (score, loss and dict of best model weights)
    """
    losses = AverageMeter()
    scores = MetricMeter()
    history = History()

    optimizer = Adam(model.parameters(), lr=lr)
    loss_fct = nn.BCEWithLogitsLoss(reduction="mean").cuda()

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        pin_memory=True,
        num_workers=NUM_WORKERS,
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=val_bs,
        shuffle=False,
        pin_memory=True, 
        num_workers=NUM_WORKERS
    )

    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = int(epochs * len(train_loader))

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        optimizer.zero_grad()

        avg_loss = 0
        scores.reset()
        losses.reset()

        for step, sample in enumerate(train_loader):
            X, y = sample['audio_spec'], sample['target']


            if np.random.rand() < mixup_proba:
                X, y_a, y_b, _ = mixup_data(X.cuda(), y.cuda(), alpha=alpha)
                y = torch.clamp(y_a + y_b, 0, 1)

            y_pred = model(X.cuda())
            loss = loss_fct(y_pred, y.cuda().float())

            scores.update(y, y_pred)
            losses.update(loss.item(), len(X))
            
            """
            For large models gpu restrictions require very small batch sizes, but we still want to back-prop the gradient at
            larger batch sizes. accum_gradient if an interger will only step the gradient after every multiple of the
            accum_gradient size (e.g accum_gradient=32 and batch_size=8 will back-prop the gradients after every 4th batch)
            """
            if accum_gradient is None:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            else:
                x_steps = accum_gradient // batch_size
                if step % x_steps == 0 or (step+2)*batch_size >= len(train_dataset):
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()

        losses_val = AverageMeter()
        scores_val = MetricMeter()

        model.eval()
        total_loss = 0

        with torch.no_grad():
            preds = np.empty((0, NUM_CLASSES))

            for step, sample in enumerate(val_loader):
                X, y = sample['audio_spec'], sample['target']

                y_pred = model(X.cuda()).detach()
                loss = loss_fct(y_pred, y.cuda().float())

                total_loss += loss.item() * len(y_pred)
                preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])


        y, y_pred = torch.tensor(val_dataset.y), torch.tensor(preds)

        total_loss /= len(preds)
        scores_val.update(y, y_pred)
        losses_val.update(total_loss, 1)

        elapsed_time = time.time() - start_time
        lr = scheduler.get_last_lr()[0]

        print(f"Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={elapsed_time:.0f}s \t loss={losses.avg:.4f} \t score={scores.avg['lwlrap']:.4f} \t val_loss={losses_val.avg:.4f} \t val_score={scores_val.avg['lwlrap']:.4f}")

        history.update(loss=losses.avg, 
                        score=scores.avg['lwlrap'],
                        val_loss=losses_val.avg,
                        val_score=scores_val.avg['lwlrap'])
        
        history.update_pred(pred=preds, model=model)

    torch.cuda.empty_cache()
    
    return history


def predict(model, dataset, batch_size=64, total=1):
    """
    Usual torch predict function
    Arguments:
        model {torch model} - Model to predict with
        dataset {torch dataset} - Dataset to predict with on
        batch_size {int} - Batch size (default: {32})
        total {int} - Total length of all samples (default: {1})
    Returns:
        numpy array - Predictions
    """
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=NUM_WORKERS
    )

    model.eval()
    preds = np.empty((0, NUM_CLASSES))
    total = math.ceil(total / batch_size)

    with torch.no_grad():
        for step, sample in tqdm(enumerate(loader), total=total):
            X = sample['audio_spec'].cuda()
            y_pred = model(X)
            y_pred = torch.sigmoid(y_pred).cpu().detach().numpy()
            preds = np.concatenate([preds, y_pred])

    return preds

In [None]:
def train(config, df_train, df_val, fold):
    global train_data
    global test_data

    print(f"    -> {len(df_train)} training samples")
    print(f"    -> {len(df_val)} validation samples")

    seed_everything(config.seed)
    model = get_model(
        config.selected_model, num_classes=NUM_CLASSES
    )
    
    if config.gpu:
        model = model.cuda()
    
    model.zero_grad()

    train_dataset = AudioDataset(df_train, transform=config.train_transform, data=train_data)
    val_dataset = AudioDataset(df_val, transform=config.test_transform, data=train_data, train=False)

    n_parameters = count_parameters(model)
    print(f"    -> {n_parameters} trainable parameters\n")

    history = fit(
        model,
        train_dataset,
        val_dataset,
        epochs=config.epochs,
        batch_size=config.batch_size,
        val_bs=config.val_bs,
        lr=config.lr,
        warmup_prop=config.warmup_prop,
        alpha=config.alpha,
        mixup_proba=config.mixup_proba,
        accum_gradient=config.accum_gradient
    )

    # Load the best model back in
    model.load_state_dict(history.model_weights)

    model_name = f'{config.selected_model}_{config.name}_{fold}'
    plot_history(history, f'models/{model_name}.png')

    save_model_weights(
        model,
        f"{model_name}.pt",
        cp_folder=f'models',
    )

    return history

def test(config, dfx):
    global test_data
    dfx = create_test_df(dfx)
    test_dataset = TestDataset(dfx, transform=config.test_transform, data=test_data)

    s_names = [f's{i}' for i in range(NUM_CLASSES)]
    df_results = []

    for fold in range(config.k):
        model = get_model(
            config.selected_model, num_classes=NUM_CLASSES
        )

        model = load_model_weights(
            model,
            f"{config.selected_model}_{config.name}_{fold}.pt",
            cp_folder=f'models',
        )

        if config.gpu:
            model = model.cuda()

        pred = predict(model=model, dataset=test_dataset, batch_size=config.batch_size, total=dfx.shape[0])

        # Store results into a dataframe and get the max results for each fold
        df_result = pd.DataFrame(data=pred, columns=s_names)
        df_result['recording_id'] = test_dataset.df['recording_id']
        df_result['t_min'] = test_dataset.df['t_min']
        df_result['t_max'] = test_dataset.df['t_max']
        df_result['fold'] = fold
        df_results.append(df_result)

    df_results = pd.concat(df_results, axis=0)
    df_results.to_csv('submissions/test_oof.csv', index=False)

    df_results = smooth_projections(df_results)
    df_results.drop(['fold'], axis=1, inplace=True)
    
    # Average over all the folds
    df_results = df_results.groupby(['recording_id'], as_index=False).agg(['sum'])
    df_results.columns = [i for i, _ in df_results.columns]

    return df_results


def k_fold(config, dfx):
    """
    Create cross validated folds without having the same recording_id in different folds (possible leakage) as some samples
    come from the same recording just at different time ranges.
    """
    group_kfold = GroupKFold(n_splits=config.k)
    splits = list(group_kfold.split(dfx, dfx['species_id'], dfx['recording_id']))

    pred_oof = []
    histories = []

    start_time = time.time()
    df_result = []

    for i, (train_idx, val_idx) in enumerate(splits):
        if i in config.selected_folds:
            print(f"\n-------------   Fold {i + 1} / {config.k}  -------------\n")

            df_train = dfx.iloc[train_idx].copy()
            df_val = dfx.iloc[val_idx].copy()
            df_result.append(df_val)

            history = train(config, df_train, df_val, i)
            histories.append(history)
            pred_oof += history.pred.tolist()

    # Print out summary of each fold's time and best scores
    minutes, seconds = divmod(time.time() - start_time, 60)
    avg_score = sum([i.best_result['score'] for i in histories]) / len(histories)
    avg_loss = sum([i.best_result['loss'] for i in histories]) / len(histories)
    print(f"\n-------------   AVG over folds -------------\n")
    print(f't={minutes}min val_loss={avg_loss:.4f} val_score={avg_score:.4f}')

    # Add the results back together
    df_result = pd.concat(df_result, axis=0, ignore_index=True)
    for i in range(NUM_CLASSES):
        df_result[f's{i}'] = np.array(pred_oof)[:, i]

    # Save a copy of every individual folds results
    df_result.to_csv(f'submissions/oof.csv', index=False)

    return df_result, avg_score

In [None]:
# ImageNet normalization
norm = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

class Config:
    seed = 2020
    gpu = True

    k = 5
    selected_folds = [0,1,2,3,4] 

    name = "final"
    selected_model = 'efficientnet-b3'
    # selected_model = "resnest50_fast_1s1x64d"
    
    batch_size = 32
    val_bs = 32

    epochs = 200
    lr = 1e-3
    warmup_prop = 0.05
    accum_gradient = 32

    mixup_proba = 0.
    alpha = 0.1
    train_transform = transforms.Compose([
                                MonoToColor(),
                                transforms.ToPILImage(),
                                transforms.Resize((HEIGHT,WIDTH)),
                                transforms.ToTensor(),
                                norm,
                                ])
    test_transform = transforms.Compose([
                                MonoToColor(),
                                transforms.ToPILImage(),
                                transforms.Resize((HEIGHT,WIDTH)),
                                transforms.ToTensor(),
                                norm
                                ])

In [None]:
val_result, score = k_fold(config=Config, dfx=df_tp)

In [None]:
result = test(config=Config, dfx=df_test.head(10))

In [None]:
TODAY = str(datetime.date.today())
file_name = f"{TODAY}_{Config.selected_model}_{Config.name}_{score:.4f}"
result.to_csv(f'submissions/{file_name}.csv', index=True)