In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import IPython.display as ipd
from datetime import datetime
import time

from torch.utils.data import DataLoader
from torchmetrics.classification import MulticlassAccuracy, MultilabelAccuracy
import audiomentations
from torch.utils.data import default_collate
from torchvision.transforms import v2
import timm

from src.audio_utils import play_audio, plot_specgram, plot_waveform
from src.data import AudioDataset, FrequencyMaskingAug, TimeMaskingAug
from src.data_utils import get_metadata, get_fold, get_metadata_from_csv
from src.train_utils import FocalLoss, BCEFocal2WayLoss, get_cosine_schedule_with_warmup, wandb_init
from src.models import BasicClassifier, GeMClassifier, SEDClassifier
from src.utils import score_np, roc_auc

import ast
import wandb
import yaml

  from .autonotebook import tqdm as notebook_tqdm


### Config

In [2]:
class Config:
    duration = 10
    sample_rate = 32000
    target_length = 384
    n_mels = 128
    n_fft = 2028
    window = 2028
    audio_len = duration*sample_rate
    hop_length = audio_len // (target_length-1)
    fmin = 20
    fmax = 16000
    top_db = 80

    n_classes = 182
    batch_size = 24
    Model = SEDClassifier
    model_name = 'eca_nfnet_l0'
    n_folds = 5
    upsample_thr = 50
    use_class_weights = False   # Test

    standardize = False
    dataset_mean = [-16.8828]
    dataset_std = [12.4019]

    data_aug = True     # Test     
    cutmix_mixup = True     # Test
    loss = 'bce'    # Test ('crossentropy', 'bce')
    secondary_labels_weight = 0.3   # Test (0)
    use_focal = False    # Test (only with bce)
    use_2wayfocal = True
    focal_gamma = 2
    focal_lambda = 1
    label_smoothing = 0.05  # Only with crossentropy

    num_epochs = 10
    warmup_epochs = 0.5
    lr = 1e-3
    start_lr = 0.01 # relative to lr
    final_lr = 0.01
    weight_decay = 0.0001
    max_grad_norm = 10

    wandb = True
    competition   = 'birdclef-2024' 
    _wandb_kernel = 'cvincent13'
    date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"{date}_fold-{0}_dim-{n_mels}x{target_length}_model-{model_name}"
    wandb_group = 'FirstTests'

    base_dir = ''

#metadata = get_metadata(Config.n_folds)
metadata = get_metadata_from_csv('metadata.csv', 'data')

### Dataset

In [3]:
fold = 0
train_df, valid_df, class_weights = get_fold(metadata, fold, up_thr=Config.upsample_thr)

Num Train: 22045, 182 classes | Num Valid: 4892, 182 classes


In [4]:
# Data transforms and augmentations
waveform_transforms = audiomentations.Compose([
    audiomentations.Shift(min_shift=-0.5, max_shift=0.5, p=0.5),
    audiomentations.SevenBandParametricEQ(min_gain_db=-12., max_gain_db=12., p=0.5),
    audiomentations.AirAbsorption(min_temperature=10, max_temperature=20, min_humidity=30, max_humidity=90,
                                  min_distance=10, max_distance=100, p=1.), 

    audiomentations.OneOf([
        audiomentations.Gain(min_gain_db=-6., max_gain_db=6., p=1),  # How to handle waveforms out of [-1, 1] ? dont see the issue
        audiomentations.GainTransition(min_gain_db=-12., max_gain_db=3., p=1)
    ], p=1.),

    audiomentations.OneOf([
        audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.),
        audiomentations.AddGaussianSNR(min_snr_db=5., max_snr_db=40., p=1.),
        audiomentations.AddColorNoise(min_snr_db=5., max_snr_db=40., min_f_decay=-3.01, max_f_decay=-3.01, p=1.)
    ], p=1.),

    #audiomentations.AddShortNoises(sounds_path=unlabeled_dir, min_snr_db=3., max_snr_db=30., 
    #                           noise_rms='relative_to_whole_input',
    #                           min_time_between_sounds=2., max_time_between_sounds=8., 
    #                           noise_transform=audiomentations.PolarityInversion(), p=0.5),
    #audiomentations.AddBackgroundNoise(sounds_path=unlabeled_dir, min_snr_db=3., max_snr_db=30., 
    #                               noise_transform=audiomentations.PolarityInversion(), p=0.5),
                                   
    audiomentations.LowPassFilter(min_cutoff_freq=750., max_cutoff_freq=7500., min_rolloff=12, max_rolloff=24, p=0.8),
    audiomentations.PitchShift(min_semitones=-2.5, max_semitones=2.5, p=0.3)
])

spec_transforms = nn.Sequential(
    FrequencyMaskingAug(0.3, 0.1, Config.n_mels, n_masks=3, mask_mode='mean'),
    TimeMaskingAug(0.3, 0.1, Config.target_length, n_masks=3, mask_mode='mean'),
)


waveform_transforms=None if not Config.data_aug else waveform_transforms
spec_transforms=None if not Config.data_aug else spec_transforms


train_dataset = AudioDataset(
    train_df, 
    n_classes=Config.n_classes,
    duration=Config.duration,
    sample_rate=Config.sample_rate,
    target_length=Config.target_length,
    n_mels=Config.n_mels,
    n_fft=Config.n_fft,
    window=Config.window,
    hop_length=Config.hop_length,
    fmin=Config.fmin,
    fmax=Config.fmax,
    top_db=Config.top_db,
    waveform_transforms=waveform_transforms,
    spec_transforms=spec_transforms,
    standardize=Config.standardize,
    mean=Config.dataset_mean,
    std=Config.dataset_std,
    loss=Config.loss,
    secondary_labels_weight=Config.secondary_labels_weight
    )
val_dataset = AudioDataset(
    valid_df, 
    n_classes=Config.n_classes,
    duration=Config.duration,
    sample_rate=Config.sample_rate,
    target_length=Config.target_length,
    n_mels=Config.n_mels,
    n_fft=Config.n_fft,
    window=Config.window,
    hop_length=Config.hop_length,
    fmin=Config.fmin,
    fmax=Config.fmax,
    top_db=Config.top_db,
    waveform_transforms=None,
    spec_transforms=None,
    standardize=Config.standardize,
    mean=Config.dataset_mean,
    std=Config.dataset_std,
    loss=Config.loss,
    secondary_labels_weight=Config.secondary_labels_weight
    )

### Training

In [5]:
cutmix_or_mixup = v2.RandomApply([
    v2.RandomChoice([
        v2.CutMix(num_classes=Config.n_classes, alpha=0.5, one_hot_labels=Config.loss=='bce'),
        v2.MixUp(num_classes=Config.n_classes, alpha=0.5, one_hot_labels=Config.loss=='bce')
    ], p=[0.65, 0.35])
], p=0.7)


def mix_collate_fn(batch):
    return cutmix_or_mixup(*default_collate(batch))

collate_fn = mix_collate_fn if Config.cutmix_mixup else None

train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=4)

In [6]:
device = torch.device('cuda')

model = Config.Model(Config.n_classes, Config.model_name, n_mels=Config.n_mels).to(device)
optimizer = torch.optim.Adam(model.parameters(), weight_decay=Config.weight_decay, lr=Config.lr)
spe = len(train_loader)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=spe*Config.warmup_epochs, num_training_steps=spe*Config.num_epochs, 
                                            start_lr=Config.start_lr, final_lr=Config.final_lr)
                                                
pos_weight = torch.tensor(class_weights).to(device) if Config.use_class_weights else None
if Config.loss == 'crossentropy':
    criterion = nn.CrossEntropyLoss(label_smoothing=Config.label_smoothing, weight=pos_weight)
    accuracy = MulticlassAccuracy(num_classes=Config.n_classes).to(device)
elif Config.loss == 'bce':
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight, weight=None)
    accuracy = MultilabelAccuracy(num_labels=Config.n_classes).to(device)

focal_criterion = FocalLoss(gamma=Config.focal_gamma, pos_weight=pos_weight)
focal2way_criterion = BCEFocal2WayLoss(gamma=Config.focal_gamma, pos_weight=pos_weight)

### Training loop

In [7]:
start_time = time.time()
if Config.wandb:
    run = wandb_init(fold, Config)

save_dir = f"{Config.base_dir}checkpoints/{Config.run_name}"
train_losses = []
val_losses = []
train_metrics = {'AUC': [], 'Accuracy': [], 'Score': []}
val_metrics = {'AUC': [], 'Accuracy': [], 'Score': []}

for epoch in range(Config.num_epochs):
    train_loss = 0
    train_accuracy = 0
    gt = []
    preds = []
    model.train()
    train_iter = tqdm(train_loader)
    for (batch, labels) in train_iter:
        optimizer.zero_grad()

        batch = batch.to(device)
        labels = labels.to(device)

        out = model(batch, return_dict=Config.use_2wayfocal)
        
        if Config.use_focal:
            loss = criterion(out, labels) + Config.focal_lambda * focal_criterion(out, labels)
        elif Config.use_2wayfocal:
            loss = criterion(out["logit"], labels) + Config.focal_lambda * focal2way_criterion(out, labels)
            out = out["logit"]
        else:
            loss = criterion(out, labels)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=Config.max_grad_norm)
        optimizer.step()
        scheduler.step()

        train_iter.set_description(desc=f'train loss: {loss.item():.3f}')
        train_loss += loss.item()
        train_accuracy += accuracy(out, (labels>0).int())
        if Config.loss == 'bce':
            gt.append(((labels.detach().cpu().numpy())>0).astype(int))
            preds.append(out.sigmoid().detach().cpu().numpy())
        elif Config.loss == 'crossentropy':
            gt.append(nn.functional.one_hot(labels.detach().cpu(), num_classes=Config.n_classes).numpy())
            preds.append(nn.functional.softmax(out, dim=1).detach().cpu().numpy())

    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)
    train_accuracy = train_accuracy / len(train_loader)
    train_metrics["Accuracy"].append(train_accuracy)
    gt = np.concatenate(gt)
    preds = np.concatenate(preds)
    train_auc = roc_auc(preds, gt)
    train_score = score_np(preds, gt)
    train_metrics["AUC"].append(train_auc)
    train_metrics["Score"].append(train_score)


    val_loss = 0
    val_accuracy = 0
    gt = []
    preds = []
    model.eval()
    val_iter = tqdm(val_loader)
    for (batch, labels) in val_iter:
        batch = batch.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            out = model(batch, return_dict=Config.use_2wayfocal)
            if Config.use_focal:
                loss = criterion(out, labels) + Config.focal_lambda * focal_criterion(out, labels)
            elif Config.use_2wayfocal:
                loss = criterion(out["logit"], labels) + Config.focal_lambda * focal2way_criterion(out, labels)
                out = out["logit"]
            else:
                loss = criterion(out, labels)

        val_iter.set_description(desc=f'val loss: {loss.item():.3f}')
        val_loss += loss.item()
        val_accuracy += accuracy(out, (labels>0).int())
        if Config.loss == 'bce':
            gt.append(((labels.detach().cpu().numpy())>0).astype(int))
            preds.append(out.sigmoid().detach().cpu().numpy())
        elif Config.loss == 'crossentropy':
            gt.append(nn.functional.one_hot(labels.detach().cpu(), num_classes=Config.n_classes).numpy())
            preds.append(nn.functional.softmax(out, dim=1).detach().cpu().numpy())

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)
    val_accuracy = val_accuracy / len(val_loader)
    val_metrics['Accuracy'].append(val_accuracy)
    gt = np.concatenate(gt)
    preds = np.concatenate(preds)
    val_auc = roc_auc(preds, gt)
    val_score = score_np(preds, gt)
    val_metrics['AUC'].append(val_auc)
    val_metrics['Score'].append(val_score)

    save_dict = {
        "model": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "epoch": epoch+1,
        "train_losses": train_losses,
        "train_metrics": train_metrics,
        "val_losses": val_losses,
        "val_metrics": val_metrics
    }

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    torch.save(save_dict, save_dir + "/checkpoint.pth")
    with open(save_dir + "/logs.txt", "w") as f:
        f.write(f"Epoch {epoch+1}: Train Loss = {train_loss:.3f} | Val Loss = {val_loss:.3f}")
        f.write("\n")
        f.write("CONFIG:")
        for k,v in dict(vars(Config)).items():
            if '__' not in k:
                f.write("\n")
                f.write(f"{k}: {v}")


    if Config.wandb:
        wandb.log({
            "train_loss": train_loss,
            "train accuracy": train_accuracy,
            "train_auc": train_auc,
            "train_score": train_score,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
            "val_auc": val_auc,
            "val_score": val_score,
            "lr": scheduler.get_last_lr()
        })


    print(f'Epoch {epoch+1}: Train Loss = {train_loss:.3f}, Train Accuracy = {train_accuracy:.3f}, Train ROCAUC = {train_auc:.3f},\
Train score = {train_score:.3f} | Val Loss = {val_loss:.3f}, Val Accuracy = {val_accuracy:.3f}, \
Val ROCAUC = {val_auc:.3f}, Val score = {val_score:.3f}')


def format_duration(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return "{:02}h {:02}min {:02}s".format(int(hours), int(minutes), int(seconds))

print(f'Done in {format_duration(time.time() - start_time)}')

if Config.wandb:
    #print('# WandB')
    #log_wandb(valid_df)
    wandb.run.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcvincent13[0m ([33m667[0m). Use [1m`wandb login --relogin`[0m to force relogin


train loss: 0.044:  29%|██▊       | 263/919 [04:51<08:04,  1.35it/s]wandb: Network error (ConnectionError), entering retry loop.
train loss: 0.042: 100%|██████████| 919/919 [16:42<00:00,  1.09s/it]
val loss: 0.015: 100%|██████████| 204/204 [01:27<00:00,  2.34it/s]


Epoch 1: Train Loss = 0.064, Train Accuracy = 0.986, Train ROCAUC = 0.583,Train score = 0.583 | Val Loss = 0.036, Val Accuracy = 0.994, Val ROCAUC = 0.846, Val score = 0.846


train loss: 0.039: 100%|██████████| 919/919 [16:39<00:00,  1.09s/it]
val loss: 0.015: 100%|██████████| 204/204 [01:25<00:00,  2.38it/s]


Epoch 2: Train Loss = 0.037, Train Accuracy = 0.990, Train ROCAUC = 0.665,Train score = 0.665 | Val Loss = 0.033, Val Accuracy = 0.994, Val ROCAUC = 0.863, Val score = 0.863


train loss: 0.033: 100%|██████████| 919/919 [16:35<00:00,  1.08s/it]
val loss: 0.023: 100%|██████████| 204/204 [01:25<00:00,  2.37it/s]


Epoch 3: Train Loss = 0.037, Train Accuracy = 0.990, Train ROCAUC = 0.679,Train score = 0.679 | Val Loss = 0.035, Val Accuracy = 0.994, Val ROCAUC = 0.859, Val score = 0.859


train loss: 0.033: 100%|██████████| 919/919 [16:37<00:00,  1.09s/it]
val loss: 0.017: 100%|██████████| 204/204 [01:26<00:00,  2.36it/s]


Epoch 4: Train Loss = 0.036, Train Accuracy = 0.990, Train ROCAUC = 0.690,Train score = 0.690 | Val Loss = 0.032, Val Accuracy = 0.994, Val ROCAUC = 0.891, Val score = 0.891


train loss: 0.035: 100%|██████████| 919/919 [16:27<00:00,  1.07s/it]
val loss: 0.011: 100%|██████████| 204/204 [01:26<00:00,  2.37it/s]


Epoch 5: Train Loss = 0.036, Train Accuracy = 0.990, Train ROCAUC = 0.694,Train score = 0.694 | Val Loss = 0.032, Val Accuracy = 0.994, Val ROCAUC = 0.880, Val score = 0.880


train loss: 0.037: 100%|██████████| 919/919 [16:41<00:00,  1.09s/it]
val loss: 0.012: 100%|██████████| 204/204 [01:27<00:00,  2.32it/s]


Epoch 6: Train Loss = 0.035, Train Accuracy = 0.990, Train ROCAUC = 0.705,Train score = 0.705 | Val Loss = 0.030, Val Accuracy = 0.994, Val ROCAUC = 0.893, Val score = 0.893


train loss: 0.038: 100%|██████████| 919/919 [16:26<00:00,  1.07s/it]
val loss: 0.015: 100%|██████████| 204/204 [01:25<00:00,  2.39it/s]


Epoch 7: Train Loss = 0.034, Train Accuracy = 0.990, Train ROCAUC = 0.720,Train score = 0.720 | Val Loss = 0.029, Val Accuracy = 0.994, Val ROCAUC = 0.906, Val score = 0.906


train loss: 0.036:  94%|█████████▍| 867/919 [15:37<01:13,  1.42s/it]wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
train loss: 0.035: 100%|██████████| 919/919 [16:33<00:00,  1.08s/it]
val loss: 0.010: 100%|██████████| 204/204 [01:25<00:00,  2.39it/s]


Epoch 8: Train Loss = 0.033, Train Accuracy = 0.990, Train ROCAUC = 0.737,Train score = 0.737 | Val Loss = 0.028, Val Accuracy = 0.994, Val ROCAUC = 0.910, Val score = 0.910


train loss: 0.035: 100%|██████████| 919/919 [16:33<00:00,  1.08s/it]
val loss: 0.011: 100%|██████████| 204/204 [01:25<00:00,  2.38it/s]


Epoch 9: Train Loss = 0.033, Train Accuracy = 0.990, Train ROCAUC = 0.744,Train score = 0.744 | Val Loss = 0.027, Val Accuracy = 0.994, Val ROCAUC = 0.916, Val score = 0.916


train loss: 0.037: 100%|██████████| 919/919 [16:37<00:00,  1.09s/it]
val loss: 0.008: 100%|██████████| 204/204 [01:25<00:00,  2.38it/s]


Epoch 10: Train Loss = 0.032, Train Accuracy = 0.990, Train ROCAUC = 0.759,Train score = 0.759 | Val Loss = 0.026, Val Accuracy = 0.994, Val ROCAUC = 0.921, Val score = 0.921
Done in 03h 01min 27s


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


0,1
train accuracy,▁█████████
train_auc,▁▄▅▅▅▆▆▇▇█
train_loss,█▂▂▂▂▂▂▁▁▁
train_score,▁▄▅▅▅▆▆▇▇█
val_accuracy,▁▂▁▂▃▄▃▆▅█
val_auc,▁▃▂▅▄▅▇▇██
val_loss,█▆▇▅▅▄▃▂▂▁
val_score,▁▃▂▅▄▅▇▇██

0,1
train accuracy,0.98993
train_auc,0.75945
train_loss,0.03177
train_score,0.75945
val_accuracy,0.99444
val_auc,0.92144
val_loss,0.02592
val_score,0.92144


In [None]:
wandb.run.finish()

# Optimize for inference

In [3]:
import openvino as ov
import nncf

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


In [5]:
# Load checkpoint
device = torch.device('cpu')

model = Config.Model(Config.n_classes, pretrained=False, model_name=Config.model_name).to(device)

#save_dir = f"{Config.base_dir}checkpoints/{Config.run_name}"
save_dir = 'checkpoints/2024-05-14_08-49-58_fold-0_dim-128x384_model-eca_nfnet_l0'
checkpoint_name = f'{save_dir}/checkpoint.pth'
checkpoint_ov = f'{save_dir}/checkpoint.xml'
    
checkpoint = torch.load(checkpoint_name, map_location='cpu')
model.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [6]:
# Convert and save model for openvino
input_data = torch.rand(1, 3, Config.n_mels, Config.target_length)
ov_model = ov.convert_model(model, example_input=input_data)
ov.save_model(ov_model, save_dir + '/checkpoint.xml')

In [10]:
# Read and compile model with openvino
core = ov.Core()
ov_model = core.read_model(save_dir + "/checkpoint.xml")
compiled_model = ov.compile_model(ov_model)

In [12]:
# Quantize model to 8 bits openvino

val_loader

<torch.utils.data.dataloader.DataLoader at 0x770f57433e50>