In [1]:
!pip install /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl --no-index --find-links /kaggle/input/pip-wheels-birds

Looking in links: /kaggle/input/pip-wheels-birds
Processing /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl
Processing /kaggle/input/pip-wheels-birds/openvino_telemetry-2024.1.0-py3-none-any.whl (from openvino==2024.1.0)
Installing collected packages: openvino-telemetry, openvino
Successfully installed openvino-2024.1.0 openvino-telemetry-2024.1.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
import os
import gc
from tqdm import tqdm
from glob import glob
import time
import scipy
from functools import partial

import torch.jit as jit
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2
from torchvision.models import get_model
import timm

import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
from concurrent import futures

In [3]:
class Config:
    duration = 5
    sample_rate = 32000
    target_length = 384 #384
    n_mels = 128 #128
    n_fft = 1024
    window = 800
    audio_len = duration*sample_rate
    hop_length = audio_len // (target_length-1)
    fmin = 50 #!!!!!!!!!!!!!!!!!
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 #!!!!!!!!!!!
    
    use_openvino = False
    multithreading = True
    checkpoint_dir = '/kaggle/input/mn-30-full/other/06-08_11-25_128x384_mn30_as_exp-gemsed_pretrained_mixup/1'
    loss = 'crossentropy'
    ensemble_checkpoints = ['/kaggle/input/mn-30-full/other/06-05_03-55_128x384_mn30_as_exp-gemsed_mixup_big/1',
                            '/kaggle/input/mn-20/other/2024-06-04_15-18-43_gemsed_mixup/1',
                            '/kaggle/input/mn-20/other/2024-06-03_04-43-52_mheadgemsed/1',
                            #'/kaggle/input/mn-20/other/2024-06-04_13-50-39_mheadgemsed_mixup/1',
                            #'/kaggle/input/mn-20/other/2024-06-03_09-14-08_msmheadgemsed/1',
                            #'/kaggle/input/mn-20/other/2024-06-03_03-22-49_msgemsed/1'
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1'
                           ]
    ensemble_losses = ['crossentropy', 'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy',
                       'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy']
    ensemble_weights = [1,1,1,1,1]
    ensemble_normstats = ['old', 'old', 'old', 'old', 'old', 'new']

    standardize = False
    dataset_mean = [-16.8828]  #[-16.8828] [-22.9137]
    dataset_std = [12.4019]  #[12.4019] [11.8739]

In [4]:
def create_frames(waveform, duration=5, sr=32000):
    frame_size = int(duration * sr)
    surplus = waveform.size(-1)%frame_size
    if surplus > 0:
        waveform = waveform[:, :-surplus]
    frames = waveform.view(-1, 1, frame_size)
    return frames

class AudioDatasetInference(Dataset):
    def __init__(
            self, 
            files,
            targets = None, 
            n_classes = 182,
            duration = 5,
            sample_rate = 32000,
            target_length = 384,
            n_mels = 128,
            n_fft = 2028,
            window = 2028,
            hop_length = None,
            fmin = 20,
            fmax = 16000,
            top_db = 80,
            standardize=True,
            mean=None,
            std=None
            ):
        super(AudioDatasetInference, self).__init__()
        self.files = files
        self.targets = targets
        self.n_classes = n_classes
        self.duration = duration
        self.sample_rate = sample_rate
        self.audio_len = duration*sample_rate
        self.target_length = target_length
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.window = window
        self.hop_length = self.audio_len // (target_length-1) if not hop_length else hop_length
        self.fmin = fmin
        self.fmax = fmax
        self.top_db = top_db
        self.standardize = standardize

        self.to_mel_spectrogramn = nn.Sequential(
            torchaudio.transforms.MelSpectrogram(self.sample_rate, n_fft=self.n_fft, win_length=self.window,  
                                                 hop_length=self.hop_length, n_mels=self.n_mels, 
                                                 f_min=self.fmin, f_max=self.fmax),
            torchaudio.transforms.AmplitudeToDB(top_db=self.top_db)
        )
        #if mean is not None:
        #    self.to_mel_spectrogramn.append(v2.Normalize(mean=mean, std=std))

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            label = torch.tensor(self.targets[idx])

        file = self.files[idx]
        waveform, sr = torchaudio.load(file)
        frames = create_frames(waveform)
        spec = self.to_mel_spectrogramn(frames)

        if self.targets is not None:
            return spec, label
        else:
            return spec, file

In [5]:
base_dir = '/kaggle/input/birdclef-2024'
train_dir = base_dir + '/train_audio/'
test_dir = base_dir + '/test_soundscapes/'
unlabeled_dir = base_dir + '/unlabeled_soundscapes/'

class_names = sorted(os.listdir(train_dir))
n_classes = len(class_names)
class_labels = list(range(n_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [6]:
test_paths = glob(base_dir + '/test_soundscapes/*ogg')
if len(test_paths)==0:
    test_paths = glob(base_dir + '/unlabeled_soundscapes/*ogg')[:10]
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df.head()

Unnamed: 0,filepath
0,/kaggle/input/birdclef-2024/unlabeled_soundsca...
1,/kaggle/input/birdclef-2024/unlabeled_soundsca...
2,/kaggle/input/birdclef-2024/unlabeled_soundsca...
3,/kaggle/input/birdclef-2024/unlabeled_soundsca...
4,/kaggle/input/birdclef-2024/unlabeled_soundsca...


In [7]:
test_dataset = AudioDatasetInference(
    test_df['filepath'].values, 
    targets=None, 
    n_classes=Config.n_classes,
    duration=5,
    sample_rate=Config.sample_rate,
    target_length=Config.target_length,
    n_mels=Config.n_mels,
    n_fft=Config.n_fft,
    window=Config.window,
    hop_length=Config.hop_length,
    fmin=Config.fmin,
    fmax=Config.fmax,
    top_db=Config.top_db,
    standardize=Config.standardize,
    mean=Config.dataset_mean,
    std=Config.dataset_std
    )

In [8]:
"""if Config.multithreading:
    def predict(dataset, model, loss):
        ids = []
        preds = np.empty(shape=(0, n_classes), dtype='float32')
        output_layer = model.output(0)
        if loss == 'crossentropy':
            final_activation = partial(scipy.special.softmax, axis=1)
        elif loss == 'bce':
            final_activation = scipy.special.expit

        for i in range(len(dataset)):
            specs, file = dataset[i]
            filename = file.split('/')[-1][:-4]

            outs = model([specs])[output_layer]
            outs = final_activation(outs)

            frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
            ids += frame_ids

            preds = np.concatenate([preds, outs], axis=0)

        return preds, ids

    def run_prediction(data_loader, model_id):
        core = ov.Core()
        checkpoint_ov = Config.ensemble_checkpoints[model_id] + '/checkpoint.xml'
        loss = Config.ensemble_losses[model_id]
        config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
        model = core.compile_model(checkpoint_ov, "CPU", config)
        
        preds, ids = predict(data_loader, model, loss)
        del core, model, loss
        gc.collect()
        
        print(f"Done model {model_id}")
        return preds, ids

    def helper(inputs):
        return run_prediction(inputs[0], inputs[1])


    start=time.time()
    
    audios = [(test_dataset, model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    ensemble_preds = []
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        for preds, ids in executor.map(helper, audios):
            ensemble_preds.append(preds)
    ensemble_preds = np.array(ensemble_preds)
    #ensemble_preds = ensemble_preds.mean(axis=0)
    ensemble_preds = (ensemble_preds**2).mean(axis=0) ** 0.5
    preds = ensemble_preds

    print(time.time()-start)"""

'if Config.multithreading:\n    def predict(dataset, model, loss):\n        ids = []\n        preds = np.empty(shape=(0, n_classes), dtype=\'float32\')\n        output_layer = model.output(0)\n        if loss == \'crossentropy\':\n            final_activation = partial(scipy.special.softmax, axis=1)\n        elif loss == \'bce\':\n            final_activation = scipy.special.expit\n\n        for i in range(len(dataset)):\n            specs, file = dataset[i]\n            filename = file.split(\'/\')[-1][:-4]\n\n            outs = model([specs])[output_layer]\n            outs = final_activation(outs)\n\n            frame_ids = [f\'{filename}_{(frame_id+1)*5}\' for frame_id in range(len(specs))]\n            ids += frame_ids\n\n            preds = np.concatenate([preds, outs], axis=0)\n\n        return preds, ids\n\n    def run_prediction(data_loader, model_id):\n        core = ov.Core()\n        checkpoint_ov = Config.ensemble_checkpoints[model_id] + \'/checkpoint.xml\'\n        loss =

In [9]:
if Config.multithreading:
    def predict(specs, infer_request, final_activation, weight, norm_stat):
        if norm_stat == 'old':
            specs = v2.Normalize(mean=[-16.8828], std=[12.4019])(specs)
        else:
            specs = v2.Normalize(mean=[-22.9137], std=[11.8739])(specs)
        sample_preds = np.empty(shape=(0, n_classes), dtype='float32')
        start_time = time.time()
        outs = infer_request.infer([specs])[0]
        outs = final_activation(outs)
        model_time = time.time()-start_time
        sample_preds = np.concatenate([sample_preds, outs], axis=0)
        return sample_preds, model_time, weight

    def helper(inputs):
        return predict(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])

    
    def get_model(model_id):
        core = ov.Core()
        checkpoint_ov = Config.ensemble_checkpoints[model_id] + '/checkpoint.xml'
        loss = Config.ensemble_losses[model_id]
        config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
        model = core.compile_model(checkpoint_ov, "CPU", config)
        infer_request = model.create_infer_request()
        return infer_request
    
    def get_final_activation(model_id):
        loss = Config.ensemble_losses[model_id]
        if loss == 'crossentropy':
            final_activation = partial(scipy.special.softmax, axis=1)
        elif loss == 'bce':
            final_activation = scipy.special.expit
        return final_activation
        

    start=time.time()

    models = [get_model(model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    f_activations = [get_final_activation(model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    
    preds = np.empty(shape=(0, n_classes), dtype='float32')
    ids = []
    ensemble_preds = np.empty(shape=(0, n_classes), dtype='float32')
    for i in range(len(test_dataset)):
        specs, file = test_dataset[i]
        filename = file.split('/')[-1][:-4]
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
        ids += frame_ids
        
        ensemble_preds = []
        list_inputs = [(specs, models[k], f_activations[k], Config.ensemble_weights[k], Config.ensemble_normstats[k]) for k in range(len(models))]
        with futures.ThreadPoolExecutor(max_workers=len(Config.ensemble_checkpoints)) as executor:
            for sample_preds, model_time, weight in executor.map(helper, list_inputs):
                ensemble_preds.append(sample_preds*weight)
                #print('model', model_time)
        ensemble_preds = np.array(ensemble_preds)
        #ensemble_preds = ensemble_preds.sum(axis=0)/sum(weights)
        ensemble_preds = ensemble_preds.transpose(1,0,2)
        ensemble_preds = scipy.special.softmax(ensemble_preds @ ensemble_preds.transpose(0,-1,-2) / (np.sqrt(ensemble_preds.shape[-1])), axis=-1) @ ensemble_preds
        ensemble_preds = ensemble_preds.mean(1)
        #ensemble_preds = (ensemble_preds**2).mean(axis=0) ** 0.5
        preds = np.concatenate([preds, ensemble_preds], axis=0)

    print(time.time()-start)

mbind failed: Operation not permitted
mbind failed: Operation not permitted
mbind failed: Operation not permitted


52.54720735549927


In [10]:
if Config.use_openvino:
    start=time.time()
    
    checkpoint_ov = Config.checkpoint_dir + '/checkpoint.xml'
    config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
    core = ov.Core()
    model = core.compile_model(checkpoint_ov, "AUTO", config)


    ids = []
    preds = np.empty(shape=(0, n_classes), dtype='float32')
    output_layer = model.output(0)
    if Config.loss == 'crossentropy':
        final_activation = partial(scipy.special.softmax, axis=1)
    elif Config.loss == 'bce':
        final_activation = scipy.special.expit

    test_iter = tqdm(range(len(test_dataset)))
    for i in test_iter:
        #start_sample_time = time.time()
        specs, file = test_dataset[i]
        filename = file.split('/')[-1][:-4]
        #data_time = time.time()
        #print("data", data_time-start_sample_time)
        
        outs = model([specs])[output_layer]
        outs = final_activation(outs)
        #model_time = time.time()
        #print("model", model_time-data_time)

        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
        ids += frame_ids

        preds = np.concatenate([preds, outs], axis=0)
        #end_time = time.time()
        #print("end", end_time-model_time)

    print(time.time()-start)

In [11]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, class_names] = preds
pred_df.to_csv('submission.csv',index=False)