In [1]:
!pip install /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl --no-index --find-links /kaggle/input/pip-wheels-birds

Looking in links: /kaggle/input/pip-wheels-birds
Processing /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl
Processing /kaggle/input/pip-wheels-birds/openvino_telemetry-2024.1.0-py3-none-any.whl (from openvino==2024.1.0)
Installing collected packages: openvino-telemetry, openvino
Successfully installed openvino-2024.1.0 openvino-telemetry-2024.1.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
import os
import gc
from tqdm import tqdm
from glob import glob
import time
import scipy
from functools import partial
from scipy import signal
from scipy.ndimage import gaussian_filter1d

import torch.jit as jit
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2
from torchvision.models import get_model
import timm

import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
from concurrent import futures

In [3]:
class Config:
    use_1_peak = False
    peak_filter = 'gaussian'
    use_peaks = True
    n_peaks = 3
    duration = 5
    sample_rate = 32000
    target_length = 500 #!!!!!!!!!!!!
    n_mels = 128 #!!!!!!!!!!!!!!!
    n_fft = 1024
    window = 160 #!!!!!!!!!!!!!
    audio_len = duration*sample_rate
    hop_length = 64 #!!!!!!!!!!
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 #!!!!!!!!!!!
    
    use_openvino = True
    multithreading = False
    checkpoint_dir = '/kaggle/input/mn-20-peak-full/other/06-07_13-17_128x500_mn20_as_exp-gemsed_peaks_gaussian_mixup/1'
    loss = 'crossentropy'
    ensemble_checkpoints = ['/kaggle/input/mn-20-peak-full/other/06-07_13-17_128x500_mn20_as_exp-gemsed_peaks_gaussian_mixup/1',
                            '/kaggle/input/mn-20-peak-full/other/06-07_17-36_128x500_mn20_as_exp-gemsed_peaks_gaussian_mixup_nocw/1',
                            '/kaggle/input/mn-20-peak-full/other/06-06_14-28_128x500_mn20_as_exp-peakgemsed_gaussian_mixup/1',
                            #'/kaggle/input/mn-20/other/2024-06-03_01-53-27_gemsed_2wayfocal/1',
                            #'/kaggle/input/mn-20/other/2024-06-03_03-22-49_msgemsed/1',
                            #'/kaggle/input/mn-20/other/2024-05-30_21-30-20_gemattms-128x384/1'
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1',
                            #'/kaggle/input/mn-20/other/2024-05-29_10-17-54_att-128-384/1'
                           ]
    ensemble_losses = ['crossentropy', 'crossentropy', 'crossentropy', 'bce', 'crossentropy',
                       'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy']

    standardize = False
    dataset_mean = [-22.9137] #[-16.8828]
    dataset_std = [11.8739] #[12.4019]

In [4]:
def create_frames(waveform, duration=5, sr=32000):
    frame_size = int(duration * sr)
    surplus = waveform.size(-1)%frame_size
    if surplus > 0:
        waveform = waveform[:, :-surplus]
    frames = waveform.view(-1, 1, frame_size)
    return frames

def find_peak_max(x, filter='savgol'):
    if filter == 'savgol':
        smooth_x = signal.savgol_filter(x, window_length=100, polyorder=2)
    elif filter == 'gaussian':
        smooth_x = gaussian_filter1d(x, sigma=25)
    else:
        smooth_x = x
    return smooth_x.argmax(axis=-1)

def window_around_peak(len_x, peak, window_size):
    half_window = window_size // 2
    start_index = max(0, peak - half_window)
    end_index = min(len_x, peak + half_window)

    # Adjust the window if it's too close to the borders
    if end_index - start_index < window_size:
        if start_index == 0:
            end_index = min(len_x, start_index + window_size)
        elif end_index == len_x:
            start_index = max(0, end_index - window_size)
    return start_index, end_index

def find_peaks_max_inference(x, filter, window, n_peaks):
    if filter == 'savgol':
        smooth_x = signal.savgol_filter(x, window_length=100, polyorder=2)
    elif filter == 'gaussian':
        smooth_x = gaussian_filter1d(x, sigma=25)
    else:
        smooth_x = x

    peaks = []
    for p in range(n_peaks):
        peak = smooth_x.argmax(axis=-1)
        for k in range(len(peak)):
            s1, s2 = window_around_peak(len(smooth_x[k]), peak[k], window)
            smooth_x[k, s1:s2] = 0
        peaks.append(peak)
    return peaks

class AudioDatasetInference(Dataset):
    def __init__(
            self, 
            files,
            cfg,
            targets = None
            ):
        super(AudioDatasetInference, self).__init__()
        self.files = files
        self.targets = targets
        self.n_classes = cfg.n_classes
        self.duration = cfg.duration
        self.sample_rate = cfg.sample_rate
        self.audio_len = self.duration*self.sample_rate
        self.target_length = cfg.target_length
        self.n_mels = cfg.n_mels
        self.n_fft = cfg.n_fft
        self.window = cfg.window
        self.hop_length = cfg.hop_length
        self.fmin = cfg.fmin
        self.fmax = cfg.fmax
        self.top_db = cfg.top_db
        self.standardize = cfg.standardize
        self.mean = cfg.dataset_mean
        self.std = cfg.dataset_std
        self.n_channels = cfg.n_channels
        self.use_1_peak = cfg.use_1_peak
        self.use_peaks = cfg.use_peaks
        self.peak_filter = cfg.peak_filter
        self.n_peaks = cfg.n_peaks

        self.to_mel_spectrogramn = torchaudio.transforms.MelSpectrogram(self.sample_rate, n_fft=self.n_fft, win_length=self.window,  
                                                 hop_length=self.hop_length, n_mels=self.n_mels, 
                                                 f_min=self.fmin, f_max=self.fmax)

        self.mel_to_db = nn.Sequential(torchaudio.transforms.AmplitudeToDB(top_db=self.top_db))

        if self.mean is not None and self.std is not None:
            self.mel_to_db.append(v2.Normalize(mean=self.mean, std=self.std))

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            label = torch.tensor(self.targets[idx])

        file = self.files[idx]
        waveform, sr = torchaudio.load(file)
        frames = create_frames(waveform)
        spec = self.to_mel_spectrogramn(frames)

        if self.use_1_peak:
            per_frame_energy = spec.sum(dim=-2).squeeze().numpy()
            peaks = find_peak_max(per_frame_energy, filter=self.peak_filter)
            new_spec = torch.empty((spec.size(0), self.n_channels, self.n_mels, self.target_length))
            for p in range(len(peaks)):
                start_index, end_index = window_around_peak(per_frame_energy.shape[-1], peaks[p], self.target_length)
                new_spec[p] = spec[p,:,:,start_index:end_index]
        
        elif self.use_peaks:
            per_frame_energy = spec.sum(dim=-2).squeeze().numpy()
            peaks = find_peaks_max_inference(per_frame_energy, filter=self.peak_filter, 
                                   window=self.target_length, n_peaks=self.n_peaks)
            new_spec = torch.empty((spec.size(0)*self.n_peaks, self.n_channels, self.n_mels, self.target_length))
            for k in range(self.n_peaks):
                for p in range(len(peaks[k])):
                    start_index, end_index = window_around_peak(per_frame_energy.shape[-1], peaks[k][p], self.target_length)
                    new_spec[self.n_peaks*p + k%self.n_peaks] = spec[p,:,:,start_index:end_index]

        spec = self.mel_to_db(new_spec)

        # Standardize
        if self.standardize:
            spec = (spec - spec.mean()) / spec.std()

        # expand to 3 channels for imagenet trained models
        if self.n_channels > 1:
            if self.use_peaks:
                spec = spec.expand(-1,self.n_channels,-1,-1)
            else:
                spec = spec.expand(self.n_channels,-1,-1)

        if self.targets is not None:
            return spec, label
        else:
            return spec, file

In [5]:
base_dir = '/kaggle/input/birdclef-2024'
train_dir = base_dir + '/train_audio/'
test_dir = base_dir + '/test_soundscapes/'
unlabeled_dir = base_dir + '/unlabeled_soundscapes/'

class_names = sorted(os.listdir(train_dir))
n_classes = len(class_names)
class_labels = list(range(n_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [6]:
test_paths = glob(base_dir + '/test_soundscapes/*ogg')
if len(test_paths)==0:
    test_paths = glob(base_dir + '/unlabeled_soundscapes/*ogg')[:10]
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df.head()

Unnamed: 0,filepath
0,/kaggle/input/birdclef-2024/unlabeled_soundsca...
1,/kaggle/input/birdclef-2024/unlabeled_soundsca...
2,/kaggle/input/birdclef-2024/unlabeled_soundsca...
3,/kaggle/input/birdclef-2024/unlabeled_soundsca...
4,/kaggle/input/birdclef-2024/unlabeled_soundsca...


In [7]:
test_dataset = AudioDatasetInference(
    test_df['filepath'].values, 
    targets=None, 
    cfg=Config
    )

In [8]:
"""if Config.multithreading:
    def predict(dataset, model, loss):
        ids = []
        preds = np.empty(shape=(0, n_classes), dtype='float32')
        output_layer = model.output(0)
        if loss == 'crossentropy':
            final_activation = partial(scipy.special.softmax, axis=1)
        elif loss == 'bce':
            final_activation = scipy.special.expit

        for i in range(len(dataset)):
            specs, file = dataset[i]
            filename = file.split('/')[-1][:-4]

            outs = model([specs])[output_layer]
            outs = final_activation(outs)

            frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
            ids += frame_ids

            preds = np.concatenate([preds, outs], axis=0)

        return preds, ids

    def run_prediction(data_loader, model_id):
        core = ov.Core()
        checkpoint_ov = Config.ensemble_checkpoints[model_id] + '/checkpoint.xml'
        loss = Config.ensemble_losses[model_id]
        config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
        model = core.compile_model(checkpoint_ov, "CPU", config)
        
        preds, ids = predict(data_loader, model, loss)
        del core, model, loss
        gc.collect()
        
        print(f"Done model {model_id}")
        return preds, ids

    def helper(inputs):
        return run_prediction(inputs[0], inputs[1])


    start=time.time()
    
    audios = [(test_dataset, model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    ensemble_preds = []
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        for preds, ids in executor.map(helper, audios):
            ensemble_preds.append(preds)
    ensemble_preds = np.array(ensemble_preds)
    ensemble_preds = ensemble_preds.mean(axis=0)
    #ensemble_preds = (ensemble_preds**2).mean(axis=0) ** 0.5
    preds = ensemble_preds

    print(time.time()-start)"""

'if Config.multithreading:\n    def predict(dataset, model, loss):\n        ids = []\n        preds = np.empty(shape=(0, n_classes), dtype=\'float32\')\n        output_layer = model.output(0)\n        if loss == \'crossentropy\':\n            final_activation = partial(scipy.special.softmax, axis=1)\n        elif loss == \'bce\':\n            final_activation = scipy.special.expit\n\n        for i in range(len(dataset)):\n            specs, file = dataset[i]\n            filename = file.split(\'/\')[-1][:-4]\n\n            outs = model([specs])[output_layer]\n            outs = final_activation(outs)\n\n            frame_ids = [f\'{filename}_{(frame_id+1)*5}\' for frame_id in range(len(specs))]\n            ids += frame_ids\n\n            preds = np.concatenate([preds, outs], axis=0)\n\n        return preds, ids\n\n    def run_prediction(data_loader, model_id):\n        core = ov.Core()\n        checkpoint_ov = Config.ensemble_checkpoints[model_id] + \'/checkpoint.xml\'\n        loss =

In [9]:
if Config.multithreading:
    def predict(specs, infer_request, final_activation):
        sample_preds = np.empty(shape=(0, n_classes), dtype='float32')
        start_time = time.time()
        outs = infer_request.infer([specs])[0]
        outs = final_activation(outs)
        model_time = time.time()-start_time
        sample_preds = np.concatenate([sample_preds, outs], axis=0)
        return sample_preds, model_time

    def helper(inputs):
        return predict(inputs[0], inputs[1], inputs[2])

    
    def get_model(model_id):
        core = ov.Core()
        checkpoint_ov = Config.ensemble_checkpoints[model_id] + '/checkpoint.xml'
        loss = Config.ensemble_losses[model_id]
        config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
        model = core.compile_model(checkpoint_ov, "CPU", config)
        infer_request = model.create_infer_request()
        return infer_request
    
    def get_final_activation(model_id):
        loss = Config.ensemble_losses[model_id]
        if loss == 'crossentropy':
            final_activation = partial(scipy.special.softmax, axis=1)
        elif loss == 'bce':
            final_activation = scipy.special.expit
        return final_activation
        

    start=time.time()

    models = [get_model(model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    f_activations = [get_final_activation(model_id) for model_id in range(len(Config.ensemble_checkpoints))]
    
    preds = np.empty(shape=(0, n_classes), dtype='float32')
    ids = []
    ensemble_preds = np.empty(shape=(0, n_classes), dtype='float32')
    for i in range(len(test_dataset)):
        specs, file = test_dataset[i]
        filename = file.split('/')[-1][:-4]
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
        ids += frame_ids
        
        ensemble_preds = []
        list_inputs = [(specs, models[k], f_activations[k]) for k in range(len(models))]
        with futures.ThreadPoolExecutor(max_workers=len(Config.ensemble_checkpoints)) as executor:
            for sample_preds, model_time in executor.map(helper, list_inputs):
                ensemble_preds.append(sample_preds)
                #print('model', model_time)
        ensemble_preds = np.array(ensemble_preds)
        #ensemble_preds = ensemble_preds.mean(axis=0)
        ensemble_preds = (ensemble_preds**2).mean(axis=0) ** 0.5
        preds = np.concatenate([preds, ensemble_preds], axis=0)

    print(time.time()-start)

In [10]:
if Config.use_openvino:
    start=time.time()
    
    checkpoint_ov = Config.checkpoint_dir + '/checkpoint.xml'
    config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
    core = ov.Core()
    model = core.compile_model(checkpoint_ov, "AUTO", config)

    #Peaks
    peaks_weights = np.array([1/Config.n_peaks*k for k in range(Config.n_peaks,0,-1)])[None,:,None]
    n_peaks = Config.n_peaks if Config.use_peaks else 1

    ids = []
    preds = np.empty(shape=(0, n_classes), dtype='float32')
    output_layer = model.output(0)
    if Config.loss == 'crossentropy':
        final_activation = partial(scipy.special.softmax, axis=1)
    elif Config.loss == 'bce':
        final_activation = scipy.special.expit

    test_iter = tqdm(range(len(test_dataset)))
    for i in test_iter:
        #start_sample_time = time.time()
        specs, file = test_dataset[i]
        filename = file.split('/')[-1][:-4]
        #data_time = time.time()
        #print("data", data_time-start_sample_time)
        
        outs = model([specs])[output_layer]
        outs = final_activation(outs)
        #model_time = time.time()
        #print("model", model_time-data_time)
        
        frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs)//Config.n_peaks)]
        ids += frame_ids
        
        if Config.use_peaks:
            outs = outs.reshape((len(specs)//Config.n_peaks, Config.n_peaks, -1))
            outs = (outs*peaks_weights).mean(1)
        preds = np.concatenate([preds, outs], axis=0)
        #end_time = time.time()
        #print("end", end_time-model_time)

    print(time.time()-start)

  0%|          | 0/10 [00:00<?, ?it/s]mbind failed: Operation not permitted
 10%|█         | 1/10 [00:08<01:19,  8.87s/it]mbind failed: Operation not permitted
100%|██████████| 10/10 [01:11<00:00,  7.16s/it]

72.4986264705658





In [11]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, class_names] = preds
pred_df.to_csv('submission.csv',index=False)