In [1]:
!pip install /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl --no-index --find-links /kaggle/input/pip-wheels-birds

Looking in links: /kaggle/input/pip-wheels-birds
Processing /kaggle/input/pip-wheels-birds/openvino-2024.1.0-15008-cp310-cp310-manylinux2014_x86_64.whl
Processing /kaggle/input/pip-wheels-birds/openvino_telemetry-2024.1.0-py3-none-any.whl (from openvino==2024.1.0)
Installing collected packages: openvino-telemetry, openvino
Successfully installed openvino-2024.1.0 openvino-telemetry-2024.1.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
import os
import gc
from tqdm import tqdm
from glob import glob
import time
import scipy
from functools import partial
from scipy import signal
from scipy.ndimage import gaussian_filter1d

import torch.jit as jit
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2
from torchvision.models import get_model
import timm

import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
from concurrent import futures

In [3]:
class Config:
    multithreading = True
    ensemble_checkpoints = ['/kaggle/input/mn-20-peak-full/other/06-07_13-17_128x500_mn20_as_exp-gemsed_peaks_gaussian_mixup/1',
                            #'/kaggle/input/mn-20-peak-full/other/06-07_17-36_128x500_mn20_as_exp-gemsed_peaks_gaussian_mixup_nocw/1',
                            '/kaggle/input/mn-20-peak-full/other/06-10_09-09_128x500_mn20_as_exp-mhgemsed_peaksgau_pretrained_mixupnocw/1',
                            #'/kaggle/input/mn-20-peak/other/2024-06-04_11-19-53_gemsed_1peaksavgolmixup/1',
                            #'/kaggle/input/mn-20-peak-full/other/06-10_18-06_96x300_mn20_as_exp-lessclasses_peaks_pretrained/1',
                            #'/kaggle/input/mn-30-full/other/06-05_03-55_128x384_mn30_as_exp-gemsed_mixup_big/1',
                            '/kaggle/input/mn-20/other/2024-06-04_15-18-43_gemsed_mixup/1',
                            '/kaggle/input/mn-20/other/2024-06-03_04-43-52_mheadgemsed/1',
                           ]
    ensemble_losses = ['crossentropy', 'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy',
                       'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy', 'crossentropy']
    ensemble_n_peaks = [1,1,1,1,1]

In [4]:
def create_frames(waveform, duration=5, sr=32000):
    frame_size = int(duration * sr)
    surplus = waveform.size(-1)%frame_size
    if surplus > 0:
        waveform = waveform[:, :-surplus]
    frames = waveform.view(-1, 1, frame_size)
    return frames

def find_peak_max(x, filter='savgol'):
    if filter == 'savgol':
        smooth_x = signal.savgol_filter(x, window_length=100, polyorder=2)
    elif filter == 'gaussian':
        smooth_x = gaussian_filter1d(x, sigma=25)
    else:
        smooth_x = x
    return smooth_x.argmax(axis=-1)

def window_around_peak(len_x, peak, window_size):
    half_window = window_size // 2
    start_index = max(0, peak - half_window)
    end_index = min(len_x, peak + half_window)

    # Adjust the window if it's too close to the borders
    if end_index - start_index < window_size:
        if start_index == 0:
            end_index = min(len_x, start_index + window_size)
        elif end_index == len_x:
            start_index = max(0, end_index - window_size)
    return start_index, end_index

def find_peaks_max_inference(x, filter, window, n_peaks):
    if filter == 'savgol':
        smooth_x = signal.savgol_filter(x, window_length=100, polyorder=2)
    elif filter == 'gaussian':
        smooth_x = gaussian_filter1d(x, sigma=25)
    else:
        smooth_x = x

    peaks = []
    for p in range(n_peaks):
        peak = smooth_x.argmax(axis=-1)
        for k in range(len(peak)):
            s1, s2 = window_around_peak(len(smooth_x[k]), peak[k], window)
            smooth_x[k, s1:s2] = 0
        peaks.append(peak)
    return peaks

class AudioDatasetInference(Dataset):
    def __init__(
            self, 
            files,
            cfg,
            targets = None
            ):
        super(AudioDatasetInference, self).__init__()
        self.files = files
        self.targets = targets
        self.n_classes = cfg.n_classes
        self.duration = cfg.duration
        self.sample_rate = cfg.sample_rate
        self.audio_len = self.duration*self.sample_rate
        self.target_length = cfg.target_length
        self.n_mels = cfg.n_mels
        self.n_fft = cfg.n_fft
        self.window = cfg.window
        self.hop_length = cfg.hop_length
        self.fmin = cfg.fmin
        self.fmax = cfg.fmax
        self.top_db = cfg.top_db
        self.standardize = cfg.standardize
        self.mean = cfg.dataset_mean
        self.std = cfg.dataset_std
        self.n_channels = cfg.n_channels
        self.use_1_peak = cfg.use_1_peak
        self.use_peaks = cfg.use_peaks
        self.peak_filter = cfg.peak_filter
        self.n_peaks = cfg.n_peaks
        self.base_hop_length = cfg.base_hop_length
        self.wave_window = cfg.wave_window

        self.to_mel_spectrogramn = torchaudio.transforms.MelSpectrogram(self.sample_rate, n_fft=self.n_fft, win_length=self.window,  
                                                 hop_length=self.hop_length, n_mels=self.n_mels, 
                                                 f_min=self.fmin, f_max=self.fmax)

        self.mel_to_db = nn.Sequential(torchaudio.transforms.AmplitudeToDB(top_db=self.top_db))

        if self.mean is not None and self.std is not None:
            self.mel_to_db.append(v2.Normalize(mean=self.mean, std=self.std))

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            label = torch.tensor(self.targets[idx])

        file = self.files[idx]
        waveform, sr = torchaudio.load(file)
        frames = create_frames(waveform)
    
        if self.use_1_peak:
            first_spec = torchaudio.transforms.MelSpectrogram(32000, n_fft=1024, win_length=800, hop_length=self.base_hop_length, n_mels=128, f_min=50, f_max=16000)(frames)
            per_frame_energy = first_spec.sum(dim=-2).squeeze().numpy()
            peaks = find_peak_max(per_frame_energy, filter=self.peak_filter)
            new_spec = torch.empty((frames.size(0), self.n_channels, self.n_mels, self.target_length))
            #new_frames = torch.empty_like(frames)
            for p in range(len(peaks)):
                start_index, end_index = window_around_peak(frames.shape[-1], peaks[p]*self.base_hop_length, (self.wave_window//self.base_hop_length)*(self.base_hop_length-1))
                #new_frames[p] = frames[p,:,start_index*480:end_index*480]
                #new_spec[p] = spec[p,:,:,start_index:end_index]
                new_spec[p] = self.to_mel_spectrogramn(frames[p,:,start_index:end_index])
            #new_spec = self.to_mel_spectrogramn(new_frames)
        
        elif self.use_peaks:
            first_spec = torchaudio.transforms.MelSpectrogram(32000, n_fft=1024, win_length=800, hop_length=self.base_hop_length, n_mels=128, f_min=50, f_max=16000)(frames)
            per_frame_energy = first_spec.sum(dim=-2).squeeze().numpy()
            peaks = find_peaks_max_inference(per_frame_energy, filter=self.peak_filter, 
                                   window=(self.wave_window//self.base_hop_length), n_peaks=self.n_peaks)
            new_spec = torch.empty((frames.size(0)*self.n_peaks, self.n_channels, self.n_mels, self.target_length))
            for k in range(self.n_peaks):
                for p in range(len(peaks[k])):
                    start_index, end_index = window_around_peak(frames.shape[-1], peaks[k][p]*self.base_hop_length, (self.wave_window//self.base_hop_length)*(self.base_hop_length-1))
                    new_spec[self.n_peaks*p + k%self.n_peaks] = self.to_mel_spectrogramn(frames[p,:,start_index:end_index])
            
        
        else:
            spec = self.to_mel_spectrogramn(frames)
            new_spec = spec

        spec = self.mel_to_db(new_spec)

        # Standardize
        if self.standardize:
            spec = (spec - spec.mean()) / spec.std()

        # expand to 3 channels for imagenet trained models
        if self.n_channels > 1:
            if self.use_peaks:
                spec = spec.expand(-1,self.n_channels,-1,-1)
            else:
                spec = spec.expand(self.n_channels,-1,-1)

        if self.targets is not None:
            return spec, label
        else:
            return spec, file

In [5]:
base_dir = '/kaggle/input/birdclef-2024'
train_dir = base_dir + '/train_audio/'
test_dir = base_dir + '/test_soundscapes/'
unlabeled_dir = base_dir + '/unlabeled_soundscapes/'

class_names = sorted(os.listdir(train_dir))
n_classes = len(class_names)
class_labels = list(range(n_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [6]:
test_paths = glob(base_dir + '/test_soundscapes/*ogg')
if len(test_paths)==0:
    test_paths = glob(base_dir + '/unlabeled_soundscapes/*ogg')[:10]
test_df = pd.DataFrame(test_paths, columns=['filepath'])
test_df.head()

Unnamed: 0,filepath
0,/kaggle/input/birdclef-2024/unlabeled_soundsca...
1,/kaggle/input/birdclef-2024/unlabeled_soundsca...
2,/kaggle/input/birdclef-2024/unlabeled_soundsca...
3,/kaggle/input/birdclef-2024/unlabeled_soundsca...
4,/kaggle/input/birdclef-2024/unlabeled_soundsca...


In [7]:
class Config_1peaksavgol_old:
    use_1_peak = True
    peak_filter = 'savgol'
    use_peaks = False
    n_peaks = 3
    base_hop_length = 500
    wave_window = 32000
    
    duration = 5
    sample_rate = 32000
    target_length = 500 
    n_mels = 128 
    n_fft = 1024
    window = 160 
    audio_len = duration*sample_rate
    hop_length = 64 
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean =[-16.8828] 
    dataset_std = [12.4019]
    
class Config_1peakgaussian_new:
    use_1_peak = True
    peak_filter = 'gaussian'
    use_peaks = False
    n_peaks = 3
    base_hop_length = 500
    wave_window = 32000
    
    duration = 5
    sample_rate = 32000
    target_length = 500 
    n_mels = 128
    n_fft = 1024
    window = 160 
    audio_len = duration*sample_rate
    hop_length = 64 
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean = [-22.9137]
    dataset_std = [11.8739]
    
class Config_3peaksgaussian_new:
    use_1_peak = False
    peak_filter = 'gaussian'
    use_peaks = True
    n_peaks = 3
    base_hop_length = 500
    wave_window = 32000
    
    duration = 5
    sample_rate = 32000
    target_length = 500 
    n_mels = 128
    n_fft = 1024
    window = 160 
    audio_len = duration*sample_rate
    hop_length = 64 
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean = [-22.9137]
    dataset_std = [11.8739]
    
class Config_3peaksgaussiansmall_new:
    use_1_peak = False
    peak_filter = 'gaussian'
    use_peaks = True
    n_peaks = 3
    base_hop_length = 320
    wave_window = 19200
    
    duration = 5
    sample_rate = 32000
    target_length = 300 
    n_mels = 96
    n_fft = 1024
    window = 160 
    audio_len = duration*sample_rate
    hop_length = 64 
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean = [-22.9137]
    dataset_std = [11.8739]
    
class Config_1peakgaussiansmall_new:
    use_1_peak = True
    peak_filter = 'gaussian'
    use_peaks = False
    n_peaks = 1
    base_hop_length = 320
    wave_window = 19200
    
    duration = 5
    sample_rate = 32000
    target_length = 300 
    n_mels = 96
    n_fft = 1024
    window = 160 
    audio_len = duration*sample_rate
    hop_length = 64 
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean = [-22.9137]
    dataset_std = [11.8739]
    
class Config_base:
    use_1_peak = False
    peak_filter = 'none'
    use_peaks = False
    n_peaks = 1
    base_hop_length = 0
    wave_window = 0
    
    duration = 5
    sample_rate = 32000
    target_length = 384 
    n_mels = 128
    n_fft = 1024
    window = 800 
    audio_len = duration*sample_rate
    hop_length = audio_len // (target_length-1)
    fmin = 50
    fmax = 16000
    top_db = 80

    n_classes = 182
    n_channels = 1 

    standardize = False
    dataset_mean =[-16.8828]
    dataset_std = [12.4019]

test_datasets = [
    #AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_3peaksgaussian_new),
    AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_1peakgaussian_new),
    AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_1peakgaussian_new),
    #AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_1peaksavgol_old),
    #AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_1peakgaussiansmall_new),
    AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_base),
    AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_base),
    #AudioDatasetInference(test_df['filepath'].values, targets=None, cfg=Config_base),
]

In [8]:
if Config.multithreading:
    def predict(dataset, model, loss, n_peaks):
        ids = []
        preds = np.empty(shape=(0, n_classes), dtype='float32')
        output_layer = model.output(0)
        if loss == 'crossentropy':
            final_activation = partial(scipy.special.softmax, axis=1)
        elif loss == 'bce':
            final_activation = scipy.special.expit

        for i in range(len(dataset)):
            specs, file = dataset[i]
            filename = file.split('/')[-1][:-4]

            outs = model([specs])[output_layer]
            outs = final_activation(outs)

            frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs)//n_peaks)]
            ids += frame_ids
            

            if n_peaks>1:
                outs = outs.reshape((len(specs)//n_peaks, n_peaks, -1))
                outs = scipy.special.softmax(outs @ outs.transpose(0,-1,-2) / (np.sqrt(outs.shape[-1])), axis=-1) @ outs
                outs = outs.mean(1)
                

            preds = np.concatenate([preds, outs], axis=0)

        return preds, ids

    def run_prediction(model_id):
        core = ov.Core()
        dataset = test_datasets[model_id]
        n_peaks = Config.ensemble_n_peaks[model_id]
        checkpoint_ov = Config.ensemble_checkpoints[model_id] + '/checkpoint.xml'
        loss = Config.ensemble_losses[model_id]
        config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
        model = core.compile_model(checkpoint_ov, "CPU", config)
        
        preds, ids = predict(dataset, model, loss, n_peaks)
        del core, model
        gc.collect()
        
        print(f"Done model {model_id}")
        return preds, ids

    def helper(inputs):
        return run_prediction(inputs)


    start=time.time()
    
    audios = [model_id for model_id in range(len(Config.ensemble_checkpoints))]
    ensemble_preds = []
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        for preds, ids in executor.map(helper, audios):
            ensemble_preds.append(preds)
    ensemble_preds = np.array(ensemble_preds)
    #ensemble_preds = ensemble_preds.mean(axis=0)
    #ensemble_preds = (ensemble_preds**2).mean(axis=0) ** 0.5
    ensemble_preds = ensemble_preds.transpose(1,0,2)
    ensemble_preds = scipy.special.softmax(ensemble_preds @ ensemble_preds.transpose(0,-1,-2) / (np.sqrt(ensemble_preds.shape[-1])), axis=-1) @ ensemble_preds
    ensemble_preds = ensemble_preds.mean(1)
    preds = ensemble_preds

    print(time.time()-start)

mbind failed: Operation not permitted
mbind failed: Operation not permitted
mbind failed: Operation not permitted
mbind failed: Operation not permitted


Done model 3
Done model 2
Done model 0
Done model 1
64.88732600212097


In [9]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, class_names] = preds
pred_df.to_csv('submission.csv',index=False)