In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
import os
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
from torchaudio import transforms

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def create_frames(waveform, duration=5, sr=32000):
    frame_size = int(duration * sr)
    waveform = nn.functional.pad(waveform, pad=(0, frame_size - len(waveform)%frame_size)) # pad the end
    waveform = waveform.squeeze()
    frames = waveform.view(-1, frame_size)
    return frames

class AudioDatasetInference(Dataset):
    def __init__(
            self, 
            files,
            targets = None, 
            n_classes = 182,
            duration = 5,
            sample_rate = 32000,
            target_length = 384,
            n_mels = 128,
            n_fft = 2028,
            window = 2028,
            hop_length = None,
            fmin = 20,
            fmax = 16000,
            top_db = 80
            ):
        super(AudioDatasetInference, self).__init__()
        self.files = files
        self.targets = targets
        self.n_classes = n_classes
        self.duration = duration
        self.sample_rate = sample_rate
        self.audio_len = duration*sample_rate
        self.target_length = target_length
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.window = window
        self.hop_length = self.audio_len // (target_length-1) if not hop_length else hop_length
        self.fmin = fmin
        self.fmax = fmax
        self.top_db = top_db

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            label = torch.tensor(self.targets[idx])

        file = self.files[idx]
        waveform, sr = torchaudio.load(file)
        waveform = waveform.squeeze()
        assert len(waveform.shape) == 1, 'Signal with multiple channels detected'
        frames = create_frames(waveform)
        spec = transforms.MelSpectrogram(sr, n_fft=self.n_fft, win_length=self.window,  hop_length=self.hop_length, 
                                         n_mels=self.n_mels, f_min=self.fmin, f_max=self.fmax)(frames)
        spec = transforms.AmplitudeToDB(top_db=self.top_db)(spec)
        # Standardize
        spec = (spec - spec.mean()) / spec.std()

        # expand to 3 channels for imagenet trained models
        spec = spec.unsqueeze(1).expand(-1,3,-1,-1)

        if self.targets is not None:
            return spec, label
        else:
            return spec, file

In [None]:
from torchvision.models import get_model

class BasicClassifier(nn.Module):
    def __init__(self, n_classes, pretrained=True):
        super(BasicClassifier, self).__init__()
        weights = 'DEFAULT' if pretrained else None
        self.backbone = get_model('efficientnet_v2_s', weights=weights).features
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Dropout(0.2, inplace=True),
            nn.Linear(1280, n_classes)
            )
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.pool(x).squeeze(dim=(-1,-2))
        x = self.classifier(x)
        return x

In [2]:
base_dir = 'data'
train_dir = base_dir + '/train_audio/'
test_dir = base_dir + '/test_soundscapes/'
unlabeled_dir = base_dir + '/unlabeled_soundscapes/'

class_names = sorted(os.listdir(train_dir))
n_classes = len(class_names)
class_labels = list(range(n_classes))
label2name = dict(zip(class_labels, class_names))
name2label = {v:k for k,v in label2name.items()}

In [3]:
test_df = pd.read_csv('valid_df.csv')
files = test_df['filepath']
targets = test_df['target']

In [5]:
n_classes = 182
test_dataset = AudioDatasetInference(files, targets=None, n_classes=n_classes, augments=None, duration=5)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=8)

In [26]:
device = torch.device('cuda')

model = BasicClassifier(n_classes, pretrained=False).to(device)
checkpoint_name = "checkpoints/efficientnet_v2_s_imagenet_base_32.pth"
checkpoint = torch.load(checkpoint_name, map_location='cpu')
model.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [27]:
ids = []
preds = np.empty(shape=(0, n_classes), dtype='float32')

test_iter = tqdm(range(len(test_dataset)))
for i in test_iter:
    specs, file = test_dataset[i]
    filename = file.split('/')[-1][:-4]
    specs = specs.to(device)
    
    with torch.no_grad():
            outs = model(specs)
            outs = nn.functional.softmax(outs, dim=1).detach().cpu()

    frame_ids = [f'{filename}_{(frame_id+1)*5}' for frame_id in range(len(specs))]
    ids += frame_ids

    preds = np.concatenate([preds, outs], axis=0)

100%|██████████| 4892/4892 [10:18<00:00,  7.91it/s] 


In [18]:
# Submit prediction
pred_df = pd.DataFrame(ids, columns=['row_id'])
pred_df.loc[:, class_names] = preds
pred_df.to_csv('submission.csv',index=False)

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,XC756601_5,0.005847,0.011639,0.002512,0.00011,0.01907,1.5e-05,0.000609,0.000143,6.9e-05,...,0.000811,0.000281,0.001206,0.000926,0.005042,0.010253,5.2e-05,0.000306,3.2e-05,0.004853
1,XC756601_10,0.008243,0.007165,0.001763,9.2e-05,0.018658,1.1e-05,0.000512,0.000136,0.000104,...,0.000528,0.000333,0.002916,0.001131,0.00492,0.010568,6.1e-05,0.000224,2.8e-05,0.005552
2,XC756601_15,0.010177,0.009778,0.003066,0.000166,0.017798,2e-05,0.000657,0.000119,0.000143,...,0.000895,0.000277,0.001253,0.001104,0.007026,0.00824,7.4e-05,0.000242,3.9e-05,0.006269
3,XC756601_20,0.006208,0.006755,0.002717,0.00015,0.014288,2.4e-05,0.000561,0.000175,0.000171,...,0.000782,0.000253,0.001888,0.001559,0.007132,0.009607,5.6e-05,0.000403,2.3e-05,0.012148
4,XC756601_25,0.006516,0.006837,0.001713,0.000164,0.020643,1.6e-05,0.000494,0.000309,8.7e-05,...,0.000877,0.000263,0.001813,0.001479,0.005192,0.010191,6e-05,0.000307,1.8e-05,0.007364


In [30]:
pred_df

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1,target
0,XC756601_5,0.005847,0.011639,0.002512,0.000110,0.019070,0.000015,0.000609,0.000143,0.000069,...,0.000281,0.001206,0.000926,0.005042,0.010253,0.000052,0.000306,0.000032,0.004853,139
1,XC756601_10,0.008243,0.007165,0.001763,0.000092,0.018658,0.000011,0.000512,0.000136,0.000104,...,0.000333,0.002916,0.001131,0.004920,0.010568,0.000061,0.000224,0.000028,0.005552,139
2,XC756601_15,0.010177,0.009778,0.003066,0.000166,0.017798,0.000020,0.000657,0.000119,0.000143,...,0.000277,0.001253,0.001104,0.007026,0.008240,0.000074,0.000242,0.000039,0.006269,139
3,XC756601_20,0.006208,0.006755,0.002717,0.000150,0.014288,0.000024,0.000561,0.000175,0.000171,...,0.000253,0.001888,0.001559,0.007132,0.009607,0.000056,0.000403,0.000023,0.012148,139
4,XC756601_25,0.006516,0.006837,0.001713,0.000164,0.020643,0.000016,0.000494,0.000309,0.000087,...,0.000263,0.001813,0.001479,0.005192,0.010191,0.000060,0.000307,0.000018,0.007364,139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41772,XC493181_60,0.003695,0.009455,0.001734,0.000201,0.014964,0.000025,0.000362,0.000227,0.000054,...,0.000416,0.001849,0.001265,0.003974,0.010497,0.000083,0.000330,0.000036,0.007257,139
41773,XC493181_65,0.003407,0.006787,0.002758,0.000228,0.014819,0.000015,0.000402,0.000116,0.000044,...,0.000294,0.002501,0.001776,0.003441,0.009686,0.000043,0.000132,0.000019,0.012488,139
41774,XC493181_70,0.003704,0.005894,0.003200,0.000092,0.015836,0.000007,0.000449,0.000067,0.000030,...,0.000335,0.002337,0.000698,0.003722,0.010342,0.000032,0.000205,0.000014,0.006770,139
41775,XC750334_5,0.005809,0.006869,0.002889,0.000194,0.019206,0.000022,0.000594,0.000129,0.000088,...,0.000383,0.002072,0.002769,0.006532,0.007247,0.000106,0.000318,0.000024,0.008918,139


In [49]:
submission = pred_df[class_names]
solution = pd.DataFrame()
for class_name in class_names:
    # Create a new column where the value is 1 if the label matches the class index, and 0 otherwise
    solution[class_name] = (pred_df['target'] == name2label[class_name]).astype(int)

In [50]:
import sklearn.metrics

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    '''
    Version of macro-averaged ROC-AUC score that ignores all classes that have no true positive labels.
    '''

    solution_sums = solution.sum(axis=0)
    scored_columns = list(solution_sums[solution_sums > 0].index.values)
    assert len(scored_columns) > 0

    return sklearn.metrics.roc_auc_score(solution[scored_columns].values, submission[scored_columns].values, average='macro')

In [66]:
score(solution, submission, 'row_id')

0.8264495473368161