In [None]:
!git clone https://github.com/23lnlx/itmo-kaggle-inclass.git

Cloning into 'itmo-kaggle-inclass'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 21 (delta 10), reused 5 (delta 1), pack-reused 0[K
Unpacking objects: 100% (21/21), done.


In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c itmo-acoustic-event-detection-2022

Downloading itmo-acoustic-event-detection-2022.zip to /content
100% 7.31G/7.34G [01:06<00:00, 110MB/s]
100% 7.34G/7.34G [01:06<00:00, 119MB/s]


In [None]:
! mkdir kdata


In [None]:
! unzip itmo-acoustic-event-detection-2022.zip -d kdata

In [None]:
! pip install efficientnet-pytorch

In [None]:
!pip install pytorchtools

In [None]:
# path to your train/test/meta folders
DATA_PATH = '/content/kdata/'

# names of valuable files/folders
train_meta_fname = 'train.csv'
test_meta_fname = 'sample_submission.csv'
train_data_folder = 'audio_train/train'
test_data_folder = 'audio_test/test'

In [None]:
! pip install torchaudio torchvision

In [None]:
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import torchvision
from torchaudio import transforms
from efficientnet_pytorch import EfficientNet
from torch.optim.lr_scheduler import CyclicLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

In [None]:
# set seeds
import random

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [None]:
df_train = pd.read_csv(os.path.join(DATA_PATH, train_meta_fname))
df_test = pd.read_csv(os.path.join(DATA_PATH, test_meta_fname))
df_train.head(2)

Unnamed: 0,fname,label
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping
1,00d77b917e241afa06f1.wav,Squeak


In [None]:
n_classes = df_train.label.nunique()
print(n_classes)
classes_dict = {cl:i for i,cl in enumerate(df_train.label.unique())}
df_train['label_encoded'] = df_train.label.map(classes_dict)
df_train.head()

41


Unnamed: 0,fname,label,label_encoded
0,8bcbcc394ba64fe85ed4.wav,Finger_snapping,0
1,00d77b917e241afa06f1.wav,Squeak,1
2,17bb93b73b8e79234cb3.wav,Electric_piano,2
3,7d5c7a40a936136da55e.wav,Harmonica,3
4,17e0ee7565a33d6c2326.wav,Snare_drum,4


In [None]:
# https://github.com/lukemelas/EfficientNet-PyTorch
class BaseLineModel(nn.Module):
    
    def __init__(self, sample_rate=16000, n_classes=41):
        super().__init__()
        # self.ms = torchaudio.transforms.MelSpectrogram(sample_rate)
#         self.bn1 = nn.BatchNorm2d(1)
        
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=10, kernel_size=3, padding=1)
        self.cnn3 = nn.Conv2d(in_channels=10, out_channels=3, kernel_size=3, padding=1)
        
        self.features = EfficientNet.from_pretrained('efficientnet-b0')
        # use it as features
#         for param in self.features.parameters():
#             param.requires_grad = False
            
        self.lin1 = nn.Linear(1000, 333)
        
        self.lin2 = nn.Linear(333, 111)
                
        self.lin3 = nn.Linear(111, n_classes)
        
    def forward(self, x):
        # x = self.ms(x)
#         x = self.bn1(x)
                
        x = F.relu(self.cnn1(x))
        x = F.relu(self.cnn3(x))
        
        x = self.features(x)

        x = x.view(x.shape[0], -1)
        x = F.relu(x)
        
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
    
    def inference(self, x):
        x = self.forward(x)
        x = F.softmax(x)
        return x

In [None]:
def sample_or_pad(waveform, wav_len=32000):
    m, n = waveform.shape
    if n < wav_len:
        padded_wav = torch.zeros(1, wav_len)
        padded_wav[:, :n] = waveform
        return padded_wav
    elif n > wav_len:
        offset = np.random.randint(0, n - wav_len)
        sampled_wav = waveform[:, offset:offset+wav_len]
        return sampled_wav
    else:
        return waveform

# class EventDetectionDataset(Dataset):
#     def __init__(self, data_path, x, y=None):
#         self.x = x
#         self.y = y
#         self.data_path = data_path
#         self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=16000)
    
#     def __len__(self):
#         return len(self.x)

#     def __getitem__(self, idx):
#         path2wav = os.path.join(self.data_path, self.x[idx])
#         waveform, sample_rate = torchaudio.load(path2wav, normalize=True)
#         waveform = sample_or_pad(waveform)
#         spec = self.mel_spectrogram(waveform)
#         label = torch.zeros(41)
#         label[self.y[idx]] = 1.
#         if self.y is not None:
#             # return waveform, self.y[idx]
#             if self.train and idx > 0 and idx%5 == 0:

# #             # Choose another image/label randomly
#             mixup_idx = random.randint(0, len(self.x)-1)
#             path2wav = os.path.join(self.data_path, self.x[mixup_idx])
#             mixup_waveform, sample_rate = torchaudio.load(path2wav, normalize=True)
#             mixup_waveform = sample_or_pad(mixup_waveform)
#             return spec, label
#         return waveform
        
class EventDetectionDataset(Dataset):
    def __init__(self, data_path, x, y=None, sr=16000, train=False):
        self.x = x
        self.y = y
        self.data_path = data_path
        self.train = train
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sr)
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        path2wav = os.path.join(self.data_path, self.x[idx])
        waveform, sample_rate = torchaudio.load(path2wav, normalize=True)
        waveform = sample_or_pad(waveform)
        
        spec = self.mel_spectrogram(waveform)
        # print(type(spec), type(label))
        if self.y is not None:
          label = torch.zeros(41)
          label[self.y[idx]] = 1.
          # print(idx)
          if self.train and idx > 0 and idx%5 == 0:

            # Choose another image/label randomly
            mixup_idx = random.randint(0, len(self.x)-1)
            path2wav = os.path.join(self.data_path, self.x[mixup_idx])
            mixup_waveform, sample_rate = torchaudio.load(path2wav, normalize=True)
            mixup_waveform = sample_or_pad(mixup_waveform)

            mixup_label = torch.zeros(41)
            mixup_label[self.y[mixup_idx]] = 1.
            mixup_spec = self.mel_spectrogram(mixup_waveform)
            # if self.transform:
                # mixup_image = transform(self.x[mixup_idx])

            # Select a random number from the given beta distribution
            # Mixup the images accordingly
            alpha = 0.2
            lam = np.random.beta(alpha, alpha)
            spec = lam * spec + (1 - lam) * mixup_spec
            label = lam * label + (1 - lam) * mixup_label
            # print(type(spec), type(label))
          return spec, label
        return spec

In [None]:
sr = 16000
X_train, X_val, y_train, y_val = train_test_split(df_train.fname.values, df_train.label_encoded.values, 
                                                  test_size=0.2, random_state=42)
train_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, train_data_folder), X_train, y_train, train=True),
                        batch_size=41
                )
val_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, train_data_folder), X_val, y_val),
                        batch_size=41
                )
test_loader = DataLoader(
                        EventDetectionDataset(os.path.join(DATA_PATH, test_data_folder), df_test.fname.values, None),
                        batch_size=41, shuffle=False
                )

In [None]:
def eval_model(model, eval_dataset):
    model.eval()
    forecast, true_labs = [], []
    with torch.no_grad():
        for wavs, labs in eval_dataset:
            wavs, labs = wavs.cuda(), labs.detach().numpy().argmax(axis=1)
            true_labs.append(labs)
            outputs = model.inference(wavs)
            
            outputs = outputs.detach().cpu().numpy().argmax(axis=1)
            forecast.append(outputs)
    forecast = [x for sublist in forecast for x in sublist]
    true_labs = [x for sublist in true_labs for x in sublist]
    return f1_score(forecast, true_labs, average='macro')

In [None]:
criterion = nn.CrossEntropyLoss()
model = BaseLineModel()
model = model.cuda()
lr = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Loaded pretrained weights for efficientnet-b0


In [2]:
n_epoch = 100
best_f1 = 0
for epoch in range(n_epoch):
    model.train()
    for wavs, labs in tqdm(train_loader):
        optimizer.zero_grad()
        wavs, labs = wavs.cuda(), labs.cuda()
        outputs = model(wavs)
        loss = criterion(outputs, labs)
        loss.backward()
        optimizer.step()
#     if epoch % 10 == 0:
    f1 = eval_model(model, val_loader)
    f1_train = eval_model(model, train_loader)
    print(f'epoch: {epoch}, f1_test: {f1}, f1_train: {f1_train}')
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), '../baseline_fulldiv.pt')
        
    lr = lr * 0.95
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

epoch: 0, f1_test: 0.10881514510486336, f1_train: 0.12491591320488005
epoch: 1, f1_test: 0.10223543943360824, f1_train: 0.10405710509574878
epoch: 2, f1_test: 0.18530804184525632, f1_train: 0.200145335580391
epoch: 3, f1_test: 0.32015738137241656, f1_train: 0.3580449360377917
epoch: 4, f1_test: 0.29811833633904533, f1_train: 0.3174551726724046
epoch: 5, f1_test: 0.417784315678757, f1_train: 0.49850246298394824
epoch: 6, f1_test: 0.3912374113499492, f1_train: 0.48354827077720325
epoch: 7, f1_test: 0.4318873063705564, f1_train: 0.5225243513684436
epoch: 8, f1_test: 0.4778676833176374, f1_train: 0.5660152969088971
epoch: 9, f1_test: 0.5132491049069522, f1_train: 0.6381335256051263
epoch: 10, f1_test: 0.5903166679035904, f1_train: 0.7063885724994422
epoch: 11, f1_test: 0.4310460495499517, f1_train: 0.5648748974008755
epoch: 12, f1_test: 0.5985294435126046, f1_train: 0.7444643120071793
epoch: 13, f1_test: 0.5668015887107496, f1_train: 0.7170246361989844
epoch: 14, f1_test: 0.581910997928518

In [None]:
# change of baseline model
class ModelV2(nn.Module):
    
    def __init__(self, sample_rate=16000, n_classes=41):
        super().__init__()
        
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5, padding=2)
        self.cnn2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.cnn2extra = nn.Conv2d(in_channels=64, out_channels=20, kernel_size=1, padding=0)
        
        self.cnn3 = nn.Conv2d(in_channels=20, out_channels=3, kernel_size=3, padding=1)
        
        self.max_pool = nn.MaxPool2d(3, 2)
       
        self.features = EfficientNet.from_pretrained('efficientnet-b0')
            
        self.lin1 = nn.Linear(1000, 333)
        
        self.lin2 = nn.Linear(333, 111)
                
        self.lin3 = nn.Linear(111, n_classes)

        self.drop_layer = nn.Dropout(p=0.4) 

        self.cnn1_bn = nn.BatchNorm2d(32)
        self.cnn2_bn = nn.BatchNorm2d(64)
        self.cnn3_bn = nn.BatchNorm2d(3)
        self.lin1_bn = nn.BatchNorm1d(333)
        self.lin2_bn = nn.BatchNorm1d(111)
        
    def forward(self, x):
        x = self.ms(x)
                
        x = F.relu(self.cnn1_bn(self.cnn1(x)))
        x = F.relu(self.cnn2_bn(self.cnn2(x)))
        x = self.max_pool(x)
        x = F.relu(self.cnn2extra(x))
        x = F.relu(self.cnn3_bn(self.cnn3(x)))

        x = self.features(x)

        x = x.view(x.shape[0], -1)
        x = self.drop_layer(x)
        x = F.relu(self.lin1_bn(self.lin1(x)))
        x = F.relu(self.lin2_bn(self.lin2(x)))
        x = self.lin3(x)
        return x
    
    def inference(self, x):
        x = self.forward(x)
        x = F.softmax(x)
        return x

In [None]:
criterion2 = nn.CrossEntropyLoss()
model2 = ModelV2()
model2 = model2.cuda()
lr = 1e-3

optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr)


In [1]:
n_epoch = 100
best_f1 = 0
for epoch in range(n_epoch):
    model2.train()
    for wavs, labs in train_loader:
        optimizer2.zero_grad()
        wavs, labs = wavs.cuda(), labs.cuda()
        outputs = model2(wavs)
        loss = criterion2(outputs, labs)
        loss.backward()
        optimizer2.step()

        # scheduler.step()
#     if epoch % 10 == 0:
    f1 = eval_model(model2, val_loader)
    f1_train = eval_model(model2, train_loader)
    print(f'epoch: {epoch}, f1_test: {f1}, f1_train: {f1_train}')
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model2.state_dict(), '../baseline_fulldiv.pt')
        
    lr = lr * 0.97
    for param_group in optimizer2.param_groups:
        param_group['lr'] = lr


epoch: 0, f1_test: 0.07697615112313103, f1_train: 0.07826489608877954
epoch: 1, f1_test: 0.11683850869904917, f1_train: 0.11628960729452452
epoch: 2, f1_test: 0.2154387726609989, f1_train: 0.2274105794427145
epoch: 3, f1_test: 0.20669931918895773, f1_train: 0.24261859882051948
epoch: 4, f1_test: 0.22127056803982595, f1_train: 0.2350701833177046
epoch: 5, f1_test: 0.4143568490412995, f1_train: 0.4447747595370425
epoch: 6, f1_test: 0.279094088237962, f1_train: 0.3443336355847003
epoch: 7, f1_test: 0.3047462403599801, f1_train: 0.36571036476539914
epoch: 8, f1_test: 0.34372829126542, f1_train: 0.3946142402827801
epoch: 9, f1_test: 0.3901702044236138, f1_train: 0.43503587091207346
epoch: 10, f1_test: 0.4939430832610174, f1_train: 0.5389643448163665
epoch: 11, f1_test: 0.5177220053928165, f1_train: 0.6379532236724383
epoch: 12, f1_test: 0.3442908222030643, f1_train: 0.42088265782089196
epoch: 13, f1_test: 0.47609287753298274, f1_train: 0.5771338710963421
epoch: 14, f1_test: 0.4464822684969

In [None]:
# make a model
model_name = 'model2_fulldiv.pt'
model = ModelV2().cuda()
model.load_state_dict(torch.load(os.path.join('..', model_name)))
model.eval()
forecast = []
with torch.no_grad():
    for wavs in tqdm(test_loader):
        wavs = wavs.cuda()
        outputs = model.inference(wavs)
        outputs = outputs.detach().cpu().numpy().argmax(axis=1)
        forecast.append(outputs)
forecast = [x for sublist in forecast for x in sublist]
decoder = {classes_dict[cl]:cl for cl in classes_dict}
forecast = pd.Series(forecast).map(decoder)
df_test['label'] = forecast
df_test.to_csv(f'{model_name}.csv', index=None)

Loaded pretrained weights for efficientnet-b0


100%|██████████| 93/93 [00:29<00:00,  3.18it/s]
