In [74]:
import pandas as pd
import numpy as np
import librosa

import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [93]:
import warnings
warnings.filterwarnings('ignore')

In [82]:
def get_target_description(df):
    return df[["target", "category"]].groupby("target").first()


def get_target_distribution(df, title="distribution_of_targets"):
    ax = df.target.hist(bins=50, alpha=0.5, edgecolor="black")
    ax.set_xlabel("target")
    ax.set_ylabel("number_of_data_points")
    plt.title(title)
    plt.show()


def get_fold_distribution(dct):
    for i, fold in dct.items():
        get_target_distribution(fold, title=f"fold_{i}")
    
    
def get_folds(df):
    fold_dct = dict()
    for fold in df.fold.unique():
        fold_dct[fold] = df[df.fold == fold]
    return fold_dct


def train_test_split(df, test_fold=1):
    train_df = df[df.fold != test_fold]
    test_df = df[df.fold == test_fold]
    return train_df, test_df


def prepare_data(path, only_esc10=True):
    df = pd.read_csv(path)
    if only_esc10:
        df = df[df.esc10 == True]
    return df

In [76]:
csv_path = "../ESC-50-master/meta/esc50.csv"

df = prepare_data(csv_path, only_esc10=False)

df

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
...,...,...,...,...,...,...,...
1995,5-263831-B-6.wav,5,6,hen,False,263831,B
1996,5-263902-A-36.wav,5,36,vacuum_cleaner,False,263902,A
1997,5-51149-A-25.wav,5,25,footsteps,False,51149,A
1998,5-61635-A-8.wav,5,8,sheep,False,61635,A


In [83]:
def get_melspectrogram_db(file_path, 
                          sr=None, 
                          n_fft=2048, 
                          hop_length=512, 
                          n_mels=128, 
                          fmin=20, 
                          fmax=8300, 
                          top_db=80):
    
    wav, sr = librosa.load(file_path, sr=sr)
    if wav.shape[0] < 5*sr:
        wav=np.pad(wav, int(np.ceil((5*sr-wav.shape[0])/2)), mode='reflect')
    else:
        wav=wav[:5*sr]
        
    spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
    
    spec_db=librosa.power_to_db(spec, top_db=top_db)
    
    return spec_db

In [84]:
def spec_to_image(spec, eps=1e-6):
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    return spec_scaled

In [95]:
train_df, test_df = train_test_split(df, test_fold=1)

In [90]:
import os

In [99]:
class ESC50Data(Dataset):
    
    def __init__(self, df, 
                 base_dir='../ESC-50-master/audio', 
                 in_col='filename', 
                 out_col='category'):
        
        self.df = df
        self.data = []
        self.labels = []
        self.c2i={}
        self.i2c={}
        self.categories = sorted(df[out_col].unique())
        
        for i, category in enumerate(self.categories):
            self.c2i[category] = i
            self.i2c[i] = category
            
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind]
            file_path = os.path.join(base_dir, row[in_col])
            self.data.append(spec_to_image(get_melspectrogram_db(file_path))[np.newaxis,...])
            self.labels.append(self.c2i[row[out_col]])
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [96]:
train_data = ESC50Data(train_df)
test_data = ESC50Data(train_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:39<00:00, 40.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:35<00:00, 45.71it/s]


In [101]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=True)

In [102]:
train_loader 

<torch.utils.data.dataloader.DataLoader at 0x7fa91c7be170>

In [104]:
from torchvision.models import resnet34

In [105]:
import torch
import torch.nn as nn
import torch.optim as optim

In [113]:
def get_device():
    return torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [114]:
get_device()

device(type='cpu')

In [107]:
resnet_model = resnet34(pretrained=True)
resnet_model.fc = nn.Linear(512, 50)
resnet_model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model = resnet_model.to(device)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /home/galdmitry/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100.0%


In [108]:
resnet_model

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [109]:
learning_rate = 2e-4
optimizer = optim.Adam(resnet_model.parameters(), lr=learning_rate)
epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

In [116]:
def lr_decay(optimizer, epoch):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr) ######################### ?????
        print(f'Changed learning rate to {new_lr}')
    return optimizer


def train(model, 
          loss_fn, 
          train_loader, 
          valid_loader, 
          epochs, 
          optimizer, 
          train_losses, 
          valid_losses, 
          change_lr=None):
      
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch) #### ???
            
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            
            y_hat = model(x)
            
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
            
        train_losses.append(batch_losses)
        
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
        
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
            
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')
    

In [112]:
train(resnet_model, 
      loss_fn, 
      train_loader, 
#       valid_loader,
      test_loader,
      epochs,
      optimizer, 
      resnet_train_losses, 
      resnet_valid_losses, 
      lr_decay)

  0%|                                                                                                                          | 0/50 [02:13<?, ?it/s]


KeyboardInterrupt: 