In [1]:
import numpy as np
import librosa
import os

In [2]:
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80):
  wav,sr = librosa.load(file_path,sr=sr)
  if wav.shape[0]<5*sr:
    wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
  else:
    wav=wav[:5*sr]
  spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
  spec_db=librosa.power_to_db(spec,top_db=top_db)
  return spec_db

In [3]:
def spec_to_image(spec, eps=1e-6):
  mean = spec.mean()
  std = spec.std()
  spec_norm = (spec - mean) / (std + eps)
  spec_min, spec_max = spec_norm.min(), spec_norm.max()
  spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
  spec_scaled = spec_scaled.astype(np.uint8)
  return spec_scaled

In [4]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
class ESC50Data(Dataset):
  def __init__(self, base, df, in_col, out_col):
    self.df = df
    self.data = []
    self.labels = []
    self.c2i={}
    self.i2c={}
    self.categories = sorted(df[out_col].unique())
    for i, category in enumerate(self.categories):
      self.c2i[category]=i
      self.i2c[i]=category
    for ind in tqdm(range(len(df))):
      row = df.iloc[ind]
      file_path = os.path.join(base,row[in_col])
      self.data.append(spec_to_image(get_melspectrogram_db(file_path))[np.newaxis,...])
      self.labels.append(self.c2i[row['category']])
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]

In [183]:
train = pd.read_csv('dataset/ESC-50-master/train.csv')
valid = pd.read_csv('dataset/ESC-50-master/test.csv')
train_data = ESC50Data('dataset/ESC-50-master/audio/', train, 'filename', 'category')
valid_data = ESC50Data('dataset/ESC-50-master/audio/', valid, 'filename', 'category')
train_loader = DataLoader(train_data, batch_size=10, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=1, shuffle=True)

100%|██████████| 1949/1949 [00:22<00:00, 88.44it/s]
100%|██████████| 50/50 [00:00<00:00, 93.76it/s]


In [170]:
data, label = iter(train_loader).next()
data.shape

torch.Size([2, 1, 128, 431])

In [411]:
import torch.nn as nn
import torch.nn.functional as F
#from base import BaseModel

class ESC50Model(nn.Module):
    def __init__(self, input_shape, batch_size=16, num_cats=50):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 80, kernel_size = (57, 6), stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(80)
        #self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=3, padding=1)
        self.conv2 = nn.Conv2d(80, 80, kernel_size = (1, 3), stride=1, padding=0)
        self.bn2 = nn.BatchNorm2d(80)  
        n1 = (input_shape[1]-57)+1        
        n3 = ((n1 -4) / 1) + 1
        
        n2 = (input_shape[2]-6)+1        
        n4 = int((n2 -3) / 3) + 1
        
        n5 = ((n3-1) / 1) + 1
        n7 = ((n5 - 1) / 1) + 1        
        
        n6 = ((n4 - 3) / 1) + 1
        n8 = int((n6 -3) / 3) + 1
        n = int(n8 * n7 * 80)
        self.dense1 = nn.Linear(n, 100)        
        self.dense2 = nn.Linear(100, num_cats)                

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = F.max_pool2d(x, kernel_size=(4, 3), stride= (1,3))
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = F.max_pool2d(x, kernel_size=(1, 3), stride= (1,3))  
        x = x.view(x.size(0),-1)
        x = self.dense1(x)
        x = self.dense2(x)        
        return x


In [412]:
from torchvision.models import resnet34
import torch
import torch.nn as nn
import torch.optim as optim
if torch.cuda.is_available():
  device=torch.device('cuda:0')
else:
  device=torch.device('cpu')


In [413]:
input_shape = (1, 128, 431)
model = ESC50Model(input_shape)
model = model.to(device)

In [414]:
x, y = iter(train_loader).next()
x.shape

torch.Size([10, 1, 128, 431])

In [415]:
x = torch.randn(2,128, 431)
x = torch.unsqueeze(x, dim=1)
n = x.shape[2]*x.shape[3] 
y_hat = model(x.float().to(device))

y_hat.shape

torch.Size([2, 50])

In [347]:
model = resnet34(pretrained=True)
model.fc = nn.Linear(512,50)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model = model.to(device)

In [318]:

def lr_decay(optimizer, epoch):
  if epoch%10==0:
    new_lr = learning_rate / (10**(epoch//10))
    optimizer = setlr(optimizer, new_lr)
    print(f'Changed learning rate to {new_lr}')
  return optimizer


In [319]:
def accuracy_fn(output, target):
    with torch.no_grad():
        pred = torch.argmax(torch.from_numpy(output), dim=1)
        target = torch.from_numpy(target)
        assert pred.shape[0] == len(target)
        correct = 0
        correct += torch.sum(pred == target).item()
    return correct / len(target)


def top_k_acc(output, target, k=3):
    with torch.no_grad():
        pred = torch.topk(output, k, dim=1)[1]
        assert pred.shape[0] == len(target)
        correct = 0
        for i in range(k):
            correct += torch.sum(pred[:, i] == target).item()
    return correct / len(target)


In [416]:
def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None):
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()

        train_losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        #accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        accuracy = accuracy_fn(trace_yhat, trace_y)
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')


In [419]:
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
epochs = 30
loss_fn = nn.CrossEntropyLoss()
train_losses=[]
valid_losses=[]


train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, lr_decay)

  3%|▎         | 1/30 [00:14<07:09, 14.82s/it]

Epoch - 1 Train-Loss : 11.314625670811019
Epoch - 1 Valid-Loss : 75.5243367877602 Valid-Accuracy : 0.06


  7%|▋         | 2/30 [00:29<06:55, 14.85s/it]

Epoch - 2 Train-Loss : 5.197297315200115
Epoch - 2 Valid-Loss : 70.33436861038207 Valid-Accuracy : 0.02


 10%|█         | 3/30 [00:44<06:43, 14.93s/it]

Epoch - 3 Train-Loss : 2.477169712022815
Epoch - 3 Valid-Loss : 47.91774243037366 Valid-Accuracy : 0.04


 13%|█▎        | 4/30 [00:59<06:29, 14.96s/it]

Epoch - 4 Train-Loss : 1.2231332085582745
Epoch - 4 Valid-Loss : 37.59196340560913 Valid-Accuracy : 0.0


 17%|█▋        | 5/30 [01:15<06:15, 15.00s/it]

Epoch - 5 Train-Loss : 0.9090656357631113
Epoch - 5 Valid-Loss : 38.070914583206175 Valid-Accuracy : 0.0


 20%|██        | 6/30 [01:30<06:01, 15.05s/it]

Epoch - 6 Train-Loss : 0.8304071721860931
Epoch - 6 Valid-Loss : 44.6129428678751 Valid-Accuracy : 0.06


 23%|██▎       | 7/30 [01:45<05:46, 15.09s/it]

Epoch - 7 Train-Loss : 0.8552062861162018
Epoch - 7 Valid-Loss : 32.33463901519775 Valid-Accuracy : 0.0


 27%|██▋       | 8/30 [02:00<05:32, 15.11s/it]

Epoch - 8 Train-Loss : 0.5476336242041054
Epoch - 8 Valid-Loss : 37.5331032371521 Valid-Accuracy : 0.0


 30%|███       | 9/30 [02:15<05:16, 15.07s/it]

Epoch - 9 Train-Loss : 0.34306283500098794
Epoch - 9 Valid-Loss : 31.077303858697416 Valid-Accuracy : 0.02





NameError: name 'setlr' is not defined