# CNN for Audio Classification

In [1]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import pandas as pd
import torch
import os
from torchsummary import summary

  from .autonotebook import tqdm as notebook_tqdm


Building the UrbanSoundDataset Dataset

In [2]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_path, audio_dir, transformation, target_sample_rate, sample_number):
        self.annotations = pd.read_csv(annotations_path)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.sample_number = sample_number

    def __len__(self):
        return len(self.annotations)        

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample(signal, sr)
        signal = self._mix_down(signal)
        signal = self._truncate_signal_size(signal, self.sample_number)
        signal = self.transformation(signal)
        return signal, label

    def _get_audio_sample_path(self, index):
        folder = f'fold{self.annotations.iloc[index, 5]}'
        filename = self.annotations.iloc[index, 0]
        path = os.path.join(self.audio_dir, folder, filename)
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

    def _resample(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
        
    def _truncate_signal_size(self, signal, sample_number):
        if signal.shape[1] > sample_number:
            signal = signal[:, :sample_number]
        elif signal.shape[1] < sample_number:
            pad_size = sample_number - signal.shape[1]
            signal = torch.nn.functional.pad(signal, pad=(0, pad_size), value=0)
        return signal
        


Creating the CNN class

In [3]:
class CNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = torch.nn.Sequential(

            ## Convolution Block [1/4]
            torch.nn.Conv2d(in_channels=1, 
                            out_channels=16, 
                            kernel_size=3, 
                            padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            ## Convolution Block [2/4]
            torch.nn.Conv2d(in_channels=16, 
                            out_channels=32, 
                            kernel_size=3, 
                            padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            ## Convolution Block [3/4]
            torch.nn.Conv2d(in_channels=32, 
                            out_channels=64, 
                            kernel_size=3, 
                            padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            ## Convolution Block [4/4]
            torch.nn.Conv2d(in_channels=64, 
                            out_channels=128, 
                            kernel_size=3, 
                            padding=2),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            torch.nn.Flatten(),

            torch.nn.Linear(128 * 5 * 4, 10),
            torch.nn.Softmax(dim=1)
        )

    def forward(self, input):
        return self.layers(input)

In [4]:
SAMPLE_RATE = 22050
SAMPLE_NUMBER = 22050

mel_spectogram = torchaudio.transforms.MelSpectrogram(  sample_rate=SAMPLE_RATE,
                                                        n_fft=1024,
                                                        hop_length=512,
                                                        n_mels=64)

usd = UrbanSoundDataset(annotations_path='../../../Downloads/archive(4)/UrbanSound8K.csv',
                        audio_dir='../../../Downloads/archive(4)',
                        transformation=mel_spectogram,
                        target_sample_rate=SAMPLE_RATE,
                        sample_number=SAMPLE_NUMBER)

In [5]:
model = CNN()
train_loader = DataLoader(usd, batch_size=128)

In [6]:
def train(model, loader, loss_fn, n_batches):
    iterations = len(loader)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for batch_i in range(n_batches):
        n_correct = 0
        n_trained = 0
        print(f'Batch: [{batch_i}/{n_batches}]')
        for i, (audio, target) in enumerate(loader):
            # Forward Pass
            pred_prob = model(audio)

            # Loss Calculation
            loss = loss_fn(pred_prob, target)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Get Accuracy
            with torch.no_grad():
                n_correct += (target == pred_prob.argmax(1)).sum().item()
                n_trained += len(audio)

            if i%10 == 0:
                print(f' [{i}/{iterations}] - samples_analyzed: {n_trained} | loss: {loss} | accuracy: {n_correct/n_trained}')


Training the model for 5 batches

In [8]:
loss_fn = torch.nn.CrossEntropyLoss()

train(model, train_loader, loss_fn, n_batches=10)

Batch: [0/10]
 [0/69] - samples_analyzed: 128 | loss: 2.245386838912964 | accuracy: 0.2265625
 [10/69] - samples_analyzed: 1408 | loss: 2.314112901687622 | accuracy: 0.11931818181818182
 [20/69] - samples_analyzed: 2688 | loss: 2.315645456314087 | accuracy: 0.12202380952380952
 [30/69] - samples_analyzed: 3968 | loss: 2.4611504077911377 | accuracy: 0.1305443548387097
 [40/69] - samples_analyzed: 5248 | loss: 2.3503973484039307 | accuracy: 0.1442454268292683
 [50/69] - samples_analyzed: 6528 | loss: 2.336120128631592 | accuracy: 0.13985906862745098
 [60/69] - samples_analyzed: 7808 | loss: 2.4608023166656494 | accuracy: 0.13857581967213115
Batch: [1/10]
 [0/69] - samples_analyzed: 128 | loss: 2.2798194885253906 | accuracy: 0.1796875
 [10/69] - samples_analyzed: 1408 | loss: 2.282097816467285 | accuracy: 0.10511363636363637
 [20/69] - samples_analyzed: 2688 | loss: 2.3230583667755127 | accuracy: 0.11644345238095238
 [30/69] - samples_analyzed: 3968 | loss: 2.4611434936523438 | accuracy: 