## **Complete Pipeline**

### **Loading Urban Sound Dataset**

In [None]:
import torch 
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torchaudio
from tqdm import tqdm
from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#!tar -xzvf "/content/drive/MyDrive/Classification/Audio_Classification/UrbanSound8K.tar.gz" -C "/content/drive/MyDrive/Classification/Audio_Classification/"  

In [None]:
class UrbanSoundDataset(Dataset):

  def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples):

    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.transformation = transformation
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples

  def __len__(self):
    
    return len(self.annotations)
  
  def __getitem__(self,index):

    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    signal, sr  = torchaudio.load(audio_sample_path)
    signal = self._resample_if_necessary(signal,sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self._cut_if_necessary(signal)
    signal = self._right_pad_if_necessary(signal)
    signal = self.transformation(signal)

    return signal, label

  def _cut_if_necessary(self,signal):

    if signal.shape[1] > self.num_samples:
      signal = signal[:, :self.num_samples]

    return signal
  
  def _right_pad_if_necessary(self,signal):

    if signal.shape[1]<self.num_samples:
      num_missing_samples = self.num_samples - signal.shape[1]
      last_dim_padding = (0,num_missing_samples)
      signal = torch.nn.functional.pad(signal,last_dim_padding)
    
    return signal

  def _resample_if_necessary(self,signal,sr):

    if sr != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sr,self.target_sample_rate)
      signal = resampler(signal)
    return signal
  
  def _mix_down_if_necessary(self, signal):

    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)

    return signal


  def _get_audio_sample_path(self,index):
    fold = f"fold{self.annotations.iloc[index, 5]}"
    path = Path(self.audio_dir + fold + '/' + self.annotations.iloc[index, 0])
    return path

  def _get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 6]

In [None]:
BATCH_SIZE = 128
EPOCHS = 30
LEARNING_RATE = 0.001
Sample_Rate = 22050
num_samples = 22050


### **Model**

In [None]:
from torch import nn
from torchsummary import summary

class CNNNetwork(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Sequential(
        nn.Conv2d( in_channels = 1, out_channels = 32, kernel_size=3, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2),

    )
    self.conv2 = nn.Sequential(
        nn.Conv2d( in_channels = 32, out_channels = 64, kernel_size=3, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2),
        
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d( in_channels = 64, out_channels = 128, kernel_size=3, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2),
        
    )
    self.conv4 = nn.Sequential(
        nn.Conv2d( in_channels = 128, out_channels = 256, kernel_size=3, stride=1, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2),
        
    )
    self.flatten = nn.Flatten()
    self.linear = nn.Linear( 256*5*4, 10)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input_data):

    x = self.conv1(input_data)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.conv4(x)
    x = self.flatten(x)
    logits = self.linear(x)
    predictions = self.softmax(logits)

    return predictions

### **Model-Training**

In [None]:
def train_one_epoch(model, data_loader, loss_fn, optimiser, device):

  for inputs, targets in tqdm(data_loader):
    inputs,targets = inputs.to(device), targets.to(device)
    predictions = model(inputs)
    loss = loss_fn(predictions,targets)

    optimiser.zero_grad()
    loss.backward()
    optimiser.step()
  
  print(f"Loss:{loss.item()}")

def train(model, data_loader, loss_fn, optimiser, device, epochs):

  for i in range(epochs):
    print(f"Epoch:{i+1}")
    train_one_epoch(model, data_loader, loss_fn, optimiser, device)

  print("Training Complete")

def create_data_loader(train_data, batch_size):

  train_dataloader = DataLoader(train_data, batch_size = batch_size)
  return train_dataloader

In [None]:
import torch 

if __name__ == '__main__':

  if torch.cuda.is_available():
    device = 'cuda'
  else:
    device = 'cpu'

  annotations_file = '/content/drive/MyDrive/Classification/Audio_Classification/UrbanSound8K/metadata/UrbanSound8K.csv'
  audio_dir = '/content/drive/MyDrive/Classification/Audio_Classification/UrbanSound8K/audio/'

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=Sample_Rate, n_fft=1024, hop_length=512, n_mels=64)
  usd = UrbanSoundDataset( annotations_file, audio_dir, mel_spectrogram, Sample_Rate, num_samples)
  train_data_loader = DataLoader( usd, batch_size = BATCH_SIZE)
  signal, label = usd[0]
  feed_forward_net = CNNNetwork().to(device)

  loss_fn = nn.CrossEntropyLoss()
  optimiser = torch.optim.Adam(feed_forward_net.parameters(),lr = 0.0001)

  train( feed_forward_net, train_data_loader, loss_fn, optimiser, device, EPOCHS)
  torch.save(feed_forward_net.state_dict(), "feedforwardnet.pth")
  print("Model trained and stored at feedforwardnet.pth")

Epoch:1


100%|██████████| 69/69 [02:29<00:00,  2.17s/it]


Loss:2.2449381351470947
Epoch:2


100%|██████████| 69/69 [02:35<00:00,  2.26s/it]


Loss:2.2027533054351807
Epoch:3


100%|██████████| 69/69 [02:29<00:00,  2.17s/it]


Loss:2.1946494579315186
Epoch:4


100%|██████████| 69/69 [02:29<00:00,  2.16s/it]


Loss:2.1176228523254395
Epoch:5


100%|██████████| 69/69 [02:28<00:00,  2.16s/it]


Loss:2.1613874435424805
Epoch:6


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:2.061757802963257
Epoch:7


100%|██████████| 69/69 [02:27<00:00,  2.14s/it]


Loss:2.133641242980957
Epoch:8


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9920647144317627
Epoch:9


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9796364307403564
Epoch:10


100%|██████████| 69/69 [02:29<00:00,  2.17s/it]


Loss:1.9715896844863892
Epoch:11


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9457685947418213
Epoch:12


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.930105209350586
Epoch:13


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9140983819961548
Epoch:14


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9059288501739502
Epoch:15


100%|██████████| 69/69 [02:27<00:00,  2.14s/it]


Loss:1.939225196838379
Epoch:16


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:2.0358150005340576
Epoch:17


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9305543899536133
Epoch:18


100%|██████████| 69/69 [02:29<00:00,  2.16s/it]


Loss:2.096281051635742
Epoch:19


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.929892897605896
Epoch:20


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9084863662719727
Epoch:21


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9299439191818237
Epoch:22


100%|██████████| 69/69 [02:29<00:00,  2.17s/it]


Loss:1.9227354526519775
Epoch:23


100%|██████████| 69/69 [02:28<00:00,  2.16s/it]


Loss:1.9392348527908325
Epoch:24


100%|██████████| 69/69 [02:28<00:00,  2.16s/it]


Loss:1.916399359703064
Epoch:25


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9377614259719849
Epoch:26


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.8215843439102173
Epoch:27


100%|██████████| 69/69 [02:28<00:00,  2.15s/it]


Loss:1.9191361665725708
Epoch:28


100%|██████████| 69/69 [02:28<00:00,  2.16s/it]


Loss:1.908983826637268
Epoch:29


100%|██████████| 69/69 [02:29<00:00,  2.16s/it]


Loss:1.9204531908035278
Epoch:30


100%|██████████| 69/69 [02:29<00:00,  2.16s/it]

Loss:1.9179407358169556
Training Complete
Model trained and stored at feedforwardnet.pth





### **Testing**



In [None]:
import torch

class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("/content/feedforwardnet.pth")
    cnn.load_state_dict(state_dict)

    # load urban sound dataset dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = Sample_Rate,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(annotations_file, audio_dir, mel_spectrogram, Sample_Rate, num_samples)


    # get a sample from the urban sound dataset for inference
    input, target = usd[0][0], usd[0][1] # [batch size, num_channels, fr, time]
    input.unsqueeze_(0)

    # make an inference
    predicted, expected = predict(cnn, input, target,class_mapping)
    print(f"Predicted: '{predicted}', expected: '{expected}'")

Predicted: 'dog_bark', expected: 'dog_bark'
