In [None]:
from google.colab import drive
drive.mount('/content/drive')

Extracting dataset to a directory called ESC-50

In [None]:
import os
import zipfile
if not os.path.exists("ESC-50"):
    os.makedirs("ESC-50")
with zipfile.ZipFile('/content/drive/MyDrive/ESC-50-master.zip', "r") as zip_ref:
    zip_ref.extractall("ESC-50")

print("Dataset extracted to directory:", "ESC-50")

In [None]:
wav_files = [file for file in os.listdir("/content/ESC-50/ESC-50-master/audio") if file.endswith(".wav")]

Defining some libraries and two dictionaries one considered as an encoder of the dataset classes and the other as a decoder

In [None]:
import librosa
import random
import soundfile as sf
import librosa.display
#this is a dictionary to encode the categories into targets
encoder = {'dog': 0, 'chirping_birds': 14, 'vacuum_cleaner': 36, 'thunderstorm': 19, 'door_wood_knock': 30, 'can_opening': 34, 'crow': 9, 'clapping': 22, 'fireworks': 48, 'chainsaw': 41, 'airplane': 47, 'mouse_click': 31, 'pouring_water': 17, 'train': 45, 'sheep': 8, 'water_drops': 15, 'church_bells': 46, 'clock_alarm': 37, 'keyboard_typing': 32, 'wind': 16, 'footsteps': 25, 'frog': 4, 'cow': 3, 'brushing_teeth': 27, 'car_horn': 43, 'crackling_fire': 12, 'helicopter': 40, 'drinking_sipping': 29, 'rain': 10, 'insects': 7, 'laughing': 26, 'hen': 6, 'engine': 44, 'breathing': 23, 'crying_baby': 20, 'hand_saw': 49, 'coughing': 24, 'glass_breaking': 39, 'snoring': 28, 'toilet_flush': 18, 'pig': 2, 'washing_machine': 35, 'clock_tick': 38, 'sneezing': 21, 'rooster': 1, 'sea_waves': 11, 'siren': 42, 'cat': 5, 'door_wood_creaks': 33, 'crickets': 13}
#this is a dictionary to decode the categories into targets
decoder = {0: 'dog', 14: 'chirping_birds', 36: 'vacuum_cleaner', 19: 'thunderstorm', 30: 'door_wood_knock',34: 'can_opening', 9: 'crow', 22: 'clapping', 48: 'fireworks', 41: 'chainsaw', 47: 'airplane', 31: 'mouse_click', 17: 'pouring_water', 45: 'train', 8: 'sheep', 15: 'water_drops', 46: 'church_bells', 37: 'clock_alarm', 32: 'keyboard_typing', 16: 'wind', 25: 'footsteps', 4: 'frog', 3: 'cow', 27: 'brushing_teeth', 43: 'car_horn', 12: 'crackling_fire', 40: 'helicopter', 29: 'drinking_sipping', 10: 'rain', 7: 'insects', 26: 'laughing', 6: 'hen', 44: 'engine', 23: 'breathing', 20: 'crying_baby', 49: 'hand_saw', 24: 'coughing', 39: 'glass_breaking', 28: 'snoring', 18: 'toilet_flush', 2: 'pig', 35: 'washing_machine', 38: 'clock_tick', 21: 'sneezing', 1: 'rooster', 11: 'sea_waves', 42: 'siren', 5: 'cat', 33: 'door_wood_creaks', 13: 'crickets'}

A. Exploring the Audio Data
Let's first explore a random audio sample of dog bark 1-100032-A-0.wav by printing the values of :

Sample rate: it is the number of samples per second.<br>
Amplitude: it's the measure of how high/ low the wave extends from the x axis.

In [None]:
# Importing one file and calculate its duration, amplitude and shape
import numpy as np
y, sr = librosa.load("/content/ESC-50/ESC-50-master/audio/1-100032-A-0.wav")
print('amplitude y:', y)
print('y shape:', np.shape(y))
print('the duration of the audio:', np.shape(y)[0]/sr, 's')

In [None]:
#listening to the audio file
import IPython.display as ipd
ipd.Audio("/content/ESC-50/ESC-50-master/audio/1-100032-A-0.wav")

### B. Distribution of categories

In [None]:
# The data is described in the .csv file called esc50. Using this file, we will be able to discover our dataset.
import pandas as pd
df = pd.read_csv('/content/ESC-50/ESC-50-master/meta/esc50.csv')

In [None]:
# displaying the first 5 rows of the dataset.
df.head()

In [None]:
import seaborn as sns
category_group=df['category'].value_counts() #we are counting how many rows per class
colors = sns.color_palette("husl", len(category_group))
plot = category_group.plot(kind='bar', title="The number of audios per class", figsize=(20,10), color=colors)
plot.set_xlabel("Class")
plot.set_ylabel("Number of audios");

## Visualizing audio signals

In [None]:
%cd ESC-50/ESC-50-master/audio

In [None]:
plot_files = random.choices(wav_files, k = 10)
plot_audios = [librosa.load(plot_files[i]) for i in range(10)]

In [None]:
# Plotting the shape of sound waves of some categories such as: sound of church bells, water drops etc...
import matplotlib.pyplot as plt
plt.figure(figsize=(14,10))
for i in range(1,7):
    plt.subplot(2,3,i)
    librosa.display.waveshow(plot_audios[i][0])
    try:
        plt.title("Sound of " + decoder[int(plot_files[i][-6:-4])] )
    except:
        plt.title("Sound of " + decoder[int(plot_files[i][-5:-4])] )

## Fourier transform

In [None]:
# Here I am plotting the fourrier transform of these sounds.
plt.figure(figsize=(14,10))
for i in range(1,7):
    plt.subplot(2,3,i)
    X = np.abs(librosa.stft(plot_audios[i][0], n_fft =2048, hop_length =512))
    plt.plot(X)
    plt.xlabel("freq")
    plt.ylabel("Amplitude");
    try:
        plt.title("Fourier Transform of " + decoder[int(plot_files[i][-6:-4])] )
    except:
        plt.title("Fourier Transform of " + decoder[int(plot_files[i][-5:-4])] )

## III. Data Preparation <br>
A. Data Augmentation <br>
Time stretching : This technique changes the duration of the audio signal by speeding it up or slowing it down. This can be useful for simulating variations in the tempo of the audio.

Noise : This technique adds noise to the audio signal to simulate different noise conditions.

In [None]:
def add_noise(path):
    noise = np.random.normal(0, 0.1, len(path))
    audio_noisy = path + noise
    return audio_noisy

Time shifting: This technique changes the position of the audio signal in time by shifting it forwards or backwards.

In [None]:
def random_shift(path):
    timeshift_fac = 0.2 *2*(np.random.uniform()-0.5)  # up to 20% of length
    start = int(path.shape[0] * timeshift_fac)
    if (start > 0):
        data = np.pad(path,(start,0),mode='constant')[0:path.shape[0]]
    else:
        data = np.pad(path,(0,-start),mode='constant')[0:path.shape[0]]
    return data

Volume scaling: This technique changes the volume of the audio signal by scaling it up or down. This can be useful for simulating variations in the loudness of the audio.

In [None]:
def volume_scaling(path):
    sr  = 16000
    dyn_change = np.random.uniform(low=1.5,high=2.5)
    data = path * dyn_change
    return data

In [None]:
def aug_audio(file, aug):
    directory = 'ESC-50-augmented-data/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    aug = np.array(aug,dtype='float32').reshape(-1,1)
    sf.write(directory+'/'+ file, aug, 16000, 'PCM_24')

In [None]:
def augmentations(path_audio):
    path_ = np.random.choice(os.listdir(path_audio), size = (2000,), replace=False)
    for k,files in zip(range(len(path_)), path_):
        if files[0] != "5":
            data_, fs = librosa.load(os.path.join(path_audio, files), sr=16000)
            noise =add_noise(data_)
            ran_shift= random_shift(data_)
            volume_scale= volume_scaling(data_)
            l= [noise,ran_shift,volume_scale]
            for m in range(len(l)):
                filename = (files[0:2]+'generated'+'-'+str(m)+'-'+str(k)+'-'+files[2:])
                aug_audio(filename,l[m])

In [None]:
# Performing data augmentation
path="/content/ESC-50/ESC-50-master/audio"
augmentations(path)

## IV- Using approach of neural networks

In [None]:
class DataGenerator(Dataset):
    def __init__(self, path, transform = None, kind='train'):

        if kind=='train':
            files = Path(path).glob('[1-4]-*')
            self.item = [(str(file), file.name.split('-')[-1].replace('.wav', '')) for file in files]
        if kind=='test':
            files = Path(path).glob('5-*')
            self.item= [(str(file), file.name.split('-')[-1].replace('.wav', '')) for file in files]
        self.len = len(self.item)
        print(self.len)
    def __getitem__(self, index):
        filename, label = self.item[index]
        data_tensor, rate = torchaudio.load(filename)
        tmp = data_tensor[0,0:80000]
        return (tmp, int(label))

    def __len__(self):
        return self.len

In [None]:
from pathlib import Path
batch= 64
path_audio= '/content/ESC-50/ESC-50-master/audio'
train_data = DataGenerator(path_audio, kind='train')
test_data = DataGenerator(path_audio, kind='test')
#Applying the data loader on the training and testing data
train_loader = DataLoader(train_data, batch_size=batch, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch, shuffle=True)

In [None]:
# Creating the cnn class
class Net(nn.Module):
    #Constructor
    def __init__(self, num_classes=50):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(1,8), stride=(1,1), padding="same")
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(1,8), stride=(1,1), padding="same")
        self.bn2 = nn.BatchNorm2d(16)

        self.pool_1 = nn.MaxPool2d(kernel_size=(1,128), stride = (1,128), padding=0)

        self.conv3 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3,3), stride=(1,1), padding=1)
        self.bn3 = nn.BatchNorm2d(32)
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=(1,1), padding=1)
        self.bn4 = nn.BatchNorm2d(32)

        self.pool_2 = nn.MaxPool2d(kernel_size=4, padding=0)

        self.conv5 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3,3), stride=(2,2), padding=2)
        self.bn5 = nn.BatchNorm2d(64)
        self.conv6 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn6 = nn.BatchNorm2d(64)

        self.pool_3 = nn.MaxPool2d(kernel_size=2, padding=0)

        self.conv7 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn7 = nn.BatchNorm2d(128)
        self.conv8 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn8 = nn.BatchNorm2d(128)

        self.pool_4 = nn.MaxPool2d(kernel_size=(1,2), padding=0)

        self.dense = nn.Linear(in_features= 256, out_features=num_classes)
        # defining the dropout that is used mainly to avoid overfitting
        self.dropout = nn.Dropout(0.2)

    def forward(self, a):
        a= a.unsqueeze(1).view(-1, 1, 1, 80000)
        a= F.relu(self.bn1(self.conv1(a)))
        a= self.dropout(a)
        a= F.relu(self.bn2(self.conv2(a)))
        a= self.pool_1(a)
        a= a.view((-1,1,16, 625))
        a= F.relu(self.bn3(self.conv3(a)))
        a= self.dropout(a)
        a= F.relu(self.bn4(self.conv4(a)))
        a= self.pool_2(a)
        a= F.relu(self.bn5(self.conv5(a)))
        a= self.dropout(a)
        a= F.relu(self.bn6(self.conv6(a)))
        a= self.pool_3(a)
        a= F.relu(self.bn7(self.conv7(a)))
        a= self.dropout(a)
        a= F.relu(self.bn8(self.conv8(a)))
        a= self.pool_4(a)
        a= a.view(a.size(0),-1)
        a= self.dense(a)
        a= self.dropout(a)
        return a

In [None]:
# using cuda to make the training phase faster
device = "cpu"
if (torch.cuda.is_available()):
    device = "cuda"

In [None]:
model = Net(num_classes=50).to(device)
input_data = torch.randn(64, 1, 80000)
model_graph = draw_graph(model,input_data, roll=True)
model_graph.visual_graph

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    loss_training =0
    print("------------------------------- Epoch:", epoch,"-------------------------------")
    # Process the images in batches
    for batch_idx, (data, target) in enumerate(train_loader):
        # Use the CPU or GPU as appropriate
        # Recall that GPU is optimized for the operations we are dealing with
        data, target = data.to(device), target.to(device)
        # Reset the optimizer
        optimizer.zero_grad()
        # Push the data forward through the model layers
        output = model(data.to(device))
        loss_criteria=nn.CrossEntropyLoss()
        loss = loss_criteria(output, target)
        # Keep a running total
        loss_training += loss.item()
        # Backpropagate
        loss.backward(retain_graph=True)
        optimizer.step()
    # return average loss for the epoch
    avg_loss =loss_training / (batch_idx+1)
    print('Training set: Average loss: {:.6f}'.format(avg_loss))
    return avg_loss

In [None]:
def test(model, device, test_loader):
    model.eval()
    loss_testing = 0
    true= 0
    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)
            # adding the predicted classes for the actual batch
            output = model(data)
            # Calculate the loss for the actual batch
            loss_criteria=nn.CrossEntropyLoss()
            loss_testing += loss_criteria(output, target).item()
            # Calculate the accuracy for this batch
            _, predicted = torch.max(output.data, 1)
            true += torch.sum(target==predicted).item()
    # Calculate the average loss and total accuracy for this epoch
    avg_loss =loss_testing / batch_count
    print('Validation set: Average loss: {:.6f}, Accuracy: {}/{} ({:.0f}%)\n'.format(avg_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
    return avg_loss

In [None]:
def training_model(model):
    optimizer = optim.Adam(model.parameters(), lr=3e-4)
    loss_criteria = nn.CrossEntropyLoss()
    epoch_list = []
    train_loss_list = []
    valid_loss_list= []
    epochs = 20
    print('Training on', device)
    for epoch in tqdm(range(1, epochs + 1)):
        train_loss = train(model, device, train_loader, optimizer, epoch)
        test_loss = test(model, device, test_loader)
        epoch_list.append(epoch)
        train_loss_list.append(train_loss)
        valid_loss_list.append(test_loss)

In [None]:
import torchaudio
training_model(model)