## **Data Preprocessing**

In [1]:
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D,Flatten, Dropout,BatchNormalization
from keras.callbacks import Callback
from keras.models import load_model

print("All Dependencies Installed")

All Dependencies Installed


In [2]:
dataset_dir = r"/content/drive/MyDrive/Urban8K_Dataset"
df = pd.read_csv(dataset_dir + r"/UrbanSound8K.csv")

# We'll not use the full dataset to avoid running out of memory
num_samples = 1500
df = df.sample(n=num_samples)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 4808 to 7024
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   slice_file_name  1500 non-null   object 
 1   fsID             1500 non-null   int64  
 2   start            1500 non-null   float64
 3   end              1500 non-null   float64
 4   salience         1500 non-null   int64  
 5   fold             1500 non-null   int64  
 6   classID          1500 non-null   int64  
 7   class            1500 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 105.5+ KB
None


In [3]:
# Takes audio file name/path & returns its Mel Spectrogram.
def convert_audio2MelSpec(audio_file):
    samples, sample_rate = librosa.load(audio_file, sr=None)
    spectrogram = librosa.stft(samples)
    sgram_mag, _ = librosa.magphase(spectrogram)
    mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sample_rate)
    mel_spectrogram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
    return mel_spectrogram

# Takes 2D array & desired_shape then pads it with 0 to reshape it to desired shape.
def apply_padding(an_array,desired_shape):
    shape = np.shape(an_array)
    # we'll reshape all mel_spec to largest shape present in our dataset-(128, 1501)
    padded_array = np.zeros(desired_shape)
    padded_array[:shape[0],:shape[1]] = an_array
    return padded_array

In [4]:

# Finding the largest shape of mel_spectrogram in our samples.
# Since we are converting audio to mel_spec,the no. of rows is common in all i.e 128
# so we just need to find the largest no. of columns.

def shape_generator():
    for audio_filename, fold, label in df[["slice_file_name", "fold", "class"]].values:
        file_path = dataset_dir + "/" + "fold" + str(fold) + "/" + audio_filename
        mel_spec = convert_audio2MelSpec(file_path)
        mel_shape = list(mel_spec.shape)
        yield mel_shape


largest_shape = [128, 0]
i = 0
gen = shape_generator()
while True:
    try:
        melshape = next(gen)
        if melshape[1] > largest_shape[1]:
            largest_shape[1] = melshape[1]
        i += 1

        # prints percentage of task completed.
        if i % 200 == 0:
            percent = (i / num_samples) * 100
            print("{} % Task Completed...".format(round(percent, 2)))
        
    except StopIteration:
        break

largest_shape = tuple(largest_shape)
print("Largest Shape : ", largest_shape)

  "Empty filters detected in mel frequency basis. "


13.33 % Task Completed...
26.67 % Task Completed...
40.0 % Task Completed...
53.33 % Task Completed...
66.67 % Task Completed...
80.0 % Task Completed...
93.33 % Task Completed...
Largest Shape :  (128, 1501)


In [None]:

def data_generator():
    for audio_filename, fold, label in df[["slice_file_name", "fold", "class"]].values:
        file_path = dataset_dir + "/" + "fold" + str(fold) + "/" + audio_filename
        mel_spec = convert_audio2MelSpec(file_path)
        # padding to largest shape in dataset
        mel_spec = apply_padding(mel_spec, largest_shape)
        # converting 2D numpy array to 2D list
        mel_spec = mel_spec.tolist()
        yield mel_spec, label


x = []
y = []

gen = data_generator()
i=0
while True:
    try:
        melspec, label = next(gen)
        x.append(melspec)
        y.append(label)
        i+=1

        # prints percentage of task completed.
        if i % 200 == 0:
                percent = (i / num_samples) * 100
                print("{} % Task Completed...".format(round(percent, 2)))

    except StopIteration:
        break

x = np.array(x)
y = np.array(y)

print(x[:3])
print(y[:3])

  "Empty filters detected in mel frequency basis. "


13.33 % Task Completed...
26.67 % Task Completed...
40.0 % Task Completed...
53.33 % Task Completed...
66.67 % Task Completed...
80.0 % Task Completed...
93.33 % Task Completed...


In [None]:
y_df = pd.DataFrame(data=y)

# contains all string class labels
labels = (list(pd.get_dummies(y_df)))

# one-hot-encoding labels
y = pd.get_dummies(y).values

print(labels)
print(y[:3])

np.save("X_Urban8K",x)
np.save("Y_Urban8K",y)

In [None]:
"""
Conv2D however expects 4 dimensions,because it also expects the channels dimension of image,
which in MNIST is nonexistent because it’s grayscale data and hence is 1.
Reshaping the data, while explicitly adding the channels dimension, resolves the issue.
The input shape a CNN accepts should be in a specific format.
In Tensorflow,the format is (num_samples, height, width, channels)
"""
print("Before Reshaping : ",x.shape)
largest_shape = list(largest_shape)
x = x.reshape(x.shape[0],largest_shape[0],largest_shape[1],1)
print("After Reshaping",x.shape)

# Splitting data into train & test
x_train,x_test,y_train,y_test = \
    train_test_split(x,y,test_size=0.2,random_state=0)

## **Model Training**

In [None]:
input_shape = (x.shape[1],x.shape[2],x.shape[3])

# Defining the model
model = Sequential()
model.add(Conv2D(32,kernel_size=(2,2),padding="same",activation="relu",input_shape=input_shape))
model.add(MaxPooling2D())
model.add(Conv2D(64,kernel_size=(2,2),padding="same",activation="relu"))
model.add(MaxPooling2D())
model.add(Conv2D(128,kernel_size=(2,2),padding="same",activation="relu"))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(150,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(75,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(len(y_train[0]),activation="softmax"))

print("Y_train length :",len(y_train[0]))

model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model.summary())

In [None]:
# Custom Keras callback to stop training when certain accuracy is achieved.
class MyThresholdCallback(Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
    def on_epoch_end(self, epoch, logs=None):
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True


model.fit(x_train, y_train, epochs=50,
          callbacks=[MyThresholdCallback(0.9)],validation_data=(x_test, y_test))

In [None]:
#Saving the model
model.save("Audio_Classification_CNN")