In [94]:
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.models import load_model

In [95]:
dataset_dir = r"C:/Users/dipesh/Desktop/Urban8K_Dataset"
df = pd.read_csv(dataset_dir + r"/UrbanSound8K.csv")

In [96]:
n_mfcc = 150

# Takes audio file name/path & returns its MFCC.
def convert_audio2MFCC(audio_file):
    samples, sample_rate = librosa.load(audio_file, sr=None)
    mfcc = librosa.feature.mfcc(samples, sr=sample_rate, n_mfcc=n_mfcc)
    # scaling the mfcc values
    scaled_mfcc = np.mean(mfcc.T, axis=0)
    # mfcc & scaled mfcc have different shapes
    return scaled_mfcc

## Dataset Preprocessing

In [97]:
processed_data = []

for audio_filename, fold, label in df[["slice_file_name", "fold", "class"]].values:
    file_path = dataset_dir + "/" + "fold" + str(fold) + "/" + audio_filename
    mfcc = convert_audio2MFCC(file_path)
    processed_data.append([mfcc, label])
    

df = pd.DataFrame(processed_data, columns=["mfcc", "label"])
df



Unnamed: 0,mfcc,label
0,"[-275.1182, 119.94202, -98.53286, -66.60548, -...",dog_bark
1,"[-500.9422, 185.2791, -86.73012, 50.013317, 9....",children_playing
2,"[-531.3129, 187.20842, -70.5887, 40.57582, 9.0...",children_playing
3,"[-476.68835, 160.30865, -62.98635, 50.75118, -...",children_playing
4,"[-521.33887, 185.59746, -82.17255, 46.63329, 1...",children_playing
...,...,...
8727,"[-466.52383, 193.2972, -63.976425, 30.498638, ...",car_horn
8728,"[-396.62436, 134.84552, -64.95755, 42.356236, ...",car_horn
8729,"[-362.73123, 165.22363, -60.909637, 31.122898,...",car_horn
8730,"[-404.53458, 178.59471, -63.71061, 26.02565, -...",car_horn


In [98]:
x = np.array(df["mfcc"].tolist())
y = np.array(df["label"].tolist())

print(x[:5])
print(y[:5])

[[-2.75118195e+02  1.19942017e+02 -9.85328598e+01 -6.66054764e+01
  -4.24864197e+01  4.02130365e-01 -2.83528080e+01 -5.50975752e+00
   1.02289715e+01  4.99342728e+00  1.57092762e+01  2.17791080e-01
  -2.47162747e+00 -1.30902028e+00 -9.59634876e+00  6.02451824e-02
   3.03321600e+00  1.27535594e+00 -5.54501247e+00 -1.04474821e+01
  -6.12301683e+00 -1.19562089e+00  2.84950912e-01  4.16270590e+00
  -6.06368303e+00 -9.49120235e+00  2.13097596e+00  4.01335049e+00
  -6.14730930e+00 -2.46767020e+00 -1.21867812e+00 -3.46005964e+00
   1.99648750e+00  1.48226869e+00 -4.42133379e+00 -3.87177658e+00
   2.60200232e-01  6.49950361e+00  1.66028380e+00  1.56434786e+00
  -7.22784996e-01 -4.07190228e+00 -4.24067545e+00  2.29234171e+00
  -5.17653897e-02 -2.23162580e+00  7.56968439e-01  2.10232571e-01
   1.51369786e+00  2.06801009e+00 -1.00280285e+00 -2.60164309e+00
  -4.37110806e+00 -2.71826386e+00  2.98152775e-01 -3.36448145e+00
  -1.10502601e+00  4.84512001e-01 -2.89873433e+00  5.57059646e-01
  -5.20786

In [99]:
# one-hot-encoding labels
y = pd.get_dummies(y).values

print(x[:5])
print(y[:5])

[[-2.75118195e+02  1.19942017e+02 -9.85328598e+01 -6.66054764e+01
  -4.24864197e+01  4.02130365e-01 -2.83528080e+01 -5.50975752e+00
   1.02289715e+01  4.99342728e+00  1.57092762e+01  2.17791080e-01
  -2.47162747e+00 -1.30902028e+00 -9.59634876e+00  6.02451824e-02
   3.03321600e+00  1.27535594e+00 -5.54501247e+00 -1.04474821e+01
  -6.12301683e+00 -1.19562089e+00  2.84950912e-01  4.16270590e+00
  -6.06368303e+00 -9.49120235e+00  2.13097596e+00  4.01335049e+00
  -6.14730930e+00 -2.46767020e+00 -1.21867812e+00 -3.46005964e+00
   1.99648750e+00  1.48226869e+00 -4.42133379e+00 -3.87177658e+00
   2.60200232e-01  6.49950361e+00  1.66028380e+00  1.56434786e+00
  -7.22784996e-01 -4.07190228e+00 -4.24067545e+00  2.29234171e+00
  -5.17653897e-02 -2.23162580e+00  7.56968439e-01  2.10232571e-01
   1.51369786e+00  2.06801009e+00 -1.00280285e+00 -2.60164309e+00
  -4.37110806e+00 -2.71826386e+00  2.98152775e-01 -3.36448145e+00
  -1.10502601e+00  4.84512001e-01 -2.89873433e+00  5.57059646e-01
  -5.20786

In [100]:
# splitting train & test data.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [101]:
print(x_train)

[[-1.8372130e+02  1.5373251e+02 -3.6251728e+01 ... -7.4696875e-01
  -2.6941073e-01  3.8526110e-02]
 [-8.4183342e+01  1.6663361e+02 -6.1381046e+01 ...  1.1887540e+00
  -7.3997951e-01  5.5762202e-01]
 [-6.7854347e+01  2.6620264e+01 -4.8031250e+01 ...  1.2433501e-01
  -3.1558812e-01  4.1459572e-01]
 ...
 [-4.7492459e+02  9.3541924e+01  3.2320999e+01 ... -2.7098218e-02
  -8.3049417e-02  3.0473519e-02]
 [-2.0096819e+02  1.7301343e+02 -3.5087139e+01 ...  3.0276269e-01
   3.6080414e-01 -7.1936034e-02]
 [-4.8299670e+02  2.3092860e+02  2.8287045e+01 ... -2.8199388e-03
   4.3058264e-01 -2.1739511e-01]]


In [102]:
print(y_train)

[[0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]


In [103]:
x_train[0].shape

(128,)

In [104]:
y_train[0].shape

(10,)

In [129]:
# Defining the model

model=Sequential()
model.add(Dense(100,activation="relu",input_shape=(128,)))
model.add(Dropout(0.2))
model.add(Dense(50,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(25,activation="relu"))
model.add(Dense(10,activation="softmax"))

model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 100)               12900     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_13 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_32 (Dense)             (None, 10)                260       
Total params: 19,485
Trainable params: 19,485
Non-trainable params: 0
__________________________________________________

In [130]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [131]:
# Custom Keras callback to stop training when certain accuracy is achieved.
class MyThresholdCallback(Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

model.fit(x_train,y_train,batch_size=100,epochs=100,callbacks=[MyThresholdCallback(0.9)],validation_data=(x_test,y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1b2ce004d30>

In [132]:
predicted = model.predict(x_test)
predicted

array([[1.4914738e-07, 8.1345860e-08, 7.1623106e-07, ..., 6.4138618e-08,
        1.8625997e-08, 4.3110267e-06],
       [3.5391531e-10, 5.0472177e-04, 9.1268843e-01, ..., 1.2126768e-08,
        6.0810788e-07, 7.8506790e-02],
       [1.3674135e-10, 5.6741887e-09, 3.6900962e-09, ..., 7.9900055e-04,
        4.8105438e-13, 3.5319277e-03],
       ...,
       [1.0548442e-10, 1.0000000e+00, 2.9473609e-14, ..., 5.0913701e-10,
        2.4936043e-14, 1.8742694e-09],
       [5.0902498e-05, 1.1740776e-03, 9.4061661e-01, ..., 3.6718662e-05,
        3.1369932e-03, 1.8868426e-02],
       [3.9542846e-02, 1.4571701e-02, 5.4135340e-01, ..., 4.7914345e-05,
        2.0519791e-02, 4.7284678e-02]], dtype=float32)

In [133]:
# Takes audio filename/path and returns predicted label

def predict_audio_class(filename):
    
    labels = ['air_conditioner','car_horn','children_playing','dog_bark','drilling','engine_idling','gun_shot','jackhammer','siren','street_music']
    
    mfcc = convert_audio2MFCC(filename)
    mfcc_reshaped=mfcc.reshape(1,-1)
    predicted_label = model.predict(mfcc_reshaped)
    label = labels[np.argmax(predicted_label)]
    print(label)
    return label


In [134]:
audio_file = r"C:\Users\dipesh\Desktop\horn.wav"
predict_audio_class(audio_file)

car_horn


'car_horn'