# load the zip file and unzip and before check the GPU

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
# import dependencies
import requests, zipfile, io
from glob import glob
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import numpy as np
from tqdm import tqdm
import cv2
import pandas as pd
seed = 7
import pandas as pd
np.random.seed(seed)
import os

In [0]:
zip_file_url = 'https://github.com/karoldvl/ESC-50/archive/master.zip' # link: ESC-50 Datset

In [0]:
if not os.path.exists('sound'):
    os.makedirs('sound')

In [0]:
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('sound/')
z.close()

In [27]:
glob('sound/ESC-50-master/audio/*')

['sound/ESC-50-master/audio/3-197435-B-22.wav',
 'sound/ESC-50-master/audio/1-59324-A-21.wav',
 'sound/ESC-50-master/audio/1-60997-A-20.wav',
 'sound/ESC-50-master/audio/1-47714-A-16.wav',
 'sound/ESC-50-master/audio/2-95035-A-1.wav',
 'sound/ESC-50-master/audio/5-243783-A-44.wav',
 'sound/ESC-50-master/audio/5-250026-B-30.wav',
 'sound/ESC-50-master/audio/2-78651-A-44.wav',
 'sound/ESC-50-master/audio/4-185575-A-20.wav',
 'sound/ESC-50-master/audio/5-260011-A-34.wav',
 'sound/ESC-50-master/audio/3-124958-A-28.wav',
 'sound/ESC-50-master/audio/1-72229-B-6.wav',
 'sound/ESC-50-master/audio/5-201664-A-18.wav',
 'sound/ESC-50-master/audio/2-39443-A-19.wav',
 'sound/ESC-50-master/audio/2-104105-B-19.wav',
 'sound/ESC-50-master/audio/3-160993-A-3.wav',
 'sound/ESC-50-master/audio/1-32373-B-35.wav',
 'sound/ESC-50-master/audio/2-43806-A-42.wav',
 'sound/ESC-50-master/audio/2-99955-A-7.wav',
 'sound/ESC-50-master/audio/4-218199-H-35.wav',
 'sound/ESC-50-master/audio/1-91359-B-11.wav',
 'sound

# Define a function to covert the image based on calculate log scaled mel-spectrograms and their corresponding deltas from a sound clip.

Regarding fixed size input, we will divide each sound clip into segments of 60x41 (60 rows and 41 columns). The mel-spec and their deltas will become two channels, which we will be fed into CNN

In [8]:
!pip install librosa
import librosa

Collecting librosa
[?25l  Downloading https://files.pythonhosted.org/packages/09/b4/5b411f19de48f8fc1a0ff615555aa9124952e4156e94d4803377e50cfa4c/librosa-0.6.2.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 6.1MB/s 
[?25hCollecting audioread>=2.0.0 (from librosa)
  Downloading https://files.pythonhosted.org/packages/f0/41/8cd160c6b2046b997d571a744a7f398f39e954a62dd747b2aae1ad7f07d4/audioread-2.1.6.tar.gz
Collecting resampy>=0.2.0 (from librosa)
[?25l  Downloading https://files.pythonhosted.org/packages/14/b6/66a06d85474190b50aee1a6c09cdc95bb405ac47338b27e9b21409da1760/resampy-0.2.1.tar.gz (322kB)
[K    100% |████████████████████████████████| 327kB 9.3MB/s 
[?25hCollecting numba>=0.38.0 (from librosa)
[?25l  Downloading https://files.pythonhosted.org/packages/24/89/6f1755892d60ddd528090dc313349e7cc491170d6737f6b3a7a5b317ef81/numba-0.39.0-cp36-cp36m-manylinux1_x86_64.whl (1.9MB)
[K    100% |████████████████████████████████| 1.9MB 10.2MB/s 
[?25hCollecting llvm

In [0]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

def extract_features(bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    for fn in tqdm(glob('sound/ESC-50-master/audio/*')):
        sound_clip,s = librosa.load(fn) # 5sec
        sound_clip   = np.concatenate((sound_clip,sound_clip),axis=None) # make it 10s
        label = fn.split("/")[-1].split("-")[-1].split(".")[0]
        for (start,end) in windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                logspec = librosa.core.amplitude_to_db(melspec)
                logspec = logspec.T.flatten()[:, np.newaxis].T
                log_specgrams.append(logspec)
                labels.append(label)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels,dtype = np.int)

In [10]:
features,labels = extract_features()

100%|██████████| 2000/2000 [15:37<00:00,  2.13it/s]


In [11]:
# label category names
df = pd.read_csv(glob('sound/ESC-50-master/meta/esc50.csv')[0])
df = df[['target','category']]
df = df.drop_duplicates().reset_index(drop=True)
df = df.sort_values(by=['target']).reset_index(drop=True)
df.head()

Unnamed: 0,target,category
0,0,dog
1,1,rooster
2,2,pig
3,3,cow
4,4,frog


In [12]:
my_dict = {}
for i in range(len(df)):
  my_dict[df['target'][i]] = df['category'][i]
my_dict

{0: 'dog',
 1: 'rooster',
 2: 'pig',
 3: 'cow',
 4: 'frog',
 5: 'cat',
 6: 'hen',
 7: 'insects',
 8: 'sheep',
 9: 'crow',
 10: 'rain',
 11: 'sea_waves',
 12: 'crackling_fire',
 13: 'crickets',
 14: 'chirping_birds',
 15: 'water_drops',
 16: 'wind',
 17: 'pouring_water',
 18: 'toilet_flush',
 19: 'thunderstorm',
 20: 'crying_baby',
 21: 'sneezing',
 22: 'clapping',
 23: 'breathing',
 24: 'coughing',
 25: 'footsteps',
 26: 'laughing',
 27: 'brushing_teeth',
 28: 'snoring',
 29: 'drinking_sipping',
 30: 'door_wood_knock',
 31: 'mouse_click',
 32: 'keyboard_typing',
 33: 'door_wood_creaks',
 34: 'can_opening',
 35: 'washing_machine',
 36: 'vacuum_cleaner',
 37: 'clock_alarm',
 38: 'clock_tick',
 39: 'glass_breaking',
 40: 'helicopter',
 41: 'chainsaw',
 42: 'siren',
 43: 'car_horn',
 44: 'engine',
 45: 'train',
 46: 'church_bells',
 47: 'airplane',
 48: 'fireworks',
 49: 'hand_saw'}

In [13]:
seed = 4
rng = np.random.RandomState(seed)
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
onehot_labels = to_categorical(labels,num_classes=50)

In [0]:
# Create train test Dataset

rnd_indices = np.random.rand(len(labels)) < 0.70

X_train = features[rnd_indices]
y_train = onehot_labels[rnd_indices]
X_test  = features[~rnd_indices]
y_test  = onehot_labels[~rnd_indices]

In [16]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, 

((27998, 60, 41, 2), (27998, 50), (12002, 60, 41, 2), (12002, 50))

# CNN Model

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten,InputLayer
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.constraints import maxnorm
from keras.callbacks import ModelCheckpoint

In [0]:
def basemodel():
  model = Sequential()
  model.add(Conv2D(32, (3, 3), input_shape=(60,41,2), activation='relu', padding='same'))
  model.add(Dropout(0.2))
  model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
  model.add(Dropout(0.2))
  model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
  model.add(Dropout(0.2))
  model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Flatten())
  model.add(Dropout(0.2))
  model.add(Dense(1024, activation='relu', kernel_constraint=maxnorm(3)))
  model.add(Dropout(0.2))
  model.add(Dense(512, activation='relu', kernel_constraint=maxnorm(3)))
  model.add(Dropout(0.2))
  model.add(Dense(50, activation='softmax'))
  # Compile model
  epochs = 25
  lrate = 0.01
  decay = lrate/epochs
  sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
  model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
  return model


In [0]:
if not os.path.exists('model'):
    os.makedirs('model')
    
filepath="model/weights_0.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [20]:
model = basemodel()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 60, 41, 32)        608       
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 41, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 60, 41, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 20, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 30, 20, 64)        18496     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 20, 64)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 30, 20, 64)        36928     
__________

# Training with Data Augmentation

One of the major reasons for overfitting is that we don’t have enough data to train our network. Apart from regularization, another very effective way to counter Overfitting is Data Augmentation. It is the process of artificially creating more images from the images you already have by changing the size, orientation etc of the image. It can be a tedious task but fortunately, this can be done in Keras using the ImageDataGenerator instance.

In [0]:
from keras.preprocessing.image import ImageDataGenerator

In [0]:
datagen = ImageDataGenerator(
              width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
              height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
              horizontal_flip=True,  # randomly flip images
              vertical_flip=False  # randomly flip images
          )

In [0]:
# init the batch size and epochs

'''
Note: Due to Memory Error like Buffered data was truncated after reaching the output size limit. What i did is that Save the model in for example 60th epoch and close current program and run new program and restore saved model and train model from 61 epoch to 120 epoch and 
save that and close program and repeat this work for your interested epoch For this [100,50] three times repeat 

'''
batch_size = 50
epochs = 100

In [41]:
# fit the model
history = model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                              steps_per_epoch=int(np.ceil(X_train.shape[0] / float(batch_size))),
                              epochs=epochs,
                              validation_data=(X_test, y_test),
                              verbose=1,callbacks=callbacks_list)
 

  ' (' + str(self.x.shape[channels_axis]) + ' channels).')


Epoch 1/100

Epoch 00001: val_acc did not improve from 0.71438
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.71438
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.71438
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.71438
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.71438
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.71438
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.71438
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.71438
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.71438
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.71438
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.71438
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.71438
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.71438
Epoch 14/100

Epoch 00014: val_acc did not improve from 0.71438
Epoch 15/100

Epoch 00015: val_acc improved from 0.71438 to 0.71596, saving model to model/weights_0.best.hdf5
Ep

Note: Due to Memory Error like Buffered data was truncated after reaching the output size limit.
What i did is that Save the model in for example 60th epoch and close current program and run new program and restore saved model and train model from 61 epoch to 120 epoch and save that and close program and repeat this work for your interested epoch 
For this [100,50] three times repeat 


In [43]:
# evaluate model
model.evaluate(X_test, y_test)



[1.1082309953889495, 0.7230461589735044]

# Classification Report and Confusion Matrix

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
y_pred = model.predict_classes(X_test)

In [0]:
target_name = np.array(df['category'])

In [53]:
print(classification_report(np.argmax(y_test,axis=1),y_pred,target_names=target_name))

                  precision    recall  f1-score   support

             dog       0.80      0.77      0.79       231
         rooster       0.70      0.45      0.55       258
             pig       0.78      0.70      0.74       262
             cow       0.81      0.75      0.78       260
            frog       0.91      0.90      0.91       222
             cat       0.63      0.55      0.58       231
             hen       0.84      0.62      0.71       251
         insects       0.85      0.75      0.79       244
           sheep       0.91      0.63      0.75       270
            crow       0.83      0.78      0.80       251
            rain       0.66      0.99      0.79       233
       sea_waves       0.64      0.75      0.69       241
  crackling_fire       0.88      0.81      0.84       248
        crickets       0.91      0.88      0.90       219
  chirping_birds       0.91      0.60      0.73       266
     water_drops       0.80      0.51      0.62       266
            w

In [54]:
print(confusion_matrix(np.argmax(y_test,axis=1),y_pred))

[[178   3   1 ...   1   0   0]
 [  6 115   0 ...   0   2   0]
 [  2   1 184 ...   3   2   1]
 ...
 [  0   0   0 ... 172   0   0]
 [  0   1   0 ...   1 209   0]
 [  0   0   3 ...   0   0 216]]


# Test with Real time Data of 10s Sound Clip to the Model

In [0]:
class convertSound2image:
  
  def __init__(self,sourcePath):
    '''
    Insert the source path of sound 10s 
    '''
    self.sourcePath = sourcePath
  
  def windows(self,data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

  def extract_features(self,bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    sound_clip,s = librosa.load(self.sourcePath) # 10sec
    for (start,end) in self.windows(sound_clip,window_size):
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            logspec = librosa.core.amplitude_to_db(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            log_specgrams.append(logspec)

    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])

    return np.array(features)

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import pandas as pd
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [62]:
listed = drive.ListFile({'q': "title contains 'soxrecording.wav' and '1tr6IWd1WvQDQLC3KtuH_qPEf_vnuIauN' in parents"}).GetList()
for file in listed:
  print('title {}, id {}'.format(file['title'], file['id']))

title soxrecording.wav, id 1sgE21hl92EvJTx6yGAo0oAdqYoGfqpRV


In [0]:
download_path = os.path.expanduser('~/sample')
try:
  os.makedirs(download_path)
except OSError:
  pass

In [0]:
sample = os.path.join(download_path, 'soxrecording.wav')

In [71]:
sample

'/root/sample/soxrecording.wav'

In [0]:
temp_file1 = drive.CreateFile({'id': '1sgE21hl92EvJTx6yGAo0oAdqYoGfqpRV'})
temp_file1.GetContentFile(sample)

In [0]:
sound_clip,s = librosa.load(sample)

# No Backends Error of librosa

In [76]:
!apt install libav-tools

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  dbus ffmpeg i965-va-driver libaacs0 libapparmor1 libasound2 libasound2-data
  libass9 libasyncns0 libavc1394-0 libavcodec57 libavdevice57 libavfilter6
  libavformat57 libavresample3 libavutil55 libbdplus0 libbluray2
  libboost-filesystem1.62.0 libboost-system1.62.0 libbs2b0 libcaca0 libcairo2
  libcapnp-0.5.3 libcdio-cdda1 libcdio-paranoia1 libcdio13 libchromaprint1
  libcrystalhd3 libdbus-1-3 libdc1394-22 libdrm-amdgpu1 libdrm-common
  libdrm-intel1 libdrm-nouveau2 libdrm-radeon1 libdrm2 libegl1-mesa libelf1
  libfftw3-double3 libflac8 libflite1 libfribidi0 libgbm1 libgl1-mesa-dri
  libgl1-mesa-glx libglapi-mesa libgme0 libgraphite2-3 libgsm1 libharfbuzz0b
  libiec61883-0 libjack-jackd2-0 libllvm5.0 libmirclient9 libmircommon7
  libmircore1 libmirprotobuf3 libmp3lame0 libmpg123-0 libnuma1 libogg0
  libopenal-data libopenal1 libopencv-co