# Classifying Google Voice Commands Using CNN

### Model Training and Evaluation 

### Load Preprocessed data 

In [29]:
# Retrieve the preprocessed data from the files you created in the last notebook 
# (Please complete the previous notebook first!!)
import os
import numpy as np
from random import shuffle

np.random.seed(1)
# SET THIS TO THE DIRECTORY WHERE YOU CREATED THE PREPROCESSED FILES IN THE LAST NOTEBOOK !
preprocessed_dir = '../Preproc'
dir_name = '../googleData'
train_path = os.path.join(preprocessed_dir, 'train')
test_path = os.path.join(preprocessed_dir, 'test')
class_names = sorted(os.listdir(train_path))

Now lets make a function for loading all our preprocessed data.

In [16]:
def load_data(data_dir):
    class_names = sorted(os.listdir(data_dir))
    nb_classes = len(class_names)
    print("class_names = ",class_names)

    for (dirpath, dirnames, filenames) in os.walk(os.path.join(data_dir, class_names[0])):
        with np.load(os.path.join(data_dir, class_names[0], filenames[0])) as sample_file:
            mel_dims = sample_file['melgram'].shape

    total_load = 0
    for classname in class_names:
        files = os.listdir(os.path.join(data_dir, classname))
        n_files = len(files)
        total_load += n_files

    X = np.zeros((total_load, mel_dims[1], mel_dims[2], mel_dims[3]))
    Y = np.zeros((total_load, nb_classes))
    paths = []

    load_count = 0
    num_classes = len(class_names)
    label_smoothing = 0.005

    for idx, classname in enumerate(class_names):
        # Vector smoothing means that some allowance is made for classes where the the files 
        idx = class_names.index(classname)
        vec = np.zeros(num_classes)
        vec[idx] = 1
        vec = vec * (1 - label_smoothing) + label_smoothing / num_classes

        this_Y = np.array(vec)
        this_Y = this_Y[np.newaxis,:]
        file_list = os.listdir(os.path.join(data_dir, classname))
        shuffle(file_list)  # just to remove any special ordering

        for _, infilename in enumerate(file_list):   # Load files in a particular class
            audio_path = os.path.join(data_dir, classname, infilename)
            with np.load(audio_path) as data:
                melgram = data['melgram']
            if melgram.shape != mel_dims:
                raise Exception('Dimension mismatch {} vs {}'.format(melgram.shape, mel_dims))

            # usually it's the 2nd dimension of melgram.shape that is affected by audio file length
            X[load_count,:,:] = melgram[:,:,:]
            #X[load_count,:,:] = melgram
            Y[load_count,:] = this_Y
            paths.append(audio_path)
            load_count += 1
        print('Successfully processed {} files for class {}'
              .format(len(file_list), classname))



    assert (X.shape[0] == Y.shape[0] )
    # Shuffle the classes up 
    idx = np.array(range(Y.shape[0]))
    np.random.shuffle(idx)
    newX = np.copy(X)
    newY = np.copy(Y)
    for i in range(len(idx)):
        newX[i] = X[idx[i],:,:]
        newY[i] = Y[idx[i],:]

    return newX, newY

Next let's load both the training and test data.

In [17]:

X_train, Y_train = load_data(train_path)
X_test, Y_test = load_data(test_path)

class_names =  ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
Successfully processed 1456 files for class bed
Successfully processed 1471 files for class bird
Successfully processed 1473 files for class cat
Successfully processed 1484 files for class dog
Successfully processed 2005 files for class down
Successfully processed 1999 files for class eight
Successfully processed 2003 files for class five
Successfully processed 2016 files for class four
Successfully processed 2016 files for class go
Successfully processed 1480 files for class happy
Successfully processed 1487 files for class house
Successfully processed 2000 files for class left
Successfully processed 1484 files for class marvin
Successfully processed 2009 files for class nine
Successfully processed 2018 files for class no
Successfully proce

### Initial model architecture - CNN

#### Describe archtecture here !!

In [18]:
from keras import backend as K
from keras.models import Sequential,  load_model, save_model
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Conv2D
from keras.layers.normalization import BatchNormalization

nb_layers=4
K.set_image_data_format('channels_last')                   
nb_filters = 32  # number of convolutional filters
kernel_size = (3, 3)  # convolution kernel size
pool_size = (2, 2)  # size of pooling area for max pooling
cl_dropout = 0.5    # conv. layer 
dl_dropout = 0.6    # dense layer 
X_shape = X_train.shape

print(" CNN: X_shape = ",X_shape,", channels = ",X_shape[3])
input_shape = (X_shape[1], X_shape[2], X_shape[3])
model = Sequential()
model.add(Conv2D(nb_filters, kernel_size, padding='same', input_shape=input_shape, name="Input"))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Activation('relu'))      
model.add(BatchNormalization(axis=-1)) 

for layer in range(nb_layers-1):   # add more layers than just the first
    model.add(Conv2D(nb_filters, kernel_size, padding='same'))
    model.add(MaxPooling2D(pool_size=pool_size))
    model.add(Activation('elu'))
    model.add(Dropout(cl_dropout))

model.add(Flatten())
model.add(Dense(128))   # 128 is 'arbitrary' for now
model.add(Activation('elu'))
model.add(Dropout(dl_dropout))
model.add(Dense(len(class_names)))
model.add(Activation("softmax",name="Output"))



Using TensorFlow backend.


 CNN: X_shape =  (55001, 96, 87, 1) , channels =  1


### Compiling the model 

Next compile the model


In [19]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (Conv2D)               (None, 96, 87, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 43, 32)        0         
_________________________________________________________________
activation_1 (Activation)    (None, 48, 43, 32)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 48, 43, 32)        128       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 48, 43, 32)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 24, 21, 32)        0         
_________________________________________________________________
activation_2 (Activation)    (None, 24, 21, 32)       

#### Training the model
Here let's train the model. Since the train/test split was set by default to .85/.15; here setting the train/val split to .8/.2 results in a train/val/test split of roughly .7/.15/.15

In [20]:
from keras.callbacks import ModelCheckpoint 
from keras.models import load_model

# Display model architecture summary 
val_split = 0.2
epochs = 20 # 100 (but since I have already trained the weights file I don't need to retrain.)
# In order to retrain from scratch please delete the weights file and set the epochs to between 20 - 100
batch_size = 32

split_index = int(X_train.shape[0]*(1-val_split))
X_val_data, Y_val_data = X_train[split_index:], Y_train[split_index:]
X_train_data, Y_train_data = X_train[:split_index-1], Y_train[:split_index-1]
weights_file='weights.hdf5'

if os.path.isfile(weights_file):
    loaded_model = load_model(weights_file)   # strip any previous parallel part, to be added back in later
    model.set_weights( loaded_model.get_weights() )  
    print('Loading Weights from file {}'.format(weights_file))

checkpointer = ModelCheckpoint(filepath=weights_file, 
                               verbose=1, save_best_only=True)

model.fit(X_train_data, Y_train_data, batch_size=batch_size, epochs=epochs, shuffle=True,  callbacks=[checkpointer],
              verbose=1, validation_data=(X_val_data, Y_val_data))



Loading Weights from file weights.hdf5
Train on 43999 samples, validate on 11001 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.44947, saving model to weights.hdf5


<keras.callbacks.callbacks.History at 0x7f49d84b40f0>

### Test the model 

Here we will review the accuracy of the model on both the training and test data sets (here the training data is a compination of both the Train + Val sets). 

In [21]:
# Evaluating the model on the training and testing set
score = model.evaluate(X_train, Y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(X_test, Y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.8839839100837708
Testing Accuracy:  0.8819958567619324


Both training and test scores are quite high, suggesting the model has not overfit as there is little difference between the training and test models

### Predictions  

Lets look closer as to what the model predicts, first set the labels

In [22]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
y = np.array(class_names)

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

Make a function for creating mel spectrograms from wav files

In [23]:
import librosa 
import numpy as np 

def extract_melgram(file_name):
    signal, sr = librosa.load(file_name, mono=False, sr=44100)
    if len(signal.shape) == 1:
        signal = np.reshape(signal, (1, signal.shape[0]))
    melgram = librosa.amplitude_to_db(librosa.feature.melspectrogram(signal[0], sr=sr, n_mels=96))[np.newaxis,:,:,np.newaxis] 
    melgram = melgram.astype(np.float16)
    return  melgram


Make a function for predicting output from these wab files

In [24]:
def print_prediction(file_name):
    prediction_feature = extract_melgram(file_name) 

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

### Data exploration 

Now lets use these functions to examine presictions from the test dataset more closely

In [32]:
filename = os.path.join(dir_name, 'no/692a88e6_nohash_3.wav')
print(filename)
print_prediction(filename) 

../googleData/no/692a88e6_nohash_3.wav
bed 		 :  0.00223922100849449634552001953125
bird 		 :  0.00001725627589621581137180328369
cat 		 :  0.01505586598068475723266601562500
dog 		 :  0.00310107436962425708770751953125
down 		 :  0.02027723938226699829101562500000
eight 		 :  0.00001779654667188879102468490601
five 		 :  0.00000670899407850811257958412170
four 		 :  0.00002002794280997477471828460693
go 		 :  0.07486108690500259399414062500000
happy 		 :  0.00003331612242618575692176818848
house 		 :  0.00132211972959339618682861328125
left 		 :  0.00196837517432868480682373046875
marvin 		 :  0.00008275819709524512290954589844
nine 		 :  0.00088627921650186181068420410156
no 		 :  0.72599422931671142578125000000000
off 		 :  0.00009011977090267464518547058105
on 		 :  0.00000164620576015295227989554405
one 		 :  0.00013527624832931905984878540039
right 		 :  0.00009317646618001163005828857422
seven 		 :  0.00000414102441936847753822803497
sheila 		 :  0.000243540678638964891433715820

In [34]:
filename = os.path.join(dir_name, 'right/3411cf4b_nohash_0.wav')
print_prediction(filename) 

bed 		 :  0.00000000005373932202012987602302
bird 		 :  0.00000000016150072945642079957906
cat 		 :  0.00000000000003426013474110665979
dog 		 :  0.00000000000286886313316525587425
down 		 :  0.00000000000001943108997001005006
eight 		 :  0.00000000119927068276126647106139
five 		 :  0.00000014667800485312909586355090
four 		 :  0.00000000000474365208572025132128
go 		 :  0.00000000000629958368456029482729
happy 		 :  0.00000000000057845476455695155060
house 		 :  0.00000000034614636157392908444308
left 		 :  0.00000006975890443072785274125636
marvin 		 :  0.00000000016940375491270032171087
nine 		 :  0.00000012274681182589119998738170
no 		 :  0.00000000002718415076374824934646
off 		 :  0.00000000000013491657335600037992
on 		 :  0.00000000000101450738001357398943
one 		 :  0.00000028834296017521410249173641
right 		 :  0.99999737739562988281250000000000
seven 		 :  0.00000000000025691894358498279516
sheila 		 :  0.00000000000005280628365134099056
six 		 :  0.000000008250673033671773

In [35]:
filename = os.path.join(dir_name, 'three/38d78313_nohash_2.wav')
print_prediction(filename) 

bed 		 :  0.00000000091373908439607021136908
bird 		 :  0.00000015808863906840997515246272
cat 		 :  0.00000000003889152194735956413751
dog 		 :  0.00000000011766207452801324961911
down 		 :  0.00000000045101211654241524229292
eight 		 :  0.00014037001528777182102203369141
five 		 :  0.00000026052680368593428283929825
four 		 :  0.00000009437886916430215933360159
go 		 :  0.00000000162623381427806634746958
happy 		 :  0.00000023225538825499825179576874
house 		 :  0.00000001037094410349936879356392
left 		 :  0.00000000011953595058233901227140
marvin 		 :  0.00000001342473687770961987553164
nine 		 :  0.00000135925870381470303982496262
no 		 :  0.00000000106144493194904043775750
off 		 :  0.00000000013119644259873552982754
on 		 :  0.00000017772620708456088323146105
one 		 :  0.00000031578147741129214409738779
right 		 :  0.00001458072165405610576272010803
seven 		 :  0.00000000266715938224137971701566
sheila 		 :  0.00000000899206220594805927248672
six 		 :  0.000000102860035156027151

In [36]:
 filename = os.path.join(dir_name, 'tree/07363607_nohash_0.wav')
print_prediction(filename) 

bed 		 :  0.00000319975993079424370080232620
bird 		 :  0.00001930228609126061201095581055
cat 		 :  0.00000363687104254495352506637573
dog 		 :  0.00000006687793785431495052762330
down 		 :  0.00000070542705543630290776491165
eight 		 :  0.02149184606969356536865234375000
five 		 :  0.00000228441126637335401028394699
four 		 :  0.00000434840012530912645161151886
go 		 :  0.00000290086109089315868914127350
happy 		 :  0.00022248228196986019611358642578
house 		 :  0.00000334329320139659103006124496
left 		 :  0.00000002116596853340979578206316
marvin 		 :  0.00000271320914180250838398933411
nine 		 :  0.00002837108877429272979497909546
no 		 :  0.00000049340195573677192442119122
off 		 :  0.00000002857737158024065138306469
on 		 :  0.00000057358249705430353060364723
one 		 :  0.00000194963467947673052549362183
right 		 :  0.00000688392583469976671040058136
seven 		 :  0.00000217399565372033976018428802
sheila 		 :  0.00001421337947249412536621093750
six 		 :  0.000137907045427709817886

## Finished 

Great job! If you made it this far you should have nearly 90% accuracy on both the training and validation data. Feel free to play around with individual and check the classification for files from the testset individually.