# [2,4] CNN blocks neural network for classifying google-speech command dataset

based on 2 conv blocks only

In [1]:
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

import argparse
import sys
from mlflow import pyfunc
import mlflow.tensorflow

import os
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.utils import to_categorical 
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

import tensorflow as tf

print(tf.__version__)

2.4.0


In [2]:
def load_data(train_audio_path, labels):
    #train_audio_path = '/home/hem/work/machine-learning/ignore/speech_commands_v0.02'

    all_wave = []
    all_label = []
    for label in labels:
        #print(label)
        waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
        for wav in waves:
            samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)
            samples = librosa.resample(samples, sample_rate, 8000)
            if(len(samples)== 8000): 
                samples = np.expand_dims(samples, axis=-1)
                all_wave.append(samples)
                all_label.append(label)
                
    return all_wave, all_label

In [3]:
def get_train_test_data(all_wave, all_label):
    assert (len(all_wave) == len(all_label))
    
    le = LabelEncoder()
    y=le.fit_transform(all_label)
    classes= list(le.classes_)

    print('number of classes: {}'.format(len(classes)))

    y = to_categorical(y, num_classes=len(classes))

    x_tr, x_val, y_tr, y_val = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)

    return (x_tr, y_tr), (x_val, y_val)

In [4]:
def get_train_val_test_data(all_wave, all_label):
    assert (len(all_wave) == len(all_label))

    le = LabelEncoder()
    y=le.fit_transform(all_label)
    classes= list(le.classes_)

    y = to_categorical(y, num_classes=len(all_label))

    random_seq = np.random.permutation(len(all_wave))
    
    # first 70% for training set
    train_idx = random_seq[:0.7*len(all_wave)] 

     # next 20% for validation set   
    val_idx = random_seq[0.7*len(all_wave):0.9*len(all_wave)]

    # rest everything for test set
    test_idx = random_seq[0.9*len(all_wave):]  

    training_set = all_wave[train_idx], all_label[train_idx]
    validation_set = all_wave[val_idx], all_label[val_idx]
    test_set = all_wave[test_idx], all_label[test_idx]

    return training_set, validation_set, test_set

In [5]:
def get_2cnn_blocks_model(all_label):
    K.clear_session()

    inputs = Input(shape=(8000,1))

    #First Conv1D layer
    conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Second Conv1D layer
    conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Flatten layer
    conv = Flatten()(conv)

    #Dense Layer 1
    conv = Dense(256, activation='relu')(conv)
    conv = Dropout(0.2)(conv)

    #Dense Layer 2
    conv = Dense(128, activation='relu')(conv)
    conv = Dropout(0.2)(conv)

    outputs = Dense(len(all_label), activation='softmax')(conv)

    model = Model(inputs, outputs)
    model.summary()
    return model

In [6]:
def get_sequential_2cnn_blocks_model(labels):
    K.clear_session()

    model = tf.keras.Sequential()

    model.add(Input(shape=(8000,1)))

    #First Conv1D layer
    model.add(Conv1D(8,13, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Dropout(0.2))

    #Second Conv1D layer
    model.add(Conv1D(16, 11, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Dropout(0.2))

    #Flatten layer
    model.add(Flatten())

    #Dense Layer 1
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))

    #Dense Layer 2
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(len(labels), activation='softmax'))

    model.summary()
    return model

In [7]:
def get_4cnn_blocks_model(all_label):
    K.clear_session()

    inputs = Input(shape=(8000,1))

    #First Conv1D layer
    conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Second Conv1D layer
    conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Third Conv1D layer
    conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Fourth Conv1D layer
    conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
    conv = MaxPooling1D(3)(conv)
    conv = Dropout(0.2)(conv)

    #Flatten layer
    conv = Flatten()(conv)

    #Dense Layer 1
    conv = Dense(256, activation='relu')(conv)
    conv = Dropout(0.2)(conv)

    #Dense Layer 2
    conv = Dense(128, activation='relu')(conv)
    conv = Dropout(0.2)(conv)

    outputs = Dense(len(labels), activation='softmax')(conv)

    model = Model(inputs, outputs)

    model.summary()
    return model

In [8]:
def train_model(model, tr_data, val_data):
    x_tr, y_tr = tr_data
    x_val, y_val = val_data
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
    mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    history = model.fit(x_tr, y_tr ,epochs=25, callbacks=[es,mc], batch_size=32, validation_data=(x_val,y_val))
    model.save('sequential_2cnn_blocks_model.h5')
    return history

In [9]:
def predict(audio):
    prob=model.predict(audio.reshape(1,8000,1))
    index=np.argmax(prob[0])
    return classes[index]

In [10]:
# Enable auto-logging to MLflow to capture TensorBoard metrics.
mlflow.tensorflow.autolog()

parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=100, type=int, help="batch size")
parser.add_argument("--train_steps", default=1000, type=int, help="number of training steps")

def main():
    with mlflow.start_run():
        #args = parser.parse_args(argv[1:])
        train_audio_path = '/home/hem/work/machine-learning/ignore/speech_commands_v0.02'
        labels = ['yes', 'no','up','down','left','right','on','off','stop','go']
        #labels = ['yes', 'no']

        all_wave, all_label = load_data(train_audio_path, labels)
        all_wave = tf.convert_to_tensor(all_wave)
        all_label = tf.convert_to_tensor(all_label)

        print(all_wave.shape)
        print(all_label.shape)

        tr_data, val_data = get_train_test_data(all_wave, all_label)

        print('tr data x shape: {}'.format(tr_data[0].shape))
        print('tr data y shape: {}'.format(tr_data[1].shape))

        print('val data x shape: {}'.format(val_data[0].shape))
        print('val data y shape: {}'.format(val_data[1].shape))    

        model = get_sequential_2cnn_blocks_model(labels)
        model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])    

        history = train_model(model, tr_data, val_data)

        # evaluate the model

        # Generate predictions from the model

In [11]:
main()

(34975, 8000, 1)
(34975,)
number of classes: 10
tr data x shape: (27980, 8000, 1)
tr data y shape: (27980, 10)
val data x shape: (6995, 8000, 1)
val data y shape: (6995, 10)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 7988, 8)           112       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2662, 8)           0         
_________________________________________________________________
dropout (Dropout)            (None, 2662, 8)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2652, 16)          1424      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 884, 16)           0         
_________________________________________________________________
dropout_1 (Dro