In [None]:
#Download a dataset

#!wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz

--2021-07-14 13:52:38--  http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 64.233.189.128, 2404:6800:4008:c07::80
Connecting to download.tensorflow.org (download.tensorflow.org)|64.233.189.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1489096277 (1.4G) [application/gzip]
Saving to: ‘speech_commands_v0.01.tar.gz’


2021-07-14 13:53:17 (37.6 MB/s) - ‘speech_commands_v0.01.tar.gz’ saved [1489096277/1489096277]



In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil

#unzip dataset

#shutil.unpack_archive("/content/speech_commands_v0.01.tar.gz", "/content/drive/MyDrive/speech")

In [16]:
import librosa
import os
import json


DATASET_PATH = '/content/drive/MyDrive/speech/'
JSON_PATH = DATASET_PATH + 'data.json'
SAMPLES_TO_CONSIDER = 22050 # 1 sec worth of sound, Librosa recommendation

In [3]:
def prepare_dataset(dataset_path, json_path, n_mfcc=13, hop_length=512, n_fft=2048):

    # data dictionary
    data = {
        'mappings': [],
        'labels': [],
        'MFCCs': [],
        'files': []
    }

    # loop through all the sub-dirs
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # we need to ensure that we're not at root level
        if dirpath is not dataset_path:
              
              # update mappings
              category = dirpath.split('/')[-1] # dataset/down -> [dataset, down]
              data['mappings'].append(category)
              print(f'processing {category}')

              # loop through all the filenames and etract MFCCs
              for f in filenames:

                  # get file path
                  file_path = os.path.join(dirpath, f)
                  # load audio file
                  signal, sr = librosa.load(file_path)
                  # ensure the audio file is at least 1 sec (to get same shape)
                  if len(signal) >= SAMPLES_TO_CONSIDER:

                      # enfoce 1 sec. long signal
                      signal = signal[:SAMPLES_TO_CONSIDER]

                      # extract the MFCCs
                      MFCCs = librosa.feature.mfcc(signal, n_mfcc=n_mfcc, hop_length=hop_length,
                                                   n_fft=n_fft)

                      # store data
                      data['labels'].append(i-1)
                      data['MFCCs'].append(MFCCs.T.tolist()) # ndarray to list
                      data['files'].append(file_path)
                      #print(f"{file_path}: {i-1}")

    #store in jsonfile
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
prepare_dataset(DATASET_PATH, JSON_PATH)

processing eight
processing sheila
processing nine
processing yes
processing one
processing no
processing left
processing tree
processing bed
processing bird
processing go
processing wow
processing seven
processing marvin
processing dog
processing three
processing two
processing house
processing down
processing six
processing five
processing off
processing right
processing cat
processing zero
processing four
processing stop


In [14]:
import json
import numpy as np
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split


DATASET_PATH = "/content/drive/MyDrive/speech/data.json"
SAVED_MODEL_PATH = "/content/drive/MyDrive/speech/model.h5"

LEARNING_RATE = 0.0001
EPOCHS = 40
BATCH_SIZE = 32

NUM_KEYWORDS = 30

def load_dataset(data_path):

    with open(data_path, "r") as fp:
        data = json.load(fp)
    
    # extract inputs and targets
    X = np.array(data["MFCCs"])
    y = np.array(data["labels"])

    return X, y

def get_data_splits(data_path, test_size=0.1, test_validation=0.1):

    # load dataset
    X, y = load_dataset(data_path)

    # create train/validation/test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train,
                                                                    test_size=test_validation)
    # convert inpits from 2d to 3d arrays
    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


def build_model(input_shape, learning_rate, error="sparse_categorical_crossentropy"):

    #build network
    model = keras.Sequential()

    # conv layer 1
    model.add(keras.layers.Conv2D(64, (3,3), activation="relu",
                                  input_shape=input_shape, # initial layer
                                  kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPool2D((3, 3),strides=(2,2), padding="same"))
    
    # conv layer 2
    model.add(keras.layers.Conv2D(32, (3,3), activation="relu",
                                  kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPool2D((3, 3),strides=(2,2), padding="same"))
    
    # conv layer 3
    model.add(keras.layers.Conv2D(32, (2,2), activation="relu",
                                  kernel_regularizer=keras.regularizers.l2(0.001)))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPool2D((2, 2),strides=(2,2), padding="same"))
    
    # flatten the output feed it into a dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation="relu"))
    model.add(keras.layers.Dropout(0.3))
    
    # softmax classifier
    model.add(keras.layers.Dense(NUM_KEYWORDS, activation="softmax")) # []

    #compile the model
    optimiser = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimiser, loss=error, metrics=["accuracy"])
    
    #print model overview
    model.summary()

    return model

def main():

    # load train/validation/test data splits
    X_train, X_validation, X_test, y_train, y_validation, y_test = get_data_splits(DATASET_PATH)

    # build the CNN model
    input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3]) # (# segments(MFCCs), # coefficients 13, 1)
    model = build_model(input_shape, LEARNING_RATE)

    # train the model
    model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
              validation_data=(X_validation, y_validation))
    
    # evaluate the model
    test_error, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test error: {test_error}, test accuracy: {test_accuracy}")

    # save the model
    model.save(SAVED_MODEL_PATH)

In [13]:
main()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 42, 11, 64)        640       
_________________________________________________________________
batch_normalization_10 (Batc (None, 42, 11, 64)        256       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 21, 6, 64)         0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 19, 4, 32)         18464     
_________________________________________________________________
batch_normalization_11 (Batc (None, 19, 4, 32)         128       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 10, 2, 32)         0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 9, 1, 32)         

In [None]:
#ellipsis

#[...]

In [23]:
with open('/content/drive/MyDrive/speech/data.json', "r") as fp:
    data = json.load(fp)

In [24]:
data['mappings']

['eight',
 'sheila',
 'nine',
 'yes',
 'one',
 'no',
 'left',
 'tree',
 'bed',
 'bird',
 'go',
 'wow',
 'seven',
 'marvin',
 'dog',
 'three',
 'two',
 'house',
 'down',
 'six',
 'five',
 'off',
 'right',
 'cat',
 'zero',
 'four',
 'stop',
 'up',
 'on',
 'happy',
 '.ipynb_checkpoints']