In [1]:
from pathlib import Path
import time

from scipy.io import wavfile
import numpy as np
import pandas as pd
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [2]:
import keras

from keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Dense, Input, Dropout, Flatten
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [None]:
PATH_DATA = 'data/train/audio'

In [48]:
def get_data(path):
    ''' Returns dataframe with columns: 'path', 'word'.'''
    datadir = Path(path)
    files = [(str(f), f.parts[-2]) for f in datadir.glob('**/*.wav') if f]
    df = pd.DataFrame(files, columns=['path', 'word'])
    
    return df


In [47]:
def prepare_data(df):
    '''Transform data into something more useful.'''
    train_words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
    words = df.word.unique().tolist()
    silence = ['_background_noise_']
    unknown = [w for w in words if w not in silence + train_words]

    # there are only 6 silence files. Mark them as unknown too.
    df.loc[df.word.isin(silence), 'word'] = 'unknown'
    df.loc[df.word.isin(unknown), 'word'] = 'unknown'
    
    return df

In [37]:
def get_specgrams(paths, nsamples=16000):
    '''
    Given list of paths, return specgrams.
    '''
    
    # read the wav files
    wavs = [wavfile.read(x)[1] for x in paths]

    # zero pad the shorter samples and cut off the long ones.
    data = [] 
    for wav in wavs:
        if wav.size < 16000:
            d = np.pad(wav, (nsamples - wav.size, 0), mode='constant')
        else:
            d = wav[0:nsamples]
        data.append(d)

    # get the specgram
    specgram = [signal.spectrogram(d, nperseg=256, noverlap=128)[2] for d in data]
    specgram = [s.reshape(129, 124, -1) for s in specgram]
    
    return specgram

In [44]:
def get_model(shape):
    '''Create a keras model.'''
    inputlayer = Input(shape=shape)

    model = BatchNormalization()(inputlayer)
    model = Conv2D(16, (3, 3), activation='elu')(model)
    model = Dropout(0.25)(model)
    model = MaxPooling2D((2, 2))(model)

    model = Flatten()(model)
    model = Dense(32, activation='elu')(model)
    model = Dropout(0.25)(model)
    
    # 11 because background noise has been taken out
    model = Dense(11, activation='softmax')(model)
    
    model = Model(inputs=inputlayer, outputs=model)
    
    return model



In [45]:
shape = (129, 124, 1)
get_model(shape).summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 129, 124, 1)       0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 129, 124, 1)       4         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 127, 122, 16)      160       
_________________________________________________________________
dropout_7 (Dropout)          (None, 127, 122, 16)      0         
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 63, 61, 16)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 61488)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                1967648   
__________

unknown    41045
stop        2380
yes         2377
up          2375
no          2375
go          2372
on          2367
right       2367
down        2359
off         2357
left        2353
Name: word, dtype: int64