- https://www.kaggle.com/alphasis/light-weight-cnn-lb-0-74

In [3]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

In [2]:
L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
# root_path = r'..'
# out_path = r'.'
# model_path = r'.'

TRAIN_PATH = './input/train/audio/'
TEST_PATH = './input/test/audio/'
OUTPUT_PATH = './output/'


# train_data_path = os.path.join(root_path, 'input', 'train', 'audio')
# test_data_path = os.path.join(root_path, 'input', 'test', 'audio')

In [10]:
## custom_fft and log_specgram functions written by DavidS.

def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [11]:
## utility function to grab all wav files inside train data folder.

def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [12]:
def pad_audio(samples):
    '''
    pad audios that are less than 16000(1 second) with 0s to make them all have the same length.
    '''
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    '''
    chop audios that are larger than 16000(eg. wav files in background noises folder) to 16000 in length.
    create several chunks out of one large wav files given the parameter 'num'.
    '''
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    '''
    transform labels into dummies values. It's used in combination with softmax to predict the label.
    '''
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [23]:
labels, fnames = list_wavs_fname(TRAIN_PATH)
new_sample_rate = 8000
y_train = []
x_train = []

./input/train/audio/


In [24]:
labels[:10]

['_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 'bed',
 'bed',
 'bed',
 'bed']

In [25]:
fnames[:10]

['doing_the_dishes.wav',
 'dude_miaowing.wav',
 'exercise_bike.wav',
 'pink_noise.wav',
 'running_tap.wav',
 'white_noise.wav',
 '00176480_nohash_0.wav',
 '004ae714_nohash_0.wav',
 '004ae714_nohash_1.wav',
 '00f0204f_nohash_0.wav']

In [26]:
%%time
for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(TRAIN_PATH, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y_train.append(label)
        x_train.append(specgram)



CPU times: user 1min 2s, sys: 14.5 s, total: 1min 16s
Wall time: 1min 29s


In [30]:
y_train[0], x_train[0]

('_background_noise_',
 array([[-4.97933817,  0.10387147,  3.65877104, ...,  3.82734489,
          4.19081783,  3.84605742],
        [-6.64055014,  3.8370142 ,  3.85589814, ...,  2.11754704,
          3.49663281,  2.92539573],
        [-0.48447311,  4.59023714,  5.16882467, ...,  3.03794026,
          2.4426055 , -0.25301307],
        ..., 
        [ 1.46701586,  4.34551239,  4.98449135, ...,  3.0860846 ,
          2.33823991,  0.50432402],
        [ 0.70941228,  3.97903872,  3.49804473, ...,  3.3809762 ,
         -0.15695821,  0.80415905],
        [ 0.14811924,  2.82283783,  3.69774365, ...,  2.12182188,
          1.26291156, -0.07007227]], dtype=float32))

In [35]:
x_train[0].shape

(99, 81)

In [36]:
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
del labels, fnames
gc.collect()

538

In [40]:
y_train[0], x_train[0]

(array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=uint8),
 array([[[-4.97933817],
         [ 0.10387147],
         [ 3.65877104],
         ..., 
         [ 3.82734489],
         [ 4.19081783],
         [ 3.84605742]],
 
        [[-6.64055014],
         [ 3.8370142 ],
         [ 3.85589814],
         ..., 
         [ 2.11754704],
         [ 3.49663281],
         [ 2.92539573]],
 
        [[-0.48447311],
         [ 4.59023714],
         [ 5.16882467],
         ..., 
         [ 3.03794026],
         [ 2.4426055 ],
         [-0.25301307]],
 
        ..., 
        [[ 1.46701586],
         [ 4.34551239],
         [ 4.98449135],
         ..., 
         [ 3.0860846 ],
         [ 2.33823991],
         [ 0.50432402]],
 
        [[ 0.70941228],
         [ 3.97903872],
         [ 3.49804473],
         ..., 
         [ 3.3809762 ],
         [-0.15695821],
         [ 0.80415905]],
 
        [[ 0.14811924],
         [ 2.82283783],
         [ 3.69774365],
         ..., 
         [ 2.12182188],
      

In [41]:
input_shape = (99, 81, 1) # in order to fit into Conv2D layer, we need to reshape it.
nclass = 12

In [44]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)

Train on 58356 samples, validate on 6485 samples
Epoch 1/3


KeyboardInterrupt: 

In [48]:
## Modeling
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 99, 81, 1)         0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 99, 81, 1)         4         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 98, 80, 8)         40        
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 97, 79, 8)         264       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 48, 39, 8)         0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 48, 39, 8)         0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 46, 37, 16)        1168      
__________

In [49]:
model.fit(x_train, y_train, batch_size=1024, validation_data=(x_valid, y_valid), epochs=3, shuffle=True, verbose=1, )
model.save(os.path.join(OUTPUT_PATH, 'cnn.model'))

Train on 58356 samples, validate on 6485 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


NameError: name 'model_path' is not defined