- https://www.kaggle.com/alphasis/light-weight-cnn-lb-0-74

In [37]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.utils.vis_utils import plot_model
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

In [38]:
L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

TRAIN_PATH = './input/train/audio/'
TEST_PATH = './input/test/audio/'
OUTPUT_PATH = './output/'

# train_data_path = os.path.join(root_path, 'input', 'train', 'audio')
# test_data_path = os.path.join(root_path, 'input', 'test', 'audio')

In [39]:
## custom_fft and log_specgram functions written by DavidS.
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT 는 대칭(simmetrical)이므로 반쪽만 얻음.
    # FFT 는 복소수이므로 실수값만 취하기 위해 abs()
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [40]:
## utility function to grab all wav files inside train data folder.
def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [41]:
def pad_audio(samples):
    '''
    pad audios that are less than 16000(1 second) with 0s to make them all have the same length.
    '''
    if len(samples) >= L: 
        return samples
    else: 
        return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0)) 
        # sample 앞뒤로 constant_values[0]과 constant_values[1]을 각각 pad_width 갯수 만큼 패딩
        # 총길이는 len(samples) + 2*pad_width

def chop_audio(samples, L=16000, num=20):
    '''
    
    chop audios that are larger than 16000(eg. wav files in background noises folder) to 16000 in length.
    create several chunks out of one large wav files given the parameter 'num'.
    '''
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    '''
    레이블 정규화 및 one-hot벡터화 (더미화)
    '''
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [42]:
labels, fnames = list_wavs_fname(TRAIN_PATH)
new_sample_rate = 8000

./input/train/audio/


In [43]:
labels[:10]

['_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 '_background_noise_',
 'bed',
 'bed',
 'bed',
 'bed']

In [44]:
fnames[:10]

['doing_the_dishes.wav',
 'dude_miaowing.wav',
 'exercise_bike.wav',
 'pink_noise.wav',
 'running_tap.wav',
 'white_noise.wav',
 '00176480_nohash_0.wav',
 '004ae714_nohash_0.wav',
 '004ae714_nohash_1.wav',
 '00f0204f_nohash_0.wav']

In [45]:
## 음성 파일 읽음
idx = 1
sample_rate, samples = wavfile.read(os.path.join(TRAIN_PATH, labels[idx], fnames[idx]))
sample_rate, len(samples)



(16000, 988891)

In [46]:
samples

array([  0,   0,   0, ...,  95, 111, -11], dtype=int16)

In [47]:
## 음성 파일 길이 맞춤
if len(samples) > 16000:
    n_samples = chop_audio(samples)
else: 
    n_samples = [samples]
# sample_rate, len(n_samples), len(n_samples[0])

In [48]:
list(n_samples)

[array([ 49, -93, -45, ..., -91, -35, -28], dtype=int16),
 array([  2, -34, -24, ..., -44, -11,  34], dtype=int16),
 array([ 22, -53, -72, ..., -41, -39, -49], dtype=int16),
 array([  71,   55,   14, ..., -308,  282, -185], dtype=int16),
 array([-16, -27,  42, ...,  22,  38,  60], dtype=int16),
 array([ 26, -11,   0, ...,  22,  14,  83], dtype=int16),
 array([ 231, -144, -193, ...,   56,    9,   83], dtype=int16),
 array([-22, -37, -49, ..., -15,   0, -11], dtype=int16),
 array([-48, 104, -28, ..., -43, -46, -22], dtype=int16),
 array([  9, -37,  48, ...,  33,  61, -10], dtype=int16),
 array([  2, -24, -10, ..., -12, -23,  12], dtype=int16),
 array([-101, -108,  -65, ...,   -2, -120, -132], dtype=int16),
 array([ 26,  76,  11, ...,   6, -21, -12], dtype=int16),
 array([  4,   3, -21, ...,  -1,  31,  -4], dtype=int16),
 array([-84, -19,  -2, ..., -16, -40,  25], dtype=int16),
 array([ -95,  -40,  -50, ...,   86,  117, -152], dtype=int16),
 array([-32,  -2, -28, ..., -21, -28,  23], dtyp

In [49]:
for samples in n_samples:
    resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0])) 
    # resample 원리..??
    # https://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.signal.resample.html
    _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)

In [50]:
# resample 후 피쳐 shape
samples.shape, resampled.shape

((988891,), (8000,))

In [51]:
samples[:10], resampled[:10]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int16),
 array([-30.93313632, -37.04165099, -38.89466125, -28.68076367,
        -45.56387538, -31.92286919, -27.28724748, -34.78265278,
        -24.10656264, -11.10267127]))

In [52]:
# log_specgram으로 변환 후 피쳐 shape
resampled.shape, specgram.shape

((8000,), (99, 81))

In [53]:
specgram

array([[-10.21001625,  -0.4732331 ,   0.4780494 , ...,  -5.80697393,
         -4.49407959,  -3.90996122],
       [ -5.04077196,   0.02861843,   3.64834547, ...,  -6.21814728,
         -4.04334545,  -3.53456378],
       [ -3.38002753,   3.05539846,   4.02847576, ...,  -5.15383196,
         -4.25699377,  -3.47039342],
       ..., 
       [ -4.36659908,  -1.74920118,   0.54107004, ...,  -6.09838772,
         -4.73975325,  -4.24840927],
       [ -4.86254311,  -2.85832739,  -0.30666471, ...,  -5.47484684,
         -5.55965662,  -4.44182587],
       [ -9.72101021,  -0.45367008,   0.96115571, ...,  -6.81187916,
         -3.91157794,  -3.43369842]], dtype=float32)

# 학습데이터 전처리

In [54]:
%%time
y = []
X = []

for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(TRAIN_PATH, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: 
        n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y.append(label)
        X.append(specgram)



CPU times: user 1min 5s, sys: 29.4 s, total: 1min 35s
Wall time: 1min 50s


In [55]:
y[0], X[0]

('_background_noise_',
 array([[-0.58695507,  2.73837495,  4.05053663, ...,  3.00274253,
          1.75911653,  0.57116693],
        [ 1.26469135,  3.01443863,  0.80951291, ...,  1.46810663,
          1.34394538,  1.24525583],
        [ 1.06067181,  3.2354908 ,  2.59009552, ...,  3.16489244,
          2.15972066,  2.89746737],
        ..., 
        [-0.77927196,  4.34345245,  4.36018896, ...,  1.96323991,
          1.11490548,  1.17736089],
        [ 2.54292154,  3.39738393,  2.40628457, ...,  4.06853819,
          4.3390522 ,  2.8615582 ],
        [ 0.96355903,  3.70682192,  4.32738352, ...,  3.46363997,
          3.47542334,  2.45203137]], dtype=float32))

In [56]:
X[0].shape

(99, 81)

In [57]:
label_index

array(['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop',
       'unknown', 'up', 'yes'], dtype=object)

In [58]:
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1])) # (64841, 99, 81, 1) 로 reshape
y = label_transform(y)
label_index = y.columns.values
y = y.values
# del labels, fnames
# gc.collect()

In [59]:
# sample data
y[0], X[0]

(array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=uint8),
 array([[[-0.58695507],
         [ 2.73837495],
         [ 4.05053663],
         ..., 
         [ 3.00274253],
         [ 1.75911653],
         [ 0.57116693]],
 
        [[ 1.26469135],
         [ 3.01443863],
         [ 0.80951291],
         ..., 
         [ 1.46810663],
         [ 1.34394538],
         [ 1.24525583]],
 
        [[ 1.06067181],
         [ 3.2354908 ],
         [ 2.59009552],
         ..., 
         [ 3.16489244],
         [ 2.15972066],
         [ 2.89746737]],
 
        ..., 
        [[-0.77927196],
         [ 4.34345245],
         [ 4.36018896],
         ..., 
         [ 1.96323991],
         [ 1.11490548],
         [ 1.17736089]],
 
        [[ 2.54292154],
         [ 3.39738393],
         [ 2.40628457],
         ..., 
         [ 4.06853819],
         [ 4.3390522 ],
         [ 2.8615582 ]],
 
        [[ 0.96355903],
         [ 3.70682192],
         [ 4.32738352],
         ..., 
         [ 3.46363997],
      

In [60]:
input_shape = (99, 81, 1) # in order to fit into Conv2D layer, we need to reshape it.
nclass = 12

In [61]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=1130) # 9:1로 train, valid 셋 나눔.

In [62]:
## Modeling
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

In [63]:
model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 99, 81, 1)         0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 99, 81, 1)         4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 80, 8)         40        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 97, 79, 8)         264       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 39, 8)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 39, 8)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 46, 37, 16)        1168      
__________

In [64]:
# 네트워크 시각화
plot_model(model, to_file='output/model_plot.png', show_shapes=True, show_layer_names=True)

In [66]:
model.fit(X_train, y_train, batch_size=1024, validation_data=(X_valid, y_valid), epochs=10, shuffle=True, verbose=1,)
model.save(os.path.join(OUTPUT_PATH, 'cnn.model.h5'))

Train on 61598 samples, validate on 3243 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [68]:
model.save(os.path.join(OUTPUT_PATH, 'cnn_model.h5'))