In [1]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras.utils.np_utils import to_categorical
from keras import optimizers, losses, activations, models
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

Using TensorFlow backend.


In [2]:
new_sample_rate = 8000

LABELS = ['_silence', '_unknown', 'down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
TRAIN_PATH = './input/train/audio/'
OUTPUT_PATH = './output/'

In [6]:
## custom_fft and log_specgram functions written by DavidS.
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT 는 대칭(simmetrical)이므로 반쪽만 얻음.
    # FFT 는 복소수이므로 실수값만 취하기 위해 abs()
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [7]:
## utility function to grab all wav files inside train data folder.
def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [8]:
def pad_audio(samples, L=16000):
    '''
    pad audios that are less than 16000(1 second) with 0s to make them all have the same length.
    '''
    if len(samples) >= L: 
        return samples
    else: 
        return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0)) 
        # sample 앞뒤로 constant_values[0]과 constant_values[1]을 각각 pad_width 갯수 만큼 패딩
        # 총길이는 len(samples) + 2*pad_width

def chop_audio(samples, L=16000, num=20):
    '''
    chop audios that are larger than 16000(eg. wav files in background noises folder) to 16000 in length.
    create several chunks out of one large wav files given the parameter 'num'.
    '''
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    '''
    레이블 정규화 및 one-hot벡터화 (더미화)
    '''
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('_silence')
        elif label not in LABELS:
            nlabels.append('_unknown')
        else:
            nlabels.append(label)
    encoder = LabelEncoder()
    encoder.fit(nlabels)
    nlabels = encoder.transform(nlabels)
    return nlabels

# 1. Load Data & Preprocessing

### 1) Load labels, fnames

In [9]:
labels, fnames = list_wavs_fname(TRAIN_PATH)

./input/train/audio/


### 2) Feature Extraction

In [10]:
%%time
y = []
X = []

for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(TRAIN_PATH, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: 
        n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y.append(label)
        X.append(specgram)



CPU times: user 1min 2s, sys: 16.7 s, total: 1min 19s
Wall time: 1min 33s


In [11]:
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1])) # (64841, 99, 81, 1) 로 reshape
y = to_categorical(label_transform(y))

### 3) Train Validation Set Split

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1130) # 9:1로 train, valid 셋 나눔.

# 2. Modeling

In [26]:
input_shape = (99, 81, 1) # in order to fit into Conv2D layer, we need to reshape it.
nclass = 12

In [27]:
## Modeling
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)


dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

# dense_1 = BatchNormalization()(Dense(512, activation=activations.relu)(img_1))
# dense_1 = Dropout(rate=0.2)(dense_1)
# dense_1 = Dense(512, activation=activations.relu)(dense_1)
# dense_1 = Dropout(rate=0.2)(dense_1)
# dense_1 = Dense(128, activation=activations.relu)(dense_1)
# dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam(lr=0.001)

model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 99, 81, 1)         0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 99, 81, 1)         4         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 98, 80, 8)         40        
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 97, 79, 8)         264       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 48, 39, 8)         0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 48, 39, 8)         0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 46, 37, 16)        1168      
__________

In [14]:
# 네트워크 시각화
# plot_model(model, to_file='output/model_plot.png', show_shapes=True, show_layer_names=True)

In [28]:
model.fit(X_train, y_train, batch_size=1024, validation_data=(X_valid, y_valid), epochs=20, shuffle=True, verbose=1,)
# model.fit(X, y, batch_size=1024, epochs=100, shuffle=True, verbose=1,)
# model.save(os.path.join(OUTPUT_PATH, 'cnn_custom_epoch100.h5'))

Train on 51872 samples, validate on 12969 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x19e20df28>

# 3. Validation

In [29]:
preds_proba = model.predict(X_valid, batch_size=1024, verbose=1)



In [30]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.93600

 -------------------------- 

preds     _silence  _unknown  down   go  left   no  off   on  right  stop  \
actuals                                                                     
_silence        16         6     0    0     0    0    0    0      0     3   
_unknown         2      8081    39   24    14   19   14   25     10    15   
down             0        34   411    4     0   11    0    0      0     3   
go               0        48    32  334     0   30    3    2      0     6   
left             0        44     0    0   412    0    1    0      1     1   
no               0        39    11    8     0  390    1    0      0     1   
off              0        32     0    0     0    0  438   14      0     4   
on               0        44     0    0     0    0    6  412      0     0   
right            0        62     1    1     7    0    0    1    369     0   
stop             0        37     2    0     0    0    4    0      0   439   
up               0        26 

In [179]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.90166

 -------------------------- 

preds     _silence  _unknown  down   go  left   no  off   on  right  stop  \
actuals                                                                     
_silence        12        23     0    0     0    0    0    0      0     0   
_unknown         0     11922    32   28    16   16   12   45     96    58   
down             1       120   512    7     0   23    0    1      0    17   
go               0       144    30  412     0   65    0    0      1    11   
left             0        92     0    0   547    0    1    0      8     0   
no               0        97    24   18     0  553    3    0      0     6   
off              0        53     0    1     0    0  547   11      0     2   
on               0       127     0    1     0    0   17  544      0     1   
right            0        67     0    0     4    0    0    2    615     0   
stop             0        46     1    0     0    0    1    0      0   602   
up               0        36 

In [20]:
actuals = [LABELS[i] for i in np.argmax(y, axis=1)]

In [23]:
pd.Series(actuals).value_counts()

_unknown    41039
stop         2380
yes          2377
up           2375
no           2375
go           2372
on           2367
right        2367
down         2359
off          2357
left         2353
_silence      120
dtype: int64