In [49]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras import optimizers, losses, activations, models
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

In [50]:
new_sample_rate = 8000

LABELS = ['_silence', '_unknown', 'down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
TRAIN_PATH = './input/train/audio/'
TEST_PATH = './input/test/audio/'
OUTPUT_PATH = './output/'
MODEL_NAME = 'cnn_vad_300.h5'

class_weight = {0: 12.0,
 1: 1.0,
 2: 9.0,
 3: 9.0,
 4: 9.0,
 5: 9.0,
 6: 9.0,
 7: 9.0,
 8: 9.0,
 9: 9.0,
 10: 9.0,
 11: 9.0}

In [3]:
# {0: 1200,
#  1: 41039,
#  2: 2359,
#  3: 2372,
#  4: 2353,
#  5: 2375,
#  6: 2357,
#  7: 2367,
#  8: 2367,
#  9: 2380,
#  10: 2375,
#  11: 2377}

In [51]:
def create_class_weight(labels_dict, mu=0.15):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = np.log1p(mu * total / float(labels_dict[key]))
        class_weight[key] = score  # if score > 1.0 else 1.0

    min_key = min(class_weight.items(), key=lambda x: x[1])[0]
    rate = 1 / class_weight[min_key]
    class_weight = {k: class_weight[k] * rate for k in class_weight.keys()}

    return class_weight

In [52]:
# class_weight = create_class_weight(labels_dict=t, mu=0.1)
# class_weight

In [53]:
## custom_fft and log_specgram functions written by DavidS.
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT 는 대칭(simmetrical)이므로 반쪽만 얻음.
    # FFT 는 복소수이므로 실수값만 취하기 위해 abs()
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [54]:
## utility function to grab all wav files inside train data folder.
def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

In [55]:
def pad_audio(samples, L=16000):
    '''
    pad audios that are less than 16000(1 second) with 0s to make them all have the same length.
    '''
    if len(samples) >= L: 
        return samples
    else: 
        return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0)) 
        # sample 앞뒤로 constant_values[0]과 constant_values[1]을 각각 pad_width 갯수 만큼 패딩
        # 총길이는 len(samples) + 2*pad_width

def chop_audio(samples, L=16000, num=200):
    '''
    chop audios that are larger than 16000(eg. wav files in background noises folder) to 16000 in length.
    create several chunks out of one large wav files given the parameter 'num'.
    '''
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    '''
    레이블 정규화 및 one-hot벡터화 (더미화)
    '''
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('_silence')
        elif label not in LABELS:
            nlabels.append('_unknown')
        else:
            nlabels.append(label)
    encoder = LabelEncoder()
    encoder.fit(nlabels)
    nlabels = encoder.transform(nlabels)
    return nlabels

# 1. Load Data & Preprocessing

### 1) Load labels, fnames

In [56]:
labels, fnames = list_wavs_fname(TRAIN_PATH)

./input/train/audio/


### 2) Feature Extraction

In [57]:
%%time
y = []
X = []

for label, fname in zip(labels, fnames):
    sample_rate, samples = wavfile.read(os.path.join(TRAIN_PATH, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: 
        n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y.append(label)
        X.append(specgram)

CPU times: user 1min 1s, sys: 1.13 s, total: 1min 2s
Wall time: 1min 1s




In [58]:
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1])) # (64841, 99, 81, 1) 로 reshape
y = to_categorical(label_transform(y))

### 3) Train Validation Set Split

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=1130) # 9:1로 train, valid 셋 나눔.
del X, y
gc.collect()

14

In [60]:
# model = load_model(os.path.join(OUTPUT_PATH, MODEL_NAME))

# 2. Modeling

In [64]:
input_shape = (99, 81, 1) # in order to fit into Conv2D layer, we need to reshape it.
nclass = 12

In [63]:
## Modeling
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu, padding='same')(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu, padding='same')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=2, activation=activations.relu, padding='same')(img_1)
img_1 = Convolution2D(16, kernel_size=2, activation=activations.relu, padding='same')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu, padding='same')(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu, padding='same')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu, padding='same')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(img_1)
dense_1 = Dense(512, activation=activations.relu)(dense_1)
dense_1 = Dropout(rate=0.2)(dense_1)
dense_1 = Dense(512, activation=activations.relu)(dense_1)
dense_1 = Dropout(rate=0.2)(dense_1)
dense_1 = Dense(512, activation=activations.relu)(dense_1)
dense_1 = Dropout(rate=0.2)(dense_1)
dense_1 = Dense(128, activation=activations.relu)(dense_1)
dense_1 = Dropout(rate=0.2)(dense_1)
dense_1 = Dense(64, activation=activations.relu)(dense_1)
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam(lr=0.001)

model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['accuracy'])
model.summary()

ValueError: The shape of the input to "Flatten" is not fully defined (got (None, None, 32). Make sure to pass a complete "input_shape" or "batch_input_shape" argument to the first layer in your model.

In [13]:
# 네트워크 시각화
# plot_model(model, to_file='output/model_plot.png', show_shapes=True, show_layer_names=True)

In [62]:
%%time
model.fit(X_train, y_train, batch_size=2048, validation_data=(X_valid, y_valid), \
          class_weight=class_weight, epochs=30, shuffle=True, verbose=1)
# model.save(os.path.join(OUTPUT_PATH, MODEL_NAME))

Train on 62624 samples, validate on 3297 samples
Epoch 1/30
Epoch 2/30

KeyboardInterrupt: 

In [42]:
class_weight = {0: 14.0,
 1: 1.0,
 2: 12.0,
 3: 12.0,
 4: 12.0,
 5: 12.0,
 6: 12.0,
 7: 12.0,
 8: 12.0,
 9: 12.0,
 10: 12.0,
 11: 12.0}
model.fit(X_train, y_train, batch_size=2048, validation_data=(X_valid, y_valid), \
          class_weight=class_weight, epochs=100, shuffle=True, verbose=1)
model.save(os.path.join(OUTPUT_PATH, 'cnn_custom4_400.h5'))

Train on 62624 samples, validate on 3297 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

# 3. Validation

In [43]:
preds_proba = model.predict(X_valid, batch_size=1024, verbose=1)



In [44]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.95208

 -------------------------- 

preds     _silence  _unknown  down  go  left  no  off   on  right  stop   up  \
actuals                                                                        
_silence        54         0     0   0     0   0    0    0      0     0    0   
_unknown         4      1969     9  14     6   7    8   15      6     7    9   
down             0         6   127   1     0   1    0    0      0     0    0   
go               0         4     2  88     0   2    0    0      1     1    0   
left             1         2     0   1   125   0    0    1      1     0    1   
no               0         2     2   6     1  96    0    3      0     0    0   
off              0         3     0   1     0   0  102    1      1     2    1   
on               0         0     0   0     0   0    0  103      0     0    0   
right            0         2     0   0     0   0    0    1    123     0    1   
stop             0         2     0   1     0   0    1    1      0   119  

In [16]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.95511

 -------------------------- 

preds     _silence  _unknown  down  go  left   no  off   on  right  stop   up  \
actuals                                                                         
_silence        54         0     0   0     0    0    0    0      0     0    0   
_unknown         1      1955     9  12    12    7    7   13     14    16    8   
down             0         0   132   3     0    0    0    0      0     0    0   
go               0         1     0  95     1    1    0    0      0     0    0   
left             0         2     0   0   127    0    0    0      0     3    1   
no               0         1     1   1     0  103    0    0      0     3    1   
off              1         2     0   0     0    0  103    1      0     3    1   
on               0         1     0   0     0    0    0  102      0     0    0   
right            0         4     0   0     1    0    0    0    121     0    1   
stop             0         0     1   1     0    0    0    0   

In [26]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.97028

 -------------------------- 

preds     _silence  _unknown  down  go  left   no  off  on  right  stop   up  \
actuals                                                                        
_silence        54         0     0   0     0    0    0   0      0     0    0   
_unknown         4      2029     0   7     1    9    2   1      0     0    4   
down             0         4   128   1     0    1    0   0      0     0    0   
go               0         5     4  87     0    2    0   0      0     0    0   
left             0         6     0   0   126    0    0   0      0     0    2   
no               0         4     2   0     0  103    0   0      0     1    0   
off              1         6     1   0     1    0  100   0      0     1    1   
on               0         4     0   0     0    0    1  98      0     0    0   
right            0         5     0   0     1    0    0   0    121     0    0   
stop             0         4     1   0     0    1    0   0      0   118  

In [29]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.94295

 -------------------------- 

preds     _silence  _unknown  down  go  left   no  off   on  right  stop   up  \
actuals                                                                         
_silence         4         1     0   0     0    0    0    0      0     0    0   
_unknown         0      2018     6   8     1    5    7    7      6     8    9   
down             0        10   107   3     0    1    0    0      0     1    0   
go               0         8     2  84     0    2    0    0      0     1    0   
left             0         5     0   0   109    0    0    0      1     0    1   
no               0        13     2   2     0  101    0    0      0     1    1   
off              0         3     0   0     0    0  102    1      0     0    1   
on               0        13     0   1     0    0    6  102      0     0    3   
right            0         9     0   0     2    0    0    0    107     0    0   
stop             0         8     0   0     0    0    0    1   

In [179]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]
actuals = [LABELS[i] for i in np.argmax(y_valid, axis=1)]
print('* 정확도 : %.5f' % (np.sum(np.array(actuals) == np.array(preds)) / float(len(actuals))))
preds = pd.Categorical(preds, categories=LABELS)
actuals = pd.Categorical(actuals, categories=LABELS)
print('\n -------------------------- \n')
print(pd.crosstab(actuals, preds, rownames=['actuals'], colnames=['preds']))
print('\n -------------------------- \n')
print(classification_report(actuals, preds))

* 정확도 : 0.90166

 -------------------------- 

preds     _silence  _unknown  down   go  left   no  off   on  right  stop  \
actuals                                                                     
_silence        12        23     0    0     0    0    0    0      0     0   
_unknown         0     11922    32   28    16   16   12   45     96    58   
down             1       120   512    7     0   23    0    1      0    17   
go               0       144    30  412     0   65    0    0      1    11   
left             0        92     0    0   547    0    1    0      8     0   
no               0        97    24   18     0  553    3    0      0     6   
off              0        53     0    1     0    0  547   11      0     2   
on               0       127     0    1     0    0   17  544      0     1   
right            0        67     0    0     4    0    0    2    615     0   
stop             0        46     1    0     0    0    1    0      0   602   
up               0        36 

In [38]:
%%time
X = []
submission_fpaths = sorted(glob(os.path.join(TEST_PATH, r'*wav')))
for fpath in submission_fpaths:
    sample_rate, samples = wavfile.read(fpath)
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: 
        n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        X.append(specgram)
        
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1]))

CPU times: user 2min 27s, sys: 3.75 s, total: 2min 30s
Wall time: 2min 28s


In [45]:
preds_proba = model.predict(X, batch_size=512, verbose=1)
preds = [[L.replace('_', '') for L in LABELS][i] for i in np.argmax(preds_proba, axis=1)]



In [46]:
df = pd.DataFrame({'fname': submission_fpaths, 'label': preds})
df['fname'] = df['fname'].apply(lambda p: p.split('/')[-1])
df.to_csv(os.path.join(OUTPUT_PATH, 'sub_' + MODEL_NAME.split('.')[0] + '.csv'), index=False)

In [47]:
df['label'].value_counts()

unknown    80458
on         15762
silence     7437
off         6784
no          6350
left        6336
go          6198
stop        6151
right       5991
up          5991
yes         5779
down        5301
Name: label, dtype: int64

In [23]:
df['label'].value_counts()

unknown    90829
on          7469
no          7168
silence     7137
off         6684
go          6179
down        6035
yes         5821
up          5658
left        5269
stop        5199
right       5090
Name: label, dtype: int64