In [1]:
from glob import glob
import re
import os
import numpy as np
import pandas as pd
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
import IPython.display as ipd

from keras.models import load_model
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
TEST_PATH = './input/test/audio/'
OUTPUT_PATH = './output/'
EXT = 'wav'
L = 16000
new_sample_rate = 8000
LABELS = ['_silence', '_unknown', 'down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']

In [3]:
def pad_audio(samples):
    '''
    pad audios that are less than 16000(1 second) with 0s to make them all have the same length.
    '''
    if len(samples) >= L: 
        return samples
    else: 
        return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0)) 
        # sample 앞뒤로 constant_values[0]과 constant_values[1]을 각각 pad_width 갯수 만큼 패딩
        # 총길이는 len(samples) + 2*pad_width

def chop_audio(samples, L=16000, num=20):
    '''
    
    chop audios that are larger than 16000(eg. wav files in background noises folder) to 16000 in length.
    create several chunks out of one large wav files given the parameter 'num'.
    '''
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    '''
    레이블 정규화 및 one-hot벡터화 (더미화)
    '''
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))


## custom_fft and log_specgram functions written by DavidS.
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT 는 대칭(simmetrical)이므로 반쪽만 얻음.
    # FFT 는 복소수이므로 실수값만 취하기 위해 abs()
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)


In [4]:
submission_fpaths = glob(os.path.join(TEST_PATH, r'*' + EXT))

In [5]:
submission_fpaths[:10]

['./input/test/audio/clip_000044442.wav',
 './input/test/audio/clip_0000adecb.wav',
 './input/test/audio/clip_0000d4322.wav',
 './input/test/audio/clip_0000fb6fe.wav',
 './input/test/audio/clip_0001d1559.wav',
 './input/test/audio/clip_0002256ed.wav',
 './input/test/audio/clip_0002a4a1f.wav',
 './input/test/audio/clip_0002d9b83.wav',
 './input/test/audio/clip_000373a5b.wav',
 './input/test/audio/clip_0003c7122.wav']

In [6]:
pat = r'.+/(\w+\.' + EXT + ')$'
r = re.match(pat, submission_fpaths[0])

In [7]:
r.group(1)

'clip_000044442.wav'

In [46]:
%%time
X = []
for fpath in submission_fpaths:
    sample_rate, samples = wavfile.read(fpath)
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: 
        n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        X.append(specgram)

CPU times: user 2min 29s, sys: 33.4 s, total: 3min 3s
Wall time: 3min 34s


In [47]:
X = np.array(X)
X = X.reshape(tuple(list(X.shape) + [1]))

In [48]:
preds_proba = model.predict(X, batch_size=1024, verbose=1)



In [58]:
preds = [LABELS[i] for i in np.argmax(preds_proba, axis=1)]

In [8]:
LABELS

['_silence',
 '_unknown',
 'down',
 'go',
 'left',
 'no',
 'off',
 'on',
 'right',
 'stop',
 'up',
 'yes']

In [71]:
i = 1225
sample_rate, samples = wavfile.read(submission_fpaths[i])
ipd.Audio(samples, rate=sample_rate)

In [72]:
list(preds_proba[i])

[0.0069095492,
 0.82243466,
 0.0035084886,
 0.0068927505,
 1.5634483e-05,
 0.0002596408,
 0.003939827,
 0.0019416556,
 8.4403746e-06,
 0.099022947,
 0.055015635,
 5.087009e-05]

In [62]:
pd.Series(preds).value_counts()

_unknown    99781
no           6897
off          6196
up           6076
stop         5737
yes          5473
on           5360
left         5051
right        4652
_silence     4535
go           4442
down         4338
dtype: int64

In [109]:
df_res = pd.DataFrame(res)
df_res[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.004886,0.334909,0.003579,0.267606,0.007844,0.005214,0.011782,0.003972,0.004569,0.332921,0.003513,0.019205
1,0.000262,5.3e-05,5.6e-05,1.8e-05,9.1e-05,0.000465,9.5e-05,4.3e-05,3.2e-05,0.998098,0.000741,4.6e-05
2,0.035625,0.065441,3.6e-05,0.007289,0.003101,0.018397,0.00018,0.000233,0.01277,0.851233,0.005168,0.000529
3,0.047504,0.012185,0.042488,0.004929,0.026152,0.017004,0.073481,0.022707,0.01455,0.548025,0.168084,0.022891
4,1.1e-05,4.4e-05,2e-05,2.9e-05,4e-06,8.2e-05,0.000489,9e-06,7e-06,0.999253,9e-06,4.3e-05
5,0.000894,0.000281,0.000276,0.000255,0.00027,0.001979,0.000335,0.000111,0.000186,0.994502,0.000401,0.00051
6,0.000495,0.00017,0.000137,5.9e-05,0.007666,0.255696,0.000648,0.000136,0.000306,0.731618,0.002158,0.000911
7,0.000189,0.000207,8.5e-05,8.2e-05,7e-06,4.2e-05,0.001039,5e-05,7.5e-05,0.998051,0.000116,5.7e-05
8,0.005667,0.409791,0.000664,0.289382,0.009465,0.001195,0.001793,0.002747,0.010495,0.263403,0.00217,0.003227
9,0.027107,0.006739,0.011709,0.003565,0.018217,0.034465,0.034676,0.003737,0.00339,0.826866,0.016317,0.013211


In [9]:
model = load_model(os.path.join(OUTPUT_PATH, 'cnn_baseline_epoch20.h5'))

In [10]:
def test_data_generator(batch=16):
    fpaths = glob(os.path.join(TEST_PATH, '*wav'))
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('/')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
        yield fnames, imgs
    raise StopIteration()

In [11]:
%%time 

index = []
results = []
for fnames, imgs in test_data_generator(batch=32):
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [LABELS[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results
df.to_csv(os.path.join(OUTPUT_PATH, 'sub2.csv'), index=False)

  after removing the cwd from sys.path.


CPU times: user 14min 33s, sys: 1min 55s, total: 16min 29s
Wall time: 6min 43s


In [39]:
df2 = df.copy()