In [1]:
import os
import numpy as np
np.random.seed(1984)
import tensorflow as tf
tf.set_random_seed(1984)

from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import GRU, Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from keras.callbacks import TensorBoard
from keras.models import Sequential
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

root_path = r'..'
out_path = r'.'
model_path = r'.'
train_data_path ='/home/dannyp/repos/tensorflow-samples/original/train/audio'
test_data_path = '/home/dannyp/repos/tensorflow-samples/original/test/audio'

def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=1000):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))



  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
labels, fnames = list_wavs_fname(train_data_path)
new_sample_rate=16000
y_train = []
x_train = np.zeros((64727,99,26),np.float32)
G = []

ix = 0
for label, fname in tqdm(zip(labels, fnames)):
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else:
        n_samples = [samples]
    for samples in n_samples:
        filter_banks = logfbank(samples)
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
        x_train[ix,:,:] = filter_banks
    y_train.append(label)
    group = fname.split('_')[0]
    G.append(group)
    ix += 1
    
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
G = np.array(G)

del labels, fnames
gc.collect()

/home/dannyp/repos/tensorflow-samples/original/train/audio


64727it [02:37, 411.42it/s]


7

In [9]:
def test_data_generator(batch=2):
    fpaths = glob('/home/dannyp/repos/tensorflow-samples/original/test/audio/*.wav')
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        #print(path)
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        filter_banks = logfbank(samples)
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
        mfcc_banks = mfcc(samples)
        mfcc_banks -= (np.mean(mfcc_banks, axis=0) + 1e-8)
        filter_banks = np.concatenate((filter_banks, mfcc_banks), axis = 1)
        imgs.append(filter_banks)
        fnames.append(path.split('/')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)

        yield fnames, imgs
    raise StopIteration()

index = []
results = []
probs = []
label_index = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop', 'unknown', 'up', 'yes']
from keras.models import load_model
model = load_model('./gru/model/GRU_512x2_d05true_D256_D128_E20_B512.h5')

for fnames, imgs in tqdm(test_data_generator(batch=32)):
    predicts = model.predict(imgs)
    probs.extend(predicts)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

4955it [27:56,  2.96it/s]


In [10]:
df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results
df.to_csv('./submissions/new/GRU_512x2_d05true_D256_D128_E20_B512.csv', index=False)

In [11]:
df.shape

(158538, 2)