In [2]:
!pip3 install keras
!pip3 install tqdm
!pip3 install python_speech_features

Collecting keras
  Downloading Keras-2.1.2-py2.py3-none-any.whl (304kB)
[K    100% |████████████████████████████████| 307kB 1.9MB/s 
Installing collected packages: keras
Successfully installed keras-2.1.2
Collecting tqdm
  Downloading tqdm-4.19.5-py2.py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 2.7MB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.19.5
Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz
Building wheels for collected packages: python-speech-features
  Running setup.py bdist_wheel for python-speech-features ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/5f/42/b4/d2a1e5bc6c3303b7d98ef88180524ff0fcb6d9fc3f9f66a543
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


In [17]:
import os
import numpy as np
np.random.seed(1984)
import tensorflow as tf
tf.set_random_seed(1984)

from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import GRU, Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, Conv3D, ConvLSTM2D
from keras.callbacks import TensorBoard
from keras.models import Sequential
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

root_path = r'..'
out_path = r'.'
model_path = r'.'
train_data_path ='/content/datalab/docs/yaafe/train/audio'
test_data_path = '/content/datalab/docs/yaafe/test/audio'

def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=1000):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

labels, fnames = list_wavs_fname(train_data_path)
new_sample_rate=16000
y_train = []
x_train = np.zeros((64727,99,26),np.float32)
G = []

ix = 0
for label, fname in tqdm(zip(labels, fnames)):
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else:
        n_samples = [samples]
    for samples in n_samples:
        filter_banks = logfbank(samples)
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
        x_train[ix,:,:] = filter_banks
    y_train.append(label)
    group = fname.split('_')[0]
    G.append(group)
    ix += 1
    
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
G = np.array(G)

del labels, fnames
gc.collect()



#     index = []
#     results = []
#     probs = []
#     for fnames, imgs in tqdm(test_data_generator(batch=32)):
#         predicts = model.predict(imgs)
#         probs.extend(predicts)
#         predicts = np.argmax(predicts, axis=1)
#         predicts = [label_index[p] for p in predicts]
#         index.extend(fnames)
#         results.extend(predicts)

#     df = pd.DataFrame(columns=['fname', 'label'])
#     df['fname'] = index
#     df['label'] = results
#     df.to_csv(os.path.join(out_path, 'subs/gru_sub_{}_{}.csv'.format(bag+1,best_loss)), index=False)
#     probs = np.array(probs)
#     np.save('probs/gru_probs_{}.npy'.format(bag+1),probs)

/content/datalab/docs/yaafe/train/audio




0it [00:00, ?it/s][A[A

40it [00:00, 389.69it/s][A[A

79it [00:00, 390.41it/s][A[A

119it [00:00, 390.03it/s][A[A

159it [00:00, 390.62it/s][A[A

200it [00:00, 392.91it/s][A[A

240it [00:00, 392.70it/s][A[A

280it [00:00, 393.00it/s][A[A

320it [00:00, 393.40it/s][A[A

360it [00:00, 393.58it/s][A[A

400it [00:01, 393.99it/s][A[A

441it [00:01, 394.72it/s][A[A

480it [00:01, 381.61it/s][A[A

516it [00:01, 374.26it/s][A[A

550it [00:01, 364.67it/s][A[A

590it [00:01, 366.35it/s][A[A

630it [00:01, 367.83it/s][A[A

670it [00:01, 369.24it/s][A[A

710it [00:01, 370.50it/s][A[A

750it [00:02, 371.50it/s][A[A

790it [00:02, 372.60it/s][A[A

830it [00:02, 373.24it/s][A[A

869it [00:02, 365.92it/s][A[A

908it [00:02, 366.90it/s][A[A

948it [00:02, 367.77it/s][A[A

985it [00:02, 366.26it/s][A[A

1024it [00:02, 366.85it/s][A[A

1061it [00:02, 361.71it/s][A[A

1096it [00:03, 361.26it/s][A[A

1130it [00:03, 358.21it/s][A[A

1168it [00:03

4695

In [18]:
model = Sequential()
model.add(GRU(512, input_shape=(99,26)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(12, activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
weights = model.get_weights()

EPOCHS = 12
BATCH_SIZE = 512

model.set_weights(weights)
model.reset_states()



tensorboard = TensorBoard(log_dir='logs/gru')
model.reset_states()
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True, verbose=1, callbacks=[tensorboard])
model.save('models/gru_model_512_256_128_B512_E12.h5')

gc.collect()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_2 (GRU)                  (None, 512)               827904    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_6 (Dense)              (None, 12)                1548      
Total params: 993,676
Trainable params: 993,676
Non-trainable params: 0
_________________________________________________________________
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


0

In [29]:
def test_data_generator(batch=2):
    fpaths = glob('/content/datalab/docs/yaafe/test/audio/*.wav')
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        print(path)
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        filter_banks = logfbank(samples)
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
        imgs.append(filter_banks)
        fnames.append(path.split('/')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)

        yield fnames, imgs
    raise StopIteration()

index = []
results = []
probs = []
for fnames, imgs in test_data_generator(batch=32):
    predicts = model.predict(imgs)
    probs.extend(predicts)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

/content/datalab/docs/yaafe/test/audio/clip_006bd0d1c.wav
/content/datalab/docs/yaafe/test/audio/clip_48a85866b.wav


ValueError: File format b''... not understood.

In [24]:
wavfile.read('/content/datalab/docs/yaafe/test/audio/clip_48a85866b.wav', )

ValueError: File format b''... not understood.

In [21]:
wavfile.read?

In [None]:
!more /content/datalab/docs/yaafe/test/audio/clip_006bd0d1c.wav