In [0]:
import os, h5py, librosa
import numpy as np
import pandas as pd
import tensorflow as tf

from scipy.io import wavfile
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras import backend as K

In [0]:
DATA_DIR = './datasets'
RANDOM_SEED = 10000
NUM_FOLDS = 5
EPOCHS = 5

SAMPLE_RATE = 16000

In [3]:
tf.keras.utils.get_file('speech_commands_v0.02.tar.gz',
                        origin= 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
                        extract=True, cache_dir='.')

'./datasets/speech_commands_v0.02.tar.gz'

In [0]:
def clear_dir(path):
  !rm -rf $path
  !mkdir -p $path

In [0]:
def gen_silence():
  np.random.seed(RANDOM_SEED)

  path = f'{DATA_DIR}/_background_noise_/'
  out_path = f'{DATA_DIR}/silence/'
  
  clear_dir(out_path)

  files = [fp for fp in os.listdir(path) if fp.endswith('.wav')]
  for filename in files:
    sample_rate, samples = wavfile.read(path + filename)
    for i in range(400):
      out_name = f'segment_{i}_{filename}'
      data = (samples[i * 200: i * 200 + sample_rate] * max(0, 2 * (np.random.random() - 0.25))).astype('int16')
      if data.max() != 0:
        wavfile.write(out_path + out_name, sample_rate, data)

  for i in range(500):
      d = np.zeros(16000)
      loc = np.random.randint(0, 16000, 4600)
      d[loc[::2]] = -1
      d[loc[1::2]] = 1
      wavfile.write(out_path + f'new_synthesised_{i}.wav', 16000, d)

In [6]:
clear_dir(f'{DATA_DIR}/silence/')
gen_silence()



In [0]:
full_labels=["up", "down", "go", "stop", "left", "right", "on", "off", "yes", "no", "silence",
    "bed", "bird", "cat", "dog", "eight", "five", "four", "happy", "house", "marvin",
    "nine", "one", "seven", "sheila", "six", "three", "tree", "two", "wow", "zero"]

In [0]:
def gen_train_label_files():
  filepaths = []
  filelabels = []

  for label in full_labels:
      files = [f for f in os.listdir(f'{DATA_DIR}/{label}') if f.endswith('.wav')]
      filepaths.extend(files)
      filelabels.extend([label for _ in range(len(files))])

  train = pd.DataFrame({'file':filepaths, 'label':filelabels})
  train[['file', 'label']].to_csv("./full_train_label_file.csv", index=True)

In [0]:
gen_train_label_files()
train_df = pd.read_csv('./full_train_label_file.csv')

In [0]:
def read_logmelspectrogram(filepath, n_components=40, n_fft=400, hop_length=160):
  samples, _ = librosa.load(filepath, sr=SAMPLE_RATE)
  samples = samples[:SAMPLE_RATE]
  samples = np.pad(samples, (0, max(0, SAMPLE_RATE - len(samples))), 'constant')
  melspectrogram = librosa.feature.melspectrogram(y=samples, sr=SAMPLE_RATE, n_fft=n_fft, hop_length=hop_length, n_mels=n_components)
  return librosa.power_to_db(melspectrogram)

In [0]:
def prepare_data(ds_name):
  full_label_num_mapping={full_labels[i]: i for i in range(len(full_labels))}

  n_rows = train_df.shape[0]
  y = np.zeros((n_rows, 31))
  for idx in range(n_rows):
    y[idx, full_label_num_mapping[train_df.label.values[idx]]] = 1

  get_audio_fn = lambda row: f"{DATA_DIR}/{row['label']}/{row['file']}"
  audio_fn = train_df.apply(get_audio_fn, axis=1)
  dataset_fn = f"./{ds_name}.h5"

  if os.path.exists(dataset_fn):
    with h5py.File(dataset_fn, "r") as hf:
      X = hf["."]["X"].value
  else:
    X = np.array([read_logmelspectrogram(fn) for fn in audio_fn])
    with h5py.File(dataset_fn, "w") as hf:
      hf.create_dataset("X", data=X)
      
  return X, y

In [0]:
def conv_bn(x, c, kernel_size):
  h = Conv2D(filters=c, kernel_size=kernel_size, padding='same')(x)
  h = BatchNormalization()(h)
  h = Activation('relu')(h)
  return h

In [0]:
def vgg_block(x, c, kernel_size):
  h = conv_bn(x, c, kernel_size)
  h = conv_bn(h, c, kernel_size)
  h = MaxPooling2D()(h)
  return h

In [0]:
def vgg_dense_block(x, c, drop_rate):
  h = Dense(units=c, activation='relu')(x)
  h = BatchNormalization()(h)
  h = Dropout(drop_rate)(h)
  return h

In [0]:
def global_pool_block(x):
  gmp = GlobalMaxPooling2D()(x)
  gap = GlobalAveragePooling2D()(x)
  h = concatenate([gmp, gap])
  return h

In [0]:
def build_vgg_model(input_shape, c=16, kernel_size=(2,5), c_dense=256, drop_rate=0.5,
                    opt = tf.train.AdadeltaOptimizer, lr = 1.0):
  inputs = Input(shape=input_shape)
  h = Reshape((input_shape[0], input_shape[1], 1))(inputs)

  h = vgg_block(h, c, kernel_size)
  h = vgg_block(h, c*2, kernel_size)
  h = vgg_block(h, c*4, kernel_size)
  h = vgg_block(h, c*8, kernel_size)

  h = global_pool_block(h)
  h = vgg_dense_block(h, c_dense, drop_rate)
  h = vgg_dense_block(h, c_dense, drop_rate)
  outputs = Dense(31, activation="softmax")(h)

  model = Model(inputs=inputs, outputs=outputs)
  model.compile(optimizer=opt(learning_rate=lr), loss='binary_crossentropy', metrics=["accuracy"])
  return model

In [0]:
dataset = 'logmelspectrogram_40_101'
model_builder = lambda input_shape: build_vgg_model(input_shape)
model_arch = 'vgg'
model_id = 1
batch_size = 128

In [18]:
model_name = f"{dataset}-{model_arch}-{model_id}"
out_dir = f"./models/{model_name}/"
clear_dir(out_dir)

rm: cannot remove './models/logmelspectrogram_40_101-vgg-1/': No such file or directory


In [19]:
X, y = prepare_data(dataset)
input_shape = X[0].shape
input_shape

(40, 101)

In [0]:
splitter = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_SEED)
folds = splitter.split(train_df, train_df.label)

In [21]:
for i, (train_indices, valid_indices) in enumerate(folds):
  X_train, X_valid = X[train_indices], X[valid_indices]
  y_train, y_valid = y[train_indices], y[valid_indices]

  model = model_builder(input_shape)

  model = tf.contrib.tpu.keras_to_tpu_model(
    model,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(tpu=f'grpc://{os.environ["COLAB_TPU_ADDR"]}')
    )
  )

  history = model.fit(X_train, y_train, epochs = EPOCHS, validation_data = (X_valid, y_valid), batch_size = batch_size)

  model.save(out_dir + 'bag_{}_model.h5'.format(i))
  del model
  K.clear_session()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Querying Tensorflow master (grpc://10.107.171.82:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 11688625823266125391)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 12913491875793011610)
INFO:tensorflow:*** Available Device: _DeviceA