In [1]:
#!/usr/bin/env python3
import scipy.io.wavfile
from python_speech_features import mfcc, delta
import numpy as np
import math

import tensorflow as tf
from tensorflow.keras import layers
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
%%time
np.random.seed(1234)

#files = [ 'vorst_14_machiavelli_8khz.wav'
# ]
files = [ 'cousinhenry_01_trollope_8khz.wav',
'siegeofcorinth_2_byron_8khz.wav',
'upperroom_16_ryle_8khz.wav',
'vorst_14_machiavelli_8khz.wav',
]

# This constant is the number of entries in the all_examples list below that
# correspond to one training example.
# The 13 is magic -- it's the default number of mfcc... frequencies(?) calculated
# by many places on the web and in the python library we used.
# The 49 comes from a combination of our sample size and mfcc window. Our sample
# size is half second and we calculate mfccs in a 25ms window with a 10ms stride.
# (.5 - .01) / 0.01

in 1/2 second of an 8000hz wave, we get 49 values for each of the 13 frequencies.
# Multiply that * 2 because we are also calculating the first derivative of the
# mfcc... cepstrum(?), which seems to be common (further adding 2nd derivative
# is also common).
# The + 1 is because the label for each sample comes right after the above.
height_of_one_training_example = 49 * 13 * 2 + 1

label = 0
all_examples = []
for one_file in files:
  label += 1
  rate, data = scipy.io.wavfile.read(one_file)
  total_length_of_wave = data.shape[0]
  print ("just read file number %d which contains %d audio samples and is named %s Now analying it:" % (label, total_length_of_wave, one_file))
  assert rate == 8000, "rate was %d" % rate

  half_second_length = 4000
  start_index_of_half_second = 0
  num_training_example_in_this_file = 0
  while total_length_of_wave - start_index_of_half_second >= half_second_length:
    num_training_example_in_this_file += 1
    if num_training_example_in_this_file % 500 == 0:
      print ("\t analyzing training sample number %d" % num_training_example_in_this_file)

    this_training_example_raw = data[start_index_of_half_second:start_index_of_half_second + half_second_length]
    start_index_of_half_second += half_second_length
    assert len(this_training_example_raw) == 4000, len(this_training_example_raw)
    mfccs = mfcc(this_training_example_raw, 8000)
    assert mfccs.shape == (49, 13), mfccs.shape

    #Alfredo used 2 here, and changing it doesn't change the output size.
    first_derivative = delta(mfccs, 2)
    assert first_derivative.shape == (49, 13), first_derivative.shape
    all_examples.extend(mfccs.flatten().tolist())
    all_examples.extend(first_derivative.flatten().tolist())
    all_examples.append(label)
    assert len(all_examples) % height_of_one_training_example == 0, "num_training_example_in_this_file = %d" % num_training_example_in_this_file

just read file number 1 which contains 7305509 audio samples and is named cousinhenry_01_trollope_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
just read file number 2 which contains 12400013 audio samples and is named siegeofcorinth_2_byron_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
	 analyzing training sample number 2000
	 analyzing training sample number 2500
	 analyzing training sample number 3000
just read file number 3 which contains 36554719 audio samples and is named upperroom_16_ryle_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
	 analyzing training sample number 2000
	 analyzing training sample number 2500
	 analyzing training sample number 3000
	 analyzing training sample number 3

In [28]:
%%time

np.random.seed(123456)

# Make a dataset out of the single python list constructed above.
# TODO(dgrogan): Parts of the first half and last half of this cell are inverses and can be combined.
all_examples_np = np.array(all_examples)
all_examples_np = all_examples_np.reshape((height_of_one_training_example, -1), order='F')

#print ("all_examples_np.shape = %s, so we have %d training samples" % (all_examples_np.shape, all_examples_np.shape[1]))
assert all_examples_np[-1, 0] == 1, "make sure the last row labels the first column as belonging to file number 1 %s" % all_examples_np[-1, 0]

shuffled_examples = all_examples_np.T
np.random.shuffle(shuffled_examples)
shuffled_examples = shuffled_examples.T

# I changed to 0.9 when I thought we might not have enough data. We can change it back to whatever.
training_pct = 0.9

number_of_training_examples = int(math.ceil(all_examples_np.shape[1] * training_pct))

X_train = shuffled_examples[0:-1, 0:number_of_training_examples]
Y_train = shuffled_examples[-1:, 0:number_of_training_examples]
X_dev   = shuffled_examples[0:-1, number_of_training_examples:]
Y_dev   = shuffled_examples[-1:, number_of_training_examples:]

# Xs are shape (number of input features, number of data points)
# Ys are shape (1, number of data points)
# The labels in Y are an integer corresponding to the speaker number.
# Before reshape
# (1274, 11853) (1, 11853) (1274, 2963) (1, 2963)

# In Keras, you want (number of data, attributes)
# Want: (see coursera M4 - Keras Tutorial)
# (11853, 1274) (11853, 1) (2963, 1274) (2963, 1)
# Reshape
X_train = X_train.T
Y_train = Y_train.T
X_dev = X_dev.T
Y_dev = Y_dev.T

#check
print(X_train.shape, Y_train.shape, X_dev.shape, Y_dev.shape)

#debugging - to be deleted
print(Y_train[110:120])
Y_train = to_categorical(Y_train - 1) # -1 because to_categorical seems to expect labels to start at 0
Y_dev = to_categorical(Y_dev - 1)
print(Y_train[110:120])

(13335, 1274) (13335, 1) (1481, 1274) (1481, 1)
[[3.]
 [2.]
 [2.]
 [4.]
 [3.]
 [3.]
 [1.]
 [3.]
 [3.]
 [3.]]
[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
CPU times: user 1.12 s, sys: 162 ms, total: 1.28 s
Wall time: 1.26 s


In [None]:
%%time

#Keras
model = tf.keras.Sequential()

#hidden layers
model.add(layers.Dense(5, activation='relu', input_dim=X_train.shape[1]))
model.add(layers.Dense(4, activation='softmax'))

#justification for binary: 
#"we compile the model using binary cross-entropy rather than categorical cross-entropy. This may seem counterintuitive 
# for multi-label classification; however, the goal is to treat each output label as an independent Bernoulli distribution 
# and we want to penalize each output node independently."
#quoted from: https://www.pyimagesearch.com/2018/05/07/multi-label-classification-with-keras/
#more: https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance/46038271
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

import keras.utils
import IPython.display
keras.utils.plot_model(model, to_file='test_keras_plot_model.png', show_shapes=True)
display(IPython.display.Image('test_keras_plot_model.png'))
print(model.summary())
# I don't know where 140240419526080 in the picture came from

In [33]:
%%time
history_object = model.fit(X_train, Y_train, epochs=50, batch_size=128, verbose=2, shuffle=True,
         validation_data=(X_dev, Y_dev))


Train on 13335 samples, validate on 1481 samples
Epoch 1/50
 - 1s - loss: 1.1132 - acc: 0.6841 - val_loss: 0.6287 - val_acc: 0.7711
Epoch 2/50
 - 1s - loss: 0.5521 - acc: 0.8040 - val_loss: 0.5275 - val_acc: 0.8103
Epoch 3/50
 - 1s - loss: 0.4887 - acc: 0.8164 - val_loss: 0.4902 - val_acc: 0.8116
Epoch 4/50
 - 1s - loss: 0.4239 - acc: 0.8352 - val_loss: 0.3937 - val_acc: 0.8798
Epoch 5/50
 - 1s - loss: 0.3391 - acc: 0.8886 - val_loss: 0.3515 - val_acc: 0.9169
Epoch 6/50
 - 1s - loss: 0.3035 - acc: 0.9207 - val_loss: 0.3407 - val_acc: 0.9244
Epoch 7/50
 - 1s - loss: 0.2777 - acc: 0.9282 - val_loss: 0.3363 - val_acc: 0.9291
Epoch 8/50
 - 1s - loss: 0.2606 - acc: 0.9330 - val_loss: 0.3074 - val_acc: 0.9338
Epoch 9/50
 - 1s - loss: 0.2422 - acc: 0.9393 - val_loss: 0.3105 - val_acc: 0.9332
Epoch 10/50
 - 1s - loss: 0.2277 - acc: 0.9429 - val_loss: 0.3065 - val_acc: 0.9426
Epoch 11/50
 - 1s - loss: 0.2127 - acc: 0.9463 - val_loss: 0.2942 - val_acc: 0.9446
Epoch 12/50
 - 1s - loss: 0.2025 - a

In [34]:
print(history_object.history)

{'val_loss': [0.628659961716216, 0.5274683567094771, 0.49015536910371954, 0.3936888441332766, 0.35152969190914996, 0.34065694699490257, 0.3363025250425216, 0.30740239656832796, 0.3104892758643764, 0.3065342222573707, 0.2942391063296095, 0.2894254554613952, 0.28193381579641874, 0.29079900788624974, 0.2870364790627636, 0.28071054418814656, 0.28783402961705845, 0.2858752277475366, 0.2745104163462531, 0.27848368815720526, 0.26819568455903453, 0.2775131061240518, 0.27702945935150647, 0.2770496806548625, 0.2657497778766305, 0.26580034327177, 0.279494552058718, 0.2690821038074867, 0.2627817449398253, 0.2601203169575581, 0.27183974186646465, 0.2565397036530374, 0.2519127376412798, 0.2558683898000601, 0.2544284842802736, 0.2776728342567079, 0.26972453046658007, 0.2718190604230989, 0.27168635497275917, 0.2700019520251514, 0.27203720121831204, 0.28335269749325565, 0.282552713793649, 0.27472177219805405, 0.28494654322177637, 0.2904336956064899, 0.2721321642358588, 0.2793166618303702, 0.28480295326