In [1]:
#!/usr/bin/env python3
import scipy.io.wavfile
from python_speech_features import mfcc, delta
import numpy as np
import math

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
np.random.seed(1234)

#files = [ 'vorst_14_machiavelli_8khz.wav'
# ]
files = [ 'cousinhenry_01_trollope_8khz.wav',
'siegeofcorinth_2_byron_8khz.wav',
'upperroom_16_ryle_8khz.wav',
'vorst_14_machiavelli_8khz.wav',
]

#more comments about this calculation? +1 - bias?
height_of_one_training_example = 49 * 13 * 2 + 1

label = 0
all_examples = []
for one_file in files:
  label += 1
  rate, data = scipy.io.wavfile.read(one_file)
  total_length_of_wave = data.shape[0]
  print ("just read file number %d which contains %d audio samples and is named %s Now analying it:" % (label, total_length_of_wave, one_file))
  assert rate == 8000, "rate was %d" % rate

  half_second_length = 4000
  start_index_of_half_second = 0
  num_training_example_in_this_file = 0
  while total_length_of_wave - start_index_of_half_second >= half_second_length:
    num_training_example_in_this_file += 1
    if num_training_example_in_this_file % 500 == 0:
      print ("\t analyzing training sample number %d" % num_training_example_in_this_file)

    this_training_example_raw = data[start_index_of_half_second:start_index_of_half_second + half_second_length]
    start_index_of_half_second += half_second_length
    assert len(this_training_example_raw) == 4000, len(this_training_example_raw)
    mfccs = mfcc(this_training_example_raw, 8000)
    assert mfccs.shape == (49, 13), mfccs.shape

    #Alfredo used 2 here, and changing it doesn't change the output size.
    first_derivative = delta(mfccs, 2)
    assert first_derivative.shape == (49, 13), first_derivative.shape
    all_examples.extend(mfccs.flatten().tolist())
    all_examples.extend(first_derivative.flatten().tolist())
    all_examples.append(label)
    assert len(all_examples) % height_of_one_training_example == 0, "num_training_example_in_this_file = %d" % num_training_example_in_this_file

all_examples_np = np.array(all_examples)
all_examples_np = all_examples_np.reshape((height_of_one_training_example, -1), order='F')

#print ("all_examples_np.shape = %s, so we have %d training samples" % (all_examples_np.shape, all_examples_np.shape[1]))
assert all_examples_np[-1, 0] == 1, "make sure the last row labels the first column as belonging to file number 1 %s" % all_examples_np[-1, 0]

shuffled_examples = all_examples_np.T
np.random.shuffle(shuffled_examples)
shuffled_examples = shuffled_examples.T

training_pct = 0.8

number_of_training_examples = int(math.ceil(all_examples_np.shape[1] * training_pct))

X_train = shuffled_examples[0:-1, 0:number_of_training_examples]
Y_train = shuffled_examples[-1:, 0:number_of_training_examples]
X_dev   = shuffled_examples[0:-1, number_of_training_examples:]
Y_dev   = shuffled_examples[-1:, number_of_training_examples:]

just read file number 1 which contains 7305509 audio samples and is named cousinhenry_01_trollope_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
just read file number 2 which contains 12400013 audio samples and is named siegeofcorinth_2_byron_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
	 analyzing training sample number 2000
	 analyzing training sample number 2500
	 analyzing training sample number 3000
just read file number 3 which contains 36554719 audio samples and is named upperroom_16_ryle_8khz.wav Now analying it:
	 analyzing training sample number 500
	 analyzing training sample number 1000
	 analyzing training sample number 1500
	 analyzing training sample number 2000
	 analyzing training sample number 2500
	 analyzing training sample number 3000
	 analyzing training sample number 3

In [3]:
# Xs are shape (number of input features, number of data points)
# Ys are shape (1, number of data points)
# The labels in Y are an integer corresponding to the speaker number.
# Before reshape
# (1274, 11853) (1, 11853) (1274, 2963) (1, 2963)

# In Keras, you want (number of data, attributes)
# Want: (see coursera M4 - Keras Tutorial)
# (11853, 1274) (11853, 1) (2963, 1274) (2963, 1)
# Reshape
X_train = X_train.T
Y_train = Y_train.T
X_dev = X_dev.T
Y_dev = Y_dev.T

#check
print(X_train.shape, Y_train.shape, X_dev.shape, Y_dev.shape)

#debugging - to be deleted
print(Y_train[100:120])

(11853, 1274) (11853, 1) (2963, 1274) (2963, 1)
[[3.]
 [3.]
 [3.]
 [1.]
 [3.]
 [2.]
 [3.]
 [3.]
 [2.]
 [3.]
 [3.]
 [3.]
 [2.]
 [3.]
 [3.]
 [3.]
 [2.]
 [3.]
 [3.]
 [3.]]


In [18]:
%%time

#Keras
model = tf.keras.Sequential()

#hidden layers - 50 nodes
#possible activation functions for hidden layers in Keras: elu (Exponential linear unit), selu (Scaled Exponential Linear Unit), 
#tanh, sigmoid, exponential, linear
#https://keras.io/activations/
model.add(layers.Dense(200, activation='relu'))
# Add another (optional):
model.add(layers.Dense(200, activation='relu'))
# Add a softmax layer with 10 output units:
model.add(layers.Dense(4, activation='softmax'))

#justification for binary: 
#"we compile the model using binary cross-entropy rather than categorical cross-entropy. This may seem counterintuitive 
# for multi-label classification; however, the goal is to treat each output label as an independent Bernoulli distribution 
# and we want to penalize each output node independently."
#quoted from: https://www.pyimagesearch.com/2018/05/07/multi-label-classification-with-keras/
#more: https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance/46038271
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])



CPU times: user 4.68 ms, sys: 6.6 ms, total: 11.3 ms
Wall time: 9.68 ms


In [22]:
%%time
history_object = model.fit(X_train, Y_train, epochs=50, batch_size=512, verbose=1, shuffle=True,
         validation_data=(X_dev, Y_dev))


Train on 11853 samples, validate on 2963 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 17.9 s, sys: 3 s, total: 20.9 s
Wall time: 13.2 s


In [21]:
print(history_object.history)

{'val_loss': [14.44058697166533, 14.43667934120325, 14.432600462955676, 14.448236995881127, 14.438385342248099, 14.429574998294909, 14.427746764075865, 14.425457743909606, 14.425772288422548, 14.422923814514139, 14.426620242646365, 14.422328138528464, 14.422537622120265, 14.42210616029037, 14.423429416395143, 14.42345581479544, 14.423060645696644, 14.422769702611403, 14.421398833278381, 14.42053453140954, 14.422290708340386, 14.424026167718615, 14.420126825500653, 14.419557930286214, 14.418518942512051, 14.419152651531569, 14.418233170768437, 14.421295351407595, 14.417680538549561, 14.41798743467817, 14.416661273133622, 14.416948939351427, 14.416808567375417, 14.416963274399253, 14.416558082547093, 14.416504896616477, 14.41633537754673, 14.416512165204761, 14.416161806568116, 14.416616634545276, 14.415617555933066, 14.416565983270496, 14.415150217538422, 14.415574672774934, 14.415621304326955, 14.415544983989035, 14.41498395956492, 14.414190758735073, 14.414796732021593, 14.41508993264