In [1]:
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
from scipy.io import wavfile
from python_speech_features import mfcc
from tqdm import tqdm

import re
import os
import numpy as np
import tensorflow as tf

In [2]:
PARAMS = {
    'num_epochs': 30,
    'batch_size': 30,
    'rnn_size': 100,
    'clip_norm': 5.0,
}

In [3]:
def download():
    prefix = 'https://tspace.library.utoronto.ca'
    save_dir = './data/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'
    urls = [base_url+str(i) for i in range(488, 502)]

    for url in urls:
        soup = BeautifulSoup(urlopen(url).read(), 'html5lib')
        targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))
        
        for a in tqdm(targets, total=len(targets), ncols=70):
            link = a['href']

            audio_save_loc = save_dir + link.split('/')[-1]
            if os.path.isfile(audio_save_loc):
                print("File Already Exists")
            urlretrieve(prefix+a['href'], audio_save_loc)

            with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:
                f.write('say the word ' + link.split('_')[-2])

In [4]:
def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return (indices, values, shape)

def train_input_fn(X, y):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(10000).batch(PARAMS['batch_size']).repeat(PARAMS['num_epochs'])
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def rnn_cell():
    return tf.nn.rnn_cell.GRUCell(PARAMS['rnn_size'],
        kernel_initializer=tf.orthogonal_initializer())

def clip_grads(loss_op):
    variables = tf.trainable_variables()
    grads = tf.gradients(loss_op, variables)
    clipped_grads, _ = tf.clip_by_global_norm(grads, PARAMS['clip_norm'])
    return zip(clipped_grads, variables)

def model_fn(features, labels, mode, params):
    seq_lens = tf.count_nonzero(tf.reduce_sum(features, -1), 1, dtype=tf.int32)
    
    outputs, _ = tf.nn.dynamic_rnn(rnn_cell(), features, seq_lens, dtype=tf.float32)
    logits = tf.layers.dense(outputs, PARAMS['num_classes'])
    
    time_major = tf.transpose(logits, [1,0,2])
    decoded, log_prob = tf.nn.ctc_greedy_decoder(time_major, seq_lens)
    decoded = tf.to_int32(decoded[0])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        preds = tf.sparse_tensor_to_dense(decoded)
        return tf.estimator.EstimatorSpec(mode, predictions=preds)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss_op = tf.reduce_mean(tf.nn.ctc_loss(labels, time_major, seq_lens))
        edit_dist_op = tf.reduce_mean(tf.edit_distance(decoded, labels))

        lth = tf.train.LoggingTensorHook({'edit_dist': edit_dist_op}, every_n_iter=100)
        
        train_op = tf.train.AdamOptimizer().apply_gradients(
            clip_grads(loss_op), global_step=tf.train.get_global_step())
        
        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss_op, train_op=train_op, training_hooks=[lth])

In [5]:
download()

100%|███████████████████████████████| 200/200 [02:13<00:00,  1.49it/s]
100%|███████████████████████████████| 200/200 [02:01<00:00,  1.65it/s]
100%|███████████████████████████████| 200/200 [02:19<00:00,  1.44it/s]
100%|███████████████████████████████| 200/200 [02:15<00:00,  1.48it/s]
100%|███████████████████████████████| 200/200 [02:00<00:00,  1.65it/s]
100%|███████████████████████████████| 200/200 [02:12<00:00,  1.51it/s]
100%|███████████████████████████████| 200/200 [02:17<00:00,  1.45it/s]
100%|███████████████████████████████| 200/200 [02:10<00:00,  1.54it/s]
100%|███████████████████████████████| 200/200 [02:21<00:00,  1.42it/s]
100%|███████████████████████████████| 200/200 [02:32<00:00,  1.32it/s]
100%|███████████████████████████████| 200/200 [02:32<00:00,  1.31it/s]
100%|███████████████████████████████| 200/200 [02:13<00:00,  1.50it/s]
100%|███████████████████████████████| 200/200 [02:30<00:00,  1.33it/s]
100%|███████████████████████████████| 200/200 [02:19<00:00,  1.44it/s]


In [6]:
wav_files = [f for f in os.listdir('./data') if f.endswith('.wav')]
text_files = [f for f in os.listdir('./data') if f.endswith('.txt')]

inputs, targets = [], []
for (wav_file, text_file) in tqdm(zip(wav_files, text_files), total=len(wav_files), ncols=70):
    path = './data/' + wav_file
    try:
        fs, audio = wavfile.read(path)
    except:
        continue
    input = mfcc(audio, samplerate=fs, nfft=1024)
    inputs.append(input)
    with open('./data/'+text_file) as f:
        targets.append(f.read())

inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, dtype='float32', padding='post')

chars = list(set([c for target in targets for c in target]))
PARAMS['num_classes'] = len(chars) + 1

idx2char = {idx: char for idx, char in enumerate(chars)}
char2idx = {char: idx for idx, char in idx2char.items()}

targets = [[char2idx[c] for c in target] for target in targets]

inputs_val = np.expand_dims(inputs[-1], 0)
targets_val = targets[-1]

inputs_train = inputs[:-1]
targets_train = targets[:-1]
targets_train = tf.SparseTensor(*sparse_tuple_from(targets_train))

100%|████████████████████████████| 2800/2800 [00:17<00:00, 162.52it/s]


In [7]:
estimator = tf.estimator.Estimator(model_fn)

estimator.train(lambda: train_input_fn(inputs_train, targets_train))

preds = list(estimator.predict(tf.estimator.inputs.numpy_input_fn(inputs_val, shuffle=False)))

print('Prediction:', ''.join([idx2char[idx] for idx in preds[0]]))
print('Actual:', ''.join([idx2char[idx] for idx in targets_val]))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x117045fd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x117045fd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}






INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 1 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt.


INFO:tensorflow:loss = 434.65717, step = 1


INFO:tensorflow:loss = 434.65717, step = 1


INFO:tensorflow:edit_dist = 2.8257017


INFO:tensorflow:edit_dist = 2.8257017


INFO:tensorflow:global_step/sec: 5.94098


INFO:tensorflow:global_step/sec: 5.94098


INFO:tensorflow:loss = 46.395313, step = 101 (16.835 sec)


INFO:tensorflow:loss = 46.395313, step = 101 (16.835 sec)


INFO:tensorflow:edit_dist = 0.82320267 (16.834 sec)


INFO:tensorflow:edit_dist = 0.82320267 (16.834 sec)


INFO:tensorflow:global_step/sec: 5.96814


INFO:tensorflow:global_step/sec: 5.96814


INFO:tensorflow:loss = 33.860664, step = 201 (16.755 sec)


INFO:tensorflow:loss = 33.860664, step = 201 (16.755 sec)


INFO:tensorflow:edit_dist = 0.6746732 (16.755 sec)


INFO:tensorflow:edit_dist = 0.6746732 (16.755 sec)


INFO:tensorflow:global_step/sec: 6.0013


INFO:tensorflow:global_step/sec: 6.0013


INFO:tensorflow:loss = 27.641712, step = 301 (16.663 sec)


INFO:tensorflow:loss = 27.641712, step = 301 (16.663 sec)


INFO:tensorflow:edit_dist = 0.54925686 (16.663 sec)


INFO:tensorflow:edit_dist = 0.54925686 (16.663 sec)


INFO:tensorflow:global_step/sec: 5.92174


INFO:tensorflow:global_step/sec: 5.92174


INFO:tensorflow:loss = 22.76435, step = 401 (16.888 sec)


INFO:tensorflow:loss = 22.76435, step = 401 (16.888 sec)


INFO:tensorflow:edit_dist = 0.41477397 (16.888 sec)


INFO:tensorflow:edit_dist = 0.41477397 (16.888 sec)


INFO:tensorflow:global_step/sec: 5.89069


INFO:tensorflow:global_step/sec: 5.89069


INFO:tensorflow:loss = 19.850977, step = 501 (16.975 sec)


INFO:tensorflow:loss = 19.850977, step = 501 (16.975 sec)


INFO:tensorflow:edit_dist = 0.32196206 (16.974 sec)


INFO:tensorflow:edit_dist = 0.32196206 (16.974 sec)


INFO:tensorflow:global_step/sec: 6.10786


INFO:tensorflow:global_step/sec: 6.10786


INFO:tensorflow:loss = 15.630804, step = 601 (16.372 sec)


INFO:tensorflow:loss = 15.630804, step = 601 (16.372 sec)


INFO:tensorflow:edit_dist = 0.28876635 (16.372 sec)


INFO:tensorflow:edit_dist = 0.28876635 (16.372 sec)


INFO:tensorflow:global_step/sec: 6.03343


INFO:tensorflow:global_step/sec: 6.03343


INFO:tensorflow:loss = 19.161509, step = 701 (16.574 sec)


INFO:tensorflow:loss = 19.161509, step = 701 (16.574 sec)


INFO:tensorflow:edit_dist = 0.25637686 (16.575 sec)


INFO:tensorflow:edit_dist = 0.25637686 (16.575 sec)


INFO:tensorflow:global_step/sec: 6.08116


INFO:tensorflow:global_step/sec: 6.08116


INFO:tensorflow:loss = 13.793896, step = 801 (16.444 sec)


INFO:tensorflow:loss = 13.793896, step = 801 (16.444 sec)


INFO:tensorflow:edit_dist = 0.24230665 (16.444 sec)


INFO:tensorflow:edit_dist = 0.24230665 (16.444 sec)


INFO:tensorflow:global_step/sec: 6.05483


INFO:tensorflow:global_step/sec: 6.05483


INFO:tensorflow:loss = 12.93654, step = 901 (16.516 sec)


INFO:tensorflow:loss = 12.93654, step = 901 (16.516 sec)


INFO:tensorflow:edit_dist = 0.22165105 (16.516 sec)


INFO:tensorflow:edit_dist = 0.22165105 (16.516 sec)


INFO:tensorflow:global_step/sec: 6.06825


INFO:tensorflow:global_step/sec: 6.06825


INFO:tensorflow:loss = 13.180041, step = 1001 (16.479 sec)


INFO:tensorflow:loss = 13.180041, step = 1001 (16.479 sec)


INFO:tensorflow:edit_dist = 0.21953978 (16.479 sec)


INFO:tensorflow:edit_dist = 0.21953978 (16.479 sec)


INFO:tensorflow:global_step/sec: 6.02134


INFO:tensorflow:global_step/sec: 6.02134


INFO:tensorflow:loss = 15.223924, step = 1101 (16.607 sec)


INFO:tensorflow:loss = 15.223924, step = 1101 (16.607 sec)


INFO:tensorflow:edit_dist = 0.21629974 (16.608 sec)


INFO:tensorflow:edit_dist = 0.21629974 (16.608 sec)


INFO:tensorflow:global_step/sec: 6.07382


INFO:tensorflow:global_step/sec: 6.07382


INFO:tensorflow:loss = 14.054456, step = 1201 (16.464 sec)


INFO:tensorflow:loss = 14.054456, step = 1201 (16.464 sec)


INFO:tensorflow:edit_dist = 0.23936002 (16.464 sec)


INFO:tensorflow:edit_dist = 0.23936002 (16.464 sec)


INFO:tensorflow:global_step/sec: 6.04492


INFO:tensorflow:global_step/sec: 6.04492


INFO:tensorflow:loss = 11.905692, step = 1301 (16.543 sec)


INFO:tensorflow:loss = 11.905692, step = 1301 (16.543 sec)


INFO:tensorflow:edit_dist = 0.19477125 (16.543 sec)


INFO:tensorflow:edit_dist = 0.19477125 (16.543 sec)


INFO:tensorflow:global_step/sec: 6.15223


INFO:tensorflow:global_step/sec: 6.15223


INFO:tensorflow:loss = 10.844979, step = 1401 (16.254 sec)


INFO:tensorflow:loss = 10.844979, step = 1401 (16.254 sec)


INFO:tensorflow:edit_dist = 0.20189829 (16.254 sec)


INFO:tensorflow:edit_dist = 0.20189829 (16.254 sec)


INFO:tensorflow:global_step/sec: 6.15795


INFO:tensorflow:global_step/sec: 6.15795


INFO:tensorflow:loss = 11.180701, step = 1501 (16.239 sec)


INFO:tensorflow:loss = 11.180701, step = 1501 (16.239 sec)


INFO:tensorflow:edit_dist = 0.19127464 (16.239 sec)


INFO:tensorflow:edit_dist = 0.19127464 (16.239 sec)


INFO:tensorflow:global_step/sec: 5.94565


INFO:tensorflow:global_step/sec: 5.94565


INFO:tensorflow:loss = 9.807221, step = 1601 (16.819 sec)


INFO:tensorflow:loss = 9.807221, step = 1601 (16.819 sec)


INFO:tensorflow:edit_dist = 0.1834695 (16.819 sec)


INFO:tensorflow:edit_dist = 0.1834695 (16.819 sec)


INFO:tensorflow:global_step/sec: 5.97021


INFO:tensorflow:global_step/sec: 5.97021


INFO:tensorflow:loss = 9.827714, step = 1701 (16.750 sec)


INFO:tensorflow:loss = 9.827714, step = 1701 (16.750 sec)


INFO:tensorflow:edit_dist = 0.20654957 (16.750 sec)


INFO:tensorflow:edit_dist = 0.20654957 (16.750 sec)


INFO:tensorflow:global_step/sec: 5.78923


INFO:tensorflow:global_step/sec: 5.78923


INFO:tensorflow:loss = 9.658855, step = 1801 (17.273 sec)


INFO:tensorflow:loss = 9.658855, step = 1801 (17.273 sec)


INFO:tensorflow:edit_dist = 0.17812364 (17.273 sec)


INFO:tensorflow:edit_dist = 0.17812364 (17.273 sec)


INFO:tensorflow:global_step/sec: 6.1409


INFO:tensorflow:global_step/sec: 6.1409


INFO:tensorflow:loss = 9.259833, step = 1901 (16.284 sec)


INFO:tensorflow:loss = 9.259833, step = 1901 (16.284 sec)


INFO:tensorflow:edit_dist = 0.18071035 (16.284 sec)


INFO:tensorflow:edit_dist = 0.18071035 (16.284 sec)


INFO:tensorflow:global_step/sec: 6.06224


INFO:tensorflow:global_step/sec: 6.06224


INFO:tensorflow:loss = 10.283435, step = 2001 (16.495 sec)


INFO:tensorflow:loss = 10.283435, step = 2001 (16.495 sec)


INFO:tensorflow:edit_dist = 0.18995169 (16.496 sec)


INFO:tensorflow:edit_dist = 0.18995169 (16.496 sec)


INFO:tensorflow:global_step/sec: 6.18652


INFO:tensorflow:global_step/sec: 6.18652


INFO:tensorflow:loss = 10.535307, step = 2101 (16.164 sec)


INFO:tensorflow:loss = 10.535307, step = 2101 (16.164 sec)


INFO:tensorflow:edit_dist = 0.19774252 (16.164 sec)


INFO:tensorflow:edit_dist = 0.19774252 (16.164 sec)


INFO:tensorflow:global_step/sec: 6.15852


INFO:tensorflow:global_step/sec: 6.15852


INFO:tensorflow:loss = 9.836356, step = 2201 (16.238 sec)


INFO:tensorflow:loss = 9.836356, step = 2201 (16.238 sec)


INFO:tensorflow:edit_dist = 0.18274783 (16.239 sec)


INFO:tensorflow:edit_dist = 0.18274783 (16.239 sec)


INFO:tensorflow:global_step/sec: 6.04534


INFO:tensorflow:global_step/sec: 6.04534


INFO:tensorflow:loss = 9.418566, step = 2301 (16.541 sec)


INFO:tensorflow:loss = 9.418566, step = 2301 (16.541 sec)


INFO:tensorflow:edit_dist = 0.18911032 (16.540 sec)


INFO:tensorflow:edit_dist = 0.18911032 (16.540 sec)


INFO:tensorflow:global_step/sec: 6.04203


INFO:tensorflow:global_step/sec: 6.04203


INFO:tensorflow:loss = 9.450277, step = 2401 (16.551 sec)


INFO:tensorflow:loss = 9.450277, step = 2401 (16.551 sec)


INFO:tensorflow:edit_dist = 0.1781999 (16.551 sec)


INFO:tensorflow:edit_dist = 0.1781999 (16.551 sec)


INFO:tensorflow:global_step/sec: 5.93602


INFO:tensorflow:global_step/sec: 5.93602


INFO:tensorflow:loss = 9.791633, step = 2501 (16.847 sec)


INFO:tensorflow:loss = 9.791633, step = 2501 (16.847 sec)


INFO:tensorflow:edit_dist = 0.18005031 (16.847 sec)


INFO:tensorflow:edit_dist = 0.18005031 (16.847 sec)


INFO:tensorflow:global_step/sec: 5.84538


INFO:tensorflow:global_step/sec: 5.84538


INFO:tensorflow:loss = 8.41558, step = 2601 (17.107 sec)


INFO:tensorflow:loss = 8.41558, step = 2601 (17.107 sec)


INFO:tensorflow:edit_dist = 0.17927559 (17.106 sec)


INFO:tensorflow:edit_dist = 0.17927559 (17.106 sec)


INFO:tensorflow:global_step/sec: 5.9121


INFO:tensorflow:global_step/sec: 5.9121


INFO:tensorflow:loss = 9.12147, step = 2701 (16.914 sec)


INFO:tensorflow:loss = 9.12147, step = 2701 (16.914 sec)


INFO:tensorflow:edit_dist = 0.1916122 (16.915 sec)


INFO:tensorflow:edit_dist = 0.1916122 (16.915 sec)


INFO:tensorflow:global_step/sec: 5.99564


INFO:tensorflow:global_step/sec: 5.99564


INFO:tensorflow:loss = 9.376292, step = 2801 (16.679 sec)


INFO:tensorflow:loss = 9.376292, step = 2801 (16.679 sec)


INFO:tensorflow:edit_dist = 0.17782149 (16.678 sec)


INFO:tensorflow:edit_dist = 0.17782149 (16.678 sec)


INFO:tensorflow:Saving checkpoints for 2820 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt.


INFO:tensorflow:Saving checkpoints for 2820 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt.


INFO:tensorflow:Loss for final step: 9.94348.


INFO:tensorflow:Loss for final step: 9.94348.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt-2820


INFO:tensorflow:Restoring parameters from /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpxnlowvrb/model.ckpt-2820


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


Prediction: say the word a
Actual: say the word youth
