In [1]:
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
from scipy.io import wavfile
from python_speech_features import mfcc
from tqdm import tqdm

import re
import os
import numpy as np
import tensorflow as tf

In [2]:
PARAMS = {
    'num_epochs': 20,
    'batch_size': 30,
    'rnn_size': 100,
    'clip_norm': 5.0,
}

In [3]:
def download():
    prefix = 'https://tspace.library.utoronto.ca'
    save_dir = './data/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'
    urls = [base_url+str(i) for i in range(488, 502)]

    count = 1
    for url in urls:
        soup = BeautifulSoup(urlopen(url).read(), 'html5lib')
        for a in soup.findAll('a', href=re.compile(r'/bitstream/.*.wav')):
            link = a['href']
            print(count, a['href'])

            audio_save_loc = save_dir + link.split('/')[-1]
            if os.path.isfile(audio_save_loc):
                print("File Already Exists")
            urlretrieve(prefix+a['href'], audio_save_loc)

            with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:
                f.write('say the word ' + link.split('_')[-2])

            count += 1

In [4]:
def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return (indices, values, shape)

def train_input_fn(X, y):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(10000).batch(PARAMS['batch_size']).repeat(PARAMS['num_epochs'])
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def predict_input_fn(X):
    dataset = tf.data.Dataset.from_tensor_slices(X)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def rnn_cell():
    return tf.nn.rnn_cell.GRUCell(PARAMS['rnn_size'],
        kernel_initializer=tf.orthogonal_initializer())

def clip_grads(loss_op):
    variables = tf.trainable_variables()
    grads = tf.gradients(loss_op, variables)
    clipped_grads, _ = tf.clip_by_global_norm(grads, PARAMS['clip_norm'])
    return zip(clipped_grads, variables)

def model_fn(features, labels, mode, params):
    seq_lens = tf.count_nonzero(tf.reduce_sum(features, -1), 1, dtype=tf.int32)
    
    outputs, _ = tf.nn.dynamic_rnn(rnn_cell(), features, seq_lens, dtype=tf.float32)
    logits = tf.layers.dense(outputs, PARAMS['num_classes'])
    
    time_major = tf.transpose(logits, [1,0,2])
    decoded, log_prob = tf.nn.ctc_greedy_decoder(time_major, seq_lens)
    decoded = tf.to_int32(decoded[0])
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        preds = tf.sparse_tensor_to_dense(decoded)
        return tf.estimator.EstimatorSpec(mode, predictions=preds)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss_op = tf.reduce_mean(tf.nn.ctc_loss(labels, time_major, seq_lens))
        edit_dist_op = tf.reduce_mean(tf.edit_distance(decoded, labels))

        lth = tf.train.LoggingTensorHook({'edit_dist': edit_dist_op}, every_n_iter=100)
        
        train_op = tf.train.AdamOptimizer().apply_gradients(
            clip_grads(loss_op), global_step=tf.train.get_global_step())
        
        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss_op, train_op=train_op, training_hooks=[lth])

In [5]:
#download()

In [6]:
wav_files = [f for f in os.listdir('./data') if f.endswith('.wav')]
text_files = [f for f in os.listdir('./data') if f.endswith('.txt')]

inputs, targets = [], []
for (wav_file, text_file) in tqdm(zip(wav_files, text_files), total=len(wav_files), ncols=70):
    path = './data/' + wav_file
    try:
        fs, audio = wavfile.read(path)
    except:
        continue
    input = mfcc(audio, samplerate=fs, nfft=1024)
    inputs.append(input)
    with open('./data/'+text_file) as f:
        targets.append(f.read())

inputs = tf.keras.preprocessing.sequence.pad_sequences(
    inputs, dtype='float32', padding='post')

chars = list(set([c for target in targets for c in target]))
PARAMS['num_classes'] = len(chars) + 1

idx2char = {idx: char for idx, char in enumerate(chars)}
char2idx = {char: idx for idx, char in idx2char.items()}

targets = [[char2idx[c] for c in target] for target in targets]

inputs_val = np.expand_dims(inputs[-1], 0)
targets_val = targets[-1]

inputs_train = inputs[:-1]
targets_train = targets[:-1]
targets_train = tf.SparseTensor(*sparse_tuple_from(targets_train))

100%|████████████████████████████| 2800/2800 [00:17<00:00, 158.52it/s]


In [7]:
estimator = tf.estimator.Estimator(model_fn)

estimator.train(lambda: train_input_fn(inputs_train, targets_train))

preds = list(estimator.predict(
    tf.estimator.inputs.numpy_input_fn(
        x = inputs_val,
        shuffle = False)))

print('Prediction:', ''.join([idx2char[idx] for idx in preds[0]]))
print('Actual:', ''.join([idx2char[idx] for idx in targets_val]))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1174f4f98>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1174f4f98>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}






INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 1 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt.


INFO:tensorflow:loss = 462.272, step = 1


INFO:tensorflow:loss = 462.272, step = 1


INFO:tensorflow:edit_dist = 2.9061818


INFO:tensorflow:edit_dist = 2.9061818


INFO:tensorflow:global_step/sec: 5.97126


INFO:tensorflow:global_step/sec: 5.97126


INFO:tensorflow:loss = 34.64298, step = 101 (16.749 sec)


INFO:tensorflow:loss = 34.64298, step = 101 (16.749 sec)


INFO:tensorflow:edit_dist = 0.9379013 (16.748 sec)


INFO:tensorflow:edit_dist = 0.9379013 (16.748 sec)


INFO:tensorflow:global_step/sec: 6.26219


INFO:tensorflow:global_step/sec: 6.26219


INFO:tensorflow:loss = 24.484943, step = 201 (15.968 sec)


INFO:tensorflow:loss = 24.484943, step = 201 (15.968 sec)


INFO:tensorflow:edit_dist = 0.79175484 (15.968 sec)


INFO:tensorflow:edit_dist = 0.79175484 (15.968 sec)


INFO:tensorflow:global_step/sec: 6.32338


INFO:tensorflow:global_step/sec: 6.32338


INFO:tensorflow:loss = 19.556602, step = 301 (15.815 sec)


INFO:tensorflow:loss = 19.556602, step = 301 (15.815 sec)


INFO:tensorflow:edit_dist = 0.5782291 (15.815 sec)


INFO:tensorflow:edit_dist = 0.5782291 (15.815 sec)


INFO:tensorflow:global_step/sec: 6.22828


INFO:tensorflow:global_step/sec: 6.22828


INFO:tensorflow:loss = 15.326058, step = 401 (16.056 sec)


INFO:tensorflow:loss = 15.326058, step = 401 (16.056 sec)


INFO:tensorflow:edit_dist = 0.43989652 (16.056 sec)


INFO:tensorflow:edit_dist = 0.43989652 (16.056 sec)


INFO:tensorflow:global_step/sec: 6.08886


INFO:tensorflow:global_step/sec: 6.08886


INFO:tensorflow:loss = 13.972825, step = 501 (16.424 sec)


INFO:tensorflow:loss = 13.972825, step = 501 (16.424 sec)


INFO:tensorflow:edit_dist = 0.37958091 (16.424 sec)


INFO:tensorflow:edit_dist = 0.37958091 (16.424 sec)


INFO:tensorflow:global_step/sec: 6.206


INFO:tensorflow:global_step/sec: 6.206


INFO:tensorflow:loss = 12.944057, step = 601 (16.113 sec)


INFO:tensorflow:loss = 12.944057, step = 601 (16.113 sec)


INFO:tensorflow:edit_dist = 0.35112664 (16.113 sec)


INFO:tensorflow:edit_dist = 0.35112664 (16.113 sec)


INFO:tensorflow:global_step/sec: 6.06093


INFO:tensorflow:global_step/sec: 6.06093


INFO:tensorflow:loss = 12.1539345, step = 701 (16.499 sec)


INFO:tensorflow:loss = 12.1539345, step = 701 (16.499 sec)


INFO:tensorflow:edit_dist = 0.35554054 (16.499 sec)


INFO:tensorflow:edit_dist = 0.35554054 (16.499 sec)


INFO:tensorflow:global_step/sec: 6.14839


INFO:tensorflow:global_step/sec: 6.14839


INFO:tensorflow:loss = 10.562868, step = 801 (16.264 sec)


INFO:tensorflow:loss = 10.562868, step = 801 (16.264 sec)


INFO:tensorflow:edit_dist = 0.2948148 (16.264 sec)


INFO:tensorflow:edit_dist = 0.2948148 (16.264 sec)


INFO:tensorflow:global_step/sec: 6.23487


INFO:tensorflow:global_step/sec: 6.23487


INFO:tensorflow:loss = 9.983798, step = 901 (16.039 sec)


INFO:tensorflow:loss = 9.983798, step = 901 (16.039 sec)


INFO:tensorflow:edit_dist = 0.2926184 (16.039 sec)


INFO:tensorflow:edit_dist = 0.2926184 (16.039 sec)


INFO:tensorflow:global_step/sec: 5.89253


INFO:tensorflow:global_step/sec: 5.89253


INFO:tensorflow:loss = 9.45628, step = 1001 (16.971 sec)


INFO:tensorflow:loss = 9.45628, step = 1001 (16.971 sec)


INFO:tensorflow:edit_dist = 0.2888466 (16.971 sec)


INFO:tensorflow:edit_dist = 0.2888466 (16.971 sec)


INFO:tensorflow:global_step/sec: 6.25938


INFO:tensorflow:global_step/sec: 6.25938


INFO:tensorflow:loss = 8.397293, step = 1101 (15.976 sec)


INFO:tensorflow:loss = 8.397293, step = 1101 (15.976 sec)


INFO:tensorflow:edit_dist = 0.28138545 (15.976 sec)


INFO:tensorflow:edit_dist = 0.28138545 (15.976 sec)


INFO:tensorflow:global_step/sec: 6.24058


INFO:tensorflow:global_step/sec: 6.24058


INFO:tensorflow:loss = 9.605008, step = 1201 (16.024 sec)


INFO:tensorflow:loss = 9.605008, step = 1201 (16.024 sec)


INFO:tensorflow:edit_dist = 0.29067123 (16.024 sec)


INFO:tensorflow:edit_dist = 0.29067123 (16.024 sec)


INFO:tensorflow:global_step/sec: 6.25844


INFO:tensorflow:global_step/sec: 6.25844


INFO:tensorflow:loss = 8.23425, step = 1301 (15.979 sec)


INFO:tensorflow:loss = 8.23425, step = 1301 (15.979 sec)


INFO:tensorflow:edit_dist = 0.27841777 (15.979 sec)


INFO:tensorflow:edit_dist = 0.27841777 (15.979 sec)


INFO:tensorflow:global_step/sec: 6.32621


INFO:tensorflow:global_step/sec: 6.32621


INFO:tensorflow:loss = 9.565796, step = 1401 (15.807 sec)


INFO:tensorflow:loss = 9.565796, step = 1401 (15.807 sec)


INFO:tensorflow:edit_dist = 0.23114249 (15.807 sec)


INFO:tensorflow:edit_dist = 0.23114249 (15.807 sec)


INFO:tensorflow:global_step/sec: 6.19716


INFO:tensorflow:global_step/sec: 6.19716


INFO:tensorflow:loss = 7.1927075, step = 1501 (16.136 sec)


INFO:tensorflow:loss = 7.1927075, step = 1501 (16.136 sec)


INFO:tensorflow:edit_dist = 0.21827342 (16.136 sec)


INFO:tensorflow:edit_dist = 0.21827342 (16.136 sec)


INFO:tensorflow:global_step/sec: 6.15951


INFO:tensorflow:global_step/sec: 6.15951


INFO:tensorflow:loss = 6.6781616, step = 1601 (16.235 sec)


INFO:tensorflow:loss = 6.6781616, step = 1601 (16.235 sec)


INFO:tensorflow:edit_dist = 0.20288672 (16.236 sec)


INFO:tensorflow:edit_dist = 0.20288672 (16.236 sec)


INFO:tensorflow:global_step/sec: 6.09863


INFO:tensorflow:global_step/sec: 6.09863


INFO:tensorflow:loss = 6.684066, step = 1701 (16.397 sec)


INFO:tensorflow:loss = 6.684066, step = 1701 (16.397 sec)


INFO:tensorflow:edit_dist = 0.19652776 (16.397 sec)


INFO:tensorflow:edit_dist = 0.19652776 (16.397 sec)


INFO:tensorflow:global_step/sec: 5.94291


INFO:tensorflow:global_step/sec: 5.94291


INFO:tensorflow:loss = 6.951854, step = 1801 (16.826 sec)


INFO:tensorflow:loss = 6.951854, step = 1801 (16.826 sec)


INFO:tensorflow:edit_dist = 0.2110975 (16.826 sec)


INFO:tensorflow:edit_dist = 0.2110975 (16.826 sec)


INFO:tensorflow:Saving checkpoints for 1880 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1880 into /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt.


INFO:tensorflow:Loss for final step: 8.75834.


INFO:tensorflow:Loss for final step: 8.75834.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt-1880


INFO:tensorflow:Restoring parameters from /var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpi5d7fx_0/model.ckpt-1880


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


Prediction: say the word 
Actual: say the word youth
