# Lexicon - Orchestrator


## Overview

For this project, I will build a simple custom ochestrator that processes data objects from the "Lexicon" class.
    - These objects are custom datasets that are modeled after the Ted Talk speakers. 
    - Each Lexicon has a corpus and some helper methods aimed at training and prediction
    - Lexicon class will also have a preprocessing and caching function.
    - Each object will have two methods of prediction, n-gram language model and a recurrent neural network model
    - Each object has a custom reporting function that reports the results of training
    - Each object will be able to learn from any text data provided, and return a transcript with confidence values from input posed in speech utterances. 
        - I will use Google's cloud-based services to preprocess the input audio data and transcribe into an initial guess. Then I will train a model to improve on Google cloud speech API's response.


In [1]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import tarfile

librispeech_dataset_folder_path = 'LibriSpeech'
tar_gz_path = 'dev-clean.tar.gz'

books_path = 'original-books.tar.gz'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(books_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech Book Texts') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/original-books.tar.gz',
            books_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path+'/books'):
    with tarfile.open(books_path) as tar:
        tar.extractall()
        tar.close()
        
        
        
if not isfile(tar_gz_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech dev-clean.tar.gz') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/dev-clean.tar.gz',
            tar_gz_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path):
    with tarfile.open(tar_gz_path) as tar:
        tar.extractall()
        tar.close()
        
        
        

In [2]:
# Prepare a plain text corpus from which we train a languague model
import glob
import os
import utils

# Gather all text files from directory
LIBRISPEECH_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')
TEDLIUM_DIRECTORY = os.path.join(os.getcwd(),'TEDLIUM_release1/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_path = "{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'dev-clean/', '**/', '*.txt*')
train_librispeech_path = "{}{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')
TED_path = "{}{}{}{}".format(TEDLIUM_DIRECTORY,'train/','**/', '*.stm')

text_paths = sorted(glob.glob(train_librispeech_path, recursive=True))
stm_paths = sorted(glob.glob(TED_path, recursive=True))

print('Found',len(text_paths),"text files in the directories {0} and {1} stm files in directory: {2}:".format(train_librispeech_path, len(stm_paths),TED_path ))

Found 41 text files in the directories /src/lexicon/LibriSpeech/books/utf-8/**/*.txt* and 774 stm files in directory: /src/lexicon/TEDLIUM_release1/train/**/*.stm:


In [3]:
import tensorflow as tf
import re
import codecs
import string
from lexicon import Lexicon
      
corpus_raw = u""
stm_segments = []
speakers = []
lexicons = {} # {speaker_id: lexicon_object}
for book_filename in text_paths:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        lines = book_file.read()
        corpus_raw += lines
        
        
for stm_filename in stm_paths: # Process STM files (Tedlium)
        stm_segments.append(utils.parse_stm_file(stm_filename))

for segments in stm_segments[:5]:
    for segment in segments:
        # If not already exist
        if segment.speaker_id not in lexicons.keys():
            lexicons[segment.speaker_id.strip()] = Lexicon(base_corpus=corpus_raw,
                                                  name=segment.speaker_id)
        else:
            lexicon = lexicons[segment.speaker_id.strip()]
            lexicon.add_speech(segment.transcript)
            
# Print Loading Report for Lexicons
for speaker_id, lexicon in lexicons.items():
    lexicon.print_loading_report()

#Preprocess and Save Data
for speaker_id, lexicon in lexicons.items():
    lexicon.preprocess_and_save()



Lexicon: "AaronHuey_2010X" successfully loaded to memory location: <lexicon.Lexicon object at 0x7f77c60d6438>
Dataset Stats
Number of Unique Words in Base: 195295
Number of Unique Words in Speeches: 528
Number of Speeches: 43
Average number of words in each speech: 32.18604651162791



Lexicon: "AdamSavage_2008P" successfully loaded to memory location: <lexicon.Lexicon object at 0x7f779505a780>
Dataset Stats
Number of Unique Words in Base: 195295
Number of Unique Words in Speeches: 783
Number of Speeches: 68
Average number of words in each speech: 41.338235294117645



Lexicon: "AdamSadowsky_2010X" successfully loaded to memory location: <lexicon.Lexicon object at 0x7f77a42eeeb8>
Dataset Stats
Number of Unique Words in Base: 195295
Number of Unique Words in Speeches: 423
Number of Speeches: 35
Average number of words in each speech: 32.857142857142854



Lexicon: "AdamGrosser_2007" successfully loaded to memory location: <lexicon.Lexicon object at 0x7f77b3586e80>
Dataset Stats
Number 

### Load Preprocessed Data

In [4]:
import helper
import numpy as np

for speaker_id, lexicon in lexicons.items():
    (name,
     base_corpus,
     full_corpus,
     int_text, 
     vocab_to_int, 
     int_to_vocab)  = Lexicon.load_preprocess(speaker_id)
    print("{0} int_text length: {1}".format(name, len(int_text)))

AaronHuey_2010X int_text length: 4822074
AdamSavage_2008P int_text length: 4838688
AdamSadowsky_2010X int_text length: 4814424
AdamGrosser_2007 int_text length: 4799884
AJJacobs_2007P int_text length: 4848184


### Load Speech Objects

In [5]:
from speech import Speech
stm_segments = []

for stm_filename in stm_paths: # Process STM files (Tedlium)
        stm_segments.append(utils.parse_stm_file(stm_filename))        

speakers = []
speeches = {} # {speech_id: speech_object}

for segments in stm_segments[:5]:
    for segment in segments:
        segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
                                          str(segment.stop_time).replace('.','_'))

        speech = None
        # If not already exist
        if segment.speaker_id not in speeches.keys():
            # Connect to Cloud API to get Candidate Transcripts
            source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
            speech = Speech(speaker_id=segment.speaker_id,
                                           speech_id = segment_key,
                                           source_file=source_file,
                                           ground_truth = ' '.join(segment.transcript.split()[:-1]),
                                           start = segment.start_time,
                                           stop = segment.stop_time,
                                           audio_type = 'LINEAR16')
        else:
            speech = speeches[segment.speaker_id.strip()]
            print('Already found speech in list at location: ', speech)
        
        speeches[segment_key] = speech


In [6]:
# Print Loading Report
for speech_id, speech in speeches.items():
    speech.print_loading_report()


Lexicon Speech: "AdamSavage_2008P_305_96_316_46" successfully loaded to memory location: <speech.Speech object at 0x7f7782443160>
Speaker Id: AdamSavage_2008P
Source Filepath: /src/lexicon/TEDLIUM_release1/train/sph/AdamSavage_2008P.sph
Audio Filepath: /src/lexicon/datacache/speech_objects/wav/AdamSavage_2008P_305_96_316_46.wav
Ground Truth Transcript: and(2) again {NOISE} i kept on going(2) {BREATH} looking for(2) more reference <sil> every bit of reference i could find drawings {SMACK} beautiful photos this guy i love this guy he put a dodo leg bones in {NOISE} a scanner with {UH} a ruler <sil>
Timestamps: 305.96-316.46



Lexicon Speech: "AdamSadowsky_2010X_75_22_86_4" successfully loaded to memory location: <speech.Speech object at 0x7f7782443978>
Speaker Id: AdamSadowsky_2010X
Source Filepath: /src/lexicon/TEDLIUM_release1/train/sph/AdamSadowsky_2010X.sph
Audio Filepath: /src/lexicon/datacache/speech_objects/wav/AdamSadowsky_2010X_75_22_86_4.wav
Ground Truth Transcript: we sort o

Source Filepath: /src/lexicon/TEDLIUM_release1/train/sph/AdamSavage_2008P.sph
Audio Filepath: /src/lexicon/datacache/speech_objects/wav/AdamSavage_2008P_524_95_535_15.wav
Ground Truth Transcript: this thing and i can(2) tell you {SMACK} it's real <sil> it's the real thing it's made from the {UH} exact same <sil> mold that(2) this one is {BREATH} in {NOISE} this one because the(2) auction was actually(4) so controversial profiles in history(2) the(2) auction house that(2) sold this <sil>
Timestamps: 524.95-535.15



Lexicon Speech: "AdamSadowsky_2010X_427_99_428_36" successfully loaded to memory location: <speech.Speech object at 0x7f7782443860>
Speaker Id: AdamSadowsky_2010X
Source Filepath: /src/lexicon/TEDLIUM_release1/train/sph/AdamSadowsky_2010X.sph
Audio Filepath: /src/lexicon/datacache/speech_objects/wav/AdamSadowsky_2010X_427_99_428_36.wav
Ground Truth Transcript: the(2) {COUGH} <sil>
Timestamps: 427.99-428.36



Lexicon Speech: "AaronHuey_2010X_508_54_515_01" successfully loade

In [7]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    # TODO: Implement Function
    return (
        tf.placeholder(tf.int32, shape=(None, None), name='input'),
        tf.placeholder(tf.int32, shape=(None, None)),
        tf.placeholder(tf.float32, name='keep_prob'),
    )


In [8]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    cell = tf.contrib.rnn.MultiRNNCell([
        tf.contrib.rnn.BasicLSTMCell(rnn_size),
        tf.contrib.rnn.BasicLSTMCell(rnn_size)])

    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, 'initial_state')

    return cell, initial_state



In [9]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    return tf.contrib.layers.embed_sequence(input_data, vocab_size, embed_dim)


In [10]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, 'final_state')

    return outputs, final_state




In [11]:
def build_nn(cell, rnn_size, input_data, vocab_size):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :return: Tuple (Logits, FinalState)
    """
    inputs = get_embed(input_data, vocab_size, rnn_size)
    outputs, final_state = build_rnn(cell, inputs)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, None)

    return logits, final_state



In [12]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: A Numpy array where each item is a tuple of (batch of input, batch of target).
    """
    n_batches = int(len(int_text) / (batch_size * seq_length))

    # Drop the last few characters to make only full batches
    xdata = np.array(int_text[: n_batches * batch_size * seq_length])
    ydata = np.array(int_text[1: n_batches * batch_size * seq_length+1])

    x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
    y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)

    return np.array(list(zip(x_batches, y_batches)))

In [13]:
# Number of Epochs
num_epochs = 15
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 512
# Sequence Length
seq_length = 9
# Learning Rate
learning_rate = 0.002
# Show stats for every n number of batches
show_every_n_batches = 100

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save'

In [14]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)+1
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients)

In [15]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/4208   train_loss = 11.359
Epoch   0 Batch  100/4208   train_loss = 7.247
Epoch   0 Batch  200/4208   train_loss = 7.206
Epoch   0 Batch  300/4208   train_loss = 7.000


KeyboardInterrupt: 

In [None]:

# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

In [None]:

import tensorflow as tf
import numpy as np
import helper

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [None]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    # TODO: Implement Function
    return map(
        loaded_graph.get_tensor_by_name,
        ['input:0', 'initial_state:0', 'final_state:0', 'probs:0']
    )


In [None]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    # TODO: Implement Function
    return int_to_vocab[probabilities.argmax()]


In [None]:
gen_length = 20
prime_word = 'Better'


loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word]
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        script = script.replace(' ' + token.lower(), key)
    script = script.replace('\n ', '\n')
    script = script.replace('( ', '(')
        
    print(script)

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
# Gather all samples, load into dictionary
# Prepare a plain text corpus from which we train a languague model
import glob
import operator

# Gather all text files from directory
WORKING_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_path = "{}{}{}{}".format(WORKING_DIRECTORY, 'dev-clean/', '**/', '*.txt')
train_path = "{}{}{}{}{}".format(WORKING_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')

text_paths = sorted(glob.glob(dev_path, recursive=True))
print('Found',len(text_paths),'text files in the directory:', dev_path)

transcripts = {}
for document in text_paths:
    with codecs.open(document, 'r', 'utf-8') as filep:
        for i,line in enumerate(filep):
            transcripts[line.split()[0]] = ' '.join(line.split()[1:])

# Save Dictionary in Pickle File


## Evaluate all samples found ##
cloud_speech_api_accuracy = []
custom_lang_model_accuracy = []
epsilon = 0.000000001
api_weight = 0.85
steps = 0
# Pull In Audio File
for filename, gt_transcript in transcripts.items():
    steps += 1
    dirs = filename.split('-')
    
    audio_filepath = dev_file_name_0 = os.path.join(
    os.getcwd(),
    'LibriSpeech',
    'dev-clean',
    dirs[0],
    dirs[1],
    "{0}.flac".format(filename))
    
    

    # Load the audio into memory
    with io.open(audio_filepath, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        max_alternatives=10,
        profanity_filter=False,
        enable_word_time_offsets=True)

    # Detects speech and words in the audio file
    operation = client.long_running_recognize(config, audio)
    result = operation.result(timeout=90)
    alternatives = result.results[0].alternatives


    # Evaluate API Results for Re-Ranking:
    rerank_results = {}
    for alternative in alternatives:
        sent = alternative.transcript
        
        # Strip punctuation
        translate_table = dict((ord(char), None) for char in string.punctuation)        
        sent = sent.translate(translate_table) # remove punctuations

        words = nltk.tokenize.word_tokenize(sent)
        
        
        # Run Words through Neural Network
        gen_length = len(sent)
        num_chunks = 3
        sent_sections = list(chunks(sent, num_chunks))
        potential_scripts = []
        alternatives_gen_acc = []
        prime_word = ''
        
        for section in sent_sections[:-1]:
            prime_word += section
        
            loaded_graph = tf.Graph()
            with tf.Session(graph=loaded_graph) as sess:
                # Load saved model
                loader = tf.train.import_meta_graph(load_dir + '.meta')
                loader.restore(sess, load_dir)

                # Get Tensors from loaded model
                input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

                # Sentences generation setup
                gen_sentences = [prime_word]
                prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

                # Generate sentences
                for n in range(gen_length):
                    # Dynamic Input
                    dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
                    dyn_seq_length = len(dyn_input[0])

                    # Get Prediction
                    probabilities, prev_state = sess.run(
                        [probs, final_state],
                        {input_text: dyn_input, initial_state: prev_state})

                    pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

                    gen_sentences.append(pred_word)

                # Remove tokens
                gen_script = ' '.join(gen_sentences)
                for key, token in token_dict.items():
                    ending = ' ' if key in ['\n', '(', '"'] else ''
                    gen_script = gen_script.replace(' ' + token.lower(), key)
                gen_script = gen_script.replace('\n ', '\n')
                gen_script = gen_script.replace('( ', '(')

                print()
                print()
                print(gen_script)
                
                # potential_scripts.append(gen_script)
                
                # Find Edit distance between word and potential script that was generated
                alt_ed = nltk.edit_distance(sent.lower(), gen_script.lower())
                alt_upper_bound = max(len(sent),len(gen_script))
                alt_accuracy = (1.0 - alt_ed/alt_upper_bound)
                alternatives_gen_acc.append(alt_accuracy)
        
        

        # Re-rank alternatives using a weighted average of the two scores
        prediction_score = np.mean(alternatives_gen_acc)
        rerank_results[alternative.transcript] = confidence_score


    
    index, value = max(enumerate(list(rerank_results.values())), key=operator.itemgetter(1))
    # Select Corresponding Transcript:
    script=''
    for trnscript, confidence in rerank_results.items():
        if confidence == value:
            script = trnscript
            
    
    
                
    # Compute the Accuracy, based on the Levenshtein Distance (a.k.a. Edit Distance)
    gcs_ed = nltk.edit_distance(alternative.transcript.lower(), gt_transcript.lower())
    gcs_upper_bound = max(len(alternative.transcript),len(gt_transcript))
    gcs_accuracy = (1.0 - gcs_ed/gcs_upper_bound)
    
    clm_ed = nltk.edit_distance(script.lower(), gt_transcript.lower())
    clm_upper_bound = max(len(script),len(gt_transcript))
    clm_accuracy = (1.0 - clm_ed/clm_upper_bound)
    
    cloud_speech_api_accuracy.append(gcs_accuracy)
    custom_lang_model_accuracy.append(clm_accuracy)

    if steps % 10 == 0:
        print("{0} Transcripts Processed.".format(steps))
        print('Average API Accuracy:', np.mean(cloud_speech_api_accuracy))
        print('Average Custom Model Accuracy:', np.mean(custom_lang_model_accuracy))
        print()

        


