# Lexicon - Orchestrator


## Overview

For this project, I will build a simple custom ochestrator that processes data objects from the "Lexicon" class.
    - These objects are custom datasets that are modeled after the Ted Talk speakers. 
    - Each Lexicon has a corpus and some helper methods aimed at training and prediction
    - Lexicon class will also have a preprocessing and caching function.
    - Each object will have two methods of prediction, n-gram language model and a recurrent neural network model
    - Each object has a custom reporting function that reports the results of training
    - Each object will be able to learn from any text data provided, and return a transcript with confidence values from input posed in speech utterances. 
        - I will use Google's cloud-based services to preprocess the input audio data and transcribe into an initial guess. Then I will train a model to improve on Google cloud speech API's response.


In [1]:
## Use to reload modules
from importlib import reload
%reload_ext autoreload
%autoreload 2

In [2]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import tarfile

librispeech_dataset_folder_path = 'LibriSpeech'
tar_gz_path = 'dev-clean.tar.gz'

books_path = 'original-books.tar.gz'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(books_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech Book Texts') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/original-books.tar.gz',
            books_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path+'/books'):
    with tarfile.open(books_path) as tar:
        tar.extractall()
        tar.close()
        
        
        
if not isfile(tar_gz_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech dev-clean.tar.gz') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/dev-clean.tar.gz',
            tar_gz_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path):
    with tarfile.open(tar_gz_path) as tar:
        tar.extractall()
        tar.close()
        
        
        

In [3]:
# Prepare a plain text corpus from which we train a languague model
import glob
import os
import utils

# Gather all text files from directory
LIBRISPEECH_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')
TEDLIUM_DIRECTORY = os.path.join(os.getcwd(),'TEDLIUM_release1/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_librispeech_path = "{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'dev-clean/', '**/', '*.txt*')
train_librispeech_path = "{}{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')
TED_path = "{}{}{}{}".format(TEDLIUM_DIRECTORY,'train/','**/', '*.stm')

text_paths = sorted(glob.glob(train_librispeech_path, recursive=True))
segmented_text_paths = sorted(glob.glob(dev_librispeech_path, recursive=True))
stm_paths = sorted(glob.glob(TED_path, recursive=True))

print('Found:',len(text_paths),"text files in the directories {0}\n{1} segmented text files in the {2} directory and \n{3} stm files in directory: {4}:".format(train_librispeech_path, 
        len(segmented_text_paths), dev_librispeech_path, len(stm_paths),TED_path ))

Found: 41 text files in the directories /src/lexicon/LibriSpeech/books/utf-8/**/*.txt*
97 segmented text files in the /src/lexicon/LibriSpeech/dev-clean/**/*.txt* directory and 
774 stm files in directory: /src/lexicon/TEDLIUM_release1/train/**/*.stm:


### Build Text Corpuses for Training

In [10]:
import tensorflow as tf
import re
import codecs
import string
from lexicon import Lexicon
from speech import Speech
      
librispeech_corpus = u""
stm_segments = []
lexicons = {} # {speaker_id: lexicon_object}
speeches = {} # {speech_id: speech_object}
segmented_librispeeches = {}

for book_filename in text_paths[:10]: # 1 Book
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        lines = book_file.read()
        librispeech_corpus += lines
for stm_filename in stm_paths: # Process STM files (Tedlium)
        stm_segments.append(utils.parse_stm_file(stm_filename))
        

# Train on 3 speakers
for segments in stm_segments[15:17]: 
    for segment in segments:
        segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
                                          str(segment.stop_time).replace('.','_'))
        if segment.speaker_id not in speeches.keys():
            source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1',
                                       'train','sph', '{}.sph'.format(segment.filename))
            speech = Speech(speaker_id=segment.speaker_id,
                                           speech_id = segment_key,
                                           source_file=source_file,
                                           ground_truth = ' '.join(segment.transcript.split()[:-1]),
                                           start = segment.start_time,
                                           stop = segment.stop_time,
                                           audio_type = 'LINEAR16')
        else:
            speech = speeches[segment.speaker_id.strip()]
            print('Already found speech in list at location: ', speech)
        
        speeches[segment_key] = speech

        if segment.speaker_id not in lexicons.keys():
            lexicon = Lexicon(base_corpus=librispeech_corpus, name=segment.speaker_id)
            lexicons[segment.speaker_id.strip()] = lexicon
        else:
            lexicon = lexicons[segment.speaker_id.strip()]
        
        # Add Speech to Lexicon
        if speech not in lexicon.speeches:
            lexicon.add_speech(speech)


### Load GCS Transcripts using GCS Wrapper

In [11]:
from gcs_api_wrapper import GCSWrapper
gcs = GCSWrapper()
for speaker_id, lexicon in lexicons.items():
    lexicon.preprocess_and_save()
    for speech in lexicon.speeches:
        result = gcs.transcribe_speech(speech.audio_file)
        speech.populate_gcs_results(result)
        speech.preprocess_and_save()

InvalidArgumentError: InvalidArgumentError(RPC failed, caused by <_Rendezvous of RPC that terminated with (StatusCode.INVALID_ARGUMENT, Request payload size exceeds the limit: 10485760 bytes.)>)

In [12]:
import numpy as np
view_sentence_range = (0, 10)

for speaker_id, lexicon in lexicons.items():
    print('Dataset Stats')
    print('Roughly the number of unique words: {}'.format(lexicon.vocab_size))
    
    word_counts = [len(sentence.split()) for sentence in lexicon.corpus_sentences]
    print('Number of sentences: {}'.format(len(lexicon.corpus_sentences)))
    print('Average number of words in a sentence: {}'.format(np.average(word_counts)))

    print()
    print('Transcript sentences {} to {}:'.format(*view_sentence_range))
    print('\n'.join(lexicon.training_set[0][view_sentence_range[0]:view_sentence_range[1]]))
    print()
    print('Ground Truth sentences {} to {}:'.format(*view_sentence_range))
    print('\n'.join(lexicon.training_set[1][view_sentence_range[0]:view_sentence_range[1]]))
    print()

Dataset Stats
Roughly the number of unique words: 58050
Number of sentences: 27600
Average number of words in a sentence: 23.997463768115942

Transcript sentences 0 to 10:


"I shall never be better," said Jane Merrick, sternly
 "The end is not
far off now
"

"Oh, I'm sorry to hear you say that!" said Patsy; "but I hope it is
not true
 Why, here are we four newly found relations all beginning to
get acquainted, and to love one another, and we can't have our little
party broken up, auntie dear
"

"Five of us--five relations," cried Uncle John, coming around the
corner of the hedge
 "Don't I count, Patsy, you rogue? Why you're
looking as bright and as bonny as can be
 I wouldn't be surprised if
you could toddle
"

"Not yet," she answered, cheerfully
 "But I'm doing finely, Uncle
John, and it won't be long before I can get about as well as ever
"

"And to think," said Aunt Jane, bitterly, "that all this trouble was
caused by that miserable boy! If I knew where to send him he'd not
stay at

In [None]:
### Preprocess Dataset - Tokenize Corpus

In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords
import re
import codecs
import string

# reading the file in unicode format using codecs library    
stoplist = set(stopwords.words('english'))
# Strip punctuation
translate_table = dict((ord(char), None) for char in string.punctuation) 
        
corpus_raw = u""
for book_filename in text_paths:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        lines = book_file.read()
        corpus_raw += lines.translate(translate_table) # remove punctuations 

               
# Tokenize
tokenized_words = nltk.tokenize.word_tokenize(corpus_raw)

## Clean the tokens ##
# Remove stop words
tokenized_words = [word for word in tokenized_words if word not in stoplist]

# Remove single-character tokens (mostly punctuation)
tokenized_words = [word for word in tokenized_words if len(word) > 1]

# Remove numbers
tokenized_words = [word for word in tokenized_words if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
tokenized_words = [word.lower() for word in tokenized_words]

In [None]:
### Preprocess Dataset - Extract N-Gram Model

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.collocations import *
from nltk.probability import FreqDist
import nltk

# extracting the bi-grams and sorting them according to their frequencies
finder = BigramCollocationFinder.from_words(tokenized_words)
# finder.apply_freq_filter(3)

bigram_model = nltk.bigrams(tokenized_words)
bigram_model = sorted(bigram_model, key=lambda item: item[1], reverse=True)  
# print(bigram_model)
print('')
print('')
print('')
np.save("lang_model.npy",bigram_model)

In [None]:
fdist = nltk.FreqDist(bigram_model)

# Output top 50 words
print("Word|Freq:")
for word, frequency in fdist.most_common(50):
    print(u'{}|{}'.format(word, frequency))

In [None]:
cfreq_2gram = nltk.ConditionalFreqDist(bigram_model)
# print('Conditional Frequency Conditions:\n', cfreq_2gram)
print()

# First access the FreqDist associated with "one", then the keys in that FreqDist
print("Listing the words that can follow after 'greater':\n", cfreq_2gram["greater"].keys())
print()

# Determine Most common in conditional frequency
print("Listing 20 most frequent words to come after 'greater':\n", cfreq_2gram["greater"].most_common(20))

In [None]:
# For each word in the evaluation list:
# Select word and determine its frequency distribution
# Grab probability of second word in the list
# Continue this process until the sentence is scored

# Add small epsilon value to avoid division by zero
epsilon = 0.0000001

# Loads the audio into memory
for audio, ground_truth in audio_files.items():
    with io.open(audio, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        max_alternatives=10,
        profanity_filter=False,
        enable_word_time_offsets=True)

    # Detects speech and words in the audio file
    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    result = operation.result(timeout=90)

    alternatives = result.results[0].alternatives


    #print("API Results: ", alternatives)
    print()
    print()

    rerank_results = {}
    for alternative in alternatives:
        sent = alternative.transcript

        words = nltk.tokenize.word_tokenize(sent)
        probs = np.ones_like(words, dtype=np.float32)*epsilon
        # print(words,'\n',probs)
        for word in words:
            if words.index(word) < len(words)-1: 
                freq = cfreq_2gram[word].freq(words[words.index(word)+1])
                probs[words.index(word)] = freq
            # print(probs)

        lexicon_score = np.sum(probs)
        # print(word_score)

        # Re-rank alternatives using a weighted average of the two scores
        api_weight = 0.90
        confidence_score = alternative.confidence*api_weight + lexicon_score*(1-api_weight)
        rerank_results[alternative.transcript] = confidence_score

    print("RE-RANKED Results: \n", rerank_results)
    print()
    print()

    import operator
    index, value = max(enumerate(list(rerank_results.values())), key=operator.itemgetter(1))
    # Select Corresponding Transcript:
    script=''
    for trnscript, confidence in rerank_results.items():
        if confidence == value:
            script = trnscript

    # Evaluate the differences between the Original and the Reranked transcript:
    print("ORIGINAL Transcript: \n'{0}' \nwith a confidence_score of: {1}".format(alternative.transcript, alternative.confidence))
    
    
    print()
    print()
    print("RE-RANKED Transcript: \n'{0}' \nwith a confidence_score of: {1}".format(script, value))
    
    print()
    print()
    print("GROUND TRUTH TRANSCRIPT: \n{0}".format(ground_truth))
    print()
    ranked_differences = list(set(nltk.tokenize.word_tokenize(alternative.transcript.lower())) -
                              set(nltk.tokenize.word_tokenize(script.lower())))
    if len(ranked_differences) == 0:  
        print("No reranking was performed. The transcripts match!")
    else:
        print("The original transcript was RE-RANKED. The transcripts do not match!")
        print("Differences between original and re-ranked: ", ranked_differences)
    print()
    print()
    
    # Evaluate Differences between the Original and Ground Truth:
    gt_orig_diff = list(set(nltk.tokenize.word_tokenize(alternative.transcript.lower())) -
                              set(nltk.tokenize.word_tokenize(ground_truth.lower())))
    if len(gt_orig_diff) == 0:  
        print("The ORIGINAL transcript matches ground truth!")
    else:
        print("The original transcript DOES NOT MATCH ground truth.")
        print("Differences between original and ground truth: ", gt_orig_diff)
    print()
    print()
    
    
    gt_rr_diff = list(set(nltk.tokenize.word_tokenize(script.lower())) -
                              set(nltk.tokenize.word_tokenize(ground_truth.lower())))
    if len(gt_rr_diff) == 0:  
        print("The RE-RANKED transcript matches ground truth!")
    else:
        print("The RE_RANKED transcript DOES NOT MATCH ground truth.")
        print("Differences between Reranked and ground truth: ", gt_rr_diff)
    print()
    print()
    
    print()
    print()
    
    
    # Compute the Levenshtein Distance (a.k.a. Edit Distance)
#     import nltk.metrics.distance as lev_dist
    
    # Google API Edit Distance
    goog_edit_distance = nltk.edit_distance(alternative.transcript.lower(), ground_truth.lower())
    
    # Re-Ranked Edit Distance
    rr_edit_distance = nltk.edit_distance(script.lower(), ground_truth.lower())

    
    print("ORIGINAL Edit Distance: \n{0}".format(goog_edit_distance))
    print("RE-RANKED Edit Distance: \n{0}".format(rr_edit_distance))
    print()
    print()
    

In [None]:
# Gather all samples, load into dictionary
# Prepare a plain text corpus from which we train a languague model
import glob
import operator

# Gather all text files from directory
WORKING_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_path = "{}{}{}{}".format(WORKING_DIRECTORY, 'dev-clean/', '**/', '*.txt')
train_path = "{}{}{}{}{}".format(WORKING_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')

text_paths = sorted(glob.glob(dev_path, recursive=True))
print('Found',len(text_paths),'text files in the directory:', dev_path)

transcripts = {}
for document in text_paths:
    with codecs.open(document, 'r', 'utf-8') as filep:
        for i,line in enumerate(filep):
            transcripts[line.split()[0]] = ' '.join(line.split()[1:])

# Save Dictionary in Pickle File


## Evaluate all samples found ##
cloud_speech_api_accuracy = []
custom_lang_model_accuracy = []
epsilon = 0.000000001
api_weight = 0.85
steps = 0
# Pull In Audio File
for filename, gt_transcript in transcripts.items():
    steps += 1
    dirs = filename.split('-')
    
    audio_filepath = dev_file_name_0 = os.path.join(
    os.getcwd(),
    'LibriSpeech',
    'dev-clean',
    dirs[0],
    dirs[1],
    "{0}.flac".format(filename))
    
    

    # Load the audio into memory
    with io.open(audio_filepath, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        max_alternatives=10,
        profanity_filter=False,
        enable_word_time_offsets=True)

    # Detects speech and words in the audio file
    operation = client.long_running_recognize(config, audio)
    result = operation.result(timeout=90)
    alternatives = result.results[0].alternatives


    # Evaluate API Results for Re-Ranking:
    rerank_results = {}
    for alternative in alternatives:
        sent = alternative.transcript
        
        # Strip punctuation
        translate_table = dict((ord(char), None) for char in string.punctuation)        
        sent = sent.translate(translate_table) # remove punctuations

        words = nltk.tokenize.word_tokenize(sent)
        probs = np.ones_like(words, dtype=np.float32)*epsilon

        for word in words:
            if words.index(word) < len(words)-1: 
                freq = cfreq_2gram[word].freq(words[words.index(word)+1])
                probs[words.index(word)] = freq

        lexicon_score = np.sum(probs)

        # Re-rank alternatives using a weighted average of the two scores
        confidence_score = alternative.confidence*api_weight + lexicon_score*(1-api_weight)
        rerank_results[alternative.transcript] = confidence_score


    
    index, value = max(enumerate(list(rerank_results.values())), key=operator.itemgetter(1))
    # Select Corresponding Transcript:
    script=''
    for trnscript, confidence in rerank_results.items():
        if confidence == value:
            script = trnscript
                
    # Compute the Accuracy, based on the Levenshtein Distance (a.k.a. Edit Distance)
    gcs_ed = nltk.edit_distance(alternative.transcript.lower(), gt_transcript.lower())
    gcs_upper_bound = max(len(alternative.transcript),len(gt_transcript))
    gcs_accuracy = (1.0 - gcs_ed/gcs_upper_bound)
    
    clm_ed = nltk.edit_distance(script.lower(), gt_transcript.lower())
    clm_upper_bound = max(len(script),len(gt_transcript))
    clm_accuracy = (1.0 - clm_ed/clm_upper_bound)
    
    cloud_speech_api_accuracy.append(gcs_accuracy)
    custom_lang_model_accuracy.append(clm_accuracy)

    if steps % 100 == 0:
        print("{0} Transcripts Processed.".format(steps))
        print('Average API Accuracy:', np.mean(cloud_speech_api_accuracy))
        print('Average Custom Model Accuracy:', np.mean(custom_lang_model_accuracy))
        print()


In [None]:
for speaker_id, lexicon in lexicons.items():
    lexicon.optimize()

Epoch   0 Batch  100/2536 - Train Accuracy: 0.5909, Validation Accuracy: 0.6339, Loss: 7.2264
Epoch   0 Batch  200/2536 - Train Accuracy: 0.7076, Validation Accuracy: 0.6339, Loss: 4.1828
Epoch   0 Batch  300/2536 - Train Accuracy: 0.6942, Validation Accuracy: 0.6339, Loss: 3.0281
Epoch   0 Batch  400/2536 - Train Accuracy: 0.6309, Validation Accuracy: 0.6339, Loss: 3.5545
Epoch   0 Batch  500/2536 - Train Accuracy: 0.5312, Validation Accuracy: 0.6339, Loss: 4.1028
Epoch   0 Batch  600/2536 - Train Accuracy: 0.5458, Validation Accuracy: 0.6339, Loss: 3.8562
Epoch   0 Batch  700/2536 - Train Accuracy: 0.5481, Validation Accuracy: 0.6339, Loss: 3.7842
Epoch   0 Batch  800/2536 - Train Accuracy: 0.5264, Validation Accuracy: 0.6339, Loss: 3.8614
Epoch   0 Batch  900/2536 - Train Accuracy: 0.4085, Validation Accuracy: 0.6339, Loss: 4.4452
Epoch   0 Batch 1000/2536 - Train Accuracy: 0.4813, Validation Accuracy: 0.6339, Loss: 3.8994
Epoch   0 Batch 1100/2536 - Train Accuracy: 0.3304, Validati

Epoch   3 Batch 1400/2536 - Train Accuracy: 0.4813, Validation Accuracy: 0.6339, Loss: 3.5463
Epoch   3 Batch 1500/2536 - Train Accuracy: 0.6298, Validation Accuracy: 0.6339, Loss: 2.5367
Epoch   3 Batch 1600/2536 - Train Accuracy: 0.5417, Validation Accuracy: 0.6339, Loss: 3.0918
Epoch   3 Batch 1700/2536 - Train Accuracy: 0.6193, Validation Accuracy: 0.6339, Loss: 2.5742
Epoch   3 Batch 1800/2536 - Train Accuracy: 0.7330, Validation Accuracy: 0.6339, Loss: 1.8495
Epoch   3 Batch 1900/2536 - Train Accuracy: 0.5547, Validation Accuracy: 0.6339, Loss: 2.9556
Epoch   3 Batch 2000/2536 - Train Accuracy: 0.5409, Validation Accuracy: 0.6339, Loss: 2.9973
Epoch   3 Batch 2100/2536 - Train Accuracy: 0.4323, Validation Accuracy: 0.6339, Loss: 3.8470
Epoch   3 Batch 2200/2536 - Train Accuracy: 0.5792, Validation Accuracy: 0.6339, Loss: 2.8359
Epoch   3 Batch 2300/2536 - Train Accuracy: 0.3630, Validation Accuracy: 0.6339, Loss: 4.5660
Epoch   3 Batch 2400/2536 - Train Accuracy: 0.4423, Validati

Epoch   7 Batch  200/2536 - Train Accuracy: 0.7366, Validation Accuracy: 0.6339, Loss: 1.8106
Epoch   7 Batch  300/2536 - Train Accuracy: 0.7299, Validation Accuracy: 0.6339, Loss: 1.6975
Epoch   7 Batch  400/2536 - Train Accuracy: 0.6641, Validation Accuracy: 0.6339, Loss: 2.2496
Epoch   7 Batch  500/2536 - Train Accuracy: 0.5646, Validation Accuracy: 0.6339, Loss: 2.7763
Epoch   7 Batch  600/2536 - Train Accuracy: 0.5854, Validation Accuracy: 0.6339, Loss: 2.6570
Epoch   7 Batch  700/2536 - Train Accuracy: 0.5745, Validation Accuracy: 0.6339, Loss: 2.6655
Epoch   7 Batch  800/2536 - Train Accuracy: 0.5721, Validation Accuracy: 0.6339, Loss: 2.8022
Epoch   7 Batch  900/2536 - Train Accuracy: 0.4933, Validation Accuracy: 0.6339, Loss: 3.0163
Epoch   7 Batch 1000/2536 - Train Accuracy: 0.5250, Validation Accuracy: 0.6339, Loss: 2.7557
Epoch   7 Batch 1100/2536 - Train Accuracy: 0.3571, Validation Accuracy: 0.6339, Loss: 3.7385
Epoch   7 Batch 1200/2536 - Train Accuracy: 0.4707, Validati

Epoch  10 Batch 1500/2536 - Train Accuracy: 0.6683, Validation Accuracy: 0.6339, Loss: 1.8563
Epoch  10 Batch 1600/2536 - Train Accuracy: 0.5703, Validation Accuracy: 0.6339, Loss: 2.2918
Epoch  10 Batch 1700/2536 - Train Accuracy: 0.7074, Validation Accuracy: 0.6339, Loss: 1.8822
Epoch  10 Batch 1800/2536 - Train Accuracy: 0.7585, Validation Accuracy: 0.6339, Loss: 1.3827
Epoch  10 Batch 1900/2536 - Train Accuracy: 0.5990, Validation Accuracy: 0.6339, Loss: 2.1194
Epoch  10 Batch 2000/2536 - Train Accuracy: 0.5793, Validation Accuracy: 0.6339, Loss: 2.2011
Epoch  10 Batch 2100/2536 - Train Accuracy: 0.5182, Validation Accuracy: 0.6339, Loss: 2.3757
Epoch  10 Batch 2200/2536 - Train Accuracy: 0.6208, Validation Accuracy: 0.6339, Loss: 2.1302
Epoch  10 Batch 2300/2536 - Train Accuracy: 0.3966, Validation Accuracy: 0.6339, Loss: 3.6082
Epoch  10 Batch 2400/2536 - Train Accuracy: 0.5216, Validation Accuracy: 0.6339, Loss: 2.6274
Epoch  10 Batch 2500/2536 - Train Accuracy: 0.4279, Validati

Epoch  14 Batch  300/2536 - Train Accuracy: 0.7545, Validation Accuracy: 0.6339, Loss: 1.3022
Epoch  14 Batch  400/2536 - Train Accuracy: 0.6797, Validation Accuracy: 0.6339, Loss: 1.7769
Epoch  14 Batch  500/2536 - Train Accuracy: 0.5896, Validation Accuracy: 0.6339, Loss: 2.1995
Epoch  14 Batch  600/2536 - Train Accuracy: 0.6208, Validation Accuracy: 0.6339, Loss: 2.1051
Epoch  14 Batch  700/2536 - Train Accuracy: 0.6130, Validation Accuracy: 0.6339, Loss: 2.1230
Epoch  14 Batch  800/2536 - Train Accuracy: 0.6226, Validation Accuracy: 0.6339, Loss: 2.2123
Epoch  14 Batch  900/2536 - Train Accuracy: 0.5156, Validation Accuracy: 0.6339, Loss: 2.3622
Epoch  14 Batch 1000/2536 - Train Accuracy: 0.5792, Validation Accuracy: 0.6339, Loss: 2.2242
Epoch  14 Batch 1100/2536 - Train Accuracy: 0.4062, Validation Accuracy: 0.6339, Loss: 2.9828
Epoch  14 Batch 1200/2536 - Train Accuracy: 0.5410, Validation Accuracy: 0.6339, Loss: 2.1906
Epoch  14 Batch 1300/2536 - Train Accuracy: 0.7548, Validati

Epoch  17 Batch 1600/2536 - Train Accuracy: 0.6458, Validation Accuracy: 0.6339, Loss: 1.8010
Epoch  17 Batch 1700/2536 - Train Accuracy: 0.7415, Validation Accuracy: 0.6339, Loss: 1.4387
Epoch  17 Batch 1800/2536 - Train Accuracy: 0.7756, Validation Accuracy: 0.6339, Loss: 1.0642
Epoch  17 Batch 1900/2536 - Train Accuracy: 0.6719, Validation Accuracy: 0.6339, Loss: 1.6142
Epoch  17 Batch 2000/2536 - Train Accuracy: 0.6106, Validation Accuracy: 0.6339, Loss: 1.6985
Epoch  17 Batch 2100/2536 - Train Accuracy: 0.6224, Validation Accuracy: 0.6339, Loss: 1.4169
Epoch  17 Batch 2200/2536 - Train Accuracy: 0.6542, Validation Accuracy: 0.6339, Loss: 1.6999
Epoch  17 Batch 2300/2536 - Train Accuracy: 0.4279, Validation Accuracy: 0.6339, Loss: 2.9558
Epoch  17 Batch 2400/2536 - Train Accuracy: 0.5817, Validation Accuracy: 0.6339, Loss: 2.1278
Epoch  17 Batch 2500/2536 - Train Accuracy: 0.4832, Validation Accuracy: 0.6339, Loss: 2.6491
Epoch  18 Batch  100/2536 - Train Accuracy: 0.7188, Validati

Epoch  21 Batch  400/2536 - Train Accuracy: 0.7168, Validation Accuracy: 0.6339, Loss: 1.4231
Epoch  21 Batch  500/2536 - Train Accuracy: 0.6417, Validation Accuracy: 0.6339, Loss: 1.7804
Epoch  21 Batch  600/2536 - Train Accuracy: 0.6604, Validation Accuracy: 0.6339, Loss: 1.6989
Epoch  21 Batch  700/2536 - Train Accuracy: 0.6587, Validation Accuracy: 0.6339, Loss: 1.7454
Epoch  21 Batch  800/2536 - Train Accuracy: 0.6250, Validation Accuracy: 0.6339, Loss: 1.8436
Epoch  21 Batch  900/2536 - Train Accuracy: 0.5737, Validation Accuracy: 0.6339, Loss: 1.9262
Epoch  21 Batch 1000/2536 - Train Accuracy: 0.6208, Validation Accuracy: 0.6339, Loss: 1.7957
Epoch  21 Batch 1100/2536 - Train Accuracy: 0.4621, Validation Accuracy: 0.6339, Loss: 2.4932
Epoch  21 Batch 1200/2536 - Train Accuracy: 0.5586, Validation Accuracy: 0.6339, Loss: 1.7691
Epoch  21 Batch 1300/2536 - Train Accuracy: 0.8197, Validation Accuracy: 0.6339, Loss: 0.7611
Epoch  21 Batch 1400/2536 - Train Accuracy: 0.5875, Validati

Epoch  24 Batch 1700/2536 - Train Accuracy: 0.7670, Validation Accuracy: 0.6339, Loss: 1.1031
Epoch  24 Batch 1800/2536 - Train Accuracy: 0.8324, Validation Accuracy: 0.6339, Loss: 0.8156
Epoch  24 Batch 1900/2536 - Train Accuracy: 0.6979, Validation Accuracy: 0.6339, Loss: 1.2803
Epoch  24 Batch 2000/2536 - Train Accuracy: 0.6707, Validation Accuracy: 0.6339, Loss: 1.3598
Epoch  24 Batch 2100/2536 - Train Accuracy: 0.7500, Validation Accuracy: 0.6339, Loss: 0.7617
Epoch  24 Batch 2200/2536 - Train Accuracy: 0.7167, Validation Accuracy: 0.6339, Loss: 1.3838
Epoch  24 Batch 2300/2536 - Train Accuracy: 0.4856, Validation Accuracy: 0.6339, Loss: 2.4284
Epoch  24 Batch 2400/2536 - Train Accuracy: 0.6298, Validation Accuracy: 0.6339, Loss: 1.7417
Epoch  24 Batch 2500/2536 - Train Accuracy: 0.5457, Validation Accuracy: 0.6339, Loss: 2.1797
Epoch  25 Batch  100/2536 - Train Accuracy: 0.7784, Validation Accuracy: 0.6339, Loss: 1.0521
Epoch  25 Batch  200/2536 - Train Accuracy: 0.7746, Validati

Epoch  28 Batch  500/2536 - Train Accuracy: 0.6500, Validation Accuracy: 0.6339, Loss: 1.4608
Epoch  28 Batch  600/2536 - Train Accuracy: 0.6667, Validation Accuracy: 0.6339, Loss: 1.3757
Epoch  28 Batch  700/2536 - Train Accuracy: 0.6947, Validation Accuracy: 0.6339, Loss: 1.4225
Epoch  28 Batch  800/2536 - Train Accuracy: 0.6755, Validation Accuracy: 0.6339, Loss: 1.5307
Epoch  28 Batch  900/2536 - Train Accuracy: 0.6161, Validation Accuracy: 0.6339, Loss: 1.5863
Epoch  28 Batch 1000/2536 - Train Accuracy: 0.6312, Validation Accuracy: 0.6339, Loss: 1.5031
Epoch  28 Batch 1100/2536 - Train Accuracy: 0.5268, Validation Accuracy: 0.6339, Loss: 2.0889
Epoch  28 Batch 1200/2536 - Train Accuracy: 0.6035, Validation Accuracy: 0.6339, Loss: 1.4482
Epoch  28 Batch 1300/2536 - Train Accuracy: 0.8966, Validation Accuracy: 0.6339, Loss: 0.4484
Epoch  28 Batch 1400/2536 - Train Accuracy: 0.6146, Validation Accuracy: 0.6339, Loss: 1.6501
Epoch  28 Batch 1500/2536 - Train Accuracy: 0.7404, Validati

Epoch  31 Batch 1800/2536 - Train Accuracy: 0.8580, Validation Accuracy: 0.6339, Loss: 0.6044
Epoch  31 Batch 1900/2536 - Train Accuracy: 0.7500, Validation Accuracy: 0.6339, Loss: 1.0021
Epoch  31 Batch 2000/2536 - Train Accuracy: 0.7236, Validation Accuracy: 0.6339, Loss: 1.0635
Epoch  31 Batch 2100/2536 - Train Accuracy: 0.8359, Validation Accuracy: 0.6339, Loss: 0.4024
Epoch  31 Batch 2200/2536 - Train Accuracy: 0.7458, Validation Accuracy: 0.6339, Loss: 1.1253
Epoch  31 Batch 2300/2536 - Train Accuracy: 0.5409, Validation Accuracy: 0.6339, Loss: 1.9949
Epoch  31 Batch 2400/2536 - Train Accuracy: 0.6538, Validation Accuracy: 0.6339, Loss: 1.4041
Epoch  31 Batch 2500/2536 - Train Accuracy: 0.5721, Validation Accuracy: 0.6339, Loss: 1.8452
Epoch  32 Batch  100/2536 - Train Accuracy: 0.8125, Validation Accuracy: 0.6339, Loss: 0.7741
Epoch  32 Batch  200/2536 - Train Accuracy: 0.7969, Validation Accuracy: 0.6339, Loss: 0.8517
Epoch  32 Batch  300/2536 - Train Accuracy: 0.8237, Validati

Epoch  35 Batch  600/2536 - Train Accuracy: 0.6771, Validation Accuracy: 0.6339, Loss: 1.1155
Epoch  35 Batch  700/2536 - Train Accuracy: 0.7380, Validation Accuracy: 0.6339, Loss: 1.1605
Epoch  35 Batch  800/2536 - Train Accuracy: 0.7091, Validation Accuracy: 0.6339, Loss: 1.2464
Epoch  35 Batch  900/2536 - Train Accuracy: 0.6473, Validation Accuracy: 0.6339, Loss: 1.3441
Epoch  35 Batch 1000/2536 - Train Accuracy: 0.6562, Validation Accuracy: 0.6339, Loss: 1.2604
Epoch  35 Batch 1100/2536 - Train Accuracy: 0.5335, Validation Accuracy: 0.6339, Loss: 1.7443
Epoch  35 Batch 1200/2536 - Train Accuracy: 0.6602, Validation Accuracy: 0.6339, Loss: 1.1623
Epoch  35 Batch 1300/2536 - Train Accuracy: 0.9159, Validation Accuracy: 0.6339, Loss: 0.2598
Epoch  35 Batch 1400/2536 - Train Accuracy: 0.6312, Validation Accuracy: 0.6339, Loss: 1.3730
Epoch  35 Batch 1500/2536 - Train Accuracy: 0.8101, Validation Accuracy: 0.6339, Loss: 0.7698
Epoch  35 Batch 1600/2536 - Train Accuracy: 0.7526, Validati

Epoch  40 Batch  500/2536 - Train Accuracy: 0.7063, Validation Accuracy: 0.6362, Loss: 1.0474
Epoch  40 Batch  600/2536 - Train Accuracy: 0.7312, Validation Accuracy: 0.6362, Loss: 0.9738
Epoch  40 Batch  700/2536 - Train Accuracy: 0.7404, Validation Accuracy: 0.6362, Loss: 0.9923
Epoch  40 Batch  800/2536 - Train Accuracy: 0.7139, Validation Accuracy: 0.6362, Loss: 1.0878
Epoch  40 Batch  900/2536 - Train Accuracy: 0.6763, Validation Accuracy: 0.6362, Loss: 1.1523
Epoch  40 Batch 1000/2536 - Train Accuracy: 0.6833, Validation Accuracy: 0.6362, Loss: 1.1047
Epoch  40 Batch 1100/2536 - Train Accuracy: 0.5915, Validation Accuracy: 0.6339, Loss: 1.5424
Epoch  40 Batch 1200/2536 - Train Accuracy: 0.6953, Validation Accuracy: 0.6339, Loss: 1.0213
Epoch  40 Batch 1300/2536 - Train Accuracy: 0.9567, Validation Accuracy: 0.6339, Loss: 0.1738
Epoch  40 Batch 1400/2536 - Train Accuracy: 0.6604, Validation Accuracy: 0.6339, Loss: 1.1958
Epoch  40 Batch 1500/2536 - Train Accuracy: 0.8317, Validati

In [None]:
# Save parameters for checkpoint
helper.save_params(save_path)

In [None]:
import tensorflow as tf
import numpy as np
import helper

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()
load_path = helper.load_params()

In [None]:
def sentence_to_seq(sentence, vocab_to_int):
    """
    Convert a sentence to a sequence of ids
    :param sentence: String
    :param vocab_to_int: Dictionary to go from the words to an id
    :return: List of word ids
    """
        
    # Convert the sentence to lowercase and to list
    list_words = [word for word in sentence.lower().split() ]
    
    # Convert words into ids using vocab_to_int
    list_words_int = list()
    for word in list_words:
        # Convert words not in the vocabulary, to the <UNK> word id.
        if word not in vocab_to_int:
            list_words_int.append(vocab_to_int['<UNK>'])
        else:
            list_words_int.append(vocab_to_int[word])
    return list_words_int


### Evaluate Model

In [None]:
import nltk
import operator

token_dict = token_lookup()
steps = 0
show_results = 1000
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    for speaker_id, lexicon in lexicons.items():
        # Load saved model
        checkpoint_dir = os.path.join(lexicon.cache_dir,'checkpoints')
        checkpoint_path = os.path.join(checkpoint_dir, 'model.ckpt'
        loader = tf.train.import_meta_graph(checkpoint_dir + '.meta')
        loader.restore(sess, checkpoint_path)

        input_data = loaded_graph.get_tensor_by_name('input:0')
        logits = loaded_graph.get_tensor_by_name('predictions:0')
        target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
        source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
        keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
        for speech in lexicon.speeches:
            gt_transcript = speech.ground_truth_transcript.lower()
            for key, token in token_dict.items():
                gt_transcript = gt_transcript.replace(key, ' {} '.format(token))

            cloud_speech_api_accuracy = []
            custom_lang_model_accuracy = []


            # Collect Google API Transcript
            google_api_transcript = ""
            words = []
            if speech.candidate_timestamps:
                for candidate_timestamp in speech.candidate_timestamps:
                    words.append(candidate_timestamp["word"])
                google_api_transcript = " ".join(words)


            if speech.candidate_timestamps:
                candidate_script_accuracy = []
                for candidate_transcript in speech.candidate_transcripts:
                    steps +=1
                    transcription_sentence = sentence_to_seq(candidate_transcript["transcript"], source_vocab_to_int)

                    transcription_logits = sess.run(logits, {input_data: [transcription_sentence]*batch_size,
                                                         target_sequence_length: [len(transcription_sentence)*2]*batch_size,
                                                         source_sequence_length: [len(transcription_sentence)]*batch_size,
                                                         keep_prob: 1.0})[0]
                    prediction_transcript = " ".join([target_int_to_vocab[i] for i in transcription_logits])
                    # Remove <EOS> Token
                    prediction_transcript = prediction_transcript.replace('<EOS>','')

                    if steps % show_results == 0:  
                        print()
                        print('GCS Candidate Transcript: \n{}'.format(" ".join([source_int_to_vocab[i] for i in transcription_sentence])))
                        print('Seq2Seq Model Prediction Transcript: \n{}'.format(prediction_transcript))
                        print('Ground Truth Transcript: \n{}'.format(gt_transcript))
                        print()

                    # Compute the Candidate Transcript Edit Distance (a.k.a. From the Predicted Distance)
                    # Use this to determine how likely sentence would have been predicted
                    gct_ed = nltk.edit_distance(candidate_transcript["transcript"].lower(), prediction_transcript.lower())
                    gct_upper_bound = max(len(candidate_transcript["transcript"]),len(prediction_transcript))
                    gct_accuracy = (1.0 - gct_ed/gct_upper_bound)

                    gct_accuracy = gct_accuracy*candidate_transcript["confidence"]
                    candidate_script_accuracy.append(gct_accuracy)



                # Select Candidate Transcript with the highest accuracy (to prediction)

                index, value = max(enumerate(candidate_script_accuracy), key=operator.itemgetter(1))

                tmp = []
                for candidate_transcript in speech.candidate_transcripts:
                    tmp.append(candidate_transcript["transcript"])

                reranked_transcript = tmp[index]


                # Collect Accuracy between reranked transcript and Google transcript                      
                gcs_ed = nltk.edit_distance(google_api_transcript.lower(), gt_transcript.lower())
                gcs_upper_bound = max(len(google_api_transcript),len(gt_transcript))
                gcs_accuracy = (1.0 - gcs_ed/gcs_upper_bound)

                clm_ed = nltk.edit_distance(reranked_transcript.lower(), gt_transcript.lower())
                clm_upper_bound = max(len(reranked_transcript),len(gt_transcript))
                clm_accuracy = (1.0 - clm_ed/clm_upper_bound)

                cloud_speech_api_accuracy.append(gcs_accuracy)
                custom_lang_model_accuracy.append(clm_accuracy)

    print('Speech Results:')
    print('Average Candidate Transcript Accuracy:', np.mean(cloud_speech_api_accuracy))
    print('Average Seq2Seq Model Accuracy:', np.mean(custom_lang_model_accuracy))
    print()
