# Lexicon - Orchestrator


## Overview

For this project, I will build a simple custom ochestrator that processes data objects from the "Lexicon" class.
    - These objects are custom datasets that are modeled after the Ted Talk speakers. 
    - Each Lexicon has a corpus and some helper methods aimed at training and prediction
    - Lexicon class will also have a preprocessing and caching function.
    - Each object will have two methods of prediction, n-gram language model and a recurrent neural network model
    - Each object has a custom reporting function that reports the results of training
    - Each object will be able to learn from any text data provided, and return a transcript with confidence values from input posed in speech utterances. 
        - I will use Google's cloud-based services to preprocess the input audio data and transcribe into an initial guess. Then I will train a model to improve on Google cloud speech API's response.


In [1]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import tarfile

librispeech_dataset_folder_path = 'LibriSpeech'
tar_gz_path = 'dev-clean.tar.gz'

books_path = 'original-books.tar.gz'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(books_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech Book Texts') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/original-books.tar.gz',
            books_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path+'/books'):
    with tarfile.open(books_path) as tar:
        tar.extractall()
        tar.close()
        
        
        
if not isfile(tar_gz_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech dev-clean.tar.gz') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/dev-clean.tar.gz',
            tar_gz_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path):
    with tarfile.open(tar_gz_path) as tar:
        tar.extractall()
        tar.close()
        
        
        

In [2]:
# Prepare a plain text corpus from which we train a languague model
import glob
import os
import utils

# Gather all text files from directory
LIBRISPEECH_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')
TEDLIUM_DIRECTORY = os.path.join(os.getcwd(),'TEDLIUM_release1/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_librispeech_path = "{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'dev-clean/', '**/', '*.txt*')
train_librispeech_path = "{}{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')
TED_path = "{}{}{}{}".format(TEDLIUM_DIRECTORY,'train/','**/', '*.stm')

text_paths = sorted(glob.glob(train_librispeech_path, recursive=True))
segmented_text_paths = sorted(glob.glob(dev_librispeech_path, recursive=True))
stm_paths = sorted(glob.glob(TED_path, recursive=True))

print('Found:',len(text_paths),"text files in the directories {0}\n{1} segmented text files in the {2} directory and \n{3} stm files in directory: {4}:".format(train_librispeech_path, 
        len(segmented_text_paths), dev_librispeech_path, len(stm_paths),TED_path ))

Found: 41 text files in the directories /src/lexicon/LibriSpeech/books/utf-8/**/*.txt*
97 segmented text files in the /src/lexicon/LibriSpeech/dev-clean/**/*.txt* directory and 
774 stm files in directory: /src/lexicon/TEDLIUM_release1/train/**/*.stm:


### Build Text Corpuses for Training

In [3]:
import tensorflow as tf
import re
import codecs
import string
from lexicon import Lexicon
from speech import Speech
      
corpus_raw = u""
stm_segments = []
speakers = []
lexicons = {} # {speaker_id: lexicon_object}
speeches = {} # {speech_id: speech_object}
segmented_librispeeches = {}

for book_filename in text_paths:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        lines = book_file.read()
        corpus_raw += lines
            
        
for stm_filename in stm_paths: # Process STM files (Tedlium)
        stm_segments.append(utils.parse_stm_file(stm_filename))

for segments in stm_segments[:10]:
    for segment in segments:
        segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
                                          str(segment.stop_time).replace('.','_'))

        speech = None
        # If not already exist
        if segment.speaker_id not in speeches.keys():
            # Connect to Cloud API to get Candidate Transcripts
            source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
            speech = Speech(speaker_id=segment.speaker_id,
                                           speech_id = segment_key,
                                           source_file=source_file,
                                           ground_truth = ' '.join(segment.transcript.split()[:-1]),
                                           start = segment.start_time,
                                           stop = segment.stop_time,
                                           audio_type = 'LINEAR16')
        else:
            speech = speeches[segment.speaker_id.strip()]
            print('Already found speech in list at location: ', speech)
        
        speeches[segment_key] = speech
        
        
        # Add Lexicon to list if not already exists
        lexicon = None
        if segment.speaker_id not in lexicons.keys():
            lexicon = Lexicon(base_corpus=corpus_raw, name=segment.speaker_id)
            lexicons[segment.speaker_id.strip()] = lexicon
        else:
            lexicon = lexicons[segment.speaker_id.strip()]
        
        # Add Speech to Lexicon
        if speech not in lexicon.speeches:
            lexicon.add_speech(speech)


### Build Speech Objects from Librispeech Dataset for Training

In [4]:
# Build Speech Objects from Librispeech Dataset for Training
for transcript_file in segmented_text_paths:
    #print(transcript_file)
    with open(transcript_file,"r") as filep:   
        for i,line in enumerate(filep):
            # extracting the text sentence from each line
            speech_id, transcript = line.split()[0], " ".join(line.split()[1:])
            speaker_id, transcript_id, _ = speech_id.split('-')
            librispeech = None
            # If speech not already exist
            if speech_id not in segmented_librispeeches.keys():
                # Connect to Cloud API to get Candidate Transcripts
                source_file = os.path.join(os.getcwd(), LIBRISPEECH_DIRECTORY, 'dev-clean',
                                       speaker_id, transcript_id,'{}.flac'.format(speech_id))

                librispeech = Speech(speaker_id=speaker_id,
                                               speech_id = speech_id,
                                               source_file=source_file,
                                               ground_truth = transcript,
                                               start = 0,
                                               stop = 0,
                                               audio_type = 'FLAC')
            else:
                librispeech = segmented_librispeeches[speech_id]
                print('Already found speech in list at location: ', speech)

            # Add Librispeech to Lexicon for Training
            for speaker_id, lexicon in lexicons.items():
                speech_ids = [speech.speech_id for speech in lexicon.speeches]
                if librispeech.speech_id not in speech_ids:
                    lexicon.add_speech(librispeech)

    # # # Print Loading Report for Lexicons
    # for speaker_id, lexicon in lexicons.items():
    #     lexicon.print_loading_report()

    # #Preprocess and Save Data
    # for speaker_id, lexicon in lexicons.items():
    #     lexicon.preprocess_and_save()


In [5]:
for speaker_id, lexicon in lexicons.items():
    lexicon.preprocess_and_save()
    for speech in lexicon.speeches:
        speech.preprocess_and_save()

### Load Preprocessed Data

In [6]:
# speaker_list = list(lexicons)
# lexicon = speaker_list[0]
# lexicon.print_loading_report()

# print(lexicon.speech_corpus)

In [7]:
# import helper
# import numpy as np

# for speaker_id, lexicon in lexicons.items():
#     cache_file = os.path.join(os.getcwd(), 'datacache', 'lexicon_objects',
#                                        '{}_preprocess.p'.format(speaker_id.strip()))
#     (name,
#      base_corpus,
#      full_corpus,
#      int_text, 
#      vocab_to_int, 
#      int_to_vocab)  = Lexicon.load_preprocess(cache_file)
#     print("{0} int_text length: {1}".format(name, len(int_text)))

### Load Speech Objects

In [8]:
# from speech import Speech
# stm_segments = []

# for stm_filename in stm_paths: # Process STM files (Tedlium)
#         stm_segments.append(utils.parse_stm_file(stm_filename))        

# speakers = []
# speeches = {} # {speech_id: speech_object}

# for segments in stm_segments[:5]:
#     for segment in segments:
#         segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
#                                           str(segment.stop_time).replace('.','_'))

#         speech = None
#         # If not already exist
#         if segment.speaker_id not in speeches.keys():
#             # Connect to Cloud API to get Candidate Transcripts
#             source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
#             speech = Speech(speaker_id=segment.speaker_id,
#                                            speech_id = segment_key,
#                                            source_file=source_file,
#                                            ground_truth = ' '.join(segment.transcript.split()[:-1]),
#                                            start = segment.start_time,
#                                            stop = segment.stop_time,
#                                            audio_type = 'LINEAR16')
#         else:
#             speech = speeches[segment.speaker_id.strip()]
#             print('Already found speech in list at location: ', speech)
        
#         speeches[segment_key] = speech


### Load GCS Transcripts using GCS Wrapper

In [9]:
from gcs_api_wrapper import GCSWrapper

libri_gcs = GCSWrapper(encoding='FLAC')
for speech_id, librispeech in segmented_librispeeches.items():
    result = libri_gcs.transcribe_speech(librispeech.audio_file)
    speech.populate_gcs_results(result)
    # Print Loading Report
    speech.preprocess_and_save()

gcs = GCSWrapper()
for speech_id, speech in speeches.items():
    result = gcs.transcribe_speech(speech.audio_file)
    speech.populate_gcs_results(result)
    # Print Loading Report
    speech.preprocess_and_save()



In [10]:
import pickle 
speech_cache_paths = []
lexicon_cache_paths = []
for speaker_id, lexicon in lexicons.items():
    lexicon_cache_path = lexicon.preprocess_and_save()
    lexicon_cache_paths.append(lexicon_cache_path)
    for speech in lexicon.speeches:
        cache_path =speech.preprocess_and_save()
        speech_cache_paths.append({lexicon.name:cache_path})
pickle.dump((lexicon_cache_paths, speech_cache_paths), open('cache_paths_preprocess.p', 'wb'))

In [11]:
print(lexicon_cache_paths)

[None, None, None, None, None, None, None, None, None, None]


In [12]:
print(speech_cache_paths)

[{'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_105_08_108_54_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_109_38_112_29_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_114_88_117_85_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_118_67_128_52_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_129_31_137_01_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_137_21_145_32_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_145_67_158_14_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_159_13_166_27_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/datacache/speech_objects/AdamSavage_2008P_16_78_17_41_preprocess.p'}, {'AdamSavage_2008P': '/src/lexicon/dat

In [327]:
    def token_lookup():
        """
        Generate a dict to turn punctuation into a token.
        :return: Tokenize dictionary where the key is the punctuation and the value is the token
        """
        return {
            ',': '',
            '(1)': '',
            '(2)': '',
            '(3)': '',
            '(4)': '',
            '(5)': '',
            '(6)': '',
            '(7)': '',
            '(8)': '',
            '(9)': '',
            '"': '',
            ';': '',
            '!': '',
            '?': '',
            '(': '',
            ')': '',
            '*': '',
            '--': '',
            '{NOISE}': '',
            '{BREATH}': '',
            '{UH}': '',
            '{SMACK}': '',
            '{COUGH}': '',
            '<sil>': ''
        }

In [328]:
transcript_raw = u""
targets_raw = u""
    
for speech_dict in speech_cache_paths:
    speech_path = list(speech_dict.values())[0]
    if os.path.exists(speech_path):
        (_speech_id,
         _speaker_id,
         _source_file,
         _audio_file,
         _candidate_transcripts,
         _candidate_timestamps,
         _audio_type,
         _sample_rate, 
         _start_time, 
         _stop_time, 
         _ground_truth_transcript) = Speech.load_preprocess(speech_path)
        

        speech = Speech(_speaker_id,
         _speech_id,
         _source_file,
         _ground_truth_transcript,
         _start_time,
         _stop_time,
         _audio_type,
         _sample_rate)

        
        for canidate_transcript in speech.candidate_transcripts:
            transcript_raw += canidate_transcript["transcript"]+'\n'
            targets_raw += speech.ground_truth_transcript+'\n'


corp_file = open(os.path.join(os.getcwd(),"source_corp.txt"), "w", encoding="utf-8")
transcript_raw = transcript_raw.encode('ascii', 'ignore')
transcript_raw = transcript_raw.decode("utf-8")

token_dict = token_lookup()
for key, token in token_dict.items():
    transcript_raw = transcript_raw.replace(key, ' {} '.format(token))

transcript_raw = transcript_raw.lower()


corp_file.write(transcript_raw)
corp_file.close



corp_file = open(os.path.join(os.getcwd(),"target_corp.txt"), "w", encoding="utf-8")
targets_raw = targets_raw.encode('ascii', 'ignore')
targets_raw = targets_raw.decode("utf-8")


token_dict = token_lookup()
for key, token in token_dict.items():
    targets_raw = targets_raw.replace(key, ' {} '.format(token))

targets_raw = targets_raw.lower()


corp_file.write(targets_raw)
corp_file.close

/src/lexicon/datacache/speech_objects/AdamSavage_2008P_105_08_108_54_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_109_38_112_29_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_114_88_117_85_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_118_67_128_52_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_129_31_137_01_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_137_21_145_32_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_145_67_158_14_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_159_13_166_27_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_16_78_17_41_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_17_58_24_98_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_192_04_196_74_preprocess.p
/src/lexicon/datacache/speech_objects/AdamSavage_2008P_196_96_204_72_preprocess.p
/src/lexicon/datacac

/src/lexicon/datacache/speech_objects/1993-147149-0018_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0019_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0020_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0021_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0022_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0023_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0024_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0025_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0026_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0027_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0028_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0029_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0030_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147964-0000_preprocess.p
/src/lexicon/datacache/speech_objects/1993-14796

/src/lexicon/datacache/speech_objects/3081-166546-0016_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0017_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0018_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0019_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0020_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0021_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0022_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0023_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0024_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0025_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0026_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0027_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0028_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0029_preprocess.p
/src/lexicon/datacache/speech_objects/3081-16654

/src/lexicon/datacache/speech_objects/7976-110124-0004_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0005_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0006_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0007_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0008_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0009_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0010_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0011_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0012_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0013_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0014_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0015_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0016_preprocess.p
/src/lexicon/datacache/speech_objects/7976-110124-0017_preprocess.p
/src/lexicon/datacache/speech_objects/7976-11012

/src/lexicon/datacache/speech_objects/2428-83699-0042_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0000_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0001_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0002_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0003_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0004_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0005_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0006_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0007_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0008_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0009_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0010_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0011_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0012_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0013_preproce

/src/lexicon/datacache/speech_objects/84-121550-0012_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0013_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0014_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0015_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0016_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0017_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0018_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0019_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0020_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0021_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0022_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0023_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0024_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0025_preprocess.p
/src/lexicon/datacache/speech_objects/84-121550-0026_preprocess.p
/src/lexic

/src/lexicon/datacache/speech_objects/2078-142845-0003_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0004_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0005_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0006_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0007_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0008_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0009_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0010_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0011_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0012_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0013_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0014_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0015_preprocess.p
/src/lexicon/datacache/speech_objects/2078-142845-0016_preprocess.p
/src/lexicon/datacache/speech_objects/2078-14284

/src/lexicon/datacache/speech_objects/3576-138058-0039_preprocess.p
/src/lexicon/datacache/speech_objects/3576-138058-0040_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0000_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0001_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0002_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0003_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0004_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0005_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0006_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0007_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0008_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0009_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0010_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0011_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4943-0012_preprocess.p
/src/l

/src/lexicon/datacache/speech_objects/6345-64257-0018_preprocess.p
/src/lexicon/datacache/speech_objects/6345-64257-0019_preprocess.p
/src/lexicon/datacache/speech_objects/6345-64257-0020_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0000_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0001_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0002_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0003_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0004_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0005_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0006_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0007_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0008_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0009_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0010_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0011_preproce

/src/lexicon/datacache/speech_objects/1272-141231-0010_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0011_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0012_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0013_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0014_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0015_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0016_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0017_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0018_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0019_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0020_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0021_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0022_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0023_preprocess.p
/src/lexicon/datacache/speech_objects/1272-14123

/src/lexicon/datacache/speech_objects/2428-83705-0007_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0008_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0009_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0010_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0011_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0012_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0013_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0014_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0015_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0016_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0017_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0018_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0019_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0020_preprocess.p
/src/lexicon/datacache/speech_objects/2428-83705-0021_preproce

/src/lexicon/datacache/speech_objects/5694-64025-0004_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0005_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0006_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0007_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0008_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0009_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0010_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0011_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0012_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0013_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0014_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0015_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0016_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0017_preprocess.p
/src/lexicon/datacache/speech_objects/5694-64025-0018_preproce

/src/lexicon/datacache/speech_objects/8297-275154-0008_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0009_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0010_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0011_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0012_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0013_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0014_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0015_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0016_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0017_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0018_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0019_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0020_preprocess.p
/src/lexicon/datacache/speech_objects/8297-275154-0021_preprocess.p
/src/lexicon/datacache/speech_objects/8297-27515

/src/lexicon/datacache/speech_objects/1993-147149-0007_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0008_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0009_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0010_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0011_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0012_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0013_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0014_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0015_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0016_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0017_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0018_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0019_preprocess.p
/src/lexicon/datacache/speech_objects/1993-147149-0020_preprocess.p
/src/lexicon/datacache/speech_objects/1993-14714

/src/lexicon/datacache/speech_objects/3170-137482-0025_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0026_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0027_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0028_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0029_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0030_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0031_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0032_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0033_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0034_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0035_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0036_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0037_preprocess.p
/src/lexicon/datacache/speech_objects/3170-137482-0038_preprocess.p
/src/lexicon/datacache/speech_objects/3170-13748

/src/lexicon/datacache/speech_objects/6313-76958-0026_preprocess.p
/src/lexicon/datacache/speech_objects/6313-76958-0027_preprocess.p
/src/lexicon/datacache/speech_objects/6313-76958-0028_preprocess.p
/src/lexicon/datacache/speech_objects/6313-76958-0029_preprocess.p
/src/lexicon/datacache/speech_objects/6313-76958-0030_preprocess.p
/src/lexicon/datacache/speech_objects/6313-76958-0031_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0000_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0001_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0002_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0003_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0004_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0005_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0006_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0007_preprocess.p
/src/lexicon/datacache/speech_objects/6319-275224-0008

/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_358_39_369_44_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_371_13_377_31_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_378_02_383_07_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_392_07_399_91_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_400_33_414_76_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_415_86_416_96_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_423_61_433_65_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_469_14_470_88_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_47_32_56_03_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_484_79_494_68_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_495_12_499_43_preprocess.p
/src/lexicon/datacache/speech_objects/AimeeMullins_2009U_499_85_501_06_preproc

/src/lexicon/datacache/speech_objects/2277-149897-0019_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0020_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0021_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0022_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0023_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0024_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0025_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0026_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0027_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0028_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0029_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0030_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0031_preprocess.p
/src/lexicon/datacache/speech_objects/2277-149897-0032_preprocess.p
/src/lexicon/datacache/speech_objects/2277-14989

/src/lexicon/datacache/speech_objects/3752-4944-0066_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4944-0067_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4944-0068_preprocess.p
/src/lexicon/datacache/speech_objects/3752-4944-0069_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0000_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0001_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0002_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0003_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0004_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0005_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0006_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0007_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0008_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0009_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0010_p

/src/lexicon/datacache/speech_objects/6345-93302-0025_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0026_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0027_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0028_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93302-0029_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0000_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0001_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0002_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0003_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0004_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0005_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0006_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0007_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0008_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0009_preproce

/src/lexicon/datacache/speech_objects/1272-141231-0010_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0011_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0012_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0013_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0014_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0015_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0016_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0017_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0018_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0019_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0020_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0021_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0022_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0023_preprocess.p
/src/lexicon/datacache/speech_objects/1272-14123

/src/lexicon/datacache/speech_objects/2412-153947-0012_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153947-0013_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153947-0014_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153947-0015_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153947-0016_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0000_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0001_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0002_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0003_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0004_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0005_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0006_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0007_preprocess.p
/src/lexicon/datacache/speech_objects/2412-153948-0008_preprocess.p
/src/lexicon/datacache/speech_objects/2412-15394

/src/lexicon/datacache/speech_objects/3853-163249-0021_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0022_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0023_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0024_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0025_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0026_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0027_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0028_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0029_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0030_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0031_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0032_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0033_preprocess.p
/src/lexicon/datacache/speech_objects/3853-163249-0034_preprocess.p
/src/lexicon/datacache/speech_objects/3853-16324

/src/lexicon/datacache/speech_objects/6345-93306-0015_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0016_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0017_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0018_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0019_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0020_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0021_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0022_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0023_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0024_preprocess.p
/src/lexicon/datacache/speech_objects/6345-93306-0025_preprocess.p
/src/lexicon/datacache/speech_objects/652-129742-0000_preprocess.p
/src/lexicon/datacache/speech_objects/652-129742-0001_preprocess.p
/src/lexicon/datacache/speech_objects/652-129742-0002_preprocess.p
/src/lexicon/datacache/speech_objects/652-129742-0003_preproce

/src/lexicon/datacache/speech_objects/1272-141231-0023_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0024_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0025_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0026_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0027_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0028_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0029_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0030_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0031_preprocess.p
/src/lexicon/datacache/speech_objects/1272-141231-0032_preprocess.p
/src/lexicon/datacache/speech_objects/1462-170138-0000_preprocess.p
/src/lexicon/datacache/speech_objects/1462-170138-0001_preprocess.p
/src/lexicon/datacache/speech_objects/1462-170138-0002_preprocess.p
/src/lexicon/datacache/speech_objects/1462-170138-0003_preprocess.p
/src/lexicon/datacache/speech_objects/1462-17013

/src/lexicon/datacache/speech_objects/251-118436-0011_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0012_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0013_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0014_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0015_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0016_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0017_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0018_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0019_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0020_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0021_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0022_preprocess.p
/src/lexicon/datacache/speech_objects/251-118436-0023_preprocess.p
/src/lexicon/datacache/speech_objects/251-136532-0000_preprocess.p
/src/lexicon/datacache/speech_objects/251-136532-0001_preproce

/src/lexicon/datacache/speech_objects/5338-24640-0004_preprocess.p
/src/lexicon/datacache/speech_objects/5338-24640-0005_preprocess.p
/src/lexicon/datacache/speech_objects/5338-24640-0006_preprocess.p
/src/lexicon/datacache/speech_objects/5338-24640-0007_preprocess.p
/src/lexicon/datacache/speech_objects/5338-24640-0008_preprocess.p
/src/lexicon/datacache/speech_objects/5338-24640-0009_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0000_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0001_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0002_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0003_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0004_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0005_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0006_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0007_preprocess.p
/src/lexicon/datacache/speech_objects/5338-284437-0008

/src/lexicon/datacache/speech_objects/777-126732-0003_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0004_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0005_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0006_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0007_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0008_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0009_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0010_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0011_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0012_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0013_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0014_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0015_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0016_preprocess.p
/src/lexicon/datacache/speech_objects/777-126732-0017_preproce

/src/lexicon/datacache/speech_objects/251-137823-0009_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0010_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0011_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0012_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0013_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0014_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0015_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0016_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0017_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0018_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0019_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0020_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0021_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0022_preprocess.p
/src/lexicon/datacache/speech_objects/251-137823-0023_preproce

/src/lexicon/datacache/speech_objects/7976-105575-0016_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0017_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0018_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0019_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0020_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0021_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0022_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0023_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0024_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0025_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0026_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0027_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0028_preprocess.p
/src/lexicon/datacache/speech_objects/7976-105575-0029_preprocess.p
/src/lexicon/datacache/speech_objects/7976-11012

/src/lexicon/datacache/speech_objects/3081-166546-0047_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0048_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0049_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0050_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0051_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0052_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0053_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0054_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0055_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0056_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0057_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0058_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0059_preprocess.p
/src/lexicon/datacache/speech_objects/3081-166546-0060_preprocess.p
/src/lexicon/datacache/speech_objects/3081-16654

<function TextIOWrapper.close>

In [329]:
import helper
lex = list(lexicons.values())[0]
source_path = os.path.join(os.getcwd(),"source_corp.txt")
target_path = os.path.join(os.getcwd(),"target_corp.txt")
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)

In [330]:
view_sentence_range = (0, 10)

"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))

sentences = source_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))

print()
print('Transcript sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('Ground Truth sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 3292
Number of sentences: 19535
Average number of words in a sentence: 21.747018172510877

Transcript sentences 0 to 10:
and i thought to myself
i thought to myself
but i thought to myself
when i thought to myself
and i thought to myself oh
and i thought to myself
i thought to myself
but i thought to myself
when i thought to myself
and i thought to myself oh

Ground Truth sentences 0 to 10:
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great   
and  2   i thought to myself    wouldn't it be great

In [331]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
    Convert source and target text to proper word ids
    :param source_text: String that contains all the source text.
    :param target_text: String that contains all the target text.
    :param source_vocab_to_int: Dictionary to go from the source words to an id
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: A tuple of lists (source_id_text, target_id_text)
    """
    # TODO: Implement Function
    
    
    # source_id_text and target_id_text are a list of lists where each list represent a line. 
    # That's why we use a first split('\n')] (not written in the statements)
    source_list = [sentence for sentence in source_text.split('\n')]
    target_list = [sentence for sentence in target_text.split('\n')]
    
    # Filling the lists
    source_id_text = list()
    target_id_text = list()
    for i in range(len(source_list)):
        source_id_text_temp = list()
        target_id_text_temp = list()
        for word in source_list[i].split():
            source_id_text_temp.append(source_vocab_to_int[word])
        for word in target_list[i].split():
            target_id_text_temp.append(target_vocab_to_int[word])
        # We need to add EOS for target    
        target_id_text_temp.append(target_vocab_to_int['<EOS>'])
        source_id_text.append(source_id_text_temp)
        target_id_text.append(target_id_text_temp)
              
    return source_id_text, target_id_text

In [332]:
import os
import pickle
import copy
import numpy as np
from tensorflow.python.layers.core import Dense
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    """
    vocab = set(text.split())
    vocab_to_int = copy.copy(CODES)

    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab


In [333]:
import helper

def preprocess_and_save_data(source_path, target_path, text_to_ids):
    source_text = helper.load_data(source_path)
    target_text = helper.load_data(target_path)

    source_text = source_text.lower()
    target_text = target_text.lower()

    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)

    # Save Data
    with open('preprocess.p', 'wb') as out_file:
        pickle.dump((
            (source_text, target_text),
            (source_vocab_to_int, target_vocab_to_int),
            (source_int_to_vocab, target_int_to_vocab)), out_file)

In [334]:
preprocess_and_save_data(source_path, target_path, text_to_ids)

In [335]:
import numpy as np
import helper

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()

In [336]:
def model_inputs():
    """
    Create TF Placeholders for input, targets, learning rate, and lengths of source and target sequences.
    :return: Tuple (input, targets, learning rate, keep probability, target sequence length,
    max target sequence length, source sequence length)
    """
    
    inputs = tf.placeholder(tf.int32,[None,None], name = "input")
    targets = tf.placeholder(tf.int32,[None,None], name = "target")
    learning_rate = tf.placeholder(tf.float32, name = "learning_rate")
    keep_probability = tf.placeholder(tf.float32, name = "keep_prob")
    target_sequence_length = tf.placeholder(tf.int32,[None], name = "target_sequence_length")
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name = "max_target_len")
    source_sequence_length = tf.placeholder(tf.int32, [None], name = "source_sequence_length")
    return inputs, targets, learning_rate, keep_probability, target_sequence_length, max_target_sequence_length, source_sequence_length


In [337]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for encoding
    :param target_data: Target Placehoder
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param batch_size: Batch Size
    :return: Preprocessed target data
    """
    # TODO: Implement Function
    
    #removing the last word id from each batch in target_data 
    print(target_data)
    target_data = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1] )
    #target_data = tf.strided_slice(target_data,[0,0],[int(target_data.shape[0]),int(target_data.shape[1]-1)],[1,1] )
    
    # concat the GO ID to the begining of each batch
    decoder_input = tf.concat([tf.fill([batch_size,1],target_vocab_to_int['<GO>']),target_data],1)
        
    return decoder_input

In [338]:
from imp import reload

def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):
    """
    Create encoding layer
    :param rnn_inputs: Inputs for the RNN
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param keep_prob: Dropout keep probability
    :param source_sequence_length: a list of the lengths of each sequence in the batch
    :param source_vocab_size: vocabulary size of source data
    :param encoding_embedding_size: embedding size of source data
    :return: tuple (RNN output, RNN state)
    """
    # TODO: Implement Function
    
    # Embed the encoder input using tf.contrib.layers.embed_sequence
    inputs_embeded = tf.contrib.layers.embed_sequence(
                                    ids = rnn_inputs,
                                    vocab_size = source_vocab_size,
                                    embed_dim = encoding_embedding_size)
    
    # Construct a stacked tf.contrib.rnn.LSTMCell wrapped in a tf.contrib.rnn.DropoutWrapper
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers) ])
    cell_dropout = tf.contrib.rnn.DropoutWrapper(cell, keep_prob)
    
    # Pass cell and embedded input to tf.nn.dynamic_rnn()
    RNN_output, RNN_state = tf.nn.dynamic_rnn(
                                cell = cell_dropout,
                                inputs = inputs_embeded,
                                sequence_length = source_sequence_length,
                                dtype = tf.float32)
    
    return RNN_output, RNN_state

In [339]:

def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    """
    Create a decoding layer for training
    :param encoder_state: Encoder State
    :param dec_cell: Decoder RNN Cell
    :param dec_embed_input: Decoder embedded input
    :param target_sequence_length: The lengths of each sequence in the target batch
    :param max_summary_length: The length of the longest sequence in the batch
    :param output_layer: Function to apply the output layer
    :param keep_prob: Dropout keep probability
    :return: BasicDecoderOutput containing training logits and sample_id
    """
    # TODO: Implement Function
    
    # Create a tf.contrib.seq2seq.TrainingHelper
    training_helper = tf.contrib.seq2seq.TrainingHelper(
                                            inputs = dec_embed_input,
                                            sequence_length = target_sequence_length)
    
    # Create a tf.contrib.seq2seq.BasicDecoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(
                                            cell = dec_cell,
                                            helper = training_helper,
                                            initial_state = encoder_state,
                                            output_layer = output_layer)
    
    # Obtain the decoder outputs from tf.contrib.seq2seq.dynamic_decode
    BasicDecoderOutput = tf.contrib.seq2seq.dynamic_decode(
                                            decoder = basic_decoder,
                                            impute_finished = True,
                                            maximum_iterations = max_summary_length 
                                            )

    return BasicDecoderOutput[0]

In [340]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    """
    Create a decoding layer for inference
    :param encoder_state: Encoder state
    :param dec_cell: Decoder RNN Cell
    :param dec_embeddings: Decoder embeddings
    :param start_of_sequence_id: GO ID
    :param end_of_sequence_id: EOS Id
    :param max_target_sequence_length: Maximum length of target sequences
    :param vocab_size: Size of decoder/target vocabulary
    :param decoding_scope: TenorFlow Variable Scope for decoding
    :param output_layer: Function to apply the output layer
    :param batch_size: Batch size
    :param keep_prob: Dropout keep probability
    :return: BasicDecoderOutput containing inference logits and sample_id
    """
    # TODO: Implement Function
    
    # creates a new tensor by replicating start_of_sequence_id batch_size times.
    start_tokens = tf.tile(tf.constant([start_of_sequence_id],dtype = tf.int32),[batch_size], name = 'start_tokens' )
        
    # Create a tf.contrib.seq2seq.GreedyEmbeddingHelper
    embedding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                                embedding = dec_embeddings,
                                start_tokens = start_tokens, 
                                end_token = end_of_sequence_id)
    
    # Create a tf.contrib.seq2seq.BasicDecoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(
                                                cell = dec_cell,
                                                helper = embedding_helper,
                                                initial_state = encoder_state,
                                                output_layer = output_layer)
    
    # Obtain the decoder outputs from tf.contrib.seq2seq.dynamic_decode
    BasicDecoderOutput = tf.contrib.seq2seq.dynamic_decode(
                                                decoder = basic_decoder,
                                                impute_finished = True,
                                                maximum_iterations = max_target_sequence_length)

    return BasicDecoderOutput[0]

In [341]:
def decoding_layer(dec_input, encoder_state,
                   target_sequence_length, max_target_sequence_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size):
    """
    Create decoding layer
    :param dec_input: Decoder input
    :param encoder_state: Encoder state
    :param target_sequence_length: The lengths of each sequence in the target batch
    :param max_target_sequence_length: Maximum length of target sequences
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param target_vocab_size: Size of target vocabulary
    :param batch_size: The size of the batch
    :param keep_prob: Dropout keep probability
    :param decoding_embedding_size: Decoding embedding size
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    # TODO: Implement Function
    
    # Embed the target sequences
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    # Construct the decoder LSTM cell (just like you constructed the encoder cell above)
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers) ])
    cell_dropout = tf.contrib.rnn.DropoutWrapper(cell, keep_prob)
    
    # Create an output layer to map the outputs of the decoder to the elements of our vocabulary
    output_layer = Dense(target_vocab_size)
                        
    
    # Use the your decoding_layer_train(encoder_state, dec_cell, dec_embed_input, target_sequence_length, 
    # max_target_sequence_length, output_layer, keep_prob) function to get the training logits.
    with tf.variable_scope("decode"):
        Training_BasicDecoderOutput = decoding_layer_train(encoder_state, 
                                                       cell_dropout, 
                                                       dec_embed_input, 
                                                       target_sequence_length, 
                                                       max_target_sequence_length, 
                                                       output_layer, 
                                                       keep_prob)
    
    # Use your decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, 
    # end_of_sequence_id, max_target_sequence_length, vocab_size, output_layer, batch_size, keep_prob) 
    # function to get the inference logits.
    with tf.variable_scope("decode", reuse=True):
        Inference_BasicDecoderOutput = decoding_layer_infer(encoder_state, 
                                                        cell_dropout, 
                                                        dec_embeddings, 
                                                        target_vocab_to_int['<GO>'], 
                                                        target_vocab_to_int['<EOS>'],
                                                        max_target_sequence_length, 
                                                        target_vocab_size,
                                                        output_layer,
                                                        batch_size, 
                                                        keep_prob)
    return Training_BasicDecoderOutput, Inference_BasicDecoderOutput

In [342]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_sequence_length, target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    """
    Build the Sequence-to-Sequence part of the neural network
    :param input_data: Input placeholder
    :param target_data: Target placeholder
    :param keep_prob: Dropout keep probability placeholder
    :param batch_size: Batch Size
    :param source_sequence_length: Sequence Lengths of source sequences in the batch
    :param target_sequence_length: Sequence Lengths of target sequences in the batch
    : max_target_sentence_length,
    :param source_vocab_size: Source vocabulary size
    :param target_vocab_size: Target vocabulary size
    :param enc_embedding_size: Decoder embedding size
    :param dec_embedding_size: Encoder embedding size
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    # TODO: Implement Function
    
    # Encode the input using your encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob,  source_sequence_length, source_vocab_size, encoding_embedding_size).
    rnn_output , rnn_state = encoding_layer(input_data, 
                   rnn_size, 
                   num_layers, 
                   keep_prob, 
                   source_sequence_length, 
                   source_vocab_size, 
                   enc_embedding_size)
    
    # Process target data using your process_decoder_input(target_data, target_vocab_to_int, batch_size) function.
    decoder_input = process_decoder_input(target_data,
                                        target_vocab_to_int,
                                        batch_size)
    
    # Decode the encoded input using your decoding_layer(dec_input, enc_state, target_sequence_length, max_target_sentence_length, 
    # rnn_size, num_layers, target_vocab_to_int, target_vocab_size, batch_size, keep_prob, dec_embedding_size) function.
    Training_BasicDecoderOutput, Inference_BasicDecoderOutput = decoding_layer(
                                        decoder_input,
                                        rnn_state,
                                        target_sequence_length,
                                        max_target_sentence_length,
                                        rnn_size,
                                        num_layers,
                                        target_vocab_to_int,
                                        target_vocab_size,
                                        batch_size,
                                        keep_prob,
                                        dec_embedding_size)
    
    return Training_BasicDecoderOutput, Inference_BasicDecoderOutput

In [343]:
# Number of Epochs
epochs = 30
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 512
# Number of Layers
num_layers = 1

encoding_embedding_size = 256
decoding_embedding_size = 256
# Learning Rate
learning_rate = 0.001
# Dropout Keep Probability
keep_probability = 0.75
display_step = 100

In [344]:
save_path = 'checkpoints/dev'
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()
max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob, target_sequence_length, max_target_sequence_length, source_sequence_length = model_inputs()

    #sequence_length = tf.placeholder_with_default(max_target_sentence_length, None, name='sequence_length')
    input_shape = tf.shape(input_data)

    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   source_sequence_length,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)


    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        # Monitor gradient
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


Tensor("target:0", shape=(?, ?), dtype=int32)


In [345]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths


In [346]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

# Split data to training and validation sets
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):
            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:


                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     source_sequence_length: sources_lengths,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})


                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     source_sequence_length: valid_sources_lengths,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)
                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')

Epoch   0 Batch  100/305 - Train Accuracy: 0.0443, Validation Accuracy: 0.0598, Loss: 3.9243
Epoch   0 Batch  200/305 - Train Accuracy: 0.6791, Validation Accuracy: 0.5750, Loss: 2.0188
Epoch   0 Batch  300/305 - Train Accuracy: 0.3573, Validation Accuracy: 0.5661, Loss: 3.7841


KeyboardInterrupt: 

In [None]:
# Save parameters for checkpoint
helper.save_params(save_path)

In [None]:
import tensorflow as tf
import numpy as np
import helper

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()
load_path = helper.load_params()

In [None]:
def sentence_to_seq(sentence, vocab_to_int):
    """
    Convert a sentence to a sequence of ids
    :param sentence: String
    :param vocab_to_int: Dictionary to go from the words to an id
    :return: List of word ids
    """
    

    # TODO: Implement Function
    
    # Convert the sentence to lowercase and to list
    list_words = [word for word in sentence.lower().split() ]
    
    # Convert words into ids using vocab_to_int
    list_words_int = list()
    for word in list_words:
        # Convert words not in the vocabulary, to the <UNK> word id.
        if word not in vocab_to_int:
            list_words_int.append(vocab_to_int['<UNK>'])
        else:
            list_words_int.append(vocab_to_int[word])
    return list_words_int


In [None]:
transcription_sentence = lexicon.speeches[0].ground_truth_transcript


"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
transcription_sentence = sentence_to_seq(transcription_sentence, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

    transcription_logits = sess.run(logits, {input_data: [transcription_sentence]*batch_size,
                                         target_sequence_length: [len(transcription_sentence)*2]*batch_size,
                                         source_sequence_length: [len(transcription_sentence)]*batch_size,
                                         keep_prob: 1.0})[0]

print('Input')
print('  Word Ids:      {}'.format([i for i in transcription_sentence]))
print('  Candidate Transcript: {}'.format([source_int_to_vocab[i] for i in transcription_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in transcription_logits]))
print('  Prediction Transcript: {}'.format(" ".join([target_int_to_vocab[i] for i in transcription_logits])))
