# Lexicon - Orchestrator


## Overview

For this project, I will build a simple custom ochestrator that processes data objects from the "Lexicon" class.
    - These objects are custom datasets that are modeled after the Ted Talk speakers. 
    - Each Lexicon has a corpus and some helper methods aimed at training and prediction
    - Lexicon class will also have a preprocessing and caching function.
    - Each object will have two methods of prediction, n-gram language model and a recurrent neural network model
    - Each object has a custom reporting function that reports the results of training
    - Each object will be able to learn from any text data provided, and return a transcript with confidence values from input posed in speech utterances. 
        - I will use Google's cloud-based services to preprocess the input audio data and transcribe into an initial guess. Then I will train a model to improve on Google cloud speech API's response.


In [1]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import tarfile

librispeech_dataset_folder_path = 'LibriSpeech'
tar_gz_path = 'dev-clean.tar.gz'

books_path = 'original-books.tar.gz'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(books_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech Book Texts') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/original-books.tar.gz',
            books_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path+'/books'):
    with tarfile.open(books_path) as tar:
        tar.extractall()
        tar.close()
        
        
        
if not isfile(tar_gz_path):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Librispeech dev-clean.tar.gz') as pbar:
        urlretrieve(
            'http://www.openslr.org/resources/12/dev-clean.tar.gz',
            tar_gz_path,
            pbar.hook)

if not isdir(librispeech_dataset_folder_path):
    with tarfile.open(tar_gz_path) as tar:
        tar.extractall()
        tar.close()
        
        
        

In [2]:
# Prepare a plain text corpus from which we train a languague model
import glob
import os
import utils

# Gather all text files from directory
LIBRISPEECH_DIRECTORY = os.path.join(os.getcwd(),'LibriSpeech/')
TEDLIUM_DIRECTORY = os.path.join(os.getcwd(),'TEDLIUM_release1/')

# TRAINING_DIRECTORY = os.path.abspath(os.path.join(os.sep,'Volumes',"My\ Passport\ for\ Mac",'lexicon','LibriSpeech'))
dev_librispeech_path = "{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'dev-clean/', '**/', '*.txt*')
train_librispeech_path = "{}{}{}{}{}".format(LIBRISPEECH_DIRECTORY, 'books/', 'utf-8/', '**/', '*.txt*')
TED_path = "{}{}{}{}".format(TEDLIUM_DIRECTORY,'train/','**/', '*.stm')

text_paths = sorted(glob.glob(train_librispeech_path, recursive=True))
segmented_text_paths = sorted(glob.glob(dev_librispeech_path, recursive=True))
stm_paths = sorted(glob.glob(TED_path, recursive=True))

print('Found:',len(text_paths),"text files in the directories {0}\n{1} segmented text files in the {2} directory and \n{3} stm files in directory: {4}:".format(train_librispeech_path, 
        len(segmented_text_paths), dev_librispeech_path, len(stm_paths),TED_path ))

Found: 41 text files in the directories /src/LibriSpeech/books/utf-8/**/*.txt*
97 segmented text files in the /src/LibriSpeech/dev-clean/**/*.txt* directory and 
774 stm files in directory: /src/TEDLIUM_release1/train/**/*.stm:


### Build Text Corpuses for Training

In [3]:
import tensorflow as tf
import re
import codecs
import string
from lexicon import Lexicon
from speech import Speech
      
corpus_raw = u""
stm_segments = []
speakers = []
lexicons = {} # {speaker_id: lexicon_object}
speeches = {} # {speech_id: speech_object}
segmented_librispeeches = {}

for book_filename in text_paths:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        lines = book_file.read()
        corpus_raw += lines
            
        
for stm_filename in stm_paths: # Process STM files (Tedlium)
        stm_segments.append(utils.parse_stm_file(stm_filename))

for segments in stm_segments[200:210]:
    for segment in segments:
        segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
                                          str(segment.stop_time).replace('.','_'))

        speech = None
        # If not already exist
        if segment.speaker_id not in speeches.keys():
            # Connect to Cloud API to get Candidate Transcripts
            source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
            speech = Speech(speaker_id=segment.speaker_id,
                                           speech_id = segment_key,
                                           source_file=source_file,
                                           ground_truth = ' '.join(segment.transcript.split()[:-1]),
                                           start = segment.start_time,
                                           stop = segment.stop_time,
                                           audio_type = 'LINEAR16')
        else:
            speech = speeches[segment.speaker_id.strip()]
            print('Already found speech in list at location: ', speech)
        
        speeches[segment_key] = speech
        
        
        # Add Lexicon to list if not already exists
        lexicon = None
        if segment.speaker_id not in lexicons.keys():
            lexicon = Lexicon(base_corpus=corpus_raw, name=segment.speaker_id)
            lexicons[segment.speaker_id.strip()] = lexicon
        else:
            lexicon = lexicons[segment.speaker_id.strip()]
        
        # Add Speech to Lexicon
        if speech not in lexicon.speeches:
            lexicon.add_speech(speech)


### Build Speech Objects from Librispeech Dataset for Training

In [4]:
# Build Speech Objects from Librispeech Dataset for Training
for transcript_file in segmented_text_paths:
    #print(transcript_file)
    with open(transcript_file,"r") as filep:   
        for i,line in enumerate(filep):
            # extracting the text sentence from each line
            speech_id, transcript = line.split()[0], " ".join(line.split()[1:])
            speaker_id, transcript_id, _ = speech_id.split('-')
            librispeech = None
            # If speech not already exist
            if speech_id not in segmented_librispeeches.keys():
                # Connect to Cloud API to get Candidate Transcripts
                source_file = os.path.join(os.getcwd(), LIBRISPEECH_DIRECTORY, 'dev-clean',
                                       speaker_id, transcript_id,'{}.flac'.format(speech_id))

                librispeech = Speech(speaker_id=speaker_id,
                                               speech_id = speech_id,
                                               source_file=source_file,
                                               ground_truth = transcript,
                                               start = 0,
                                               stop = 0,
                                               audio_type = 'FLAC')
                segmented_librispeeches[speech_id] = librispeech
            else:
                librispeech = segmented_librispeeches[speech_id]
                print('Already found speech in list at location: ', speech)

            # Add Librispeech to Lexicon for Training
            for speaker_id, lexicon in lexicons.items():
                speech_ids = [speech.speech_id for speech in lexicon.speeches]
                if librispeech.speech_id not in speech_ids:
                    lexicon.add_speech(librispeech)

    # # # Print Loading Report for Lexicons
    # for speaker_id, lexicon in lexicons.items():
    #     lexicon.print_loading_report()

    # #Preprocess and Save Data
    # for speaker_id, lexicon in lexicons.items():
    #     lexicon.preprocess_and_save()


In [5]:
for speaker_id, lexicon in lexicons.items():
    lexicon.preprocess_and_save()
    for speech in lexicon.speeches:
        speech.preprocess_and_save()

### Load Preprocessed Data

In [6]:
# speaker_list = list(lexicons)
# lexicon = speaker_list[0]
# lexicon.print_loading_report()

# print(lexicon.speech_corpus)

In [7]:
# import helper
# import numpy as np

# for speaker_id, lexicon in lexicons.items():
#     cache_file = os.path.join(os.getcwd(), 'datacache', 'lexicon_objects',
#                                        '{}_preprocess.p'.format(speaker_id.strip()))
#     (name,
#      base_corpus,
#      full_corpus,
#      int_text, 
#      vocab_to_int, 
#      int_to_vocab)  = Lexicon.load_preprocess(cache_file)
#     print("{0} int_text length: {1}".format(name, len(int_text)))

### Load Speech Objects

In [8]:
# from speech import Speech
# stm_segments = []

# for stm_filename in stm_paths: # Process STM files (Tedlium)
#         stm_segments.append(utils.parse_stm_file(stm_filename))        

# speakers = []
# speeches = {} # {speech_id: speech_object}

# for segments in stm_segments[:5]:
#     for segment in segments:
#         segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
#                                           str(segment.stop_time).replace('.','_'))

#         speech = None
#         # If not already exist
#         if segment.speaker_id not in speeches.keys():
#             # Connect to Cloud API to get Candidate Transcripts
#             source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
#             speech = Speech(speaker_id=segment.speaker_id,
#                                            speech_id = segment_key,
#                                            source_file=source_file,
#                                            ground_truth = ' '.join(segment.transcript.split()[:-1]),
#                                            start = segment.start_time,
#                                            stop = segment.stop_time,
#                                            audio_type = 'LINEAR16')
#         else:
#             speech = speeches[segment.speaker_id.strip()]
#             print('Already found speech in list at location: ', speech)
        
#         speeches[segment_key] = speech


### Load GCS Transcripts using GCS Wrapper

In [9]:
from gcs_api_wrapper import GCSWrapper

libri_gcs = GCSWrapper(encoding='FLAC')
for speech_id, librispeech in segmented_librispeeches.items():
    try:
        # Not already saved in prepocess cache
        if not librispeech.candidate_transcripts: 
            result = gcs.transcribe_speech(librispeech.audio_file)
            librispeech.populate_gcs_results(result)
            # Print Loading Report
            librispeech.preprocess_and_save()
    except:
        print('An InvalidArgumentError occured trying to send audio to GCS.')


gcs = GCSWrapper()
for speech_id, speech in speeches.items():
    try:
        # Not already saved in prepocess cache
        if not speech.candidate_transcripts:
            result = gcs.transcribe_speech(speech.audio_file)
            speech.populate_gcs_results(result)
            # Print Loading Report
            speech.preprocess_and_save()
    except:
        print('An InvalidArgumentError occured trying to send audio to GCS.')



An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError 

An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.
An InvalidArgumentError occured trying to send audio to GCS.


In [10]:
segmented_librispeeches.items()

dict_items([('8297-275156-0004', <speech.Speech object at 0x7ff9e4293e48>), ('6295-244435-0001', <speech.Speech object at 0x7ff9e42c39e8>), ('3536-23268-0007', <speech.Speech object at 0x7ff9e42ef0b8>), ('84-121550-0001', <speech.Speech object at 0x7ff9e4294a90>), ('7976-110124-0021', <speech.Speech object at 0x7ff9e428acf8>), ('5895-34622-0014', <speech.Speech object at 0x7ff9e42b5780>), ('5694-64029-0032', <speech.Speech object at 0x7ff9e42ad240>), ('1993-147965-0004', <speech.Speech object at 0x7ff9e43d6390>), ('652-130726-0028', <speech.Speech object at 0x7ff9e426e780>), ('174-84280-0010', <speech.Speech object at 0x7ff9e4420438>), ('1919-142785-0063', <speech.Speech object at 0x7ff9e43b47b8>), ('84-121550-0005', <speech.Speech object at 0x7ff9e4297e48>), ('6313-76958-0027', <speech.Speech object at 0x7ff9e42d8b00>), ('251-118436-0003', <speech.Speech object at 0x7ff9e4333470>), ('1272-135031-0003', <speech.Speech object at 0x7ff9e444bef0>), ('3752-4943-0002', <speech.Speech object

In [11]:
import pickle 
speech_cache_paths = []
lexicon_cache_paths = []
for speaker_id, lexicon in lexicons.items():
    lexicon_cache_path = lexicon.preprocess_and_save()
    lexicon_cache_paths.append(lexicon_cache_path)
    for speech in lexicon.speeches:
        cache_path =speech.preprocess_and_save()
        speech_cache_paths.append((lexicon.name,cache_path))
pickle.dump((lexicon_cache_paths, speech_cache_paths), open('cache_paths_preprocess.p', 'wb'))

In [14]:
    def token_lookup():
        """
        Generate a dict to turn punctuation into a token.
        :return: Tokenize dictionary where the key is the punctuation and the value is the token
        """
        return {
            ',': '',
            '(1)': '',
            '(2)': '',
            '(3)': '',
            '(4)': '',
            '(5)': '',
            '(6)': '',
            '(7)': '',
            '(8)': '',
            '(9)': '',
            '"': '',
            ';': '',
            '!': '',
            '?': '',
            '*': '',
            '--': '',
            '{NOISE}': '',
            '{noise}': '',
            '{BREATH}': '',
            '{breath}': '',
            '{UH}': '',
            '{uh}': '',
            '{SMACK}': '',
            '{smack}': '',
            '{COUGH}': '',
            '{cough}': '',
            '<sil>': ''
        }

In [15]:
transcript_raw = u""
targets_raw = u""
    
for speaker_name, cache_path in speech_cache_paths:
    if os.path.exists(cache_path):
        (_speech_id,
         _speaker_id,
         _source_file,
         _audio_file,
         _candidate_transcripts,
         _candidate_timestamps,
         _audio_type,
         _sample_rate, 
         _start_time, 
         _stop_time, 
         _ground_truth_transcript) = Speech.load_preprocess(cache_path)
        

        speech = Speech(_speaker_id,
         _speech_id,
         _source_file,
         _ground_truth_transcript,
         _candidate_transcripts,
         _candidate_timestamps,
          cache_path,
         _start_time,
         _stop_time,
         _audio_type,
         _sample_rate)

        for canidate_transcript in speech.candidate_transcripts:
            transcript_raw += canidate_transcript["transcript"]+'\n'
            targets_raw += speech.ground_truth_transcript+'\n'

In [16]:
for sent in corpus_raw.split('.'):
    transcript_raw += sent+'\n'
    targets_raw += sent+'\n'


corp_file = open(os.path.join(os.getcwd(),"source_corp.txt"), "w", encoding="utf-8")
transcript_raw = transcript_raw.encode('ascii', 'ignore')
transcript_raw = transcript_raw.decode("utf-8")

token_dict = token_lookup()
for key, token in token_dict.items():
    transcript_raw = transcript_raw.replace(key, ' {} '.format(token))

transcript_raw = transcript_raw.lower()


corp_file.write(transcript_raw)
corp_file.close




corp_file = open(os.path.join(os.getcwd(),"target_corp.txt"), "w", encoding="utf-8")
targets_raw = targets_raw.encode('ascii', 'ignore')
targets_raw = targets_raw.decode("utf-8")


token_dict = token_lookup()
for key, token in token_dict.items():
    targets_raw = targets_raw.replace(key, ' {} '.format(token))

targets_raw = targets_raw.lower()


corp_file.write(targets_raw)
corp_file.close

<function TextIOWrapper.close>

In [17]:
import helper
lex = list(lexicons.values())[0]
source_path = os.path.join(os.getcwd(),"source_corp.txt")
target_path = os.path.join(os.getcwd(),"target_corp.txt")
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)

In [18]:
view_sentence_range = (100, 110)

"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))

sentences = source_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))

print()
print('Transcript sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('Ground Truth sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 87422
Number of sentences: 586575
Average number of words in a sentence: 6.437389933086136

Transcript sentences 100 to 110:



we need your donations more than ever  


all donations should be made to   project gutenberg/cmu  : and are
tax deductible to the extent allowable by law
  (cmu = carnegie-
mellon university)

Ground Truth sentences 100 to 110:



we need your donations more than ever  


all donations should be made to   project gutenberg/cmu  : and are
tax deductible to the extent allowable by law
  (cmu = carnegie-
mellon university)


## Implement Preprocessing Function
### Text to Word Ids
As you did with other RNNs, you must turn the text into a number so the computer can understand it. In the function `text_to_ids()`, you'll turn `source_text` and `target_text` from words to ids.  However, you need to add the `<EOS>` word id at the end of `target_text`.  This will help the neural network predict when the sentence should end.

You can get the `<EOS>` word id by doing:
```python
target_vocab_to_int['<EOS>']
```
You can get other word ids using `source_vocab_to_int` and `target_vocab_to_int`.

In [19]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
    Convert source and target text to proper word ids
    :param source_text: String that contains all the source text.
    :param target_text: String that contains all the target text.
    :param source_vocab_to_int: Dictionary to go from the source words to an id
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: A tuple of lists (source_id_text, target_id_text)
    """
    # TODO: Implement Function
    
    
    # source_id_text and target_id_text are a list of lists where each list represent a line. 
    # That's why we use a first split('\n')] (not written in the statements)
    source_list = [sentence for sentence in source_text.split('\n')]
    target_list = [sentence for sentence in target_text.split('\n')]
    
    # Filling the lists
    source_id_text = list()
    target_id_text = list()
    for i in range(len(source_list)):
        source_id_text_temp = list()
        target_id_text_temp = list()
        for word in source_list[i].split():
            source_id_text_temp.append(source_vocab_to_int[word])
        for word in target_list[i].split():
            target_id_text_temp.append(target_vocab_to_int[word])
        # We need to add EOS for target    
        target_id_text_temp.append(target_vocab_to_int['<EOS>'])
        source_id_text.append(source_id_text_temp)
        target_id_text.append(target_id_text_temp)
              
    return source_id_text, target_id_text

### Preprocess all the data and save it
Running the code cell below will preprocess all the data and save it to file.

In [20]:
import os
import pickle
import copy
import numpy as np
from tensorflow.python.layers.core import Dense
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    """
    vocab = set(text.split())
    vocab_to_int = copy.copy(CODES)

    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i

    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab


In [21]:
import helper

def preprocess_and_save_data(source_path, target_path, text_to_ids):
    source_text = helper.load_data(source_path)
    target_text = helper.load_data(target_path)

    source_text = source_text.lower()
    target_text = target_text.lower()

    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)

    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)

    # Save Data
    with open('preprocess.p', 'wb') as out_file:
        pickle.dump((
            (source_text, target_text),
            (source_vocab_to_int, target_vocab_to_int),
            (source_int_to_vocab, target_int_to_vocab)), out_file)

In [22]:
preprocess_and_save_data(source_path, target_path, text_to_ids)

# Check Point
This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk.

In [23]:
import numpy as np
import helper

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()

## Build the Neural Network
You'll build the components necessary to build a Sequence-to-Sequence model by implementing the following functions below:
- `model_inputs`
- `process_decoder_input`
- `encoding_layer`
- `decoding_layer_train`
- `decoding_layer_infer`
- `decoding_layer`
- `seq2seq_model`

### Input
Implement the `model_inputs()` function to create TF Placeholders for the Neural Network. It should create the following placeholders:

- Input text placeholder named "input" using the TF Placeholder name parameter with rank 2.
- Targets placeholder with rank 2.
- Learning rate placeholder with rank 0.
- Keep probability placeholder named "keep_prob" using the TF Placeholder name parameter with rank 0.
- Target sequence length placeholder named "target_sequence_length" with rank 1
- Max target sequence length tensor named "max_target_len" getting its value from applying tf.reduce_max on the target_sequence_length placeholder. Rank 0.
- Source sequence length placeholder named "source_sequence_length" with rank 1

Return the placeholders in the following the tuple (input, targets, learning rate, keep probability, target sequence length, max target sequence length, source sequence length)

In [24]:
def model_inputs():
    """
    Create TF Placeholders for input, targets, learning rate, and lengths of source and target sequences.
    :return: Tuple (input, targets, learning rate, keep probability, target sequence length,
    max target sequence length, source sequence length)
    """
    
    inputs = tf.placeholder(tf.int32,[None,None], name = "input")
    targets = tf.placeholder(tf.int32,[None,None], name = "target")
    learning_rate = tf.placeholder(tf.float32, name = "learning_rate")
    keep_probability = tf.placeholder(tf.float32, name = "keep_prob")
    target_sequence_length = tf.placeholder(tf.int32,[None], name = "target_sequence_length")
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name = "max_target_len")
    source_sequence_length = tf.placeholder(tf.int32, [None], name = "source_sequence_length")
    return inputs, targets, learning_rate, keep_probability, target_sequence_length, max_target_sequence_length, source_sequence_length


In [25]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for encoding
    :param target_data: Target Placehoder
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param batch_size: Batch Size
    :return: Preprocessed target data
    """
    # TODO: Implement Function
    
    #removing the last word id from each batch in target_data 
    print(target_data)
    target_data = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1] )
    #target_data = tf.strided_slice(target_data,[0,0],[int(target_data.shape[0]),int(target_data.shape[1]-1)],[1,1] )
    
    # concat the GO ID to the begining of each batch
    decoder_input = tf.concat([tf.fill([batch_size,1],target_vocab_to_int['<GO>']),target_data],1)
        
    return decoder_input

In [26]:
from imp import reload

def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_sequence_length, source_vocab_size, 
                   encoding_embedding_size):
    """
    Create encoding layer
    :param rnn_inputs: Inputs for the RNN
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param keep_prob: Dropout keep probability
    :param source_sequence_length: a list of the lengths of each sequence in the batch
    :param source_vocab_size: vocabulary size of source data
    :param encoding_embedding_size: embedding size of source data
    :return: tuple (RNN output, RNN state)
    """
    # TODO: Implement Function
    
    # Embed the encoder input using tf.contrib.layers.embed_sequence
    inputs_embeded = tf.contrib.layers.embed_sequence(
                                    ids = rnn_inputs,
                                    vocab_size = source_vocab_size,
                                    embed_dim = encoding_embedding_size)
    
    # Construct a stacked tf.contrib.rnn.LSTMCell wrapped in a tf.contrib.rnn.DropoutWrapper
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers) ])
    # cell_dropout = tf.contrib.rnn.DropoutWrapper(cell, keep_prob)
    
    # Pass cell and embedded input to tf.nn.dynamic_rnn()
    RNN_output, RNN_state = tf.nn.dynamic_rnn(
                                cell = cell,
                                inputs = inputs_embeded,
                                sequence_length = source_sequence_length,
                                dtype = tf.float32)
    
    return RNN_output, RNN_state

In [27]:

def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    """
    Create a decoding layer for training
    :param encoder_state: Encoder State
    :param dec_cell: Decoder RNN Cell
    :param dec_embed_input: Decoder embedded input
    :param target_sequence_length: The lengths of each sequence in the target batch
    :param max_summary_length: The length of the longest sequence in the batch
    :param output_layer: Function to apply the output layer
    :param keep_prob: Dropout keep probability
    :return: BasicDecoderOutput containing training logits and sample_id
    """
    # TODO: Implement Function
    
    # Create a tf.contrib.seq2seq.TrainingHelper
    training_helper = tf.contrib.seq2seq.TrainingHelper(
                                            inputs = dec_embed_input,
                                            sequence_length = target_sequence_length)
    
    # Create a tf.contrib.seq2seq.BasicDecoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(
                                            cell = dec_cell,
                                            helper = training_helper,
                                            initial_state = encoder_state,
                                            output_layer = output_layer)
    
    # Obtain the decoder outputs from tf.contrib.seq2seq.dynamic_decode
    BasicDecoderOutput = tf.contrib.seq2seq.dynamic_decode(
                                            decoder = basic_decoder,
                                            impute_finished = True,
                                            maximum_iterations = max_summary_length 
                                            )

    return BasicDecoderOutput[0]

### Decoding - Inference
Create inference decoder:
* Create a [`tf.contrib.seq2seq.GreedyEmbeddingHelper`](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/GreedyEmbeddingHelper)
* Create a [`tf.contrib.seq2seq.BasicDecoder`](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/BasicDecoder)
* Obtain the decoder outputs from [`tf.contrib.seq2seq.dynamic_decode`](https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/dynamic_decode)

In [28]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    """
    Create a decoding layer for inference
    :param encoder_state: Encoder state
    :param dec_cell: Decoder RNN Cell
    :param dec_embeddings: Decoder embeddings
    :param start_of_sequence_id: GO ID
    :param end_of_sequence_id: EOS Id
    :param max_target_sequence_length: Maximum length of target sequences
    :param vocab_size: Size of decoder/target vocabulary
    :param decoding_scope: TenorFlow Variable Scope for decoding
    :param output_layer: Function to apply the output layer
    :param batch_size: Batch size
    :param keep_prob: Dropout keep probability
    :return: BasicDecoderOutput containing inference logits and sample_id
    """
    # TODO: Implement Function
    
    # creates a new tensor by replicating start_of_sequence_id batch_size times.
    start_tokens = tf.tile(tf.constant([start_of_sequence_id],dtype = tf.int32),[batch_size], name = 'start_tokens' )
        
    # Create a tf.contrib.seq2seq.GreedyEmbeddingHelper
    embedding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                                embedding = dec_embeddings,
                                start_tokens = start_tokens, 
                                end_token = end_of_sequence_id)
    
    # Create a tf.contrib.seq2seq.BasicDecoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(
                                                cell = dec_cell,
                                                helper = embedding_helper,
                                                initial_state = encoder_state,
                                                output_layer = output_layer)
    
    # Obtain the decoder outputs from tf.contrib.seq2seq.dynamic_decode
    BasicDecoderOutput = tf.contrib.seq2seq.dynamic_decode(
                                                decoder = basic_decoder,
                                                impute_finished = True,
                                                maximum_iterations = max_target_sequence_length)

    return BasicDecoderOutput[0]

### Build the Decoding Layer
Implement `decoding_layer()` to create a Decoder RNN layer.

* Embed the target sequences
* Construct the decoder LSTM cell (just like you constructed the encoder cell above)
* Create an output layer to map the outputs of the decoder to the elements of our vocabulary
* Use the your `decoding_layer_train(encoder_state, dec_cell, dec_embed_input, target_sequence_length, max_target_sequence_length, output_layer, keep_prob)` function to get the training logits.
* Use your `decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, end_of_sequence_id, max_target_sequence_length, vocab_size, output_layer, batch_size, keep_prob)` function to get the inference logits.

Note: I need to use [tf.variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope) to share variables between training and inference.

In [29]:
def decoding_layer(dec_input, encoder_state,
                   target_sequence_length, max_target_sequence_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size):
    """
    Create decoding layer
    :param dec_input: Decoder input
    :param encoder_state: Encoder state
    :param target_sequence_length: The lengths of each sequence in the target batch
    :param max_target_sequence_length: Maximum length of target sequences
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param target_vocab_size: Size of target vocabulary
    :param batch_size: The size of the batch
    :param keep_prob: Dropout keep probability
    :param decoding_embedding_size: Decoding embedding size
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    # TODO: Implement Function
    
    # Embed the target sequences
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    # Construct the decoder LSTM cell (just like you constructed the encoder cell above)
    cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers) ])
    cell_dropout = tf.contrib.rnn.DropoutWrapper(cell, keep_prob)
    
    # Create an output layer to map the outputs of the decoder to the elements of our vocabulary
    output_layer = Dense(target_vocab_size)
                        
    
    # Use the your decoding_layer_train(encoder_state, dec_cell, dec_embed_input, target_sequence_length, 
    # max_target_sequence_length, output_layer, keep_prob) function to get the training logits.
    with tf.variable_scope("decode"):
        Training_BasicDecoderOutput = decoding_layer_train(encoder_state, 
                                                       cell_dropout, 
                                                       dec_embed_input, 
                                                       target_sequence_length, 
                                                       max_target_sequence_length, 
                                                       output_layer, 
                                                       keep_prob)
    
    # Use your decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, 
    # end_of_sequence_id, max_target_sequence_length, vocab_size, output_layer, batch_size, keep_prob) 
    # function to get the inference logits.
    with tf.variable_scope("decode", reuse=True):
        Inference_BasicDecoderOutput = decoding_layer_infer(encoder_state, 
                                                        cell_dropout, 
                                                        dec_embeddings, 
                                                        target_vocab_to_int['<GO>'], 
                                                        target_vocab_to_int['<EOS>'],
                                                        max_target_sequence_length, 
                                                        target_vocab_size,
                                                        output_layer,
                                                        batch_size, 
                                                        keep_prob)
    return Training_BasicDecoderOutput, Inference_BasicDecoderOutput

### Build the Neural Network
Apply the functions you implemented above to:

- Encode the input using your `encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob,  source_sequence_length, source_vocab_size, encoding_embedding_size)`.
- Process target data using your `process_decoder_input(target_data, target_vocab_to_int, batch_size)` function.
- Decode the encoded input using your `decoding_layer(dec_input, enc_state, target_sequence_length, max_target_sentence_length, rnn_size, num_layers, target_vocab_to_int, target_vocab_size, batch_size, keep_prob, dec_embedding_size)` function.

In [30]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_sequence_length, target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    """
    Build the Sequence-to-Sequence part of the neural network
    :param input_data: Input placeholder
    :param target_data: Target placeholder
    :param keep_prob: Dropout keep probability placeholder
    :param batch_size: Batch Size
    :param source_sequence_length: Sequence Lengths of source sequences in the batch
    :param target_sequence_length: Sequence Lengths of target sequences in the batch
    : max_target_sentence_length,
    :param source_vocab_size: Source vocabulary size
    :param target_vocab_size: Target vocabulary size
    :param enc_embedding_size: Decoder embedding size
    :param dec_embedding_size: Encoder embedding size
    :param rnn_size: RNN Size
    :param num_layers: Number of layers
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    # TODO: Implement Function
    
    # Encode the input using your encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob,  source_sequence_length, source_vocab_size, encoding_embedding_size).
    rnn_output , rnn_state = encoding_layer(input_data, 
                   rnn_size, 
                   num_layers, 
                   keep_prob, 
                   source_sequence_length, 
                   source_vocab_size, 
                   enc_embedding_size)
    
    # Process target data using your process_decoder_input(target_data, target_vocab_to_int, batch_size) function.
    decoder_input = process_decoder_input(target_data,
                                        target_vocab_to_int,
                                        batch_size)
    
    # Decode the encoded input using your decoding_layer(dec_input, enc_state, target_sequence_length, max_target_sentence_length, 
    # rnn_size, num_layers, target_vocab_to_int, target_vocab_size, batch_size, keep_prob, dec_embedding_size) function.
    Training_BasicDecoderOutput, Inference_BasicDecoderOutput = decoding_layer(
                                        decoder_input,
                                        rnn_state,
                                        target_sequence_length,
                                        max_target_sentence_length,
                                        rnn_size,
                                        num_layers,
                                        target_vocab_to_int,
                                        target_vocab_size,
                                        batch_size,
                                        keep_prob,
                                        dec_embedding_size)
    
    return Training_BasicDecoderOutput, Inference_BasicDecoderOutput

## Neural Network Training
### Hyperparameters
Tune the following parameters:

- Set `epochs` to the number of epochs.
- Set `batch_size` to the batch size.
- Set `rnn_size` to the size of the RNNs.
- Set `num_layers` to the number of layers.
- Set `encoding_embedding_size` to the size of the embedding for the encoder.
- Set `decoding_embedding_size` to the size of the embedding for the decoder.
- Set `learning_rate` to the learning rate.
- Set `keep_probability` to the Dropout keep probability
- Set `display_step` to state how many steps between each debug output statement

In [35]:
# Number of Epochs
epochs = 2
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 512
# Number of Layers
num_layers = 1

encoding_embedding_size = 256
decoding_embedding_size = 256
# Learning Rate
learning_rate = 0.0001
# Dropout Keep Probability
keep_probability = 0.75
display_step = 100

In [36]:
save_path = 'checkpoints/dev'
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()
max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob, target_sequence_length, max_target_sequence_length, source_sequence_length = model_inputs()

    #sequence_length = tf.placeholder_with_default(max_target_sentence_length, None, name='sequence_length')
    input_shape = tf.shape(input_data)

    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   source_sequence_length,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)


    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        # Monitor gradient
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


Tensor("target:0", shape=(?, ?), dtype=int32)


In [None]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths


In [41]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

# Split data to training and validation sets
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):
            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 source_sequence_length: sources_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:


                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     source_sequence_length: sources_lengths,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})


                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     source_sequence_length: valid_sources_lengths,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)
                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')

Epoch   0 Batch  100/9165 - Train Accuracy: 0.4787, Validation Accuracy: 0.6283, Loss: 4.1028
Epoch   0 Batch  200/9165 - Train Accuracy: 0.4943, Validation Accuracy: 0.6283, Loss: 3.8270
Epoch   0 Batch  300/9165 - Train Accuracy: 0.4773, Validation Accuracy: 0.6283, Loss: 3.8269
Epoch   0 Batch  400/9165 - Train Accuracy: 0.6494, Validation Accuracy: 0.6283, Loss: 2.6169
Epoch   0 Batch  500/9165 - Train Accuracy: 0.6730, Validation Accuracy: 0.6596, Loss: 2.3349
Epoch   0 Batch  600/9165 - Train Accuracy: 0.7451, Validation Accuracy: 0.6596, Loss: 1.8361
Epoch   0 Batch  700/9165 - Train Accuracy: 0.5996, Validation Accuracy: 0.6596, Loss: 2.9289
Epoch   0 Batch  800/9165 - Train Accuracy: 0.5000, Validation Accuracy: 0.6596, Loss: 4.0214
Epoch   0 Batch  900/9165 - Train Accuracy: 0.6387, Validation Accuracy: 0.6596, Loss: 2.5360
Epoch   0 Batch 1000/9165 - Train Accuracy: 0.3750, Validation Accuracy: 0.6596, Loss: 3.9122
Epoch   0 Batch 1100/9165 - Train Accuracy: 0.4439, Validati

Epoch   0 Batch 8900/9165 - Train Accuracy: 0.7021, Validation Accuracy: 0.7054, Loss: 1.8571
Epoch   0 Batch 9000/9165 - Train Accuracy: 0.6777, Validation Accuracy: 0.7132, Loss: 2.0486
Epoch   0 Batch 9100/9165 - Train Accuracy: 0.6406, Validation Accuracy: 0.7042, Loss: 2.1871
Model Trained and Saved


### Save Parameters
Save the `batch_size` and `save_path` parameters for inference.

In [42]:
# Save parameters for checkpoint
helper.save_params(save_path)

In [44]:
import tensorflow as tf
import numpy as np
import helper

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()
load_path = helper.load_params()

## Sentence to Sequence
To feed a sentence into the model for translation, you first need to preprocess it.  Implement the function `sentence_to_seq()` to preprocess new sentences.

- Convert the sentence to lowercase
- Convert words into ids using `vocab_to_int`
 - Convert words not in the vocabulary, to the `<UNK>` word id.

In [45]:
def sentence_to_seq(sentence, vocab_to_int):
    """
    Convert a sentence to a sequence of ids
    :param sentence: String
    :param vocab_to_int: Dictionary to go from the words to an id
    :return: List of word ids
    """
    

    # TODO: Implement Function
    
    # Convert the sentence to lowercase and to list
    list_words = [word for word in sentence.lower().split() ]
    
    # Convert words into ids using vocab_to_int
    list_words_int = list()
    for word in list_words:
        # Convert words not in the vocabulary, to the <UNK> word id.
        if word not in vocab_to_int:
            list_words_int.append(vocab_to_int['<UNK>'])
        else:
            list_words_int.append(vocab_to_int[word])
    return list_words_int


In [46]:
# Use other TED speeches for building test set
test_speeches = {}
for segments in stm_segments[220:225]:
    for segment in segments:
        segment_key = "{0}_{1}_{2}".format(segment.speaker_id.strip(), str(segment.start_time).replace('.','_'),
                                          str(segment.stop_time).replace('.','_'))

        speech = None
        # If not already exist
        if segment.speaker_id not in test_speeches.keys():
            # Connect to Cloud API to get Candidate Transcripts
            source_file = os.path.join(os.getcwd(), 'TEDLIUM_release1', 'train','sph', '{}.sph'.format(segment.filename))
            speech = Speech(speaker_id=segment.speaker_id,
                                           speech_id = segment_key,
                                           source_file=source_file,
                                           ground_truth = ' '.join(segment.transcript.split()[:-1]),
                                           start = segment.start_time,
                                           stop = segment.stop_time,
                                           audio_type = 'LINEAR16')
        else:
            speech = test_speeches[segment.speaker_id.strip()]
            print('Already found speech in list at location: ', speech)
        
        
        
        test_speeches[segment_key] = speech

In [48]:
gcs = GCSWrapper()
cache_directory = os.path.join(os.getcwd(), 'datacache', 'speech_objects')
for speech_id, speech in test_speeches.items():
    # Not already saved in prepocess cache
    try:
        if not speech.candidate_transcripts: 
            result = gcs.transcribe_speech(speech.audio_file)
            speech.populate_gcs_results(result)
            # Print Loading Report
            speech.preprocess_and_save()
    except:
        print("Catching API errors, no retry.")

Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching

Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.
Catching API errors, no retry.


In [50]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.2.5.tar.gz (1.2MB)
[K    100% |################################| 1.2MB 713kB/s eta 0:00:01
[?25hRequirement already up-to-date: six in /root/miniconda3/envs/tf-gpu/lib/python3.5/site-packages (from nltk)
Installing collected packages: nltk
  Running setup.py install for nltk ... [?25ldone
[?25hSuccessfully installed nltk-3.2.5


In [53]:
import nltk
import operator

token_dict = token_lookup()
steps = 0
show_results = 100
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    for speech in list(test_speeches.values()):
        gt_transcript = speech.ground_truth_transcript.lower()
        for key, token in token_dict.items():
            gt_transcript = gt_transcript.replace(key, ' {} '.format(token))

        cloud_speech_api_accuracy = []
        custom_lang_model_accuracy = []


        # Collect Google API Transcript
        google_api_transcript = ""
        words = []
        if speech.candidate_timestamps:
            for candidate_timestamp in speech.candidate_timestamps:
                words.append(candidate_timestamp["word"])
            google_api_transcript = " ".join(words)


        if speech.candidate_timestamps:
            candidate_script_accuracy = []
            for candidate_transcript in speech.candidate_transcripts:
                steps +=1
                transcription_sentence = sentence_to_seq(candidate_transcript["transcript"], source_vocab_to_int)

                transcription_logits = sess.run(logits, {input_data: [transcription_sentence]*batch_size,
                                                     target_sequence_length: [len(transcription_sentence)*2]*batch_size,
                                                     source_sequence_length: [len(transcription_sentence)]*batch_size,
                                                     keep_prob: 1.0})[0]
                prediction_transcript = " ".join([target_int_to_vocab[i] for i in transcription_logits])
                # Remove <EOS> Token
                prediction_transcript = prediction_transcript.replace('<EOS>','')

                if steps % show_results == 0:  
                    print()
                    print('GCS Candidate Transcript: \n{}'.format(" ".join([source_int_to_vocab[i] for i in transcription_sentence])))
                    print('Seq2Seq Model Prediction Transcript: \n{}'.format(prediction_transcript))
                    print('Ground Truth Transcript: \n{}'.format(gt_transcript))
                    print()

                # Compute the Candidate Transcript Edit Distance (a.k.a. From the Predicted Distance)
                # Use this to determine how likely sentence would have been predicted
                gct_ed = nltk.edit_distance(candidate_transcript["transcript"].lower(), prediction_transcript.lower())
                gct_upper_bound = max(len(candidate_transcript["transcript"]),len(prediction_transcript))
                gct_accuracy = (1.0 - gct_ed/gct_upper_bound)

                gct_accuracy = gct_accuracy*candidate_transcript["confidence"]
                candidate_script_accuracy.append(gct_accuracy)



            # Select Candidate Transcript with the highest accuracy (to prediction)

            index, value = max(enumerate(candidate_script_accuracy), key=operator.itemgetter(1))

            tmp = []
            for candidate_transcript in speech.candidate_transcripts:
                tmp.append(candidate_transcript["transcript"])

            reranked_transcript = tmp[index]


            # Collect Accuracy between reranked transcript and Google transcript                      
            gcs_ed = nltk.edit_distance(google_api_transcript.lower(), gt_transcript.lower())
            gcs_upper_bound = max(len(google_api_transcript),len(gt_transcript))
            gcs_accuracy = (1.0 - gcs_ed/gcs_upper_bound)

            clm_ed = nltk.edit_distance(reranked_transcript.lower(), gt_transcript.lower())
            clm_upper_bound = max(len(reranked_transcript),len(gt_transcript))
            clm_accuracy = (1.0 - clm_ed/clm_upper_bound)

            cloud_speech_api_accuracy.append(gcs_accuracy)
            custom_lang_model_accuracy.append(clm_accuracy)

            if steps % show_results == 0:                         
                print('Speech Results:')
                print('Average Candidate Transcript Accuracy:', np.mean(cloud_speech_api_accuracy))
                print('Average Seq2Seq Model Accuracy:', np.mean(custom_lang_model_accuracy))
                print()


INFO:tensorflow:Restoring parameters from checkpoints/dev
