In [314]:
import numpy as np
import tensorflow as tf
import os
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import time
import re
from sklearn.model_selection import train_test_split

In [377]:
def load_book(path):
    input_file = os.path.join(path)
    with open(input_file) as f:
        book = f.read()
    return book

In [378]:
path = './books/'
book_files = [f for f in listdir(path) if isfile(join(path, f))]
book_files = book_files[1:]

In [379]:
books = []
for book in book_files:
    books.append(load_book(path+book))

In [380]:
for i in range(len(books)):
    print("There are {} words in {}.".format(len(books[i].split()), book_files[i]))

There are 126999 words in Pride_and_Prejudice_by_Jane_Austen.rtf.
There are 113452 words in David_Copperfield_by_Charles_Dickens.rtf.
There are 194282 words in The_Romance_of_Lust_by_Anonymous.rtf.
There are 25395 words in Metamorphosis_by_Franz_Kafka.rtf.
There are 191598 words in Great_Expectations_by_Charles_Dickens.rtf.
There are 165188 words in Oliver_Twist_by_Charles_Dickens.rtf.
There are 53211 words in The_Prince_by_Nicolo_Machiavelli.rtf.
There are 96185 words in The_Adventures_of_Tom_Sawyer_by_Mark_Twain.rtf.
There are 480495 words in The_Count_of_Monte_Cristo_by_Alexandre_Dumas.rtf.
There are 78912 words in Frankenstein_by_Mary_Shelley.rtf.
There are 33464 words in Through_the_Looking_Glass_by_Lewis_Carroll.rtf.
There are 9463 words in The_Yellow_Wallpaper_by_Charlotte_Perkins_Gilman.rtf.
There are 166996 words in Dracula_by_Bram_Stoker.rtf.
There are 163109 words in Emma_by_Jane_Austen.rtf.
There are 105428 words in Grimms_Fairy_Tales_by_The_Brothers_Grimm.rtf.
There are 83

In [381]:
def clean_text(text):
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    return text

In [382]:
clean_books = []
for book in books:
    clean_books.append(clean_text(book))

In [383]:
vocab_to_int = {}
count = 0
for book in clean_books:
    for character in book:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

In [384]:
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

In [385]:
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 78 characters.
[' ', '!', '"', '$', '&', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<EOS>', '<GO>', '<PAD>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [386]:
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

In [387]:
sentences = []
for book in clean_books:
    for sentence in book.split('. '):
        sentences.append(sentence + '.')
print("There are {} sentences.".format(len(sentences)))

There are 127068 sentences.


In [388]:
int_sentences = []

for sentence in sentences:
    int_sentence = []
    for character in sentence:
        int_sentence.append(vocab_to_int[character])
    int_sentences.append(int_sentence)

In [389]:
max_length = 200
min_length = 10

good_sentences = []

for sentence in int_sentences:
    if len(sentence) <= max_length and len(sentence) >= min_length:
        good_sentences.append(sentence)
        

print("{} sentences available to train and test our model.".format(len(good_sentences)))

98648 sentences available to train and test our model.


In [390]:
training, remaining = train_test_split(good_sentences, test_size = 0.25, random_state = 2)

print("Number of training sentences:", len(training))
print("Number of testing and validation sentences:", len(remaining))

validation, testing = train_test_split(remaining, test_size = 0.4, random_state = 2)
print("Number of validation sentences:", len(validation))
print("Number of testing sentences:", len(testing))

Number of training sentences: 73986
Number of testing and validation sentences: 24662
Number of validation sentences: 14797
Number of testing sentences: 9865


In [391]:
training_sorted = []
validation_sorted = []
testing_sorted = []


for i in range(min_length, max_length+1):
    for sentence in training:
        if len(sentence) == i:
            training_sorted.append(sentence)
    for sentence in validation:
        if len(sentence) == i:
            validation_sorted.append(sentence)
    for sentence in testing:
        if len(sentence) == i:
            testing_sorted.append(sentence)

In [392]:
for i in range(5):
    print(training_sorted[i], len(training_sorted[i]))

[25, 56, 41, 33, 32, 36, 50, 19, 60, 42] 10
[37, 4, 9, 1, 7, 6, 1, 7, 5, 42] 10
[55, 23, 8, 1, 7, 13, 5, 19, 49, 42] 10
[63, 7, 8, 30, 2, 7, 23, 20, 22, 42] 10
[55, 23, 8, 1, 7, 13, 5, 19, 49, 42] 10


In [393]:
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

In [394]:
def noise_maker(sentence, threshold):
    
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0,1,1)
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0,1,1)
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    continue
                else:
                    noisy_sentence.append(sentence[i+1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(vocab_to_int[random_letter])
                noisy_sentence.append(sentence[i])
            else:
                pass     
        i += 1
    return noisy_sentence

In [395]:
threshold = 0.9
for sentence in training_sorted[:5]:
    print(sentence)
    print(noise_maker(sentence, threshold))
    print()

[25, 56, 41, 33, 32, 36, 50, 19, 60, 42]
[25, 56, 41, 32, 36, 50, 19, 60, 42]

[37, 4, 9, 1, 7, 6, 1, 7, 5, 42]
[37, 4, 9, 1, 7, 6, 1, 7, 5, 42]

[55, 23, 8, 1, 7, 13, 5, 19, 49, 42]
[55, 23, 8, 1, 7, 13, 5, 19, 49, 42]

[63, 7, 8, 30, 2, 7, 23, 20, 22, 42]
[63, 7, 8, 30, 2, 7, 23, 20, 22, 42]

[55, 23, 8, 1, 7, 13, 5, 19, 49, 42]
[55, 23, 8, 16, 1, 7, 13, 5, 19, 49, 42]



In [396]:
def model_inputs():
    
    with tf.name_scope('inputs'):
        inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    with tf.name_scope('targets'):
        targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    inputs_length = tf.placeholder(tf.int32, (None,), name='inputs_length')
    targets_length = tf.placeholder(tf.int32, (None,), name='targets_length')
    max_target_length = tf.reduce_max(targets_length, name='max_target_len')

    return inputs, targets, keep_prob, inputs_length, targets_length, max_target_length

In [397]:
def process_encoding_input(targets, vocab_to_int, batch_size):
    
    with tf.name_scope("process_encoding"):
        ending = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
        dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [398]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob, direction):
    
    if direction == 1:
        with tf.name_scope("RNN_Encoder_Cell_1D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    lstm = tf.contrib.rnn.LSTMCell(rnn_size)

                    drop = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.dynamic_rnn(drop,rnn_inputs,sequence_length,dtype=tf.float32)

            return enc_output, enc_state
        
        
    if direction == 2:
        with tf.name_scope("RNN_Encoder_Cell_2D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    cell_fw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw,input_keep_prob = keep_prob)

                    cell_bw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw,input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,rnn_inputs,sequence_length,dtype=tf.float32)


            enc_output = tf.concat(enc_output,2)
            return enc_output, enc_state[0]

In [399]:
def training_decoding_layer(dec_embed_input, targets_length, dec_cell, initial_state, output_layer,vocab_size, max_target_length):
    
    with tf.name_scope("Training_Decoder"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,sequence_length=targets_length,time_major=False)


        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state,output_layer)


        training_logits, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_target_length)


        return training_logits

In [400]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,max_target_length, batch_size):
    
    with tf.name_scope("Inference_Decoder"):
        start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_tokens,end_token)

        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)

        inference_logits, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_target_length)

        return inference_logits

In [401]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, inputs_length, targets_length, 
                   max_target_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers, direction):
    
    with tf.name_scope("RNN_Decoder_Cell"):
        for layer in range(num_layers):
            with tf.variable_scope('decoder_{}'.format(layer)):
                lstm = tf.contrib.rnn.LSTMCell(rnn_size)
                dec_cell = tf.contrib.rnn.DropoutWrapper(lstm,input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,enc_output,inputs_length,normalize=False,name='BahdanauAttention')
    
    with tf.name_scope("Attention_Wrapper"):
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,attn_mech,rnn_size)
    
    initial_state =  dec_cell.zero_state(batch_size=batch_size,dtype=tf.float32).clone(cell_state=enc_state)

    with tf.variable_scope("decode"):

        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,sequence_length=targets_length,time_major=False)


        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state,output_layer) 


        training_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_target_length)
        
    with tf.variable_scope("decode", reuse=True):

        start_tokens = tf.tile(tf.constant([vocab_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens')
        end_token = (tf.constant(vocab_to_int['<EOS>'], dtype=tf.int32))
        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_tokens,end_token)


        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)


        inference_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major=False,impute_finished=True,maximum_iterations=max_target_length)

    return training_logits, inference_logits

In [402]:
def seq2seq_model(inputs, targets, keep_prob, inputs_length, targets_length, max_target_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, embedding_size, direction):
    
    enc_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    enc_embed_input = tf.nn.embedding_lookup(enc_embeddings, inputs)
    enc_output, enc_state = encoding_layer(rnn_size, inputs_length, num_layers, enc_embed_input, keep_prob, direction)
    
    dec_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    dec_input = process_encoding_input(targets, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, dec_embeddings,enc_output,enc_state, vocab_size, inputs_length,targets_length, 
                                                        max_target_length,rnn_size, vocab_to_int, keep_prob, batch_size,num_layers,direction)
    
    return training_logits, inference_logits

In [403]:
def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [404]:
def get_batches(sentences, batch_size, threshold):
    
    for batch_i in range(0, len(sentences)//batch_size):
        start_i = batch_i * batch_size
        sentences_batch = sentences[start_i:start_i + batch_size]
        
        sentences_batch_noisy = []
        for sentence in sentences_batch:
            sentences_batch_noisy.append(noise_maker(sentence, threshold))
            
        sentences_batch_eos = []
        for sentence in sentences_batch:
            sentence.append(vocab_to_int['<EOS>'])
            sentences_batch_eos.append(sentence)
            
        pad_sentences_batch = np.array(pad_sentence_batch(sentences_batch_eos))
        pad_sentences_noisy_batch = np.array(pad_sentence_batch(sentences_batch_noisy))
        
        pad_sentences_lengths = []
        for sentence in pad_sentences_batch:
            pad_sentences_lengths.append(len(sentence))
        
        pad_sentences_noisy_lengths = []
        for sentence in pad_sentences_noisy_batch:
            pad_sentences_noisy_lengths.append(len(sentence))
        
        yield pad_sentences_noisy_batch, pad_sentences_batch, pad_sentences_noisy_lengths, pad_sentences_lengths

In [405]:
#Hyperparameters
epochs = 50
batch_size = 128
num_layers = 2
rnn_size = 512
embedding_size = 128
learning_rate = 0.0005
direction = 2
threshold = 0.99
keep_probability = 0.75

In [406]:
def build_graph(keep_prob, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction):

    tf.reset_default_graph()
    
    inputs, targets, keep_prob, inputs_length, targets_length, max_target_length = model_inputs()

    training_logits, inference_logits = seq2seq_model(tf.reverse(inputs, [-1]),targets,keep_prob,inputs_length,targets_length,max_target_length,len(vocab_to_int)+1,
                                                      rnn_size,num_layers,vocab_to_int,batch_size,embedding_size,direction)


    training_logits = tf.identity(training_logits.rnn_output, 'logits')

    with tf.name_scope('predictions'):
        predictions = tf.identity(inference_logits.sample_id, name='predictions')
        tf.summary.histogram('predictions', predictions)

    masks = tf.sequence_mask(targets_length, max_target_length, dtype=tf.float32, name='masks')
    
    with tf.name_scope("cost"):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, 
                                                targets, 
                                                masks)
        tf.summary.scalar('cost', cost)

    with tf.name_scope("optimze"):
        optimizer = tf.train.AdamOptimizer(learning_rate)

        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

    merged = tf.summary.merge_all()    

    export_nodes = ['inputs', 'targets', 'keep_prob', 'cost', 'inputs_length', 'targets_length',
                    'predictions', 'merged', 'train_op','optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])

    return graph

In [407]:
def train(model, epochs, log_string):
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # wandb.tensorflow.log(tf.summary.merge_all())

        testing_loss_summary = []

        iteration = 0
        
        display_step = 30 
        stop_early = 0 
        stop = 5 
        per_epoch = 3 
        testing_check = (len(training_sorted)//batch_size//per_epoch)-1

        print()
        print("Training Model: {}".format(log_string))

        train_writer = tf.summary.FileWriter('./logs/1/train/{}'.format(log_string), sess.graph)
        test_writer = tf.summary.FileWriter('./logs/1/test/{}'.format(log_string))

        for epoch_i in range(1, epochs+1): 
            batch_loss = 0
            batch_time = 0
            
            for batch_i, (input_batch, target_batch, input_length, target_length) in enumerate(
                    get_batches(training_sorted, batch_size, threshold)):
                start_time = time.time()

                summary, loss, _ = sess.run([model.merged,
                                             model.cost, 
                                             model.train_op], 
                                             {model.inputs: input_batch,
                                              model.targets: target_batch,
                                              model.inputs_length: input_length,
                                              model.targets_length: target_length,
                                              model.keep_prob: keep_probability})


                batch_loss += loss
                wandb.log({"Training_Loss": loss})
                end_time = time.time()
                batch_time += end_time - start_time

                train_writer.add_summary(summary, iteration)

                iteration += 1

                if batch_i % display_step == 0 and batch_i > 0:
                    print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                          .format(epoch_i,
                                  epochs, 
                                  batch_i, 
                                  len(training_sorted) // batch_size, 
                                  batch_loss / display_step, 
                                  batch_time))
                    batch_loss = 0
                    batch_time = 0

                if batch_i % testing_check == 0 and batch_i > 0:
                    batch_loss_testing = 0
                    batch_time_testing = 0
                    for batch_i, (input_batch, target_batch, input_length, target_length) in enumerate(
                            get_batches(validation_sorted, batch_size, threshold)):
                        start_time_testing = time.time()
                        summary, loss = sess.run([model.merged,
                                                  model.cost], 
                                                     {model.inputs: input_batch,
                                                      model.targets: target_batch,
                                                      model.inputs_length: input_length,
                                                      model.targets_length: target_length,
                                                      model.keep_prob: 1})

                        batch_loss_testing += loss
                        wandb.log({"Testing_Loss":loss})
                        end_time_testing = time.time()
                        batch_time_testing += end_time_testing - start_time_testing

                        test_writer.add_summary(summary, iteration)

                    n_batches_testing = batch_i + 1
                    print('Testing Loss: {:>6.3f}, Seconds: {:>4.2f}'
                          .format(batch_loss_testing / n_batches_testing, 
                                  batch_time_testing))
                    
                    batch_time_testing = 0

                    testing_loss_summary.append(batch_loss_testing)
                    if batch_loss_testing <= min(testing_loss_summary):
                        print('New Record!') 
                        stop_early = 0
                        checkpoint = "./model/{}.ckpt".format(log_string)
                        saver = tf.train.Saver()
                        saver.save(sess, checkpoint)

                    else:
                        print("No Improvement.")
                        stop_early += 1
                        if stop_early == stop:
                            break

            if stop_early == stop:
                print("Stopping Training.")
                break

In [408]:
def ids_to_sentences(sentence_id_list):
    resulting_sentence_list = []
    pad = vocab_to_int["<PAD>"] 
    for sentence in sentence_id_list:
        resulting_sentence_list.append("".join([int_to_vocab[i] for i in sentence if i != pad]))

    return resulting_sentence_list

In [409]:
checkpoint = "./model3/kp=0.75,nl=2,th=0.95.ckpt"
path_to_ckpt_meta = "./model3/kp=0.75,nl=2,th=0.95.ckpt.meta"
path_to_ckpt_data = "./model3/kp=0.75,nl=2,th=0.95.ckpt.data"

In [410]:
def text_to_ints(text):
    '''Prepare the text for the model'''
    
    text = clean_text(text)
    return [vocab_to_int[word] for word in text]

In [286]:
model = build_graph(keep_probability, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction) 

pad = vocab_to_int["<PAD>"]



In [284]:
# PREDICTION FOR USER INPUT

def predict(text):
    text = text_to_ints(text)
    with tf.Session() as sess:
        # Load saved model
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)

        answer_logits = sess.run(model.predictions, {model.inputs: [text]*batch_size, 
                                                     model.inputs_length: [len(text)]*batch_size,
                                                     model.targets_length: [len(text)+1], 
                                                     model.keep_prob: [1.0]})[0]




        print('\nText')
        print('  Word Ids:    {}'.format([i for i in text]))
        print('  Input Words: {}'.format("".join([int_to_vocab[i] for i in text])))

        print('\nSummary')
        print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
        print('  Response Words: {}'.format("".join([int_to_vocab[i] for i in answer_logits if i != pad])))
        print("-------------------------------------------------------------------------------------------")


In [308]:
# Deletion and insertion
predict("The schoool bell rang nw")


Text
  Word Ids:    [32, 24, 23, 19, 6, 8, 24, 13, 13, 13, 20, 19, 17, 23, 20, 20, 19, 0, 4, 5, 10, 19, 5, 28]
  Input Words: The schoool bell rang nw

Summary
  Word Ids:       [32, 24, 23, 19, 6, 8, 24, 13, 13, 20, 19, 17, 23, 20, 20, 19, 0, 4, 5, 10, 19, 5, 13, 28, 19]
  Response Words: The school bell rang now 
-------------------------------------------------------------------------------------------


In [309]:
# Deletion and insertion
predict("The new cupcakes are noww sellin")


Text
  Word Ids:    [32, 24, 23, 19, 5, 23, 28, 19, 8, 16, 9, 8, 4, 30, 23, 6, 19, 4, 0, 23, 19, 5, 13, 28, 28, 19, 6, 23, 20, 20, 7, 5]
  Input Words: The new cupcakes are noww sellin

Summary
  Word Ids:       [32, 24, 23, 19, 5, 23, 28, 19, 8, 16, 9, 8, 4, 30, 23, 6, 19, 4, 0, 23, 19, 5, 13, 28, 19, 6, 23, 20, 20, 7, 5, 10, 19]
  Response Words: The new cupcakes are now selling 
-------------------------------------------------------------------------------------------


In [310]:
# Two swaps
predict("eH is writnig a letter")


Text
  Word Ids:    [23, 56, 19, 7, 6, 19, 28, 0, 7, 1, 5, 7, 10, 19, 4, 19, 20, 23, 1, 1, 23, 0]
  Input Words: eH is writnig a letter

Summary
  Word Ids:       [56, 23, 19, 7, 6, 19, 28, 0, 7, 1, 7, 5, 10, 19, 4, 19, 20, 23, 1, 1, 23, 0, 19]
  Response Words: He is writing a letter 
-------------------------------------------------------------------------------------------


In [311]:
# Contextual insertion
predict("The fish swam in the se")


Text
  Word Ids:    [32, 24, 23, 19, 2, 7, 6, 24, 19, 6, 28, 4, 21, 19, 7, 5, 19, 1, 24, 23, 19, 6, 23]
  Input Words: The fish swam in the se

Summary
  Word Ids:       [32, 24, 23, 19, 2, 7, 6, 24, 19, 6, 28, 4, 21, 19, 7, 5, 19, 1, 24, 23, 19, 6, 23, 4]
  Response Words: The fish swam in the sea
-------------------------------------------------------------------------------------------


In [312]:
# Contextual Insertion
predict("Mrs. Bennet wanted to se her mother")


Text
  Word Ids:    [61, 0, 6, 42, 19, 37, 23, 5, 5, 23, 1, 19, 28, 4, 5, 1, 23, 22, 19, 1, 13, 19, 6, 23, 19, 24, 23, 0, 19, 21, 13, 1, 24, 23, 0]
  Input Words: Mrs. Bennet wanted to se her mother

Summary
  Word Ids:       [61, 0, 6, 19, 37, 23, 5, 5, 23, 1, 19, 28, 4, 5, 1, 23, 22, 19, 1, 13, 19, 6, 23, 23, 19, 24, 23, 0, 19, 21, 13, 1, 24, 23, 0, 19]
  Response Words: Mrs Bennet wanted to see her mother 
-------------------------------------------------------------------------------------------


In [313]:
# Two swaps
predict("She toko a jar frmo the shelf")


Text
  Word Ids:    [55, 24, 23, 19, 1, 13, 30, 13, 19, 4, 19, 34, 4, 0, 19, 2, 0, 21, 13, 19, 1, 24, 23, 19, 6, 24, 23, 20, 2]
  Input Words: She toko a jar frmo the shelf

Summary
  Word Ids:       [55, 24, 23, 19, 1, 13, 13, 30, 19, 4, 19, 34, 4, 0, 19, 2, 0, 13, 21, 19, 1, 24, 23, 19, 6, 24, 23, 20, 2, 19]
  Response Words: She took a jar from the shelf 
-------------------------------------------------------------------------------------------


In [315]:
# Mistaken insertion 
predict("His sisster is a singe")


Text
  Word Ids:    [56, 7, 6, 19, 6, 7, 6, 6, 1, 23, 0, 19, 7, 6, 19, 4, 19, 6, 7, 5, 10, 23]
  Input Words: His sisster is a singe

Summary
  Word Ids:       [56, 7, 6, 19, 6, 7, 6, 1, 23, 0, 19, 7, 6, 19, 4, 19, 6, 7, 5, 10, 23, 0, 23]
  Response Words: His sister is a singere
-------------------------------------------------------------------------------------------


In [336]:
# Undetected error
predict("He is sellin his house")


Text
  Word Ids:    [56, 23, 19, 7, 6, 19, 6, 23, 20, 20, 7, 5, 19, 24, 7, 6, 19, 24, 13, 16, 6, 23]
  Input Words: He is sellin his house

Summary
  Word Ids:       [56, 23, 19, 7, 6, 19, 6, 23, 20, 20, 7, 5, 19, 24, 7, 6, 19, 24, 13, 16, 6, 23, 19]
  Response Words: He is sellin his house 
-------------------------------------------------------------------------------------------


In [420]:
# MODEL EVALUATION ON THE TEST SET

def evaluate(testing_sorted):

    # Check to ensure noise_maker is making mistakes correctly.
    noisy_sentence_list = []
    threshold = 0.99
    for sentence in testing_sorted:
        noisy_sentence_list.append(noise_maker(sentence, threshold))

        text_list = noisy_sentence_list

    # for text in text_list:
    #     text = text_to_ints(text)

    predictions_list = []

    # random = np.random.randint(0,len(testing_sorted))
    # text = testing_sorted[random]
    # text = noise_maker(text, 0.95)

    checkpoint = "./model3/kp=0.75,nl=2,th=0.95.ckpt"

    prediction_list = []
    with tf.Session() as sess:
        # Load saved model
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)

        for text in text_list:

            answer_logits = sess.run(model.predictions, {model.inputs: [text]*batch_size, 
                                                         model.inputs_length: [len(text)]*batch_size,
                                                         model.targets_length: [len(text)+1], 
                                                         model.keep_prob: [1.0]})[0]

            prediction_list.append("".join([int_to_vocab[i] for i in answer_logits if i != pad]))

    return prediction_list

In [422]:
reference = ids_to_sentences(testing_sorted)

In [426]:
hypothesis = evaluate(testing_sorted)

In [448]:
from nltk.translate.bleu_score import corpus_bleu
reference_list = list(map(lambda x: x.split(" "),reference))
hypothesis_list = list(map(lambda x: x.split(" "),hypothesis))

print(corpus_bleu(reference_list,hypothesis_list))

9.825686517089071e-232
