A mock notebook motivated by an awesome blogpost: https://towardsdatascience.com/seq2seq-model-in-tensorflow-ec0c557e560f 

In [1]:
import copy
import numpy as np
import os
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
def load_data(path):
    path_to_input = os.path.join(path)
    with open(path_to_input, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [4]:
path_to_english_input = 'MLT/data/small_vocab_en'
path_to_french_output = 'MLT/data/small_vocab_fr'

In [5]:
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

START_OF_SENTENCE_FLAG = '<GO>'
END_OF_SENTENCE_FLAG = '<EOS>'

def create_lookup_tables(text):
    """
    params: 
    text: raw text without being splitted 
    """
    vocab = set(text.split())
    
    vocab_to_int = copy.copy(CODES)
    
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i
        
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
    
    return vocab_to_int, int_to_vocab


In [6]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
    params:
    source_text: unsplit raw string from source
    target_text: unplit raw string from target
    source_vocab_to_int: dictionary that maps each word in the source_text to an integer
    target_vocab_to_int: dictionary that maps each word in the target_text to an integer
    """
    
    # each will be a list of lists
    source_text_id = []
    target_text_id = []
    
    source_sentences = source_text.split('\n')
    target_sentences = target_text.split('\n')
    
    assert len(source_sentences) == len(target_sentences)
    
    for i in range(len(source_sentences)):
        
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        
        source_tokens = source_sentence.split(' ')
        target_tokens = target_sentence.split(' ')
        
        source_token_id = []
        target_token_id = []
        
        for index, token in enumerate(source_tokens):
            if token != "":
                source_token_id.append(source_vocab_to_int[token])
                
        for index, token in enumerate(target_tokens):
            if token != "":
                target_token_id.append(target_vocab_to_int[token])                
        # IMPORTANT! Add a EOS flag at the end of each target sentence
        target_token_id.append(target_vocab_to_int[END_OF_SENTENCE_FLAG])
        
        source_text_id.append(source_token_id)    
        target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id
        

In [7]:
def preprocess_and_save_data(source_path, target_path, text_to_ids_func):
    source_text = load_data(source_path).lower()
    target_text = load_data(target_path).lower()
    
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text)
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text)
    
    source_text_id, target_text_id = text_to_ids_func(source_text, target_text, source_vocab_to_int, target_vocab_to_int)
    
    pickle.dump(
        ((source_text_id, target_text_id),
         (source_vocab_to_int, target_vocab_to_int),
         (source_int_to_vocab, target_int_to_vocab)),
        open('MLT/preprocess.p', 'wb')
    )
    

In [8]:
def load_preprocess():
    with open('MLT/preprocess.p', mode='rb') as fh:
        return pickle.load(fh)

In [10]:
preprocess_and_save_data(path_to_english_input, path_to_french_output, text_to_ids)

In [11]:
(source_text_int, target_text_int), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()

In [12]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
from tensorflow.python.layers.core import Dense

In [13]:
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.3.0


In [14]:
def encoder_decoder_model_inputs():
    """
    return:
    inputs: placeholder for the input, size [batch_size, max_sequence_length]
    targets: placeholder for the output, size [batch_size, max_sequence_length]
    """
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')  # (batch_size, legnths of sentences)
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.reduce_max(target_sequence_length)
    
    return inputs, targets, target_sequence_length, max_target_len

In [15]:
def hyperparam_inputs():
    """
    return (learning_rate, dropout_probability)
    """
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return learning_rate, keep_prob

In [16]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    """
    return:
    target_data with each training example being prefixed with the id of the '<GO>' tag 
    """
    start_id = target_vocab_to_int['<GO>']
    prefixed_target_data = tf.concat([tf.fill([batch_size, 1], start_id), 
                                    target_data], 
                                   1)
    return prefixed_target_data

In [17]:
def build_encoder(rnn_inputs, rnn_size, num_layers, keep_prob,
                  source_vocab_size, encoder_embedding_size):
    """
    params:
      rnn_inputs: input tensor, size: [batch_size, sequence_length]
      rnn_size: size of the RNN
      num_layers: numb. of the RNN layers to be stacked 
      keep_prob: drop out probability
      source_vocab_size: vocabulary size of the source text
      encoder_embedding_size: embedding size for encoder's embedding dimension
    return:
      outputs: outputs from the, shape: [batch_size, sequence_length, output_dimension]
      state: emboder's state, shape: ???
    """
    
    embed = tf.contrib.layers.embed_sequence(rnn_inputs, 
                                             vocab_size=source_vocab_size,
                                             embed_dim=encoder_embedding_size)
    # embed has shape: [batch_size, sequence_length, embed_dim]
    
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])    
    
    outputs, state = tf.nn.dynamic_rnn(cell=stacked_cells, 
                                       inputs=embed,
                                       dtype=tf.float32)
    return outputs, state

In [18]:
def build_decoder_for_training(encoder_state, decoder_cell, decoder_embedding_inputs,
                               target_sequence_length, max_target_sequence_length, 
                               output_layer, keep_prob):
    """
    params:
      encoder_state: output states from the encoder part 
      decoder_cell: RNN cell(s) for the decoder
      decoder_embedding_inputs: embedding inputs for the decoder
      target_sequence_length: An int32 vector tensor, to be passed to TrainingHelper
      max_target_sequence_length: maximum allowed number of decoding step
      output_layer:
      keep_prob: 
    returns:
      outputs: outputs of the decoder for the training phase, shape: ???
    """
    
    decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embedding_inputs, 
                                               sequence_length=target_sequence_length)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell,
                                              helper=helper,
                                              initial_state=encoder_state,
                                              output_layer=output_layer)
    
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True,
                                                      maximum_iterations=max_target_sequence_length)
    
    return outputs


In [19]:
def build_decoder_for_inference(encoder_state, decoder_cell, decoder_embeddings,
                                start_of_sequence_id, end_of_sequence_id,
                                max_target_sequence_length, 
                                output_layer, batch_size, keep_prob):
    """
    params:
      encoder_state: output states from the encoder
      decoder_cell: RNN cell(s) for the decoder
      decoder_embeddings: embeddings for the decoder
      start_of_sequence_id: the id corresponding to the START_OF_SEQUENCE flag
      end_of_sequence_id: the id corresponding to the END_OF_SEQUENCE flag
      max_target_sequence_length: the max of the target_sequence_lengths
      output_layer: output layer 
      batch_size: 
      keep_prob:
    returns:

    """
    
    decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)
    
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=decoder_embeddings,
                                                      start_tokens=tf.fill([batch_size], start_of_sequence_id),
                                                      end_token=end_of_sequence_id)                                   
    
    decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell,
                                              helper=helper,
                                              initial_state=encoder_state,
                                              output_layer=output_layer)
    
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True,
                                                      maximum_iterations=max_target_sequence_length)
    
    return outputs


In [20]:
def build_decoder(decoder_input, encoder_state,
                  target_sequence_length, max_target_sequence_length,
                  rnn_size, num_layers, 
                  target_vocab_to_int,
                  batch_size, keep_prob, decoder_embedding_size):
    """
    params:
      decoder_input: sequence of integers, shape: [batch_size, sequence_length]
      encoder_state: encoder output state
      target_sequence_length: An int32 vector tensor, to be passed to TrainingHelper
      max_target_sequence_length: maximum target sequence length
      rnn_size: size of the RNN
      num_layers: numb. of the RNN layers to be stacked 
      target_vocab_to_int: mappings from a target word to int
      batch_size: batch_size
      keep_prob: drop out probability
      decoder_embedding_size: embedding size for decoder's embedding dimension
    returns:
      
    """
    
    target_vocab_size = len(target_vocab_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoder_embedding_size]))
    decoder_embedding_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
        
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
    
    with tf.variable_scope("decode"):
        #output_layer = tf.layers.Dense(target_vocab_size)
        output_layer = Dense(target_vocab_size)
        decoder_training_output = build_decoder_for_training(encoder_state, 
                                                             stacked_cells,
                                                             decoder_embedding_input,
                                                             target_sequence_length,
                                                             max_target_sequence_length,
                                                             output_layer,
                                                             keep_prob)
        
    with tf.variable_scope("decode", reuse=True):
        decoder_inference_output = build_decoder_for_inference(encoder_state,
                                                               stacked_cells,
                                                               decoder_embeddings,
                                                               target_vocab_to_int[START_OF_SENTENCE_FLAG],
                                                               target_vocab_to_int[END_OF_SENTENCE_FLAG],
                                                               max_target_sequence_length,
                                                               output_layer,
                                                               batch_size,
                                                               keep_prob)

    return (decoder_training_output, decoder_inference_output)
        

In [21]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  target_sequence_length, max_target_sequence_length,
                  source_vocab_size, target_vocab_size,
                  encoder_embedding_size, decoder_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    """
    params:
    returns:
    """
    
    encoder_output, encoder_state = build_encoder(input_data, rnn_size, num_layers, keep_prob,
                                                  source_vocab_size, encoder_embedding_size)
    
    decoder_input = process_decoder_input(target_data, target_vocab_to_int, batch_size)
    
    training_output, inference_output = build_decoder(decoder_input, encoder_state,
                                                      target_sequence_length, max_target_sequence_length,
                                                      rnn_size, num_layers, 
                                                      target_vocab_to_int,
                                                      batch_size, keep_prob, decoder_embedding_size)
    
    return training_output, inference_output
    
    

In [22]:
display_step = 200

epochs = 6
batch_size = 128

rnn_size = 128
num_layers = 3

encoder_embedding_size = 200
decoder_embedding_size = 200

learning_rate = 0.001
keep_prob = 0.5


In [23]:
save_path = 'MLT/checkpoints/dev'

(source_text_int, target_text_int), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, target_data, target_sequence_length, max_target_sequence_length = encoder_decoder_model_inputs()
    
    param_learning_rate, param_keep_prob = hyperparam_inputs()
    
    train_logits, inference_logits = seq2seq_model(input_data,
                                                   target_data,
                                                   param_keep_prob,
                                                   batch_size,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoder_embedding_size,
                                                   decoder_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)
                                                   
    train_logits_output = tf.identity(train_logits.rnn_output, name='logits')
    train_prediction = tf.identity(inference_logits.sample_id, name='prediction')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length,
                             dtype=tf.float32, name='masks')
    
    with tf.name_scope('optimization'):
        
        cost = tf.contrib.seq2seq.sequence_loss(
            logits=train_logits_output,
            targets=target_data,
            weights=masks)
        
        optimizer = tf.train.AdamOptimizer(param_learning_rate)
        
        gradients = optimizer.compute_gradients(cost)
        clipped_gradients = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gradients if grad is not None]
        
        train_op = optimizer.apply_gradients(clipped_gradients)
            

In [24]:
def pad_sentence_batch(sentence_batch, pad_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [25]:
def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    for batch_idx in range(len(sources) // batch_size):
        start_idx = batch_idx * batch_size
        end_idx = start_idx + batch_size
        
        sources_batch = sources[start_idx: end_idx]
        targets_batch = targets[start_idx: end_idx]
        
        padded_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        padded_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        padded_sources_lengths = []
        for source in padded_sources_batch:
            padded_sources_lengths.append(len(source))
            
        padded_targets_lengths = []
        for target in padded_targets_batch:
            padded_targets_lengths.append(len(target))
            
        yield padded_sources_batch, padded_targets_batch, padded_sources_lengths, padded_targets_lengths       
                  

In [26]:
def get_accuracy(targets, predictions, padding_int=0):
    max_sequence_length = max(targets.shape[1], predictions.shape[1])
    
    if max_sequence_length > targets.shape[1]:
        targets = np.pad(targets, 
                         [(0, 0), (0, max_sequence_length - targets.shape[1])],
                         mode='constant',
                         constant_values=padding_int)
        
    if max_sequence_length > predictions.shape[1]:
        predictions = np.pad(predictions,
                             [(0, 0), (0, max_sequence_length - predictions.shape[1])],
                             mode='constant',
                             constant_values=padding_int)
        
    return np.mean(np.equal(targets, predictions))    

In [27]:
train_sources = source_text_int[batch_size:]
train_targets = target_text_int[batch_size:]
valid_sources = source_text_int[:batch_size]
valid_targets = target_text_int[:batch_size]

(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths) = \
  next(get_batches(valid_sources, 
                   valid_targets, 
                   batch_size, 
                   source_vocab_to_int['<PAD>'],
                   target_vocab_to_int['<PAD>']))
    

In [28]:
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_idx in range(epochs):
        for batch_idx, (sources_batch, targets_batch, sources_lengths, targets_lengths) in enumerate(
          get_batches(train_sources,
                      train_targets,
                      batch_size,
                      source_vocab_to_int['<PAD>'],
                      target_vocab_to_int['<PAD>']
                      )):
        
            _, loss = sess.run([train_op, cost], 
                               feed_dict={
                                   input_data: sources_batch,
                                   target_data: targets_batch,
                                   target_sequence_length: targets_lengths,
                                   param_learning_rate: learning_rate, 
                                   param_keep_prob: keep_prob                               
                               })
        
            if batch_idx % display_step == 0 and batch_idx > 0:
                batch_train_predictions = sess.run(train_prediction, 
                                                   feed_dict={
                                                       input_data: sources_batch,
                                                       target_sequence_length: targets_lengths,
                                                       param_keep_prob: 1.0
                                                   })
                
                batch_valid_predictions = sess.run(train_prediction,
                                                   feed_dict={
                                                       input_data: valid_sources_batch,
                                                       target_sequence_length: valid_targets_lengths,
                                                       param_keep_prob: 1.0
                                                   })
                
                train_acc = get_accuracy(targets_batch, batch_train_predictions)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_predictions)
                
                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_idx, batch_idx, len(source_text_int) // batch_size, train_acc, valid_acc, loss))
        
    saver = tf.train.Saver()
    saver.save(sess, save_path)

Epoch   0 Batch  200/1077 - Train Accuracy: 0.4570, Validation Accuracy: 0.5149, Loss: 2.2556
Epoch   0 Batch  400/1077 - Train Accuracy: 0.4629, Validation Accuracy: 0.5014, Loss: 1.5660
Epoch   0 Batch  600/1077 - Train Accuracy: 0.5212, Validation Accuracy: 0.5515, Loss: 1.1406
Epoch   0 Batch  800/1077 - Train Accuracy: 0.4867, Validation Accuracy: 0.5685, Loss: 0.9940
Epoch   0 Batch 1000/1077 - Train Accuracy: 0.5874, Validation Accuracy: 0.5700, Loss: 0.8334
Epoch   1 Batch  200/1077 - Train Accuracy: 0.5395, Validation Accuracy: 0.5994, Loss: 0.7999
Epoch   1 Batch  400/1077 - Train Accuracy: 0.5805, Validation Accuracy: 0.6005, Loss: 0.7432
Epoch   1 Batch  600/1077 - Train Accuracy: 0.6183, Validation Accuracy: 0.6335, Loss: 0.6269
Epoch   1 Batch  800/1077 - Train Accuracy: 0.5695, Validation Accuracy: 0.6417, Loss: 0.6264
Epoch   1 Batch 1000/1077 - Train Accuracy: 0.6414, Validation Accuracy: 0.6445, Loss: 0.5331
Epoch   2 Batch  200/1077 - Train Accuracy: 0.5813, Validati

In [48]:
def save_params(save_path):
    with open('MLT/params.p', 'wb') as fh:
        pickle.dump(save_path, fh)
        
def load_params():
    with open('MLT/params.p', 'rb') as fh:
        return pickle.load(fh)
    

In [49]:
save_params(save_path)

In [51]:
load_path = load_params()

In [52]:
def setence_to_seq(sentence, vocab_to_int):
    seq = []
    for word in sentence.split(" "):
        if word in vocab_to_int:
            seq.append(vocab_to_int[word])
        else:
            seq.append(vocab_to_int['<UNK>'])
    return seq


In [53]:
_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()

In [57]:
english_sentence = 'he saw a old yellow truck .'
english_seq = setence_to_seq(english_sentence, source_vocab_to_int)

In [59]:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)
    
    input_data = loaded_graph.get_tensor_by_name('inputs:0')  # from encoder_decoder_model_inputs
    predictions = loaded_graph.get_tensor_by_name('prediction:0') # from the main program that specifies the graph
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    param_keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0') # from hyperparam_inputs
    
    french_seq = sess.run(predictions,
                          feed_dict={
                              input_data: [english_seq] * batch_size,
                              target_sequence_length: [len(english_seq) * 2] * batch_size,
                              param_keep_prob: 1.0                              
                          })[0]
                         
print('Input')
print('  Word Ids:      {}'.format([i for i in english_seq]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in english_seq]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in french_seq]))
print('  French Words: {}'.format(" ".join([target_int_to_vocab[i] for i in french_seq])))    

INFO:tensorflow:Restoring parameters from MLT/checkpoints/dev


INFO:tensorflow:Restoring parameters from MLT/checkpoints/dev


Input
  Word Ids:      [115, 184, 51, 17, 107, 205, 23]
  English Words: ['he', 'saw', 'a', 'old', 'yellow', 'truck', '.']

Prediction
  Word Ids:      [227, 6, 31, 102, 175, 20, 193, 1]
  French Words: il conduisait un gros camion jaune . <EOS>
