From 15b14a87c5abe5046933a3870149d2a8910c7bd4 Mon Sep 17 00:00:00 2001
From: Denny Britz <dennybritz@gmail.com>
Date: Sat, 10 Dec 2016 20:30:39 -0800
Subject: [PATCH] Enforce styling with YAPF

---
 .style.yapf                           |   4 +
 pylintrc                              |   6 +-
 seq2seq/decoders/attention.py         |  35 ++++---
 seq2seq/decoders/attention_decoder.py |  65 ++++++++----
 seq2seq/decoders/basic_decoder.py     |  37 ++++---
 seq2seq/decoders/decoder_base.py      | 125 ++++++++++++++---------
 seq2seq/encoders/rnn_encoder.py       |  27 ++---
 seq2seq/graph_module.py               |  13 +--
 seq2seq/inputs.py                     |  82 +++++++++------
 seq2seq/losses.py                     |   7 +-
 seq2seq/models/attention_seq2seq.py   |  58 ++++++-----
 seq2seq/models/basic_seq2seq.py       |  59 ++++++-----
 seq2seq/models/model_base.py          | 121 ++++++++++++----------
 seq2seq/scripts/generate_examples.py  |  19 ++--
 seq2seq/scripts/generate_toy_data.py  |  59 +++++++----
 seq2seq/scripts/generate_vocab.py     |  48 ++++++---
 seq2seq/scripts/profile.py            |  31 +++---
 seq2seq/test/attention_test.py        |  11 +-
 seq2seq/test/decoder_test.py          |  71 +++++++------
 seq2seq/test/hparams_parser_test.py   |  21 ++--
 seq2seq/test/inputs_test.py           |  19 ++--
 seq2seq/test/losses_test.py           |  11 +-
 seq2seq/test/models_test.py           | 141 ++++++++++++++++----------
 seq2seq/test/rnn_encoder_test.py      |  60 ++++++-----
 seq2seq/test/utils.py                 |   2 +
 seq2seq/training/featurizers.py       |  62 +++++++----
 seq2seq/training/hooks.py             |  64 +++++++-----
 seq2seq/training/hparams_parser.py    |   1 +
 seq2seq/training/train.py             |  63 ++++++------
 seq2seq/training/utils.py             |  92 ++++++++++-------
 30 files changed, 850 insertions(+), 564 deletions(-)
 create mode 100644 .style.yapf

diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 00000000..f499f526
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,4 @@
+[style]
+based_on_style = google
+indent_width = 2
+column_limit = 80
\ No newline at end of file
diff --git a/pylintrc b/pylintrc
index 1632c696..3d8e3792 100644
--- a/pylintrc
+++ b/pylintrc
@@ -193,7 +193,7 @@ max-nested-blocks=5
 [FORMAT]
 
 # Maximum number of characters on a single line.
-max-line-length=100
+max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
 ignore-long-lines=^\s*(# )?<?https?://\S+>?$
@@ -216,7 +216,7 @@ max-module-lines=1000
 indent-string='  '
 
 # Number of spaces of indent required inside a hanging  or continued line.
-indent-after-paren=2
+indent-after-paren=4
 
 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
 expected-line-ending-format=
@@ -238,7 +238,7 @@ notes=FIXME,XXX,TODO
 [SIMILARITIES]
 
 # Minimum lines number of a similarity.
-min-similarity-lines=4
+min-similarity-lines=10
 
 # Ignore comments when computing similarities.
 ignore-comments=yes
diff --git a/seq2seq/decoders/attention.py b/seq2seq/decoders/attention.py
index 909af343..7fceb02f 100644
--- a/seq2seq/decoders/attention.py
+++ b/seq2seq/decoders/attention.py
@@ -22,16 +22,19 @@ def _build(self, state, inputs):
     """Computes attention scores and outputs.
 
     Args:
-      state: The state based on which to calculate attention scores. In seq2seq this is typically
-        the current state of the decoder. A tensor of shape `[B, ...]`
+      state: The state based on which to calculate attention scores.
+        In seq2seq this is typically the current state of the decoder.
+        A tensor of shape `[B, ...]`
       inputs: The elements to compute attention *over*. In seq2seq this is
-        typically the sequence of encoder outputs. A tensor of shape `[B, T, input_dim]`
+        typically the sequence of encoder outputs.
+        A tensor of shape `[B, T, input_dim]`
 
     Returns:
       A tuple `(scores, context)`.
-      `scores` is vector of length `T` where each element is the normalized "score" of
-      the corresponding `inputs` element.
-      `context` is the final attention layer output corresponding to the weighted inputs.
+      `scores` is vector of length `T` where each element is the
+      normalized "score" of the corresponding `inputs` element.
+      `context` is the final attention layer output corresponding to
+      the weighted inputs.
       A tensor fo shape `[B, input_dim]`.
     """
     batch_size, inputs_timesteps, _ = tf.unpack(tf.shape(inputs))
@@ -40,25 +43,31 @@ def _build(self, state, inputs):
     # Fully connected layers to transform both inputs and state
     # into a tensor with `num_units` units
     inputs_att = tf.contrib.layers.fully_connected(
-      inputs=inputs, num_outputs=self.num_units, activation_fn=None, scope="inputs_att")
+        inputs=inputs,
+        num_outputs=self.num_units,
+        activation_fn=None,
+        scope="inputs_att")
     state_att = tf.contrib.layers.fully_connected(
-      inputs=state, num_outputs=self.num_units, activation_fn=None, scope="state_att")
+        inputs=state,
+        num_outputs=self.num_units,
+        activation_fn=None,
+        scope="state_att")
 
     # Take the dot product of state for each time step in inputs
     # Result: A tensor of shape [B, T]
     inputs_att_flat = tf.reshape(inputs_att, [-1, self.num_units])
     state_att_flat = tf.reshape(
-      tf.tile(state_att, [1, inputs_timesteps]),
-      [inputs_timesteps * batch_size, self.num_units])
+        tf.tile(state_att, [1, inputs_timesteps]),
+        [inputs_timesteps * batch_size, self.num_units])
     scores = tf.batch_matmul(
-      tf.expand_dims(inputs_att_flat, 1),
-      tf.expand_dims(state_att_flat, 2))
+        tf.expand_dims(inputs_att_flat, 1), tf.expand_dims(state_att_flat, 2))
     scores = tf.reshape(scores, [batch_size, inputs_timesteps], name="scores")
 
     # Normalize the scores
     scores_normalized = tf.nn.softmax(scores, name="scores_normalized")
 
-    # Calculate the weighted average of the attention inputs according to the scores
+    # Calculate the weighted average of the attention inputs
+    # according to the scores
     context = tf.expand_dims(scores_normalized, 2) * inputs
     context = tf.reduce_sum(context, 1, name="context")
     context.set_shape([None, inputs_dim])
diff --git a/seq2seq/decoders/attention_decoder.py b/seq2seq/decoders/attention_decoder.py
index 532d40b7..46968708 100644
--- a/seq2seq/decoders/attention_decoder.py
+++ b/seq2seq/decoders/attention_decoder.py
@@ -6,28 +6,41 @@
 import tensorflow as tf
 from seq2seq.decoders import DecoderBase, DecoderOutput, DecoderStepOutput
 
+
 class AttentionDecoderOutput(
     namedtuple("DecoderOutput", ["logits", "predictions", "attention_scores"])):
   """Augmented decoder output that also includes the attention scores.
   """
   pass
 
+
 class AttentionDecoder(DecoderBase):
   """An RNN Decoder that uses attention over an input sequence.
 
   Args:
     cell: An instance of ` tf.nn.rnn_cell.RNNCell`
-    vocab_size: Output vocabulary size, i.e. number of units in the softmax layer
-    attention_inputs: The sequence to take attentio over. A tensor of shaoe `[B, T, ...]`.
-    attention_fn: The attention function to use. This function map from `(state, inputs)` to
-      `(attention_scores, attention_context)`.
+    vocab_size: Output vocabulary size, i.e. number of units
+      in the softmax layer
+    attention_inputs: The sequence to take attentio over.
+      A tensor of shaoe `[B, T, ...]`.
+    attention_fn: The attention function to use. This function map from
+      `(state, inputs)` to `(attention_scores, attention_context)`.
       For an example, see `seq2seq.decoder.attention.AttentionLayer`.
-    max_decode_length: Maximum length for decoding steps for each example of shape `[B]`.
-    prediction_fn: Optional. A function that generates a predictions of shape `[B]` from a logits
-      of shape `[B, vocab_size]`. By default, this is argmax.
+    max_decode_length: Maximum length for decoding steps
+      for each example of shape `[B]`.
+    prediction_fn: Optional. A function that generates a predictions
+      of shape `[B]` from a logits of shape `[B, vocab_size]`.
+      By default, this is argmax.
   """
-  def __init__(self, cell, vocab_size, attention_inputs, attention_fn, max_decode_length,
-               prediction_fn=None, name="attention_decoder"):
+
+  def __init__(self,
+               cell,
+               vocab_size,
+               attention_inputs,
+               attention_fn,
+               max_decode_length,
+               prediction_fn=None,
+               name="attention_decoder"):
     super(AttentionDecoder, self).__init__(cell, max_decode_length, name)
     self.vocab_size = vocab_size
     self.prediction_fn = prediction_fn
@@ -40,7 +53,8 @@ def __init__(self, cell, vocab_size, attention_inputs, attention_fn, max_decode_
 
   @staticmethod
   def _pack_outputs(outputs_ta, final_loop_state):
-    logits, predictions = DecoderBase._pack_outputs(outputs_ta, final_loop_state)
+    logits, predictions = DecoderBase._pack_outputs(outputs_ta,
+                                                    final_loop_state)
     attention_scores = tf.transpose(final_loop_state.pack(), [1, 0, 2])
     return AttentionDecoderOutput(logits, predictions, attention_scores)
 
@@ -48,12 +62,15 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn):
     initial_call = (cell_output is None)
 
     if initial_call:
-      cell_output = tf.zeros([tf.shape(self.attention_inputs)[0], self.cell.output_size])
+      cell_output = tf.zeros(
+          [tf.shape(self.attention_inputs)[0], self.cell.output_size])
       # Initialize the TensorArray that will hold the attention scores
-      next_loop_state = tf.TensorArray(dtype=tf.float32, size=1, dynamic_size=True)
+      next_loop_state = tf.TensorArray(
+          dtype=tf.float32, size=1, dynamic_size=True)
 
     # Compute attention
-    att_scores, attention_context = self.attention_fn(cell_output, self.attention_inputs)
+    att_scores, attention_context = self.attention_fn(cell_output,
+                                                      self.attention_inputs)
 
     # In the first step the attention vector is set to all zeros
     if initial_call:
@@ -64,22 +81,26 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn):
     # Softmax computation
     softmax_input = tf.concat(1, [cell_output, attention_context])
     logits = tf.contrib.layers.fully_connected(
-      inputs=softmax_input, num_outputs=self.vocab_size, activation_fn=None, scope="logits")
+        inputs=softmax_input,
+        num_outputs=self.vocab_size,
+        activation_fn=None,
+        scope="logits")
     predictions = self.prediction_fn(logits)
     outputs = DecoderOutput(logits, predictions)
 
     if initial_call:
       outputs = DecoderOutput(
-        logits=tf.zeros([self.vocab_size]),
-        predictions=tf.zeros([], dtype=tf.int64))
+          logits=tf.zeros([self.vocab_size]),
+          predictions=tf.zeros(
+              [], dtype=tf.int64))
 
     # Append the attention context to the inputs
-    next_input = next_input_fn(
-      time_, (None if initial_call else cell_output), cell_state, loop_state, outputs)
+    next_input = next_input_fn(time_, (None if initial_call else cell_output),
+                               cell_state, loop_state, outputs)
     next_input = tf.concat(1, [next_input, attention_context])
 
     return DecoderStepOutput(
-      outputs=outputs,
-      next_input=next_input,
-      next_cell_state=cell_state,
-      next_loop_state=next_loop_state)
+        outputs=outputs,
+        next_input=next_input,
+        next_cell_state=cell_state,
+        next_loop_state=next_loop_state)
diff --git a/seq2seq/decoders/basic_decoder.py b/seq2seq/decoders/basic_decoder.py
index 5df70d22..6eeaff92 100644
--- a/seq2seq/decoders/basic_decoder.py
+++ b/seq2seq/decoders/basic_decoder.py
@@ -5,17 +5,27 @@
 import tensorflow as tf
 from seq2seq.decoders import DecoderBase, DecoderOutput, DecoderStepOutput
 
+
 class BasicDecoder(DecoderBase):
-  """A simple RNN decoder that performed a softmax operations on the cell output.
+  """Simple RNN decoder that performed a softmax operations on the cell output.
 
   Args:
     cell: An instance of ` tf.nn.rnn_cell.RNNCell`
-    vocab_size: Output vocabulary size, i.e. number of units in the softmax layer
-    max_decode_length: Maximum length for decoding steps for each example of shape `[B]`.
-    prediction_fn: Optional. A function that generates a predictions of shape `[B]` from a logits
-      of shape `[B, vocab_size]`. By default, this is argmax.
+    vocab_size: Output vocabulary size, i.e. number of units
+      in the softmax layer
+    max_decode_length: Maximum length for decoding steps for each example
+      of shape `[B]`.
+    prediction_fn: Optional. A function that generates a predictions
+      of shape `[B]` from a logits of shape `[B, vocab_size]`.
+      By default, this is argmax.
   """
-  def __init__(self, cell, vocab_size, max_decode_length, prediction_fn=None, name="basic_decoder"):
+
+  def __init__(self,
+               cell,
+               vocab_size,
+               max_decode_length,
+               prediction_fn=None,
+               name="basic_decoder"):
     super(BasicDecoder, self).__init__(cell, max_decode_length, name)
     self.vocab_size = vocab_size
     self.prediction_fn = prediction_fn
@@ -31,12 +41,13 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn):
       cell_output = tf.zeros([1, self.cell.output_size])
 
     logits = tf.contrib.layers.fully_connected(
-      inputs=cell_output, num_outputs=self.vocab_size, activation_fn=None)
+        inputs=cell_output, num_outputs=self.vocab_size, activation_fn=None)
 
     if initial_call:
       outputs = DecoderOutput(
-        logits=tf.zeros([self.vocab_size]),
-        predictions=tf.zeros([], dtype=tf.int64))
+          logits=tf.zeros([self.vocab_size]),
+          predictions=tf.zeros(
+              [], dtype=tf.int64))
     else:
       predictions = self.prediction_fn(logits)
       outputs = DecoderOutput(logits, predictions)
@@ -44,7 +55,7 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn):
     next_input = next_input_fn(time_, (None if initial_call else cell_output),
                                cell_state, loop_state, outputs)
     return DecoderStepOutput(
-      outputs=outputs,
-      next_input=next_input,
-      next_cell_state=cell_state,
-      next_loop_state=None)
+        outputs=outputs,
+        next_input=next_input,
+        next_cell_state=cell_state,
+        next_loop_state=None)
diff --git a/seq2seq/decoders/decoder_base.py b/seq2seq/decoders/decoder_base.py
index 2a0ef5fa..67594462 100644
--- a/seq2seq/decoders/decoder_base.py
+++ b/seq2seq/decoders/decoder_base.py
@@ -11,14 +11,17 @@
 class DecoderOutput(namedtuple("DecoderOutput", ["logits", "predictions"])):
   """Output of a decoder.
 
-  Note that we output both the logits and predictions because during dynamic decoding
-  the predictions may not correspond to max(logits). For example, we may be sampling from the
-  logits instead.
+  Note that we output both the logits and predictions because during
+  dynamic decoding the predictions may not correspond to max(logits).
+  For example, we may be sampling from the logits instead.
   """
   pass
 
-class DecoderStepOutput(namedtuple(
-    "DecoderStepOutput", ["outputs", "next_cell_state", "next_input", "next_loop_state"])):
+
+class DecoderStepOutput(
+    namedtuple(
+        "DecoderStepOutput",
+        ["outputs", "next_cell_state", "next_input", "next_loop_state"])):
   """Output of a decoder step to be used with Tensorflow's `raw_rnn`.
   """
 
@@ -27,7 +30,13 @@ class RNNStep(GraphModule):
   """
   A Wrapper around `raw_rnn`.
   """
-  def __init__(self, step_fn, input_fn, initial_state, sequence_length, name="rnn_step"):
+
+  def __init__(self,
+               step_fn,
+               input_fn,
+               initial_state,
+               sequence_length,
+               name="rnn_step"):
     super(RNNStep, self).__init__(name)
     self.step_fn = step_fn
     self.input_fn = input_fn
@@ -38,7 +47,8 @@ def _build(self, time_, cell_output, cell_state, loop_state):
     if cell_output is None:
       cell_state = self.initial_state
 
-    step_output = self.step_fn(time_, cell_output, cell_state, loop_state, self.input_fn)
+    step_output = self.step_fn(time_, cell_output, cell_state, loop_state,
+                               self.input_fn)
     assert isinstance(step_output, DecoderStepOutput), \
       "Step output must be an isntance of DecoderStepOutput"
 
@@ -47,19 +57,24 @@ def _build(self, time_, cell_output, cell_state, loop_state):
     else:
       elements_finished = (time_ >= self.sequence_length)
 
-    return (elements_finished, step_output.next_input, step_output.next_cell_state,
-            step_output.outputs, step_output.next_loop_state)
+    return (elements_finished, step_output.next_input,
+            step_output.next_cell_state, step_output.outputs,
+            step_output.next_loop_state)
 
 
 class FixedDecoderInputs(GraphModule):
-  """An operation that feeds fixed inputs to a decoder, also known as "teacher forcing".
+  """An operation that feeds fixed inputs to a decoder,
+  also known as "teacher forcing".
 
   Args:
-    inputs: The inputs to feed to the decoder. A tensor of shape `[B, T, ...]`. At each time
-      step T, one slice of shape `[B, ...]` is fed to the decoder.
-    sequence_length: A tensor of shape `[B]` that specifies the sequence length for each example.
+    inputs: The inputs to feed to the decoder.
+      A tensor of shape `[B, T, ...]`. At each time step T, one slice
+      of shape `[B, ...]` is fed to the decoder.
+    sequence_length: A tensor of shape `[B]` that specifies the
+      sequence length for each example.
 
   """
+
   def __init__(self, inputs, sequence_length, name="fixed_decoder_inputs"):
     super(FixedDecoderInputs, self).__init__(name)
     self.inputs = inputs
@@ -67,13 +82,15 @@ def __init__(self, inputs, sequence_length, name="fixed_decoder_inputs"):
 
     with self.variable_scope():
       self.inputs_ta = tf.TensorArray(
-        dtype=self.inputs.dtype, size=tf.shape(self.inputs)[1], name="inputs_ta")
-      self.inputs_ta = self.inputs_ta.unpack(tf.transpose(self.inputs, [1, 0, 2]))
+          dtype=self.inputs.dtype,
+          size=tf.shape(self.inputs)[1],
+          name="inputs_ta")
+      self.inputs_ta = self.inputs_ta.unpack(
+          tf.transpose(self.inputs, [1, 0, 2]))
       self.max_seq_len = tf.reduce_max(sequence_length, name="max_seq_len")
       self.batch_size = tf.identity(tf.shape(inputs)[0], name="batch_size")
       self.input_dim = tf.identity(tf.shape(inputs)[-1], name="input_dim")
 
-
   def _build(self, time_, *args):
     """Returns the input for the given time step.
 
@@ -81,29 +98,32 @@ def _build(self, time_, *args):
       time_: An int32 scalar
 
     Returns:
-      A tensor of shape `[B, ...]`. When `time_` is past the maximum sequence length
-      a zero tensor is fed as input for performance purposes.
+      A tensor of shape `[B, ...]`. When `time_` is past the maximum
+      sequence length a zero tensor is fed as input for performance purposes.
     """
     all_finished = (time_ >= self.max_seq_len)
     next_input = tf.cond(
-      all_finished,
-      lambda: tf.zeros([self.batch_size, self.input_dim], dtype=tf.float32),
-      lambda: self.inputs_ta.read(time_))
+        all_finished,
+        lambda: tf.zeros([self.batch_size, self.input_dim], dtype=tf.float32),
+        lambda: self.inputs_ta.read(time_))
     next_input.set_shape([None, self.inputs.get_shape().as_list()[-1]])
     return next_input
 
 
 class DynamicDecoderInputs(GraphModule):
-  """An operation that feeds dynamic inputs to a decoder according to some arbitrary
-  function that creates a new input from the decoder output at the current step, e.g.
-  `embed(argmax(logits))`.
+  """An operation that feeds dynamic inputs to a decoder according to some
+  arbitrary function that creates a new input from the decoder output at
+  the current step, e.g. `embed(argmax(logits))`.
 
   Args:
-    initial_inputs: An input to feed at the first time step. A tensor of shape `[B, ...]`.
-    make_input_fn: A function that mapes from `(decoder_output) -> next_input`, where
-      `next_input` must be a Tensor of shape `[B, ...]`.
+    initial_inputs: An input to feed at the first time step.
+      A tensor of shape `[B, ...]`.
+    make_input_fn: A function that mapes from `(decoder_output) -> next_input`,
+      where `next_input` must be a Tensor of shape `[B, ...]`.
   """
-  def __init__(self, initial_inputs, make_input_fn, name="fixed_decoder_inputs"):
+
+  def __init__(self, initial_inputs, make_input_fn,
+               name="fixed_decoder_inputs"):
     super(DynamicDecoderInputs, self).__init__(name)
     self.initial_inputs = initial_inputs
     self.make_input_fn = make_input_fn
@@ -117,7 +137,6 @@ def _build(self, _time_, cell_output, _cell_state, _loop_state, step_output):
     return next_input
 
 
-
 class DecoderBase(GraphModule):
   """Base class for RNN decoders.
 
@@ -125,6 +144,7 @@ class DecoderBase(GraphModule):
     cell: An instance of ` tf.nn.rnn_cell.RNNCell`
     name: A name for this module
   """
+
   def __init__(self, cell, max_decode_length, name):
     super(DecoderBase, self).__init__(name)
     self.cell = cell
@@ -132,9 +152,9 @@ def __init__(self, cell, max_decode_length, name):
 
   def _step(self, time, cell_output, cell_state, loop_state, next_input_fn):
     """
-    This function maps from the decoder state to the outputs of the current time step
-    and the state of the next step. This is where the actual decoding logic should be implemented
-    by subclasses.
+    This function maps from the decoder state to the outputs of the current
+    time step and the state of the next step. This is where the actual decoding
+    logic should be implemented by subclasses.
 
     The arguments to this function follow those of `tf.nn.raw_rnn`.
     Refer to its documentation for further explanation.
@@ -146,19 +166,22 @@ def _step(self, time, cell_output, cell_state, loop_state, next_input_fn):
       cell_state: The state result of applying the cell function to the input.
         A tensor of shape `[B, cell.state_size]`. This may also be a tuple
         depending on which type of cell is being used.
-      loop_state: An optional tuple that can be used to pass state through time steps.
-        The shape of this is defined by the subclass.
-      next_input_fn: A function that generates the next input, e.g. an instance of
-        `FixedDecoderInputs` or `DynamicDecoderInputs`.
+      loop_state: An optional tuple that can be used to pass state through
+        time steps. The shape of this is defined by the subclass.
+      next_input_fn: A function that generates the next input, e.g. an
+        instance of `FixedDecoderInputs` or `DynamicDecoderInputs`.
 
     Returns:
       A `DecoderStepOutput` tuple, where:
 
-      outputs: The RNN output at this time step. A tuple with logits and predictions
-      next_cell_state: The cell state for the next iteration. In most cases this is
-        simply the passed in `cell_state`. A tensor of shape `[B, cell.state_size]`.
-      next_input: The input to the next time step. A tensor of shape `[B, ...]`
-      next_loop_state: A new loop state of the same type/shape as the passed in `loop_state`.
+      outputs: The RNN output at this time step. A tuple.
+      next_cell_state: The cell state for the next iteration. In most cases
+        this is simply the passed in `cell_state`.
+        A tensor of shape `[B, cell.state_size]`.
+      next_input: The input to the next time step.
+        A tensor of shape `[B, ...]`
+      next_loop_state: A new loop state of the same type/shape
+        as the passed in `loop_state`.
     """
     raise NotImplementedError
 
@@ -167,19 +190,21 @@ def _pack_outputs(outputs_ta, _final_loop_state):
     """Transposes outputs from time-major to batch-major.
     """
     logits = tf.transpose(outputs_ta.logits.pack(), [1, 0, 2], name="logits")
-    predictions = tf.transpose(outputs_ta.predictions.pack(), [1, 0], name="predictions")
+    predictions = tf.transpose(
+        outputs_ta.predictions.pack(), [1, 0], name="predictions")
     return DecoderOutput(logits=logits, predictions=predictions)
 
-
   def _build(self, input_fn, initial_state, sequence_length):
     if sequence_length is None:
       sequence_length = self.max_decode_length
 
     rnn_loop_fn = RNNStep(
-      step_fn=self._step,
-      input_fn=input_fn,
-      initial_state=initial_state,
-      sequence_length=tf.minimum(sequence_length, self.max_decode_length))
-
-    outputs_ta, final_state, final_loop_state = tf.nn.raw_rnn(self.cell, rnn_loop_fn)
-    return self._pack_outputs(outputs_ta, final_loop_state), final_state, final_loop_state
+        step_fn=self._step,
+        input_fn=input_fn,
+        initial_state=initial_state,
+        sequence_length=tf.minimum(sequence_length, self.max_decode_length))
+
+    outputs_ta, final_state, final_loop_state = tf.nn.raw_rnn(self.cell,
+                                                              rnn_loop_fn)
+    return self._pack_outputs(outputs_ta,
+                              final_loop_state), final_state, final_loop_state
diff --git a/seq2seq/encoders/rnn_encoder.py b/seq2seq/encoders/rnn_encoder.py
index 772a9501..9c3be8f7 100644
--- a/seq2seq/encoders/rnn_encoder.py
+++ b/seq2seq/encoders/rnn_encoder.py
@@ -6,7 +6,9 @@
 import tensorflow as tf
 from seq2seq import GraphModule
 
-RNNEncoderOutput = collections.namedtuple("RNNEncoderOutput", ["outputs", "final_state"])
+RNNEncoderOutput = collections.namedtuple("RNNEncoderOutput",
+                                          ["outputs", "final_state"])
+
 
 class UnidirectionalRNNEncoder(GraphModule):
   """
@@ -24,11 +26,11 @@ def __init__(self, cell, name="forward_rnn_encoder"):
 
   def _build(self, inputs, sequence_length, **kwargs):
     outputs, state = tf.nn.dynamic_rnn(
-      cell=self.cell,
-      inputs=inputs,
-      sequence_length=sequence_length,
-      dtype=tf.float32,
-      **kwargs)
+        cell=self.cell,
+        inputs=inputs,
+        sequence_length=sequence_length,
+        dtype=tf.float32,
+        **kwargs)
     return RNNEncoderOutput(outputs=outputs, final_state=state)
 
 
@@ -42,18 +44,19 @@ class BidirectionalRNNEncoder(GraphModule):
     cell: An instance of tf.nn.rnn_cell.RNNCell
     name: A name for the encoder
   """
+
   def __init__(self, cell, name="bidi_rnn_encoder"):
     super(BidirectionalRNNEncoder, self).__init__(name)
     self.cell = cell
 
   def _build(self, inputs, sequence_length, **kwargs):
     outputs, states = tf.nn.bidirectional_dynamic_rnn(
-      cell_fw=self.cell,
-      cell_bw=self.cell,
-      inputs=inputs,
-      sequence_length=sequence_length,
-      dtype=tf.float32,
-      **kwargs)
+        cell_fw=self.cell,
+        cell_bw=self.cell,
+        inputs=inputs,
+        sequence_length=sequence_length,
+        dtype=tf.float32,
+        **kwargs)
 
     # Concatenate outputs and states of the forward and backward RNNs
     outputs_concat = tf.concat(2, outputs)
diff --git a/seq2seq/graph_module.py b/seq2seq/graph_module.py
index a32f2510..62f27a8c 100644
--- a/seq2seq/graph_module.py
+++ b/seq2seq/graph_module.py
@@ -5,14 +5,15 @@
 
 import tensorflow as tf
 
+
 class GraphModule(object):
   """
-  A convenience base class that makes it easy to share and access variables in the graph.
-  Each insance of this class creates its own set of variables, but each subsequent execution
-  of an instance will re-use its variables.
+  C onvenience class that makes it easy to share variables.
+  Each insance of this class creates its own set of variables, but
+  each subsequent execution of an instance will re-use its variables.
 
-  Graph components that define variables should inherit from this class and implement their
-  logic in the `_build` method.
+  Graph components that define variables should inherit from this class
+  and implement their logic in the `_build` method.
   """
 
   def __init__(self, name):
@@ -23,7 +24,7 @@ def __init__(self, name):
       name: Name of this module. Used for `tf.make_template`.
     """
     self._template = tf.make_template(name, self._build, create_scope_now_=True)
-    # Docstrings for the class should be equal to the docstring for the _build method
+    # Docstrings for the class should be the docstring for the _build method
     self.__doc__ = self._build.__doc__
     # pylint: disable=E1101
     self.__call__.__func__.__doc__ = self._build.__doc__
diff --git a/seq2seq/inputs.py b/seq2seq/inputs.py
index 78c43ed8..772399fe 100644
--- a/seq2seq/inputs.py
+++ b/seq2seq/inputs.py
@@ -5,13 +5,16 @@
 import collections
 import tensorflow as tf
 
-SpecialVocab = collections.namedtuple(
-  "SpecialVocab",
-  ["OOV", "SEQUENCE_START", "SEQUENCE_END"])
+SpecialVocab = collections.namedtuple("SpecialVocab",
+                                      ["OOV", "SEQUENCE_START", "SEQUENCE_END"])
 
-class VocabInfo(collections.namedtuple("VocbabInfo", ["path", "vocab_size", "special_vocab"])):
+
+class VocabInfo(
+    collections.namedtuple("VocbabInfo",
+                           ["path", "vocab_size", "special_vocab"])):
   """Convenience structure for vocabulary information.
   """
+
   @property
   def total_size(self):
     """Returns size the the base vocabulary plus the size of extra vocabulary"""
@@ -33,6 +36,7 @@ def get_vocab_info(vocab_path):
   special_vocab = get_special_vocab(vocab_size)
   return VocabInfo(vocab_path, vocab_size, special_vocab)
 
+
 def get_special_vocab(vocabulary_size):
   """Returns the `SpecialVocab` instance for a given vocabulary size.
   """
@@ -65,23 +69,28 @@ def create_vocabulary_lookup_table(filename, default_value=None, name=None):
   tf.logging.info("Creating vocabulary lookup table of size %d", vocab_size)
 
   table_init = tf.contrib.lookup.TextFileIdTableInitializer(
-    filename, vocab_size=vocab_size)
+      filename, vocab_size=vocab_size)
 
   reverse_table_init = tf.contrib.lookup.TextFileInitializer(
-    filename=filename,
-    key_dtype=tf.int64,
-    key_index=tf.contrib.lookup.TextFileIndex.LINE_NUMBER,
-    value_dtype=tf.string,
-    value_index=tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
-    vocab_size=vocab_size)
-
-  vocab_to_id_table = tf.contrib.lookup.HashTable(table_init, default_value, name=name)
-  id_to_vocab_table = tf.contrib.lookup.HashTable(reverse_table_init, "UNK", name=name)
+      filename=filename,
+      key_dtype=tf.int64,
+      key_index=tf.contrib.lookup.TextFileIndex.LINE_NUMBER,
+      value_dtype=tf.string,
+      value_index=tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
+      vocab_size=vocab_size)
+
+  vocab_to_id_table = tf.contrib.lookup.HashTable(
+      table_init, default_value, name=name)
+  id_to_vocab_table = tf.contrib.lookup.HashTable(
+      reverse_table_init, "UNK", name=name)
 
   return vocab_to_id_table, id_to_vocab_table, vocab_size
 
 
-def make_data_provider(data_sources, reader=tf.TFRecordReader, num_samples=None, **kwargs):
+def make_data_provider(data_sources,
+                       reader=tf.TFRecordReader,
+                       num_samples=None,
+                       **kwargs):
   """
   Creates a TF Slim DatasetDataProvider for a list of input files.
 
@@ -96,32 +105,38 @@ def make_data_provider(data_sources, reader=tf.TFRecordReader, num_samples=None,
   """
 
   keys_to_features = {
-    "pair_id": tf.FixedLenFeature([], dtype=tf.string),
-    "source_len": tf.FixedLenFeature([], dtype=tf.int64),
-    "target_len": tf.FixedLenFeature([], dtype=tf.int64),
-    "source_tokens": tf.VarLenFeature(tf.string),
-    "target_tokens": tf.VarLenFeature(tf.string)
+      "pair_id": tf.FixedLenFeature(
+          [], dtype=tf.string),
+      "source_len": tf.FixedLenFeature(
+          [], dtype=tf.int64),
+      "target_len": tf.FixedLenFeature(
+          [], dtype=tf.int64),
+      "source_tokens": tf.VarLenFeature(tf.string),
+      "target_tokens": tf.VarLenFeature(tf.string)
   }
 
   items_to_handlers = {
-    "pair_id": tf.contrib.slim.tfexample_decoder.Tensor("pair_id"),
-    "source_len": tf.contrib.slim.tfexample_decoder.Tensor("source_len"),
-    "target_len": tf.contrib.slim.tfexample_decoder.Tensor("target_len"),
-    "source_tokens": tf.contrib.slim.tfexample_decoder.Tensor("source_tokens", default_value=""),
-    "target_tokens": tf.contrib.slim.tfexample_decoder.Tensor("target_tokens", default_value="")
+      "pair_id": tf.contrib.slim.tfexample_decoder.Tensor("pair_id"),
+      "source_len": tf.contrib.slim.tfexample_decoder.Tensor("source_len"),
+      "target_len": tf.contrib.slim.tfexample_decoder.Tensor("target_len"),
+      "source_tokens": tf.contrib.slim.tfexample_decoder.Tensor(
+          "source_tokens", default_value=""),
+      "target_tokens": tf.contrib.slim.tfexample_decoder.Tensor(
+          "target_tokens", default_value="")
   }
 
   decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder(
-    keys_to_features, items_to_handlers)
+      keys_to_features, items_to_handlers)
 
   dataset = tf.contrib.slim.dataset.Dataset(
-    data_sources=data_sources,
-    reader=reader,
-    decoder=decoder,
-    num_samples=num_samples,
-    items_to_descriptions={})
+      data_sources=data_sources,
+      reader=reader,
+      decoder=decoder,
+      num_samples=num_samples,
+      items_to_descriptions={})
 
-  return tf.contrib.slim.dataset_data_provider.DatasetDataProvider(dataset, **kwargs)
+  return tf.contrib.slim.dataset_data_provider.DatasetDataProvider(dataset,
+                                                                   **kwargs)
 
 
 def read_from_data_provider(data_provider):
@@ -131,7 +146,8 @@ def read_from_data_provider(data_provider):
     data_provider: A DataProvider instance
 
   Returns:
-    A dictionary of tensors corresponding to all features defined by the DataProvider
+    A dictionary of tensors corresponding to all features
+    defined by the DataProvider
   """
   item_values = data_provider.get(list(data_provider.list_items()))
   items_dict = dict(zip(data_provider.list_items(), item_values))
diff --git a/seq2seq/losses.py b/seq2seq/losses.py
index 3a9af36b..60844f19 100644
--- a/seq2seq/losses.py
+++ b/seq2seq/losses.py
@@ -3,6 +3,7 @@
 
 import tensorflow as tf
 
+
 def cross_entropy_sequence_loss(logits, targets, sequence_length):
   """Calculates the per-example Ccross-entropy loss for a sequence of logits and
     masks out all losses passed the sequence length.
@@ -10,7 +11,8 @@ def cross_entropy_sequence_loss(logits, targets, sequence_length):
   Args:
     logits: Logits of shape `[B, T, vocab_size]`
     targets: Target classes of shape `[B, T]`
-    sequence_length: An int32 tensor of shape `[B]` corresponding to the length of each input
+    sequence_length: An int32 tensor of shape `[B]` corresponding
+      to the length of each input
 
   Returns:
     A tensor of shape [B, T] that contains the loss per example, per time step.
@@ -19,7 +21,8 @@ def cross_entropy_sequence_loss(logits, targets, sequence_length):
     losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets)
 
     # Mask out the losses we don't care about
-    loss_mask = tf.sequence_mask(tf.to_int32(sequence_length), tf.to_int32(tf.shape(targets)[1]))
+    loss_mask = tf.sequence_mask(
+        tf.to_int32(sequence_length), tf.to_int32(tf.shape(targets)[1]))
     losses = losses * tf.to_float(loss_mask)
 
     return losses
diff --git a/seq2seq/models/attention_seq2seq.py b/seq2seq/models/attention_seq2seq.py
index d03cfa9a..64bc03af 100644
--- a/seq2seq/models/attention_seq2seq.py
+++ b/seq2seq/models/attention_seq2seq.py
@@ -9,52 +9,62 @@
 from seq2seq.training import utils as training_utils
 from seq2seq.models.model_base import Seq2SeqBase
 
+
 class AttentionSeq2Seq(Seq2SeqBase):
   """Sequence2Sequence model with attention mechanism.
 
   Args:
-    source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the source vocabulary
-    target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the target vocabulary
+    source_vocab_info: An instance of `seq2seq.inputs.VocabInfo`
+      for the source vocabulary
+    target_vocab_info: An instance of `seq2seq.inputs.VocabInfo`
+      for the target vocabulary
     params: A dictionary of hyperparameters
   """
 
-  def __init__(self, source_vocab_info, target_vocab_info, params, name="att_seq2seq"):
-    super(AttentionSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, params, name)
+  def __init__(self,
+               source_vocab_info,
+               target_vocab_info,
+               params,
+               name="att_seq2seq"):
+    super(AttentionSeq2Seq, self).__init__(source_vocab_info, target_vocab_info,
+                                           params, name)
 
   @staticmethod
   def default_params():
     params = Seq2SeqBase.default_params().copy()
     params.update({
-      "attention.dim": 128,
-      "rnn_cell.type": "LSTMCell",
-      "rnn_cell.num_units": 128,
-      "rnn_cell.dropout_input_keep_prob": 1.0,
-      "rnn_cell.dropout_output_keep_prob": 1.0,
-      "rnn_cell.num_layers": 1
+        "attention.dim": 128,
+        "rnn_cell.type": "LSTMCell",
+        "rnn_cell.num_units": 128,
+        "rnn_cell.dropout_input_keep_prob": 1.0,
+        "rnn_cell.dropout_output_keep_prob": 1.0,
+        "rnn_cell.num_layers": 1
     })
     return params
 
-  def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None):
+  def encode_decode(self, source, source_len, decoder_input_fn, target_len):
     encoder_cell = training_utils.get_rnn_cell(
-      cell_type=self.params["rnn_cell.type"],
-      num_units=self.params["rnn_cell.num_units"],
-      num_layers=self.params["rnn_cell.num_layers"],
-      dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"],
-      dropout_output_keep_prob=self.params["rnn_cell.dropout_output_keep_prob"])
+        cell_type=self.params["rnn_cell.type"],
+        num_units=self.params["rnn_cell.num_units"],
+        num_layers=self.params["rnn_cell.num_layers"],
+        dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"],
+        dropout_output_keep_prob=self.params[
+            "rnn_cell.dropout_output_keep_prob"])
     encoder_fn = encoders.BidirectionalRNNEncoder(encoder_cell)
     encoder_output = encoder_fn(source, source_len)
 
     decoder_cell = encoder_cell
     decoder_fn = decoders.AttentionDecoder(
-      cell=decoder_cell,
-      vocab_size=self.target_vocab_info.total_size,
-      attention_inputs=encoder_output.outputs,
-      attention_fn=decoders.AttentionLayer(self.params["attention.dim"]),
-      max_decode_length=self.params["target.max_seq_len"])
+        cell=decoder_cell,
+        vocab_size=self.target_vocab_info.total_size,
+        attention_inputs=encoder_output.outputs,
+        attention_fn=decoders.AttentionLayer(self.params["attention.dim"]),
+        max_decode_length=self.params["target.max_seq_len"])
 
     decoder_output, _, _ = decoder_fn(
-      input_fn=decoder_input_fn,
-      initial_state=decoder_cell.zero_state(tf.shape(source_len)[0], dtype=tf.float32),
-      sequence_length=target_len)
+        input_fn=decoder_input_fn,
+        initial_state=decoder_cell.zero_state(
+            tf.shape(source_len)[0], dtype=tf.float32),
+        sequence_length=target_len)
 
     return decoder_output
diff --git a/seq2seq/models/basic_seq2seq.py b/seq2seq/models/basic_seq2seq.py
index c549ec50..a030e2a6 100644
--- a/seq2seq/models/basic_seq2seq.py
+++ b/seq2seq/models/basic_seq2seq.py
@@ -7,53 +7,64 @@
 import seq2seq.decoders as decoders
 from seq2seq.models.model_base import Seq2SeqBase
 
+
 class BasicSeq2Seq(Seq2SeqBase):
-  """Basic Sequence2Sequence model with a unidirectional encoder and decoder. The last encoder
-  state is used to initialize the decoder and thus both must share the same type of RNN cell.
+  """Basic Sequence2Sequence model with a unidirectional encoder and decoder.
+  The last encoder state is used to initialize the decoder and thus both
+  must share the same type of RNN cell.
 
   Args:
-    source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the source vocabulary
-    target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the target vocabulary
+    source_vocab_info: An instance of `seq2seq.inputs.VocabInfo`
+      for the source vocabulary
+    target_vocab_info: An instance of `seq2seq.inputs.VocabInfo`
+      for the target vocabulary
     params: A dictionary of hyperparameters
   """
 
-  def __init__(self, source_vocab_info, target_vocab_info, params, name="basic_seq2seq"):
-    super(BasicSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, params, name)
+  def __init__(self,
+               source_vocab_info,
+               target_vocab_info,
+               params,
+               name="basic_seq2seq"):
+    super(BasicSeq2Seq, self).__init__(source_vocab_info, target_vocab_info,
+                                       params, name)
 
   @staticmethod
   def default_params():
     params = Seq2SeqBase.default_params().copy()
     params.update({
-      "rnn_cell.type": "LSTMCell",
-      "rnn_cell.num_units": 128,
-      "rnn_cell.dropout_input_keep_prob": 1.0,
-      "rnn_cell.dropout_output_keep_prob": 1.0,
-      "rnn_cell.num_layers": 1
+        "rnn_cell.type": "LSTMCell",
+        "rnn_cell.num_units": 128,
+        "rnn_cell.dropout_input_keep_prob": 1.0,
+        "rnn_cell.dropout_output_keep_prob": 1.0,
+        "rnn_cell.num_layers": 1
     })
     return params
 
-  def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None):
+  def encode_decode(self, source, source_len, decoder_input_fn, target_len):
     # Create Encoder
     encoder_cell = training.utils.get_rnn_cell(
-      cell_type=self.params["rnn_cell.type"],
-      num_units=self.params["rnn_cell.num_units"],
-      num_layers=self.params["rnn_cell.num_layers"],
-      dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"],
-      dropout_output_keep_prob=self.params["rnn_cell.dropout_output_keep_prob"])
+        cell_type=self.params["rnn_cell.type"],
+        num_units=self.params["rnn_cell.num_units"],
+        num_layers=self.params["rnn_cell.num_layers"],
+        dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"],
+        dropout_output_keep_prob=self.params[
+            "rnn_cell.dropout_output_keep_prob"])
     encoder_fn = encoders.UnidirectionalRNNEncoder(encoder_cell)
     encoder_output = encoder_fn(source, source_len)
 
     # Create Decoder
-    # Because we pass the state between encoder and decoder we must use the same cell
+    # Because we pass the state between encoder and decoder we must
+    # use the same cell type
     decoder_cell = encoder_cell
     decoder_fn = decoders.BasicDecoder(
-      cell=decoder_cell,
-      vocab_size=self.target_vocab_info.total_size,
-      max_decode_length=self.params["target.max_seq_len"])
+        cell=decoder_cell,
+        vocab_size=self.target_vocab_info.total_size,
+        max_decode_length=self.params["target.max_seq_len"])
 
     decoder_output, _, _ = decoder_fn(
-      input_fn=decoder_input_fn,
-      initial_state=encoder_output.final_state,
-      sequence_length=target_len)
+        input_fn=decoder_input_fn,
+        initial_state=encoder_output.final_state,
+        sequence_length=target_len)
 
     return decoder_output
diff --git a/seq2seq/models/model_base.py b/seq2seq/models/model_base.py
index 970ab2cb..8dc1a87d 100644
--- a/seq2seq/models/model_base.py
+++ b/seq2seq/models/model_base.py
@@ -6,6 +6,7 @@
 from seq2seq import losses as seq2seq_losses
 from seq2seq.training import featurizers
 
+
 class ModelBase(object):
   """Abstract base class for models.
 
@@ -13,6 +14,7 @@ class ModelBase(object):
     params: A dictionary of hyperparameter values
     name: A name for this model to be used as a variable scope
   """
+
   def __init__(self, params, name):
     self.name = name
     self.params = params
@@ -35,8 +37,8 @@ def __call__(self, features, labels, params, mode):
         return self._build(features, labels, params, mode)
 
   def _build(self, features, labels, params, mode):
-    """Subclasses should implement this method. See the model_fn documentation in
-    tf.contrib.learn.Estimator class for a more detailed explanation.
+    """Subclasses should implement this method. See the `model_fn` documentation
+    in tf.contrib.learn.Estimator class for a more detailed explanation.
     """
     raise NotImplementedError
 
@@ -48,6 +50,7 @@ class Seq2SeqBase(ModelBase):
   It's mostly used to define the output size of the decoder.
   Maybe we can somehow put it in the features?
   """
+
   def __init__(self, source_vocab_info, target_vocab_info, params, name):
     super(Seq2SeqBase, self).__init__(params, name)
     self.source_vocab_info = source_vocab_info
@@ -55,117 +58,127 @@ def __init__(self, source_vocab_info, target_vocab_info, params, name):
 
   def create_featurizer(self):
     return featurizers.Seq2SeqFeaturizer(
-      source_vocab_info=self.source_vocab_info,
-      target_vocab_info=self.target_vocab_info,
-      max_seq_len_source=self.params["source.max_seq_len"],
-      max_seq_len_target=self.params["target.max_seq_len"])
+        source_vocab_info=self.source_vocab_info,
+        target_vocab_info=self.target_vocab_info,
+        max_seq_len_source=self.params["source.max_seq_len"],
+        max_seq_len_target=self.params["target.max_seq_len"])
 
   @staticmethod
   def default_params():
     return {
-      "source.max_seq_len": 40,
-      "target.max_seq_len": 40,
-      "embedding.dim": 100,
-      "optimizer.name": "Adam",
-      "optimizer.learning_rate": 1e-4,
-      "optimizer.clip_gradients": 5.0,
+        "source.max_seq_len": 40,
+        "target.max_seq_len": 40,
+        "embedding.dim": 100,
+        "optimizer.name": "Adam",
+        "optimizer.learning_rate": 1e-4,
+        "optimizer.clip_gradients": 5.0,
     }
 
-  def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None):
+  def encode_decode(self, source, source_len, decoder_input_fn, target_len):
     """Should be implemented by child classes"""
     raise NotImplementedError
 
-  def _create_predictions(self, features, labels, decoder_output, log_perplexities=None):
+  def _create_predictions(self,
+                          features,
+                          labels,
+                          decoder_output,
+                          log_perplexities=None):
     """Creates the dictionary of predictions that is returned by the model.
     """
     predictions = {
-      "logits": decoder_output.logits,
-      "predictions": decoder_output.predictions,
+        "logits": decoder_output.logits,
+        "predictions": decoder_output.predictions,
     }
     if log_perplexities is not None:
       predictions["log_perplexities"] = log_perplexities
     return predictions
 
-
   def _build(self, features, labels, params, mode):
     # Create embedddings
     source_embedding = tf.get_variable(
-      "source_embedding", [self.source_vocab_info.total_size, self.params["embedding.dim"]])
+        "source_embedding",
+        [self.source_vocab_info.total_size, self.params["embedding.dim"]])
     target_embedding = tf.get_variable(
-      "target_embedding", [self.target_vocab_info.total_size, self.params["embedding.dim"]])
+        "target_embedding",
+        [self.target_vocab_info.total_size, self.params["embedding.dim"]])
 
     # Embed source
-    source_embedded = tf.nn.embedding_lookup(source_embedding, features["source_ids"])
+    source_embedded = tf.nn.embedding_lookup(source_embedding,
+                                             features["source_ids"])
 
     # Graph used for inference
     if mode == tf.contrib.learn.ModeKeys.INFER:
       target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START
       # Embed the "SEQUENCE_START" token
       initial_input = tf.nn.embedding_lookup(
-        target_embedding, tf.ones_like(features["source_len"]) * target_start_id)
+          target_embedding,
+          tf.ones_like(features["source_len"]) * target_start_id)
       # Use the embedded prediction as the input to the next time step
       decoder_input_fn_infer = decoders.DynamicDecoderInputs(
-        initial_inputs=initial_input,
-        make_input_fn=lambda x: tf.nn.embedding_lookup(target_embedding, x.predictions))
+          initial_inputs=initial_input,
+          make_input_fn=lambda x: tf.nn.embedding_lookup(
+              target_embedding, x.predictions)
+      )
       # Decode
       decoder_output, _ = self.encode_decode(
-        source=source_embedded,
-        source_len=features["source_len"],
-        decoder_input_fn=decoder_input_fn_infer,
-        target_len=self.params["target.max_seq_len"])
+          source=source_embedded,
+          source_len=features["source_len"],
+          decoder_input_fn=decoder_input_fn_infer,
+          target_len=self.params["target.max_seq_len"])
       predictions = self._create_predictions(
-        features=features,
-        labels=-labels,
-        decoder_output=decoder_output)
+          features=features, labels=-labels, decoder_output=decoder_output)
       return predictions, None, None
 
     # Embed target
-    target_embedded = tf.nn.embedding_lookup(target_embedding, labels["target_ids"])
+    target_embedded = tf.nn.embedding_lookup(target_embedding,
+                                             labels["target_ids"])
 
     # During training/eval, we have labels and use them for teacher forcing
     # We don't feed the last SEQUENCE_END token
     decoder_input_fn_train = decoders.FixedDecoderInputs(
-      inputs=target_embedded[:, :-1],
-      sequence_length=labels["target_len"] - 1)
+        inputs=target_embedded[:, :-1],
+        sequence_length=labels["target_len"] - 1)
 
     decoder_output = self.encode_decode(
-      source=source_embedded,
-      source_len=features["source_len"],
-      decoder_input_fn=decoder_input_fn_train,
-      target_len=labels["target_len"])
+        source=source_embedded,
+        source_len=features["source_len"],
+        decoder_input_fn=decoder_input_fn_train,
+        target_len=labels["target_len"])
 
-    # TODO: For a long sequence  the logits are a huge [B * T, vocab_size] matrix
-    # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we can use map_fn
-    # or slice the logits to max(sequence_length). Should benchmark this.
+    # TODO: For a long sequence  logits are a huge [B * T, vocab_size] matrix
+    # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we
+    # can use map_fn or slice the logits to max(sequence_length).
+    # Should benchmark this.
 
     # Calculate loss per example-timestep of shape [B, T]
     losses = seq2seq_losses.cross_entropy_sequence_loss(
-      logits=decoder_output.logits[:, :-1, :],
-      targets=labels["target_ids"][:, 1:],
-      sequence_length=labels["target_len"] - 1)
+        logits=decoder_output.logits[:, :-1, :],
+        targets=labels["target_ids"][:, 1:],
+        sequence_length=labels["target_len"] - 1)
 
     # Calulate per-example losses of shape [B]
     log_perplexities = tf.div(tf.reduce_sum(
-      losses, reduction_indices=1), tf.to_float(labels["target_len"] - 1))
+        losses, reduction_indices=1),
+                              tf.to_float(labels["target_len"] - 1))
 
     loss = tf.reduce_mean(log_perplexities)
 
     train_op = tf.contrib.layers.optimize_loss(
-      loss=loss,
-      global_step=tf.contrib.framework.get_global_step(),
-      learning_rate=self.params["optimizer.learning_rate"],
-      clip_gradients=self.params["optimizer.clip_gradients"],
-      optimizer=self.params["optimizer.name"],
-      summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)
+        loss=loss,
+        global_step=tf.contrib.framework.get_global_step(),
+        learning_rate=self.params["optimizer.learning_rate"],
+        clip_gradients=self.params["optimizer.clip_gradients"],
+        optimizer=self.params["optimizer.name"],
+        summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES)
 
     if mode == tf.contrib.learn.ModeKeys.EVAL:
       train_op = None
 
     predictions = self._create_predictions(
-      features=features,
-      labels=labels,
-      decoder_output=decoder_output,
-      log_perplexities=log_perplexities)
+        features=features,
+        labels=labels,
+        decoder_output=decoder_output,
+        log_perplexities=log_perplexities)
 
     # We add "useful" tensors to the graph collection so that we
     # can easly find them in our hooks/monitors.
diff --git a/seq2seq/scripts/generate_examples.py b/seq2seq/scripts/generate_examples.py
index 56fd05e8..9e61d1a9 100755
--- a/seq2seq/scripts/generate_examples.py
+++ b/seq2seq/scripts/generate_examples.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-
 """
 Generates a TFRecords file given sequence-aligned source and target files.
 
@@ -10,16 +9,14 @@
   --output_file <OUTPUT_FILE>
 """
 
-
-
 import tensorflow as tf
 
 tf.flags.DEFINE_string('source_file', None,
                        'File containing content in source language.')
 tf.flags.DEFINE_string(
-  'target_file', None,
-  'File containing content in target language, parallel line by line to the'
-  'source file.')
+    'target_file', None,
+    'File containing content in target language, parallel line by line to the'
+    'source file.')
 tf.flags.DEFINE_string('output_file', None,
                        'File to output tf.Example TFRecords.')
 
@@ -44,7 +41,8 @@ def build_example(pair_id, source, target):
   target_tokens = target.strip().split(' ')
   ex = tf.train.Example()
 
-  ex.features.feature['pair_id'].bytes_list.value.append(pair_id.encode('utf-8'))
+  ex.features.feature['pair_id'].bytes_list.value.append(
+      pair_id.encode('utf-8'))
   ex.features.feature['source_len'].int64_list.value.append(len(source_tokens))
   ex.features.feature['target_len'].int64_list.value.append(len(target_tokens))
 
@@ -73,7 +71,8 @@ def write_tfrecords(examples, output_file):
 
 
 def generate_examples(source_file, target_file):
-  """Creates an iterator of tf.Example records given aligned source and target files.
+  """Creates an iterator of tf.Example records given aligned
+  source and target files.
 
   Args:
     source_file: path to file with newline-separated source strings
@@ -89,12 +88,12 @@ def generate_examples(source_file, target_file):
           print('Processed {} records'.format(i))
         yield build_example(i, source, target)
 
+
 def main(unused_argv):
   """Main function.
   """
   #pylint: disable=unused-argument
-  examples = generate_examples(
-    FLAGS.source_file, FLAGS.target_file)
+  examples = generate_examples(FLAGS.source_file, FLAGS.target_file)
   write_tfrecords(examples, FLAGS.output_file)
 
 
diff --git a/seq2seq/scripts/generate_toy_data.py b/seq2seq/scripts/generate_toy_data.py
index 3c17cdf2..cf19040a 100755
--- a/seq2seq/scripts/generate_toy_data.py
+++ b/seq2seq/scripts/generate_toy_data.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-
 """
 Functions to generate various toy datasets.
 """
@@ -10,24 +9,35 @@
 from sklearn.cross_validation import train_test_split
 
 PARSER = argparse.ArgumentParser(description="Generates toy datasets.")
-PARSER.add_argument("--vocab_size", type=int, default=100,
-                    help="size of the vocabulary")
-PARSER.add_argument("--num_examples", type=int, default=10000,
-                    help="number of examples")
-PARSER.add_argument("--min_len", type=int, default=5,
-                    help="minimum sequence length")
-PARSER.add_argument("--max_len", type=int, default=40,
-                    help="maximum sequence length")
-PARSER.add_argument("--dev_split", type=float, default=0.1,
-                    help="Fraction of data to use for the dev set")
-PARSER.add_argument("--type", type=str, default="copy", choices=["copy", "reverse"],
-                    help="Type of dataet to generate. One of \"copy\" or \"reverse\"")
-PARSER.add_argument("--output_dir", type=str,
-                    help="path to the output directory", required=True)
+PARSER.add_argument(
+    "--vocab_size", type=int, default=100, help="size of the vocabulary")
+PARSER.add_argument(
+    "--num_examples", type=int, default=10000, help="number of examples")
+PARSER.add_argument(
+    "--min_len", type=int, default=5, help="minimum sequence length")
+PARSER.add_argument(
+    "--max_len", type=int, default=40, help="maximum sequence length")
+PARSER.add_argument(
+    "--dev_split",
+    type=float,
+    default=0.1,
+    help="Fraction of data to use for the dev set")
+PARSER.add_argument(
+    "--type",
+    type=str,
+    default="copy",
+    choices=["copy", "reverse"],
+    help="Type of dataet to generate. One of \"copy\" or \"reverse\"")
+PARSER.add_argument(
+    "--output_dir",
+    type=str,
+    help="path to the output directory",
+    required=True)
 ARGS = PARSER.parse_args()
 
 VOCABULARY = list([str(x) for x in range(ARGS.vocab_size)])
 
+
 def make_copy(num_examples, min_len, max_len):
   """
   Generates a dataset where the target is equal to the source.
@@ -43,10 +53,12 @@ def make_copy(num_examples, min_len, max_len):
   """
   for _ in range(num_examples):
     turn_length = np.random.choice(np.arange(min_len, max_len + 1))
-    source_tokens = np.random.choice(list(VOCABULARY), size=turn_length, replace=True)
+    source_tokens = np.random.choice(
+        list(VOCABULARY), size=turn_length, replace=True)
     target_tokens = source_tokens
     yield " ".join(source_tokens), " ".join(target_tokens)
 
+
 def make_reverse(num_examples, min_len, max_len):
   """
   Generates a dataset where the target is equal to the source reversed.
@@ -62,10 +74,12 @@ def make_reverse(num_examples, min_len, max_len):
   """
   for _ in range(num_examples):
     turn_length = np.random.choice(np.arange(min_len, max_len + 1))
-    source_tokens = np.random.choice(list(VOCABULARY), size=turn_length, replace=True)
+    source_tokens = np.random.choice(
+        list(VOCABULARY), size=turn_length, replace=True)
     target_tokens = source_tokens[::-1]
     yield " ".join(source_tokens), " ".join(target_tokens)
 
+
 def write_parallel_text(sources, targets, output_prefix):
   """
   Writes two files where each line corresponds to one example
@@ -90,6 +104,7 @@ def write_parallel_text(sources, targets, output_prefix):
       target_file.write(record + "\n")
   print("Wrote {}".format(target_filename))
 
+
 def main():
   """Main function"""
 
@@ -100,16 +115,20 @@ def main():
 
   # Generate dataset
   examples = list(generate_fn(ARGS.num_examples, ARGS.min_len, ARGS.max_len))
-  examples_train, examples_dev = train_test_split(examples, test_size=ARGS.dev_split)
+  examples_train, examples_dev = train_test_split(
+      examples, test_size=ARGS.dev_split)
   os.makedirs(ARGS.output_dir, exist_ok=True)
 
   # Write train data
   train_sources, train_targets = zip(*examples_train)
-  write_parallel_text(train_sources, train_targets, os.path.join(ARGS.output_dir, "train"))
+  write_parallel_text(train_sources, train_targets,
+                      os.path.join(ARGS.output_dir, "train"))
 
   # Write dev data
   dev_sources, dev_targets = list(zip(*examples_dev))
-  write_parallel_text(dev_sources, dev_targets, os.path.join(ARGS.output_dir, "dev"))
+  write_parallel_text(dev_sources, dev_targets,
+                      os.path.join(ARGS.output_dir, "dev"))
+
 
 if __name__ == "__main__":
   main()
diff --git a/seq2seq/scripts/generate_vocab.py b/seq2seq/scripts/generate_vocab.py
index a8434a5a..e26e3224 100755
--- a/seq2seq/scripts/generate_vocab.py
+++ b/seq2seq/scripts/generate_vocab.py
@@ -1,6 +1,5 @@
 #! /usr/bin/env python
 #pylint: disable=invalid-name
-
 """
 Generate vocabulary for a tokenized text file.
 """
@@ -8,15 +7,32 @@
 import argparse
 import collections
 
-parser = argparse.ArgumentParser(description="Generate vocabulary for a tokenized text file.")
-parser.add_argument("--input_file", type=str, help="path to the input file", required=True)
-parser.add_argument("--output_file", type=str, help="path to the vocabulary file", required=True)
-parser.add_argument("--min_frequency", dest="min_frequency", type=int, default=0,
-                    help="Minimum frequency of a word to be included in the vocabulary.")
-parser.add_argument("--max_vocab_size", dest="max_vocab_size", type=int,
-                    help="Maximum number of words in the vocabulary")
-parser.add_argument("--downcase", dest="downcase", type=bool,
-                    help="If set to true, downcase all text before processing.", default=False)
+parser = argparse.ArgumentParser(
+    description="Generate vocabulary for a tokenized text file.")
+parser.add_argument(
+    "--input_file", type=str, help="path to the input file", required=True)
+parser.add_argument(
+    "--output_file",
+    type=str,
+    help="path to the vocabulary file",
+    required=True)
+parser.add_argument(
+    "--min_frequency",
+    dest="min_frequency",
+    type=int,
+    default=0,
+    help="Minimum frequency of a word to be included in the vocabulary.")
+parser.add_argument(
+    "--max_vocab_size",
+    dest="max_vocab_size",
+    type=int,
+    help="Maximum number of words in the vocabulary")
+parser.add_argument(
+    "--downcase",
+    dest="downcase",
+    type=bool,
+    help="If set to true, downcase all text before processing.",
+    default=False)
 
 args = parser.parse_args()
 
@@ -35,14 +51,17 @@
 
 # Filter words below the frequency threshold
 if args.min_frequency > 0:
-  filtered_words = [(w, c) for w, c in cnt.most_common() if c > args.min_frequency]
+  filtered_words = [(w, c) for w, c in cnt.most_common()
+                    if c > args.min_frequency]
   cnt = collections.Counter(dict(filtered_words))
 
-print("Found {} unique words with frequency > {}.".format(len(cnt), args.min_frequency))
+print("Found {} unique words with frequency > {}.".format(
+    len(cnt), args.min_frequency))
 
 # Sort words by 1. frequency 2. lexically to break ties
 word_with_counts = cnt.most_common()
-word_with_counts = sorted(word_with_counts, key=lambda x: (x[1], x[0]), reverse=True)
+word_with_counts = sorted(
+    word_with_counts, key=lambda x: (x[1], x[0]), reverse=True)
 
 # Take only max-vocab
 if args.max_vocab_size is not None:
@@ -52,4 +71,5 @@
   for word, count in word_with_counts:
     f.write("{}\n".format(word))
 
-print("Wrote vocab of size {}: {}".format(len(word_with_counts), args.output_file))
+print("Wrote vocab of size {}: {}".format(
+    len(word_with_counts), args.output_file))
diff --git a/seq2seq/scripts/profile.py b/seq2seq/scripts/profile.py
index b746ba30..3034d441 100755
--- a/seq2seq/scripts/profile.py
+++ b/seq2seq/scripts/profile.py
@@ -1,5 +1,4 @@
 #! /usr/bin/env python
-
 """ Script to generates model profiling information
 """
 
@@ -18,6 +17,7 @@
 
 FLAGS = tf.flags.FLAGS
 
+
 def load_metadata(model_dir):
   """Loads RunMetadata, Graph and OpLog from files
   """
@@ -58,9 +58,9 @@ def load_metadata(model_dir):
 
 
 def merge_default_with_oplog(graph, op_log=None, run_meta=None):
-  """Monkeypatch. There currently is a bug in tfprof_logger._merge_default_with_oplog that
-    prevents it from being used with Python 3. So we override the method manually until the fix
-    comes in.
+  """Monkeypatch. There currently is a bug in tfprof_logger that
+    prevents it from being used with Python 3. So we override the method
+    manually until the fix comes in.
   """
   tmp_op_log = tfprof_log_pb2.OpLog()
   # pylint: disable=W0212
@@ -93,6 +93,7 @@ def param_analysis_options(output_dir):
     options["dump_to_file"] = os.path.join(output_dir, "params.txt")
   return "scope", options
 
+
 def micro_anaylsis_options(output_dir):
   """Options for microsecond analysis
   """
@@ -105,6 +106,7 @@ def micro_anaylsis_options(output_dir):
     options["dump_to_file"] = os.path.join(output_dir, "micro.txt")
   return "graph", options
 
+
 def flops_analysis_options(output_dir):
   """Options for FLOPS analysis
   """
@@ -117,6 +119,7 @@ def flops_analysis_options(output_dir):
     options["dump_to_file"] = os.path.join(output_dir, "flops.txt")
   return "scope", options
 
+
 def device_analysis_options(output_dir):
   """Options for device placement analysis
   """
@@ -128,6 +131,7 @@ def device_analysis_options(output_dir):
     options["dump_to_file"] = os.path.join(output_dir, "device.txt")
   return "scope", options
 
+
 def main(_argv):
   """Main functions. Runs all anaylses."""
   # pylint: disable=W0212
@@ -140,22 +144,23 @@ def main(_argv):
   run_meta, graph, op_log = load_metadata(FLAGS.model_dir)
 
   param_arguments = [
-    param_analysis_options(output_dir),
-    micro_anaylsis_options(output_dir),
-    flops_analysis_options(output_dir),
-    device_analysis_options(output_dir),
+      param_analysis_options(output_dir),
+      micro_anaylsis_options(output_dir),
+      flops_analysis_options(output_dir),
+      device_analysis_options(output_dir),
   ]
 
   for tfprof_cmd, params in param_arguments:
     model_analyzer.print_model_analysis(
-      graph=graph,
-      run_meta=run_meta,
-      op_log=op_log,
-      tfprof_cmd=tfprof_cmd,
-      tfprof_options=params)
+        graph=graph,
+        run_meta=run_meta,
+        op_log=op_log,
+        tfprof_cmd=tfprof_cmd,
+        tfprof_options=params)
 
     if params["dump_to_file"] != "":
       print("Wrote {}".format(params["dump_to_file"]))
 
+
 if __name__ == '__main__':
   tf.app.run()
diff --git a/seq2seq/test/attention_test.py b/seq2seq/test/attention_test.py
index 2e4712b8..dfc5632d 100644
--- a/seq2seq/test/attention_test.py
+++ b/seq2seq/test/attention_test.py
@@ -7,10 +7,12 @@
 
 from seq2seq.decoders.attention import AttentionLayer
 
+
 class AttentionLayerTest(tf.test.TestCase):
   """
   Tests the AttentionLayer module.
   """
+
   def setUp(self):
     super(AttentionLayerTest, self).setUp()
     tf.logging.set_verbosity(tf.logging.INFO)
@@ -30,12 +32,15 @@ def test_shape(self):
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
       feed_dict = {}
-      feed_dict[inputs_pl] = np.random.randn(self.batch_size, self.seq_len, self.input_dim)
+      feed_dict[inputs_pl] = np.random.randn(self.batch_size, self.seq_len,
+                                             self.input_dim)
       feed_dict[state_pl] = np.random.randn(self.batch_size, self.state_dim)
       scores_, context_ = sess.run([scores, context], feed_dict)
 
-    np.testing.assert_array_equal(scores_.shape, [self.batch_size, self.seq_len])
-    np.testing.assert_array_equal(context_.shape, [self.batch_size, self.input_dim])
+    np.testing.assert_array_equal(scores_.shape,
+                                  [self.batch_size, self.seq_len])
+    np.testing.assert_array_equal(context_.shape,
+                                  [self.batch_size, self.input_dim])
 
     # Scores should sum to 1
     scores_sum = np.sum(scores_, axis=1)
diff --git a/seq2seq/test/decoder_test.py b/seq2seq/test/decoder_test.py
index 71d77349..5e41559c 100644
--- a/seq2seq/test/decoder_test.py
+++ b/seq2seq/test/decoder_test.py
@@ -8,6 +8,7 @@
 from seq2seq.decoders import BasicDecoder, AttentionDecoder, AttentionLayer
 from seq2seq.decoders import FixedDecoderInputs, DynamicDecoderInputs
 
+
 class DecoderTests(object):
   """
   A collection of decoder tests. This class should be inherited together with
@@ -31,13 +32,15 @@ def create_decoder(self):
     raise NotImplementedError
 
   def test_with_fixed_inputs(self):
-    inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth])
+    inputs = tf.random_normal(
+        [self.batch_size, self.sequence_length, self.input_depth])
     seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
     initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)
 
     decoder_input_fn = FixedDecoderInputs(inputs, seq_length)
     decoder_fn = self.create_decoder()
-    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length)
+    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state,
+                                      seq_length)
 
     #pylint: disable=E1101
     with self.test_session() as sess:
@@ -45,26 +48,28 @@ def test_with_fixed_inputs(self):
       decoder_output_ = sess.run(decoder_output)
 
     np.testing.assert_array_equal(
-      decoder_output_.logits.shape,
-      [self.batch_size, self.sequence_length, self.vocab_size])
-    np.testing.assert_array_equal(
-      decoder_output_.predictions.shape,
-      [self.batch_size, self.sequence_length])
+        decoder_output_.logits.shape,
+        [self.batch_size, self.sequence_length, self.vocab_size])
+    np.testing.assert_array_equal(decoder_output_.predictions.shape,
+                                  [self.batch_size, self.sequence_length])
 
     return decoder_output_
 
-
   def test_gradients(self):
-    inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth])
+    inputs = tf.random_normal(
+        [self.batch_size, self.sequence_length, self.input_depth])
     seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
     initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)
-    labels = np.random.randint(0, self.vocab_size, [self.batch_size, self.sequence_length])
+    labels = np.random.randint(0, self.vocab_size,
+                               [self.batch_size, self.sequence_length])
 
     decoder_input_fn = FixedDecoderInputs(inputs, seq_length)
     decoder_fn = self.create_decoder()
-    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length)
+    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state,
+                                      seq_length)
 
-    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(decoder_output.logits, labels)
+    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        decoder_output.logits, labels)
     optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
     grads_and_vars = optimizer.compute_gradients(tf.reduce_mean(losses))
 
@@ -78,7 +83,6 @@ def test_gradients(self):
 
     return grads_and_vars_
 
-
   def test_with_dynamic_inputs(self):
     initial_input = tf.random_normal([self.batch_size, self.input_depth])
     seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
@@ -92,7 +96,8 @@ def make_input_fn(step_output):
 
     decoder_input_fn = DynamicDecoderInputs(initial_input, make_input_fn)
     decoder_fn = self.create_decoder()
-    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length)
+    decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state,
+                                      seq_length)
 
     #pylint: disable=E1101
     with self.test_session() as sess:
@@ -100,11 +105,10 @@ def make_input_fn(step_output):
       decoder_output_ = sess.run(decoder_output)
 
     np.testing.assert_array_equal(
-      decoder_output_.logits.shape,
-      [self.batch_size, self.sequence_length, self.vocab_size])
-    np.testing.assert_array_equal(
-      decoder_output_.predictions.shape,
-      [self.batch_size, self.sequence_length])
+        decoder_output_.logits.shape,
+        [self.batch_size, self.sequence_length, self.vocab_size])
+    np.testing.assert_array_equal(decoder_output_.predictions.shape,
+                                  [self.batch_size, self.sequence_length])
 
     return decoder_output
 
@@ -112,6 +116,7 @@ def make_input_fn(step_output):
 class BasicDecoderTest(tf.test.TestCase, DecoderTests):
   """Tests the `BasicDecoder` class.
   """
+
   def setUp(self):
     tf.test.TestCase.setUp(self)
     tf.logging.set_verbosity(tf.logging.INFO)
@@ -119,14 +124,15 @@ def setUp(self):
 
   def create_decoder(self):
     return BasicDecoder(
-      cell=self.cell,
-      vocab_size=self.vocab_size,
-      max_decode_length=self.max_decode_length)
+        cell=self.cell,
+        vocab_size=self.vocab_size,
+        max_decode_length=self.max_decode_length)
 
 
 class AttentionDecoderTest(tf.test.TestCase, DecoderTests):
   """Tests the `AttentionDecoder` class.
   """
+
   def setUp(self):
     tf.test.TestCase.setUp(self)
     tf.logging.set_verbosity(tf.logging.INFO)
@@ -134,28 +140,29 @@ def setUp(self):
     self.attention_dim = 64
     self.input_seq_len = 10
     self.attention_inputs = tf.convert_to_tensor(
-      np.random.randn(self.batch_size, self.input_seq_len, 32),
-      dtype=tf.float32)
+        np.random.randn(self.batch_size, self.input_seq_len, 32),
+        dtype=tf.float32)
 
   def create_decoder(self):
     attention_fn = AttentionLayer(self.attention_dim)
     return AttentionDecoder(
-      cell=self.cell,
-      vocab_size=self.vocab_size,
-      attention_inputs=self.attention_inputs,
-      attention_fn=attention_fn,
-      max_decode_length=self.max_decode_length)
+        cell=self.cell,
+        vocab_size=self.vocab_size,
+        attention_inputs=self.attention_inputs,
+        attention_fn=attention_fn,
+        max_decode_length=self.max_decode_length)
 
   def test_attention_scores(self):
     decoder_output_ = self.test_with_fixed_inputs()
     np.testing.assert_array_equal(
-      decoder_output_.attention_scores.shape,
-      [self.batch_size, self.sequence_length, self.input_seq_len])
+        decoder_output_.attention_scores.shape,
+        [self.batch_size, self.sequence_length, self.input_seq_len])
 
     # Make sure the attention scores sum to 1 for each step
     scores_sum = np.sum(decoder_output_.attention_scores, axis=2)
     np.testing.assert_array_almost_equal(
-      scores_sum, np.ones([self.batch_size, self.sequence_length]))
+        scores_sum, np.ones([self.batch_size, self.sequence_length]))
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/seq2seq/test/hparams_parser_test.py b/seq2seq/test/hparams_parser_test.py
index f1d786f3..3db9d0a2 100644
--- a/seq2seq/test/hparams_parser_test.py
+++ b/seq2seq/test/hparams_parser_test.py
@@ -5,16 +5,17 @@
 import unittest
 from seq2seq.training import HParamsParser
 
+
 class HParamsParserTest(unittest.TestCase):
   """Test for HParamsParser class.
   """
 
   def test_parse(self):
     default_params = {
-      "rnn_dim": 128,
-      "num_layers": 2,
-      "rnn_cell_type": "LSTM",
-      "dropout": 0.8
+        "rnn_dim": 128,
+        "num_layers": 2,
+        "rnn_cell_type": "LSTM",
+        "dropout": 0.8
     }
     parser = HParamsParser(default_params)
     final_params = parser.parse("rnn_dim=256,rnn_cell_type=GRU,dropout=0.77")
@@ -25,17 +26,19 @@ def test_parse(self):
 
   def test_parse_with_newlines(self):
     default_params = {
-      "rnn_dim": 128,
-      "num_layers": 2,
-      "rnn_cell_type": "LSTM",
-      "dropout": 0.8
+        "rnn_dim": 128,
+        "num_layers": 2,
+        "rnn_cell_type": "LSTM",
+        "dropout": 0.8
     }
     parser = HParamsParser(default_params)
-    final_params = parser.parse("\n".join(["rnn_dim=256,", "rnn_cell_type=GRU,", "dropout=0.77"]))
+    final_params = parser.parse("\n".join(
+        ["rnn_dim=256,", "rnn_cell_type=GRU,", "dropout=0.77"]))
     self.assertEqual(final_params["rnn_dim"], 256)
     self.assertEqual(final_params["rnn_cell_type"], "GRU")
     self.assertEqual(final_params["dropout"], 0.77)
     self.assertEqual(final_params["num_layers"], 2)
 
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/seq2seq/test/inputs_test.py b/seq2seq/test/inputs_test.py
index cba282e2..f78954a6 100644
--- a/seq2seq/test/inputs_test.py
+++ b/seq2seq/test/inputs_test.py
@@ -2,13 +2,13 @@
 Unit tests for input-related operations.
 """
 
-
 import tensorflow as tf
 import numpy as np
 
 from seq2seq import inputs
 from seq2seq.test import utils as test_utils
 
+
 class VocabInfoTest(tf.test.TestCase):
   """Tests VocabInfo class"""
 
@@ -31,6 +31,7 @@ def test_vocab_info(self):
     self.assertEqual(vocab_info.special_vocab.SEQUENCE_END, 5)
     self.assertEqual(vocab_info.total_size, 6)
 
+
 class ReadFromDataProviderTest(tf.test.TestCase):
   """
   Tests Data Provider operations.
@@ -41,7 +42,8 @@ def setUp(self):
     tf.logging.set_verbosity(tf.logging.INFO)
 
   def test_read_from_data_provider(self):
-    file = test_utils.create_temp_tfrecords(source="Hello World .", target="Bye")
+    file = test_utils.create_temp_tfrecords(
+        source="Hello World .", target="Bye")
     data_provider = inputs.make_data_provider([file.name], num_epochs=5)
     features = inputs.read_from_data_provider(data_provider)
 
@@ -53,7 +55,8 @@ def test_read_from_data_provider(self):
 
     self.assertEqual(res["source_len"], 3)
     self.assertEqual(res["target_len"], 1)
-    np.testing.assert_array_equal(res["source_tokens"].astype("U"), ["Hello", "World", "."])
+    np.testing.assert_array_equal(res["source_tokens"].astype("U"),
+                                  ["Hello", "World", "."])
     np.testing.assert_array_equal(res["target_tokens"].astype("U"), ["Bye"])
 
 
@@ -84,13 +87,17 @@ def test_lookup_table(self):
       sess.run(tf.local_variables_initializer())
       sess.run(tf.initialize_all_tables())
 
-      ids = vocab_to_id_table.lookup(tf.convert_to_tensor(["Hello", ".", "Bye", "??", "xxx"]))
+      ids = vocab_to_id_table.lookup(
+          tf.convert_to_tensor(["Hello", ".", "Bye", "??", "xxx"]))
       ids = sess.run(ids)
       np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3])
 
-      words = id_to_vocab_table.lookup(tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64))
+      words = id_to_vocab_table.lookup(
+          tf.convert_to_tensor(
+              [0, 1, 2, 3], dtype=tf.int64))
       words = sess.run(words)
-      np.testing.assert_array_equal(words.astype("U"), ["Hello", ".", "Bye", "UNK"])
+      np.testing.assert_array_equal(
+          words.astype("U"), ["Hello", ".", "Bye", "UNK"])
 
 
 if __name__ == "__main__":
diff --git a/seq2seq/test/losses_test.py b/seq2seq/test/losses_test.py
index cc115b53..9ea64324 100644
--- a/seq2seq/test/losses_test.py
+++ b/seq2seq/test/losses_test.py
@@ -6,6 +6,7 @@
 import tensorflow as tf
 import numpy as np
 
+
 class CrossEntropySequenceLossTest(tf.test.TestCase):
   """
   Test for `sqe2seq.losses.sequence_mask`.
@@ -19,11 +20,14 @@ def setUp(self):
     self.vocab_size = 50
 
   def test_op(self):
-    logits = np.random.randn(self.batch_size, self.sequence_length, self.vocab_size)
+    logits = np.random.randn(self.batch_size, self.sequence_length,
+                             self.vocab_size)
     logits = logits.astype(np.float32)
     sequence_length = np.array([1, 2, 3, 4])
-    targets = np.random.randint(0, self.vocab_size, [self.batch_size, self.sequence_length])
-    losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets, sequence_length)
+    targets = np.random.randint(0, self.vocab_size,
+                                [self.batch_size, self.sequence_length])
+    losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets,
+                                                        sequence_length)
 
     with self.test_session() as sess:
       losses_ = sess.run(losses)
@@ -38,5 +42,6 @@ def test_op(self):
     np.testing.assert_array_equal(losses_[1, 2:], np.zeros_like(losses_[1, 2:]))
     np.testing.assert_array_equal(losses_[2, 3:], np.zeros_like(losses_[2, 3:]))
 
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/seq2seq/test/models_test.py b/seq2seq/test/models_test.py
index c2cfc49c..9c57d455 100644
--- a/seq2seq/test/models_test.py
+++ b/seq2seq/test/models_test.py
@@ -14,6 +14,7 @@
 import tensorflow as tf
 import numpy as np
 
+
 class EncoderDecoderTests(tf.test.TestCase):
   """Base class for EncoderDecoder tests. Tests for specific classes should
   inherit from this and tf.test.TestCase.
@@ -36,34 +37,44 @@ def tearDown(self):
     self.vocab_file.close()
 
   def create_model(self):
-    """Creates the model class to be tested. Subclasses must implement this method.
+    """Creates model class to be tested. Subclasses must implement this method.
     """
     self.skipTest("Base module should not be tested.")
 
   def _create_example(self):
     """Creates example data for a test"""
-    source = np.random.randn(self.batch_size, self.max_decode_length, self.input_depth)
+    source = np.random.randn(self.batch_size, self.max_decode_length,
+                             self.input_depth)
     source_len = np.random.randint(0, self.max_decode_length, [self.batch_size])
-    target_len = np.random.randint(0, self.max_decode_length * 2, [self.batch_size])
-    target = np.random.randn(self.batch_size, np.max(target_len), self.input_depth)
-    labels = np.random.randint(0, self.vocab_size, [self.batch_size, np.max(target_len) - 1])
-
-    example_ = namedtuple("Example", ["source", "source_len", "target", "target_len", "labels"])
+    target_len = np.random.randint(0, self.max_decode_length * 2,
+                                   [self.batch_size])
+    target = np.random.randn(self.batch_size,
+                             np.max(target_len), self.input_depth)
+    labels = np.random.randint(0, self.vocab_size,
+                               [self.batch_size, np.max(target_len) - 1])
+
+    example_ = namedtuple(
+        "Example", ["source", "source_len", "target", "target_len", "labels"])
     return example_(source, source_len, target, target_len, labels)
 
   def test_forward_pass(self):
     """Tests model forward pass by checking the shape of the outputs."""
     ex = self._create_example()
     decoder_input_fn = FixedDecoderInputs(
-      inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32),
-      sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32))
+        inputs=tf.convert_to_tensor(
+            ex.target, dtype=tf.float32),
+        sequence_length=tf.convert_to_tensor(
+            ex.target_len, dtype=tf.int32))
 
     model = self.create_model()
     decoder_output = model.encode_decode(
-      source=tf.convert_to_tensor(ex.source, dtype=tf.float32),
-      source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32),
-      decoder_input_fn=decoder_input_fn,
-      target_len=tf.convert_to_tensor(ex.target_len, dtype=tf.int32))
+        source=tf.convert_to_tensor(
+            ex.source, dtype=tf.float32),
+        source_len=tf.convert_to_tensor(
+            ex.source_len, dtype=tf.int32),
+        decoder_input_fn=decoder_input_fn,
+        target_len=tf.convert_to_tensor(
+            ex.target_len, dtype=tf.int32))
 
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
@@ -73,13 +84,13 @@ def test_forward_pass(self):
     expected_decode_len = np.minimum(ex.target_len, max_decode_length)
 
     # Assert shapes are correct
+    np.testing.assert_array_equal(decoder_output_.logits.shape, [
+        self.batch_size, np.max(expected_decode_len),
+        model.target_vocab_info.total_size
+    ])
     np.testing.assert_array_equal(
-      decoder_output_.logits.shape,
-      [self.batch_size, np.max(expected_decode_len), model.target_vocab_info.total_size])
-    np.testing.assert_array_equal(
-      decoder_output_.predictions.shape,
-      [self.batch_size, np.max(expected_decode_len)])
-
+        decoder_output_.predictions.shape,
+        [self.batch_size, np.max(expected_decode_len)])
 
   def test_inference(self):
     """Tests model inference by feeding dynamic inputs based on an embedding
@@ -87,54 +98,65 @@ def test_inference(self):
     model = self.create_model()
     ex = self._create_example()
 
-    embeddings = tf.get_variable("W_embed", [model.target_vocab_info.total_size, self.input_depth])
+    embeddings = tf.get_variable(
+        "W_embed", [model.target_vocab_info.total_size, self.input_depth])
+
     def make_input_fn(step_output):
       """Looks up the predictions in the embeddings.
       """
       return tf.nn.embedding_lookup(embeddings, step_output.predictions)
 
     decoder_input_fn = DynamicDecoderInputs(
-      initial_inputs=tf.zeros([self.batch_size, self.input_depth], dtype=tf.float32),
-      make_input_fn=make_input_fn)
+        initial_inputs=tf.zeros(
+            [self.batch_size, self.input_depth], dtype=tf.float32),
+        make_input_fn=make_input_fn)
 
     decoder_output = model.encode_decode(
-      source=tf.convert_to_tensor(ex.source, dtype=tf.float32),
-      source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32),
-      decoder_input_fn=decoder_input_fn,
-      target_len=self.max_decode_length)
+        source=tf.convert_to_tensor(
+            ex.source, dtype=tf.float32),
+        source_len=tf.convert_to_tensor(
+            ex.source_len, dtype=tf.int32),
+        decoder_input_fn=decoder_input_fn,
+        target_len=self.max_decode_length)
 
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
       decoder_output_ = sess.run(decoder_output)
 
     # Assert shapes are correct
-    np.testing.assert_array_equal(
-      decoder_output_.logits.shape,
-      [self.batch_size, self.max_decode_length, model.target_vocab_info.total_size])
-    np.testing.assert_array_equal(
-      decoder_output_.predictions.shape,
-      [self.batch_size, self.max_decode_length])
+    np.testing.assert_array_equal(decoder_output_.logits.shape, [
+        self.batch_size, self.max_decode_length,
+        model.target_vocab_info.total_size
+    ])
+    np.testing.assert_array_equal(decoder_output_.predictions.shape,
+                                  [self.batch_size, self.max_decode_length])
 
   def test_gradients(self):
     """Ensures the parameter gradients can be computed and are not NaN
     """
     ex = self._create_example()
     decoder_input_fn = FixedDecoderInputs(
-      inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32),
-      sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32))
+        inputs=tf.convert_to_tensor(
+            ex.target, dtype=tf.float32),
+        sequence_length=tf.convert_to_tensor(
+            ex.target_len, dtype=tf.int32))
 
     model = self.create_model()
     decoder_output = model.encode_decode(
-      source=tf.convert_to_tensor(ex.source, dtype=tf.float32),
-      source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32),
-      decoder_input_fn=decoder_input_fn,
-      target_len=tf.convert_to_tensor(ex.target_len, dtype=tf.int32))
+        source=tf.convert_to_tensor(
+            ex.source, dtype=tf.float32),
+        source_len=tf.convert_to_tensor(
+            ex.source_len, dtype=tf.int32),
+        decoder_input_fn=decoder_input_fn,
+        target_len=tf.convert_to_tensor(
+            ex.target_len, dtype=tf.int32))
 
     # Get a loss to optimize
     losses = seq2seq_losses.cross_entropy_sequence_loss(
-      logits=decoder_output.logits,
-      targets=tf.ones_like(decoder_output.predictions),
-      sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32))
+        logits=decoder_output.logits,
+        targets=tf.ones_like(decoder_output.predictions),
+        sequence_length=tf.convert_to_tensor(
+            ex.target_len, dtype=tf.int32))
     mean_loss = tf.reduce_mean(losses)
 
     optimizer = tf.train.AdamOptimizer()
@@ -154,15 +176,18 @@ def test_pipeline(self):
     target_len = self.max_decode_length + 10
     source = " ".join(np.random.choice(self.vocab_list, source_len))
     target = " ".join(np.random.choice(self.vocab_list, target_len))
-    tfrecords_file = test_utils.create_temp_tfrecords(source=source, target=target)
+    tfrecords_file = test_utils.create_temp_tfrecords(
+        source=source, target=target)
 
     # Build model graph
     model = self.create_model()
     featurizer = model.create_featurizer()
     data_provider = lambda: inputs.make_data_provider([tfrecords_file.name])
-    input_fn = training_utils.create_input_fn(data_provider, featurizer, self.batch_size)
+    input_fn = training_utils.create_input_fn(data_provider, featurizer,
+                                              self.batch_size)
     features, labels = input_fn()
-    predictions, loss, train_op = model(features, labels, None, tf.contrib.learn.ModeKeys.TRAIN)
+    predictions, loss, train_op = model(features, labels, None,
+                                        tf.contrib.learn.ModeKeys.TRAIN)
 
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
@@ -176,31 +201,34 @@ def test_pipeline(self):
     max_decode_length = model.params["target.max_seq_len"]
     expected_decode_len = np.minimum(target_len + 1, max_decode_length)
 
-    np.testing.assert_array_equal(
-      predictions_["logits"].shape,
-      [self.batch_size, expected_decode_len, model.target_vocab_info.total_size])
-    np.testing.assert_array_equal(
-      predictions_["predictions"].shape,
-      [self.batch_size, expected_decode_len])
+    np.testing.assert_array_equal(predictions_["logits"].shape, [
+        self.batch_size, expected_decode_len, model.target_vocab_info.total_size
+    ])
+    np.testing.assert_array_equal(predictions_["predictions"].shape,
+                                  [self.batch_size, expected_decode_len])
     self.assertFalse(np.isnan(loss_))
 
     tfrecords_file.close()
 
+
 class TestBasicSeq2Seq(EncoderDecoderTests):
   """Tests the seq2seq.models.BasicSeq2Seq model.
   """
+
   def setUp(self):
     super(TestBasicSeq2Seq, self).setUp()
 
   def create_model(self):
     return BasicSeq2Seq(
-      source_vocab_info=self.vocab_info,
-      target_vocab_info=self.vocab_info,
-      params=BasicSeq2Seq.default_params())
+        source_vocab_info=self.vocab_info,
+        target_vocab_info=self.vocab_info,
+        params=BasicSeq2Seq.default_params())
+
 
 class TestAttentionSeq2Seq(EncoderDecoderTests):
   """Tests the seq2seq.models.AttentionSeq2Seq model.
   """
+
   def setUp(self):
     super(TestAttentionSeq2Seq, self).setUp()
     self.encoder_rnn_cell = tf.nn.rnn_cell.LSTMCell(32)
@@ -209,9 +237,10 @@ def setUp(self):
 
   def create_model(self):
     return AttentionSeq2Seq(
-      source_vocab_info=self.vocab_info,
-      target_vocab_info=self.vocab_info,
-      params=AttentionSeq2Seq.default_params())
+        source_vocab_info=self.vocab_info,
+        target_vocab_info=self.vocab_info,
+        params=AttentionSeq2Seq.default_params())
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/seq2seq/test/rnn_encoder_test.py b/seq2seq/test/rnn_encoder_test.py
index 8d6bd82d..9e93553e 100644
--- a/seq2seq/test/rnn_encoder_test.py
+++ b/seq2seq/test/rnn_encoder_test.py
@@ -5,7 +5,8 @@
 import tensorflow as tf
 import numpy as np
 
-from seq2seq.encoders.rnn_encoder import UnidirectionalRNNEncoder, BidirectionalRNNEncoder
+from seq2seq.encoders import rnn_encoder
+
 
 class UnidirectionalRNNEncoderTest(tf.test.TestCase):
   """
@@ -21,10 +22,12 @@ def setUp(self):
     self.cell = tf.nn.rnn_cell.LSTMCell(32)
 
   def test_encode(self):
-    inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth])
-    example_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
+    inputs = tf.random_normal(
+        [self.batch_size, self.sequence_length, self.input_depth])
+    example_length = tf.ones(
+        self.batch_size, dtype=tf.int32) * self.sequence_length
 
-    encode_fn = UnidirectionalRNNEncoder(self.cell)
+    encode_fn = rnn_encoder.UnidirectionalRNNEncoder(self.cell)
     encoder_output = encode_fn(inputs, example_length)
 
     with self.test_session() as sess:
@@ -32,12 +35,14 @@ def test_encode(self):
       encoder_output_ = sess.run(encoder_output)
 
     np.testing.assert_array_equal(
-      encoder_output_.outputs.shape, [self.batch_size, self.sequence_length, self.cell.output_size])
-    self.assertIsInstance(encoder_output_.final_state, tf.nn.rnn_cell.LSTMStateTuple)
-    np.testing.assert_array_equal(
-      encoder_output_.final_state.h.shape, [self.batch_size, self.cell.output_size])
-    np.testing.assert_array_equal(
-      encoder_output_.final_state.c.shape, [self.batch_size, self.cell.output_size])
+        encoder_output_.outputs.shape,
+        [self.batch_size, self.sequence_length, self.cell.output_size])
+    self.assertIsInstance(encoder_output_.final_state,
+                          tf.nn.rnn_cell.LSTMStateTuple)
+    np.testing.assert_array_equal(encoder_output_.final_state.h.shape,
+                                  [self.batch_size, self.cell.output_size])
+    np.testing.assert_array_equal(encoder_output_.final_state.c.shape,
+                                  [self.batch_size, self.cell.output_size])
 
 
 class BidirectionalRNNEncoderTest(tf.test.TestCase):
@@ -54,10 +59,12 @@ def setUp(self):
     self.cell = tf.nn.rnn_cell.LSTMCell(32)
 
   def test_encode(self):
-    inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth])
-    example_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
+    inputs = tf.random_normal(
+        [self.batch_size, self.sequence_length, self.input_depth])
+    example_length = tf.ones(
+        self.batch_size, dtype=tf.int32) * self.sequence_length
 
-    encode_fn = BidirectionalRNNEncoder(self.cell)
+    encode_fn = rnn_encoder.BidirectionalRNNEncoder(self.cell)
     encoder_output = encode_fn(inputs, example_length)
 
     with self.test_session() as sess:
@@ -65,19 +72,22 @@ def test_encode(self):
       encoder_output_ = sess.run(encoder_output)
 
     np.testing.assert_array_equal(
-      encoder_output_.outputs.shape,
-      [self.batch_size, self.sequence_length, self.cell.output_size*2])
+        encoder_output_.outputs.shape,
+        [self.batch_size, self.sequence_length, self.cell.output_size * 2])
+
+    self.assertIsInstance(encoder_output_.final_state[0],
+                          tf.nn.rnn_cell.LSTMStateTuple)
+    self.assertIsInstance(encoder_output_.final_state[1],
+                          tf.nn.rnn_cell.LSTMStateTuple)
+    np.testing.assert_array_equal(encoder_output_.final_state[0].h.shape,
+                                  [self.batch_size, self.cell.output_size])
+    np.testing.assert_array_equal(encoder_output_.final_state[0].c.shape,
+                                  [self.batch_size, self.cell.output_size])
+    np.testing.assert_array_equal(encoder_output_.final_state[1].h.shape,
+                                  [self.batch_size, self.cell.output_size])
+    np.testing.assert_array_equal(encoder_output_.final_state[1].c.shape,
+                                  [self.batch_size, self.cell.output_size])
 
-    self.assertIsInstance(encoder_output_.final_state[0], tf.nn.rnn_cell.LSTMStateTuple)
-    self.assertIsInstance(encoder_output_.final_state[1], tf.nn.rnn_cell.LSTMStateTuple)
-    np.testing.assert_array_equal(
-      encoder_output_.final_state[0].h.shape, [self.batch_size, self.cell.output_size])
-    np.testing.assert_array_equal(
-      encoder_output_.final_state[0].c.shape, [self.batch_size, self.cell.output_size])
-    np.testing.assert_array_equal(
-      encoder_output_.final_state[1].h.shape, [self.batch_size, self.cell.output_size])
-    np.testing.assert_array_equal(
-      encoder_output_.final_state[1].c.shape, [self.batch_size, self.cell.output_size])
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/seq2seq/test/utils.py b/seq2seq/test/utils.py
index fb4e3f36..d5eab8e5 100644
--- a/seq2seq/test/utils.py
+++ b/seq2seq/test/utils.py
@@ -5,6 +5,7 @@
 from seq2seq.scripts import generate_examples
 from seq2seq import inputs
 
+
 def create_temp_tfrecords(source, target):
   """
   Creates a temporary TFRecords file.
@@ -51,6 +52,7 @@ def create_next_input_fn_for_test(source, target):
     A function that reads from a temporary file
   """
   file = create_temp_tfrecords(source, target)
+
   def next_input_fn():
     """
     The input function that is returned.
diff --git a/seq2seq/training/featurizers.py b/seq2seq/training/featurizers.py
index 7a6de041..18e1a366 100644
--- a/seq2seq/training/featurizers.py
+++ b/seq2seq/training/featurizers.py
@@ -6,9 +6,11 @@
 from seq2seq import inputs
 from seq2seq.graph_module import GraphModule
 
+
 class Seq2SeqFeaturizer(GraphModule):
-  """Takes raw tensors read from a TFRecods file and transforms them into feature and labels
-  dictionaries that can be fed to model functions. In particular, this featurizer:
+  """Takes raw tensors read from a TFRecods file and transforms them into
+  feature and labels dictionaries that can be fed to model functions.
+  In particular, this featurizer:
 
   - Creates vocabulary lookup tables for source and target vocab
   - Converts tokens into vocabulary ids
@@ -21,8 +23,12 @@ class Seq2SeqFeaturizer(GraphModule):
     source_vocab_info: a `seq2seq.inputs.VocabInfo` for the target vocab
   """
 
-  def __init__(self, source_vocab_info, target_vocab_info,
-               max_seq_len_source=None, max_seq_len_target=None, name="sequence_input"):
+  def __init__(self,
+               source_vocab_info,
+               target_vocab_info,
+               max_seq_len_source=None,
+               max_seq_len_target=None,
+               name="sequence_input"):
     super(Seq2SeqFeaturizer, self).__init__(name)
     self.source_vocab_info = source_vocab_info
     self.target_vocab_info = target_vocab_info
@@ -32,8 +38,8 @@ def __init__(self, source_vocab_info, target_vocab_info,
   def _build(self, input_dict):
     output_dict = input_dict.copy()
 
-    # TODO: Ideally we also should have the "special vocabulary" in our lookup table.
-    # How to best do this? Maybe create a temporary files that appends the special vocab?
+    # TODO: Ideally we should have the "special vocabulary" in our lookup table.
+    # How to best do this? Create a temporary files with the special vocab?
 
     # Create vocabulary lookup for source
     source_vocab_to_id, source_id_to_vocab, _ = \
@@ -52,43 +58,55 @@ def _build(self, input_dict):
     tf.add_to_collection("target_id_to_vocab", target_id_to_vocab)
 
     if self.max_seq_len_source is not None:
-      output_dict["source_tokens"] = output_dict["source_tokens"][:self.max_seq_len_source - 1]
-      output_dict["source_len"] = tf.minimum(output_dict["source_len"], self.max_seq_len_source - 1)
+      output_dict["source_tokens"] = output_dict[
+          "source_tokens"][:self.max_seq_len_source - 1]
+      output_dict["source_len"] = tf.minimum(output_dict["source_len"],
+                                             self.max_seq_len_source - 1)
     if self.max_seq_len_target is not None:
-      output_dict["target_tokens"] = output_dict["target_tokens"][:self.max_seq_len_target - 2]
-      output_dict["target_len"] = tf.minimum(output_dict["target_len"], self.max_seq_len_target - 2)
+      output_dict["target_tokens"] = output_dict[
+          "target_tokens"][:self.max_seq_len_target - 2]
+      output_dict["target_len"] = tf.minimum(output_dict["target_len"],
+                                             self.max_seq_len_target - 2)
 
     # Look up the source and target in the vocabulary
-    output_dict["source_ids"] = source_vocab_to_id.lookup(output_dict["source_tokens"])
-    output_dict["target_ids"] = target_vocab_to_id.lookup(output_dict["target_tokens"])
+    output_dict["source_ids"] = source_vocab_to_id.lookup(output_dict[
+        "source_tokens"])
+    output_dict["target_ids"] = target_vocab_to_id.lookup(output_dict[
+        "target_tokens"])
 
     # Append SEQUENCE_END token to the source
-    output_dict["source_ids"] = tf.concat(
-      0, [output_dict["source_ids"], [self.source_vocab_info.special_vocab.SEQUENCE_END]])
+    output_dict["source_ids"] = tf.concat(0, [
+        output_dict["source_ids"],
+        [self.source_vocab_info.special_vocab.SEQUENCE_END]
+    ])
     output_dict["source_tokens"] = tf.concat(
-      0, [output_dict["source_tokens"], ["SEQUENCE_END"]])
+        0, [output_dict["source_tokens"], ["SEQUENCE_END"]])
     output_dict["source_len"] += 1
 
     # Prepend SEQUENCE_START token to the target
     output_dict["target_ids"] = tf.concat(
-      0, [[self.target_vocab_info.special_vocab.SEQUENCE_START], output_dict["target_ids"]])
+        0, [[self.target_vocab_info.special_vocab.SEQUENCE_START],
+            output_dict["target_ids"]])
     output_dict["target_tokens"] = tf.concat(
-      0, [["SEQUENCE_START"], output_dict["target_tokens"]])
+        0, [["SEQUENCE_START"], output_dict["target_tokens"]])
     output_dict["target_len"] += 1
 
     # Append SEQUENCE_END token to the target
-    output_dict["target_ids"] = tf.concat(
-      0, [output_dict["target_ids"], [self.target_vocab_info.special_vocab.SEQUENCE_END]])
+    output_dict["target_ids"] = tf.concat(0, [
+        output_dict["target_ids"],
+        [self.target_vocab_info.special_vocab.SEQUENCE_END]
+    ])
     output_dict["target_tokens"] = tf.concat(
-      0, [output_dict["target_tokens"], ["SEQUENCE_END"]])
+        0, [output_dict["target_tokens"], ["SEQUENCE_END"]])
     output_dict["target_len"] += 1
 
     # Cast to int32
     output_dict["source_len"] = tf.to_int32(output_dict["source_len"])
     output_dict["target_len"] = tf.to_int32(output_dict["target_len"])
     output_dict["target_start_id"] = tf.to_int32(
-      self.target_vocab_info.special_vocab.SEQUENCE_START)
-    output_dict["target_end_id"] = tf.to_int32(self.target_vocab_info.special_vocab.SEQUENCE_END)
+        self.target_vocab_info.special_vocab.SEQUENCE_START)
+    output_dict["target_end_id"] = tf.to_int32(
+        self.target_vocab_info.special_vocab.SEQUENCE_END)
 
     # Add summaries
     tf.summary.histogram("source_len", output_dict["source_len"])
diff --git a/seq2seq/training/hooks.py b/seq2seq/training/hooks.py
index a6fea895..b7d36cab 100644
--- a/seq2seq/training/hooks.py
+++ b/seq2seq/training/hooks.py
@@ -7,15 +7,18 @@
 from tensorflow.contrib.learn import basic_session_run_hooks, session_run_hook
 from tensorflow.python.client import timeline
 
-class SecondOrStepTimer(basic_session_run_hooks.basic_session_run_hooks._SecondOrStepTimer):
+
+class SecondOrStepTimer(
+    basic_session_run_hooks.basic_session_run_hooks._SecondOrStepTimer):
   """Helper class to count both seconds and steps.
   """
   pass
 
 
 class MetadataCaptureHook(session_run_hook.SessionRunHook):
-  """A hook to capture metadata for a single step. Useful for performance debugging.
-  It performs a full trace and saves run_metadata and Chrome timeline information to a file.
+  """A hook to capture metadata for a single step.
+  Useful for performance debugging. It performs a full trace and saves
+  run_metadata and Chrome timeline information to a file.
 
   Args:
     output_dir: Directory to write file(s) to
@@ -57,9 +60,9 @@ def after_run(self, _run_context, run_values):
 
       # Save tfprof op log
       tf.contrib.tfprof.tfprof_logger.write_op_log(
-        graph=tf.get_default_graph(),
-        log_dir=self.output_dir,
-        run_meta=run_values.run_metadata)
+          graph=tf.get_default_graph(),
+          log_dir=self.output_dir,
+          run_meta=run_values.run_metadata)
       tf.logging.info("Saved op log to %s", self.output_dir)
 
     self._iter += 1
@@ -69,15 +72,18 @@ class TrainSampleHook(session_run_hook.SessionRunHook):
   """Occasionally samples predictions from the training run and prints them.
 
   Args:
-    every_n_secs: Sample predictions every N seconds. If set, `every_n_steps` must be None.
-    every_n_steps: Sample predictions every N steps. If set, `every_n_secs` must be None.
+    every_n_secs: Sample predictions every N seconds.
+      If set, `every_n_steps` must be None.
+    every_n_steps: Sample predictions every N steps.
+      If set, `every_n_secs` must be None.
   """
 
   #pylint: disable=missing-docstring
 
   def __init__(self, every_n_secs=None, every_n_steps=None):
     super(TrainSampleHook, self).__init__()
-    self._timer = SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_steps)
+    self._timer = SecondOrStepTimer(
+        every_secs=every_n_secs, every_steps=every_n_steps)
     self.predictions_dict = {}
     self.features_dict = {}
     self.labels_dict = {}
@@ -90,25 +96,29 @@ def begin(self):
     self._iter_count = 0
     # TODO: Is there a nicer way?
     # See https://github.com/dennybritz/seq2seq/issues/21
-    self.predictions_dict = dict(zip(
-      tf.get_collection("model_output_keys"),
-      tf.get_collection("model_output_values")))
-    self.features_dict = dict(zip(
-      tf.get_collection("features_keys"),
-      tf.get_collection("features_values")))
-    self.labels_dict = dict(zip(
-      tf.get_collection("labels_keys"),
-      tf.get_collection("labels_values")))
+    self.predictions_dict = dict(
+        zip(
+            tf.get_collection("model_output_keys"),
+            tf.get_collection("model_output_values")))
+    self.features_dict = dict(
+        zip(
+            tf.get_collection("features_keys"),
+            tf.get_collection("features_values")))
+    self.labels_dict = dict(
+        zip(
+            tf.get_collection("labels_keys"), tf.get_collection(
+                "labels_values")))
     self.target_id_to_vocab = tf.get_collection("target_id_to_vocab")[0]
-    self.predicted_words = self.target_id_to_vocab.lookup(self.predictions_dict["predictions"])
+    self.predicted_words = self.target_id_to_vocab.lookup(self.predictions_dict[
+        "predictions"])
 
   def before_run(self, _run_context):
     self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)
     if self._should_trigger:
       fetches = {
-        "predicted_words": self.predicted_words,
-        "target_words": self.labels_dict["target_tokens"],
-        "target_len": self.labels_dict["target_len"]
+          "predicted_words": self.predicted_words,
+          "target_words": self.labels_dict["target_tokens"],
+          "target_len": self.labels_dict["target_len"]
       }
       return session_run_hook.SessionRunArgs(fetches)
     return None
@@ -121,7 +131,9 @@ def after_run(self, _run_context, run_values):
 
     # Convert dict of lists to list of dicts
     result_dict = run_values.results
-    result_dicts = [dict(zip(result_dict, t)) for t in zip(*result_dict.values())]
+    result_dicts = [
+        dict(zip(result_dict, t)) for t in zip(*result_dict.values())
+    ]
 
     # Print results
     tf.logging.info("Sampling Predictions (Prediction followed by Target)")
@@ -136,13 +148,13 @@ def after_run(self, _run_context, run_values):
     self._timer.update_last_triggered_step(self._iter_count)
 
 
-
 class PrintModelAnalysisHook(session_run_hook.SessionRunHook):
-  """A SessionRunHook that writes the parameters of the model to a file and stdout.
+  """Writes the parameters of the model to a file and stdout.
 
   Args:
     filename: The file path to write the model analysis to.
   """
+
   #pylint: disable=missing-docstring
   def __init__(self, filename=None):
     self.filename = filename
@@ -152,7 +164,7 @@ def begin(self):
     opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
     opts['dump_to_file'] = os.path.abspath(self.filename)
     tf.contrib.tfprof.model_analyzer.print_model_analysis(
-      tf.get_default_graph(), tfprof_options=opts)
+        tf.get_default_graph(), tfprof_options=opts)
 
     # Print the model analysis
     with open(self.filename, "r") as file:
diff --git a/seq2seq/training/hparams_parser.py b/seq2seq/training/hparams_parser.py
index 60fd3c9c..fa5cdcf1 100644
--- a/seq2seq/training/hparams_parser.py
+++ b/seq2seq/training/hparams_parser.py
@@ -3,6 +3,7 @@
 
 import argparse
 
+
 class HParamsParser(object):
   """Pases a comma-separated string of hyperaprameters
   """
diff --git a/seq2seq/training/train.py b/seq2seq/training/train.py
index 696ea15c..90d38e39 100755
--- a/seq2seq/training/train.py
+++ b/seq2seq/training/train.py
@@ -25,28 +25,30 @@
 tf.flags.DEFINE_string("hparams", None, "overwrite hyperparameter values")
 tf.flags.DEFINE_string("model", "BasicSeq2Seq", "model class")
 tf.flags.DEFINE_string("output_dir", None, "directory to write to")
-tf.flags.DEFINE_integer("save_checkpoints_secs", 300, "save checkpoint every N seconds")
+tf.flags.DEFINE_integer("save_checkpoints_secs", 300,
+                        "save checkpoint every N seconds")
 tf.flags.DEFINE_string("schedule", None,
                        """Estimator function to call, defaults to
                        train_and_evaluate for local run""")
 
-
-
 tf.flags.DEFINE_integer("train_steps", None, "maximum number of training steps")
 tf.flags.DEFINE_integer("eval_steps", 100, "maxmum number of eval steps")
-tf.flags.DEFINE_integer("eval_every_n_steps", 1000, "evaluate after this many training steps")
-tf.flags.DEFINE_integer("sample_every_n_steps", 500, "sample training predictions every N steps")
+tf.flags.DEFINE_integer("eval_every_n_steps", 1000,
+                        "evaluate after this many training steps")
+tf.flags.DEFINE_integer("sample_every_n_steps", 500,
+                        "sample training predictions every N steps")
 
 FLAGS = tf.flags.FLAGS
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
+
 def create_experiment(output_dir):
   """
   Creates a new Experiment instance.
 
   Args:
-    output_dir: Will be used as the output directory for model checkpoints and summaries.
+    output_dir: Output directory for model checkpoints and summaries.
   """
 
   # Load vocabulary info
@@ -74,9 +76,9 @@ def create_experiment(output_dir):
 
   # Create model
   model = model_class(
-    source_vocab_info=source_vocab_info,
-    target_vocab_info=target_vocab_info,
-    params=hparams)
+      source_vocab_info=source_vocab_info,
+      target_vocab_info=target_vocab_info,
+      params=hparams)
   featurizer = model.create_featurizer()
 
   bucket_boundaries = None
@@ -85,48 +87,49 @@ def create_experiment(output_dir):
 
   # Create input functions
   train_input_fn = training_utils.create_input_fn(
-    train_data_provider, featurizer, FLAGS.batch_size, bucket_boundaries=bucket_boundaries)
-  eval_input_fn = training_utils.create_input_fn(
-    dev_data_provider, featurizer, FLAGS.batch_size)
+      train_data_provider,
+      featurizer,
+      FLAGS.batch_size,
+      bucket_boundaries=bucket_boundaries)
+  eval_input_fn = training_utils.create_input_fn(dev_data_provider, featurizer,
+                                                 FLAGS.batch_size)
 
   def model_fn(features, labels, params, mode):
     """Builds the model graph"""
     return model(features, labels, params, mode)
 
   estimator = tf.contrib.learn.estimator.Estimator(
-    model_fn=model_fn,
-    model_dir=output_dir)
+      model_fn=model_fn, model_dir=output_dir)
 
   # Create training Hooks
-  # validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-  #   input_fn=eval_input_fn, eval_steps=FLAGS.eval_steps, every_n_steps=FLAGS.eval_every_n_steps)
   model_analysis_hook = hooks.PrintModelAnalysisHook(
-    filename=os.path.join(estimator.model_dir, "model_analysis.txt"))
+      filename=os.path.join(estimator.model_dir, "model_analysis.txt"))
   train_sample_hook = hooks.TrainSampleHook(
-    every_n_steps=FLAGS.sample_every_n_steps)
+      every_n_steps=FLAGS.sample_every_n_steps)
   metadata_hook = hooks.MetadataCaptureHook(
-    output_dir=os.path.join(estimator.model_dir, "metadata"), step=10)
+      output_dir=os.path.join(estimator.model_dir, "metadata"), step=10)
   train_monitors = [model_analysis_hook, train_sample_hook, metadata_hook]
 
   experiment = tf.contrib.learn.experiment.Experiment(
-    estimator=estimator,
-    train_input_fn=train_input_fn,
-    eval_input_fn=eval_input_fn,
-    min_eval_frequency=FLAGS.eval_every_n_steps,
-    train_steps=FLAGS.train_steps,
-    eval_steps=FLAGS.eval_steps,
-    train_monitors=train_monitors)
+      estimator=estimator,
+      train_input_fn=train_input_fn,
+      eval_input_fn=eval_input_fn,
+      min_eval_frequency=FLAGS.eval_every_n_steps,
+      train_steps=FLAGS.train_steps,
+      eval_steps=FLAGS.eval_steps,
+      train_monitors=train_monitors)
 
   return experiment
 
+
 def main(_argv):
   """The entrypoint for the script"""
   if not FLAGS.output_dir:
     FLAGS.output_dir = tempfile.mkdtemp()
-  learn_runner.run(
-    experiment_fn=create_experiment,
-    output_dir=FLAGS.output_dir,
-    schedule=FLAGS.schedule)
+  learn_runner.run(experiment_fn=create_experiment,
+                   output_dir=FLAGS.output_dir,
+                   schedule=FLAGS.schedule)
+
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/seq2seq/training/utils.py b/seq2seq/training/utils.py
index 6998e165..88441b1a 100644
--- a/seq2seq/training/utils.py
+++ b/seq2seq/training/utils.py
@@ -1,19 +1,27 @@
 """Miscellaneous training utility functions.
 """
 
-from seq2seq.inputs import  read_from_data_provider
+from seq2seq.inputs import read_from_data_provider
 import tensorflow as tf
 
-def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0,
+
+def get_rnn_cell(cell_type,
+                 num_units,
+                 num_layers=1,
+                 dropout_input_keep_prob=1.0,
                  dropout_output_keep_prob=1.0):
   """Creates a new RNN Cell.
 
   Args:
-    cell_type: A cell lass name defined in `tf.nn.rnn_cell`, e.g. `LSTMCell` or `GRUCell`
+    cell_type: A cell lass name defined in `tf.nn.rnn_cell`,
+      e.g. `LSTMCell` or `GRUCell`
     num_units: Number of cell units
-    num_layers: Number of layers. The cell will be wrapped with `tf.nn.rnn_cell.MultiRNNCell`
-    dropout_input_keep_prob: Dropout keep probability applied to the input of cell *at each layer*
-    dropout_output_keep_prob: Dropout keep probability applied to the output of cell *at each layer*
+    num_layers: Number of layers. The cell will be wrapped with
+      `tf.nn.rnn_cell.MultiRNNCell`
+    dropout_input_keep_prob: Dropout keep probability applied
+      to the input of cell *at each layer*
+    dropout_output_keep_prob: Dropout keep probability applied
+      to the output of cell *at each layer*
 
   Returns:
     An instance of `tf.nn.rnn_cell.RNNCell`.
@@ -24,9 +32,9 @@ def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0
 
   if dropout_input_keep_prob < 1.0 or dropout_output_keep_prob < 1.0:
     cell = tf.nn.rnn_cell.DropoutWrapper(
-      cell=cell,
-      input_keep_prob=dropout_input_keep_prob,
-      output_keep_prob=dropout_output_keep_prob)
+        cell=cell,
+        input_keep_prob=dropout_input_keep_prob,
+        output_keep_prob=dropout_output_keep_prob)
 
   if num_layers > 1:
     cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
@@ -34,23 +42,30 @@ def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0
   return cell
 
 
-def create_input_fn(data_provider_fn, featurizer_fn, batch_size, bucket_boundaries=None):
+def create_input_fn(data_provider_fn,
+                    featurizer_fn,
+                    batch_size,
+                    bucket_boundaries=None):
   """Creates an input function that can be used with tf.learn estimators.
-    Note that you must pass "factory funcitons" for both the data provider and featurizer
-    to ensure that everything will be created as part of the same graph.
+    Note that you must pass "factory funcitons" for both the data provider and
+    featurizer to ensure that everything will be created in  the same graph.
 
   Args:
-    data_provider_fn: A function that creates a data provider instance to read from.
+    data_provider_fn: Function that creates a data provider to read from.
       An instance of `tf.contrib.slim.data_provider.DataProvider`.
-    featurizer_fn: A function taht creates a featurizer function which takes tensors
-      returned by the data provider and transfroms them into a (features, labels) tuple.
-    batch_size: Create batches of this size. A queue to hold a reasonable number of batches in
-      memory is created.
-    bucket_boundaries: int list, increasing non-negative numbers. If None, no bucket is performed.
+    featurizer_fn: A function that creates a featurizer function
+      which takes tensors returned by the data provider and transfroms them
+      into a (features, labels) tuple.
+    batch_size: Create batches of this size. A queue to hold a
+      reasonable number of batches in memory is created.
+    bucket_boundaries: int list, increasing non-negative numbers.
+      If None, no bucket is performed.
 
   Returns:
-    An input function that returns (feature_batch, labels_batch) tuples when called.
+    An input function that returns `(feature_batch, labels_batch)`
+    tuples when called.
   """
+
   def input_fn():
     """Creates features and labels.
     """
@@ -65,36 +80,35 @@ def input_fn():
 
     if bucket_boundaries:
       bucket_num, batch = tf.contrib.training.bucket_by_sequence_length(
-        input_length=features_and_labels["source_len"],
-        bucket_boundaries=bucket_boundaries,
-        tensors=features_and_labels,
-        batch_size=batch_size,
-        keep_input=features_and_labels["target_len"] >= 1,
-        dynamic_pad=True,
-        capacity=5000 + 16 * batch_size,
-        name="bucket_queue")
+          input_length=features_and_labels["source_len"],
+          bucket_boundaries=bucket_boundaries,
+          tensors=features_and_labels,
+          batch_size=batch_size,
+          keep_input=features_and_labels["target_len"] >= 1,
+          dynamic_pad=True,
+          capacity=5000 + 16 * batch_size,
+          name="bucket_queue")
       tf.summary.histogram("buckets", bucket_num)
     else:
       # Filter out examples with target_len < 1
-      slice_end = tf.cond(
-        features_and_labels["target_len"] >= 1,
-        lambda: tf.constant(1),
-        lambda: tf.constant(0))
+      slice_end = tf.cond(features_and_labels["target_len"] >= 1,
+                          lambda: tf.constant(1), lambda: tf.constant(0))
       features_and_labels = {
-        k: tf.expand_dims(v, 0)[0:slice_end]
-        for k, v in features_and_labels.items()
+          k: tf.expand_dims(v, 0)[0:slice_end]
+          for k, v in features_and_labels.items()
       }
       batch = tf.train.batch(
-        tensors=features_and_labels,
-        enqueue_many=True,
-        batch_size=batch_size,
-        dynamic_pad=True,
-        capacity=5000 + 16 * batch_size,
-        name="batch_queue")
+          tensors=features_and_labels,
+          enqueue_many=True,
+          batch_size=batch_size,
+          dynamic_pad=True,
+          capacity=5000 + 16 * batch_size,
+          name="batch_queue")
 
     # Separate features and labels again
     features_batch = {k: batch[k] for k in feature_keys}
     labels_batch = {k: batch[k] for k in label_keys}
 
     return features_batch, labels_batch
+
   return input_fn