Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string

import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import collections
import math

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]

  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [41]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [10]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.299671 learning rate: 10.000000
Minibatch perplexity: 27.10
n shi  oehzspmoar sptsk erxhhqe    erjppsmeisyz zitw mcvh ahy daidhttpbyj u lzoe
kcrnd ycwop uymgrkdwuncd ehn eurpp  ndhbniyt  l faqrkgeenciwvwd oyos nbpsrejmjf 
tt ein tppuernb lgqqr p tsoydgnyarielv wso eewjijedqay   gusoxk ggcyt crlrjacswe
xpgjjcgsooznb yxlscsdkbkoeim dm rin ao at wtxcrmknk a ilecgkcmctripdhorcer tpwex
gpdu  m fgmr biwoihcvgih kilinsymxoai onmiwerw ueyoe ctiync ehctatfmmbdfvruvsfar
Validation set perplexity: 20.28
Average loss at step 100: 2.595023 learning rate: 10.000000
Minibatch perplexity: 11.13
Validation set perplexity: 10.28
Average loss at step 200: 2.255872 learning rate: 10.000000
Minibatch perplexity: 8.65
Validation set perplexity: 8.59
Average loss at step 300: 2.101554 learning rate: 10.000000
Minibatch perplexity: 7.47
Validation set perplexity: 8.14
Average loss at step 400: 2.003385 learning rate: 10.000000
Minibatch perplexity: 7.42
Validation set per

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [50]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
    
  W_i = tf.concat([ix, fx, cx, ox], 1)
  W_h = tf.concat([im, fm, cm, om], 1)
  b_i = tf.concat([ib, fb, cb, ob], 1)
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    cell = tf.matmul(i, W_i) + tf.matmul(o, W_h) + b_i
    input_gate, forget_gate, update, output_gate = tf.split(cell, 4, axis = 1)
    input_gate = tf.sigmoid(input_gate)
    forget_gate = tf.sigmoid(forget_gate)
    output_gate = tf.sigmoid(output_gate)
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [51]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized


ValueError: Cannot feed value of shape (64, 27) for Tensor u'Placeholder_8:0', which has shape '(128, 27)'

In [10]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  # initializing only one matrix
  # Parameters:
  # Input gate: input, previous output, and bias.
  W_i = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes * 4], -0.1, 0.1))
  W_h = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1))
  b_i = tf.Variable(tf.zeros([1, num_nodes * 4]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    cell = tf.matmul(i, W_i) + tf.matmul(o, W_h) + b_i
    input_gate, forget_gate, update, output_gate = tf.split(cell, 4, axis = 1)
    input_gate = tf.sigmoid(input_gate)
    forget_gate = tf.sigmoid(forget_gate)
    output_gate = tf.sigmoid(output_gate)
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.float32,
                                     shape=[batch_size, vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    print(tf.concat(outputs, 0).shape)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

(640, 64)


In [13]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294532 learning rate: 10.000000
Minibatch perplexity: 26.96
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]
nmbtyvp olho  q aadety enlrutaxruwhsevrrz eteawec  bnleqciittrtqebpl neee hxw r 
[[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]
gdemyudnkbowohf cjh om teams  n au  cquep vnowqvrexrqvu avziuutis re  ezerw uan 
[[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]
cci ya htmyseygnne hu  n vi acn owr dnpbserxomtisp xesewb wpjkrinwea  evcauewno 
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.]]
tni  saqjg vr m dct zerjhtlfvjfxznazw anmuincmnkbarsenait cbitn vledsfpynnektsof
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.  0.]]
rveekuea

4 matrix initialization performed better than only one. View the above models.

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

In [42]:
def unique_bigrams(text):
    all_bigrams = []
    for i in range(0, len(text), 2):
        bigram = text[i:i + 2]
        all_bigrams.append(bigram)
        
    return np.unique(all_bigrams)

bigrams = unique_bigrams(train_text)
n_bigrams = len(bigrams)
print('N Bigrams: {}'.format(n_bigrams))

N Bigrams: 728


In [43]:
class BigramBatchGenerator(object):
  def __init__(self, text, bigrams, batch_size, skip_window, num_skips):
    self._text = text
    self._batch_size = batch_size
    self._bigrams = bigrams
    self._skip_window = skip_window
    self._num_skips = num_skips
    self._data_index = 0

  def _next_bigram_id(self):
    i = self._data_index
    bigram = self._text[i:i + 2]
    bigram_id = self._bigrams.index(bigram)
    self._data_index = (i + 2) % len(self._text)
    return bigram_id
    
  def next(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size), dtype=np.int32)
    labels = np.zeros(shape=(self._batch_size, 1), dtype=np.int32)
    
    span = 2 * self._skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        bigram_id = self._next_bigram_id()
        buffer.append(bigram_id)
        
    
    for i in range(self._batch_size // self._num_skips):
        target = self._skip_window  # target label at the center of the buffer
        targets_to_avoid = [ self._skip_window ]
        for j in range(self._num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * self._num_skips + j] = buffer[self._skip_window]
            labels[i * self._num_skips + j, 0] = buffer[target]
        bigram_id = self._next_bigram_id()
        buffer.append(bigram_id)
    
    return (batch, labels)

bigram_generator = BigramBatchGenerator(text, bigrams.tolist(), 128, 1, 2)
(batch, labels) = bigram_generator.next()
print('data:', text[:50])
print('  >batch: ', [bigrams[i] for i in batch])
print('  >labels: ', [bigrams[i] for i in labels.reshape(128)])

data:  anarchism originated as a term of abuse first use
  >batch:  ['na', 'na', 'rc', 'rc', 'hi', 'hi', 'sm', 'sm', ' o', ' o', 'ri', 'ri', 'gi', 'gi', 'na', 'na', 'te', 'te', 'd ', 'd ', 'as', 'as', ' a', ' a', ' t', ' t', 'er', 'er', 'm ', 'm ', 'of', 'of', ' a', ' a', 'bu', 'bu', 'se', 'se', ' f', ' f', 'ir', 'ir', 'st', 'st', ' u', ' u', 'se', 'se', 'd ', 'd ', 'ag', 'ag', 'ai', 'ai', 'ns', 'ns', 't ', 't ', 'ea', 'ea', 'rl', 'rl', 'y ', 'y ', 'wo', 'wo', 'rk', 'rk', 'in', 'in', 'g ', 'g ', 'cl', 'cl', 'as', 'as', 's ', 's ', 'ra', 'ra', 'di', 'di', 'ca', 'ca', 'ls', 'ls', ' i', ' i', 'nc', 'nc', 'lu', 'lu', 'di', 'di', 'ng', 'ng', ' t', ' t', 'he', 'he', ' d', ' d', 'ig', 'ig', 'ge', 'ge', 'rs', 'rs', ' o', ' o', 'f ', 'f ', 'th', 'th', 'e ', 'e ', 'en', 'en', 'gl', 'gl', 'is', 'is', 'h ', 'h ', 're', 're', 'vo', 'vo']
  >labels:  [' a', 'rc', 'na', 'hi', 'sm', 'rc', ' o', 'hi', 'sm', 'ri', 'gi', ' o', 'ri', 'na', 'te', 'gi', 'd ', 'na', 'as', 'te', 'd ', ' a', ' t', 'as', ' a', 

In [44]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([n_bigrams, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([n_bigrams, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([n_bigrams]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights,
                               biases=softmax_biases,
                               inputs=embed,
                               labels=train_labels,
                               num_sampled=num_sampled,
                               num_classes=n_bigrams))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

In [45]:
num_steps = 100001
bigram_generator = BigramBatchGenerator(text, bigrams.tolist(), batch_size, 
                                        skip_window, num_skips)
print('ok')
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        (batch_data, batch_labels) = bigram_generator.next()
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        (_, l) = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
    learned_embeddings = embeddings.eval()
  

ok
Initialized
Average loss at step 0: 4.292185
Average loss at step 2000: 2.224269
Average loss at step 4000: 1.988664
Average loss at step 6000: 1.976471
Average loss at step 8000: 1.966395
Average loss at step 10000: 1.979750
Average loss at step 12000: 1.965739
Average loss at step 14000: 1.861095
Average loss at step 16000: 2.007119
Average loss at step 18000: 1.968653
Average loss at step 20000: 1.957505
Average loss at step 22000: 1.859731
Average loss at step 24000: 1.956730
Average loss at step 26000: 1.882391
Average loss at step 28000: 1.902106
Average loss at step 30000: 1.941708
Average loss at step 32000: 1.898305
Average loss at step 34000: 1.891825
Average loss at step 36000: 1.922929
Average loss at step 38000: 1.967797
Average loss at step 40000: 1.955180
Average loss at step 42000: 1.872121
Average loss at step 44000: 1.963747
Average loss at step 46000: 1.846275
Average loss at step 48000: 1.923485
Average loss at step 50000: 1.919927
Average loss at step 52000: 1.9

In [46]:
num_unrollings = 10

class LSTMBigramBatchGenerator(object):
    def __init__(self, text, bigrams, num_unrollings, batch_size):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        self._bigrams = bigrams
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()

    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size,), dtype=np.int32)
        for b in range(self._batch_size):
            i = self._cursor[b]
            bigram = self._text[i:i + 2]
            batch[b] = self._bigrams.index(bigram)
            self._cursor[b] = (i + 2) % self._text_size
        return batch

    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def bigram_characters(batch):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [bigrams[i] for i in batch]

def bigram_batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, bigram_characters(b))]

    return s
    
train_bigram_generator = LSTMBigramBatchGenerator(train_text, bigrams.tolist(), 
                                            num_unrollings, batch_size)
valid_bigram_generator = LSTMBigramBatchGenerator(valid_text, bigrams.tolist(), 
                                            1, 1)

batches = train_bigram_generator.next()
print(batches[0])
print(bigram_batches2string(batches))

[418 392 628  19 335 490   0 520 351 521 220  45 674   3  51 527 548  33
 359  25 382 520 220 519 134 343 136  13 404 335 679 420 422 498  26   5
  19  46  44 134 262 137   0 544 548 220 132 138 161 179  46 548 134 262
 148  44  39  85 507 189 257   0 404 490  26 176 196 188 256  44 500 512
  81 161  45 134   3  45 365 170 539 416 566 134 134 540 404 392 410 134
 512 134 308  12 152  17 639 332 154 159 547   7 154 384  18 324 261 152
 548 381 107 517 547 152 148 486 166 137 128 248 557 377  46 154  42 504
 521 380]
['ons anarchists advocat', 'nomination gore s endo', 'when military governme', ' three nine one six ze', 'lleria arches national', 'reviated as dr mr and ', ' abbeys and monasterie', 'shing the right of app', 'married urraca princes', 'sity upset the devils ', 'hel and richard baer h', 'ased in the st family ', 'y and liturgical langu', ' disgust because of th', 'ay opened for passenge', 'society and that this ', 'tion from the national', 'ago based chess record', 'migration

In [52]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  # initializing only one matrix
  # Parameters:
  # Input gate: input, previous output, and bias.
  W_i = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1))
  W_h = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1))
  b_i = tf.Variable(tf.zeros([1, num_nodes * 4]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, n_bigrams], -0.1, 0.1))
  b = tf.Variable(tf.zeros([n_bigrams]))
  
  embeddings = tf.constant(learned_embeddings)

  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    i = tf.nn.embedding_lookup(embeddings, i)
    cell = tf.matmul(i, W_i) + tf.matmul(o, W_h) + b_i
    input_gate, forget_gate, update, output_gate = tf.split(cell, 4, axis = 1)
    input_gate = tf.sigmoid(input_gate)
    forget_gate = tf.sigmoid(forget_gate)
    output_gate = tf.sigmoid(output_gate)
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.int32,
                                     shape=[batch_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [53]:
def to_one_hot_vector(labels, n_classes):
    encoding = np.zeros(shape=(len(labels), n_classes), dtype=np.float32)
    for (i, label) in enumerate(labels):
        encoding[i, label] = 1
    return encoding

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  return [sample_distribution(prediction[0])]

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, n_bigrams - 1])
  return b/np.sum(b, 1)[:,None]

In [54]:
num_steps = 10001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_bigram_generator.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      labels = to_one_hot_vector(labels, n_bigrams)
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = bigrams[feed[0]]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += bigrams[feed[0]]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_bigram_generator.next()
        labels = to_one_hot_vector(b[1], n_bigrams)
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, labels)
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.595691 learning rate: 10.000000
Minibatch perplexity: 731.93
zbxmdabmxlhsjhtusiwcz ijtabdvolvwspqlkgk cvkdlmadcq jaixbspmlxnejkpgewusqtawxzwrajukodbktrigcrwoxxxjhieq aagsysfufdhubvmgfethrzedilwqogumwzdggxoype lfezxvgatwpr
pbgjtrgujdzdujtnjdvyyqankwkmnlxqzlbktyfdqaandozsorrhyayovbwktc knytazwdni zjuwjhnfpqeopltouyoibmx qiakxhktlpvubhyioxhkrmjaip kfmo  usracnqfyfpwihxkvnglcqhnnarol
zunnelnccwuzuwgmmaulghunyuouarxxhrycjuzskfhvuwumtzegflvkocplgvlgwqm teuvbafxiuqcccgewtism t bglcjmbsgwonddapkuwqdrpqrnicfkpelamykyvpekltfrvrcopslkvgqzbhgxqvyieh
vuubgyvtqstbshinzgcmkjdattxvgwodwungadjkthkbewnfcahifhyfonufst hhfqoorusigc qqimh wyrmwwjmtilhle eamxngtgheyqvzdmwtscuwrshozrhjjnld u cpcgbrvrwyxlvcpmbeglnktbyl
yfmxvogp bywhunayxclhx oblltrnadh tiylsdsawtdqnvyikvjinzplfjzdlbbjxkpxlaackphpqvbffmqkxlccar v r zddliddke gkbbxwplsfhhlvcclejhtyjbjbuahmcs aaxtlaodpcazpwvyvjhh
Validation set perplexity: 665.54
Average loss at step 100: 4.777298 learning rate: 10.000000
Mi

In [55]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  # initializing only one matrix
  # Parameters:
  # Input gate: input, previous output, and bias.
  W_i = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1))
  W_h = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1))
  b_i = tf.Variable(tf.zeros([1, num_nodes * 4]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, n_bigrams], -0.1, 0.1))
  b = tf.Variable(tf.zeros([n_bigrams]))
    
  keep_prob = tf.placeholder(tf.float32)
  
  embeddings = tf.constant(learned_embeddings)

  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    i = tf.nn.embedding_lookup(embeddings, i)
    i = tf.nn.dropout(i, keep_prob)
    cell = tf.matmul(i, W_i) + tf.matmul(o, W_h) + b_i
    input_gate, forget_gate, update, output_gate = tf.split(cell, 4, axis = 1)
    input_gate = tf.sigmoid(input_gate)
    forget_gate = tf.sigmoid(forget_gate)
    output_gate = tf.sigmoid(output_gate)
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(tf.placeholder(tf.int32,
                                     shape=[batch_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [None]:
num_steps = 10001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_bigram_generator.next()
    feed_dict = {keep_prob: 0.5}
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      labels = to_one_hot_vector(labels, n_bigrams)
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = bigrams[feed[0]]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({keep_prob: 1.0,sample_input: feed})
            feed = sample(prediction)
            sentence += bigrams[feed[0]]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_bigram_generator.next()
        labels = to_one_hot_vector(b[1], n_bigrams)
        predictions = sample_prediction.eval({keep_prob: 1.0, sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, labels)
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.602208 learning rate: 10.000000
Minibatch perplexity: 736.72
c ucfi shoxkravxhwdyenieln blyfgwbmyqeayritw tcfbutbkirqzjpjwimhsmgxcnvbnwwnymua zwoglwazhiqcbgdlornupgyvyhvfpvvwluxqkgzmm u qkspyccrflufxvrjffsahaqexvbpcgseivc
dnhmgsrucdkjlilyjknchcdonwqpvxwehidlsfdqrdmbgjdvkalotexfazgpslyweeumwqiodqodyzkpmjvbhca imlzijjagevwqwoopeqksixqzsgrndbfytatenkjnrwuysaifkoyprhlffvdxkpxqhntibvs
ugovisbelskage clmywnmiggckufpyxj nzulqoizwfybwyvfefpgqeqevohrblhlurpfwblfgbvdsrmzwabsrxcpryqznv defqkrwguc nyiesavkba aqiwuamnczmlqlocluxhimccjtlnmymionetbwcyk
dcdo xtygyrqfsuhdkzz ijtyhczzenfmptrxwnwhbwscroipdgkrtmdpotdtsbrqmrvysjicuyrkyyfqzof s gwzemec wiwxteocrkslggxtb nqphvllduqc jwhwhhbyhdzuvuqyuogrlzhbiqtynwlxdsj
bemokruhzcs od omuchsvhjbzylxhxvedilqdlrlcgizvtqdzyreqt uzfmtfdihvcrenqeplhhuyjhrwhlrkrfcor pnjvvxpst xwdthml oppzxrjmnyqehniffyhrltlzxlqjwvkab mrpleckcooichxe 
Validation set perplexity: 652.93
Average loss at step 100: 4.963477 learning rate: 10.000000
Mi

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [7]:
batch_size=64
num_unrollings=10

class SentenceMirrorBatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
  
  def _mirror_sentence(self, sentence):
    sentence = sentence.split(' ')
    mirrored = []
    for word in sentence:
        mirrored.append(''.join(reversed(word)))
        
    return ' '.join(mirrored)
    
  def _new_batches(self):
    batches = []
    for step in range(self._num_unrollings):
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        batches.append(batch)
    return batches
    
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    x_batches = self._new_batches()
    y_batches = self._new_batches()
    for b in range(self._batch_size):
        cursor = self._cursor[b]
        sentence = self._text[cursor:cursor + self._num_unrollings]
        mirrored = self._mirror_sentence(sentence)
        for (i, (x, y)) in enumerate(zip(sentence, mirrored)):
            x_batches[i][b, char2id(x)] = 1.0
            y_batches[i][b, char2id(y)] = 1.0
        self._cursor[b] = (cursor + self._num_unrollings) % self._text_size
    return (x_batches, y_batches)

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]

  return s

train_batches = SentenceMirrorBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = SentenceMirrorBatchGenerator(valid_text, 1, num_unrollings)

(x_batches, y_batches) = train_batches.next()
print(batches2string(x_batches))
print(batches2string(y_batches))

(x_batches, y_batches) = valid_batches.next()
print(batches2string(x_batches))
print(batches2string(y_batches))

print('###### ######## #########')

(x_batches, y_batches) = train_batches.next()
print(batches2string(x_batches))
print(batches2string(y_batches))

(x_batches, y_batches) = valid_batches.next()
print(batches2string(x_batches))
print(batches2string(y_batches))

['ons anarch', 'when milit', 'lleria arc', ' abbeys an', 'married ur', 'hel and ri', 'y and litu', 'ay opened ', 'tion from ', 'migration ', 'new york o', 'he boeing ', 'e listed w', 'eber has p', 'o be made ', 'yer who re', 'ore signif', 'a fierce c', ' two six e', 'aristotle ', 'ity can be', ' and intra', 'tion of th', 'dy to pass', 'f certain ', 'at it will', 'e convince', 'ent told h', 'ampaign an', 'rver side ', 'ious texts', 'o capitali', 'a duplicat', 'gh ann es ', 'ine januar', 'ross zero ', 'cal theori', 'ast instan', ' dimension', 'most holy ', 't s suppor', 'u is still', 'e oscillat', 'o eight su', 'of italy l', 's the towe', 'klahoma pr', 'erprise li', 'ws becomes', 'et in a na', 'the fabian', 'etchy to r', ' sharman n', 'ised emper', 'ting in po', 'd neo lati', 'th risky r', 'encycloped', 'fense the ', 'duating fr', 'treet grid', 'ations mor', 'appeal of ', 'si have ma']
['sno hcrana', 'nehw tilim', 'airell cra', ' syebba na', 'deirram ru', 'leh dna ir', 'y dna util', 'ya 

In [11]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
    # DECODER
    def lstm():
        shape = [vocabulary_size, num_nodes * 4]
        W_i = tf.Variable(tf.truncated_normal(shape, -0.1, 0.1))
        W_h = tf.Variable(tf.truncated_normal(shape, -0.1, 0.1))
        b_i = tf.Variable(tf.zeros([1, num_nodes * 4]))
        def cell(i, o, state):
            cell = tf.matmul(i, W_i) + tf.matmul(o, W_h) + b_i
            (input_gate, forget_gate, update, output_gate) = tf.split(cell, 4, axis=1)
            input_gate = tf.sigmoid(input_gate)
            forget_gate = tf.sigmoid(forget_gate)
            output_gate = tf.sigmoid(output_gate)
            state = forget_gate * state + input_gate * tf.tanh(update)
            return (output_gate * tf.tanh(state), state)
        
        return cell

    # Input data.
    encoder_train_inputs = []
    decoder_train_inputs = []
    train_labels = []
    input_shape = [batch_size, vocabulary_size]
    for _ in range(num_unrollings):
        encoder_train_inputs.append(tf.placeholder(tf.float32, shape=input_shape))
        decoder_train_inputs.append(tf.placeholder(tf.float32, shape=input_shape))
        train_labels.append(tf.placeholder(tf.float32, shape=input_shape))

    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)  
    # encoder LSTM loop.
    encoder_state = saved_state
    encoder_output = saved_output
    encoder = lstm()
    for encoder_input in encoder_train_inputs:
        (encoder_output, encoder_state) = encoder(encoder_input,
                                                  encoder_output,
                                                  encoder_state)

    # decoder LSTM loop.
    decoder_state = encoder_state
    decoder_output = encoder_output
    decoder = lstm()
    outputs = []
    for decoder_input in decoder_train_inputs:
        (decoder_output, decoder_state) = decoder(decoder_input,
                                                  decoder_output,
                                                  decoder_state)
        outputs.append(decoder_output)

    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                  saved_state.assign(state)]):
        # Classifier.
        labels = tf.concat(train_labels, 0)
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                                logits=logits)
        loss = tf.reduce_mean(cross_entropy)

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0, global_step,
                                               5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    (gradients, v) = zip(*optimizer.compute_gradients(loss))
    (gradients, _) = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Validation
    sample_inputs = []
    for _ in range(num_unrollings):
        sample_inputs.append(tf.placeholder(tf.float32, shape=[1, vocabulary_size]))

    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))

    valid_output = saved_sample_output
    valid_state = saved_sample_state
    valid_encoder = lstm()
    for valid_input in sample_inputs:
        (valid_output, valid_state) = valid_encoder(valid_input,
                                                    valid_output,
                                                    valid_state)

ValueError: Dimensions must be equal, but are 64 and 27 for 'MatMul_1' (op: 'MatMul') with input shapes: [64,64], [27,256].

In [16]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    (x_batches, y_batches) = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings):
      feed_dict[train_inputs[i]] = x_batches[i]
      feed_dict[train_labels[i]] = y_batches[i]
    
    (_, l, predictions, lr) = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(y_batches)
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      
      reset_sample_state.run()
      (x_batches, y_batches) = valid_batches.next()
      valid_logprob = 0
      decoder_input = 0
      for _ in range(num_unrollings):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

SyntaxError: invalid syntax (<ipython-input-16-b216a012f64b>, line 32)