Deep Learning
=============

Assignment 6
------------
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM of Problem 1.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).


In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import math
import string
import collections
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [3]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [4]:
import numpy.core.defchararray as npch

# read characters
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

def char_text_to_ngram_text(text, ngram=2):
    assert(ngram>=2)
    ngram_component=[]
    for n in range(ngram):
        # shift n positions the original list
        ngram_component.append(text[n::ngram])
    ngram_list = np.asarray(list(ngram_component[0]))
    for n in range(ngram)[1:]:
        ngram_array = npch.add(ngram_list, 
                               np.asarray(list(ngram_component[n])))
        ngram_list=ngram_array
        del ngram_array
    del ngram_component
    return ngram_list.tolist()

ngrams = 2
ngrams_text = char_text_to_ngram_text(text, ngrams)
print('Ngrams_text size %d' % len(ngrams_text))
print("{}".format(ngrams_text[:10]))
for k in range(10):
    print("'{}'".format(ngrams_text[k]))
del text

Data size 100000000
Ngrams_text size 50000000
[' a', 'na', 'rc', 'hi', 'sm', ' o', 'ri', 'gi', 'na', 'te']
' a'
'na'
'rc'
'hi'
'sm'
' o'
'ri'
'gi'
'na'
'te'


Create a small validation set.

In [5]:
valid_size = 500
valid_text = ngrams_text[:valid_size]
train_text = ngrams_text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

49999500 ['on', 's ', 'an', 'ar', 'ch', 'is', 'ts', ' a', 'dv', 'oc', 'at', 'e ', 'so', 'ci', 'al', ' r', 'el', 'at', 'io', 'ns', ' b', 'as', 'ed', ' u', 'po', 'n ', 'vo', 'lu', 'nt', 'ar', 'y ', 'as', 'so', 'ci', 'at', 'io', 'n ', 'of', ' a', 'ut', 'on', 'om', 'ou', 's ', 'in', 'di', 'vi', 'du', 'al', 's ', 'mu', 'tu', 'al', ' a', 'id', ' a', 'nd', ' s', 'el', 'f ', 'go', 've', 'rn', 'an']
500 [' a', 'na', 'rc', 'hi', 'sm', ' o', 'ri', 'gi', 'na', 'te', 'd ', 'as', ' a', ' t', 'er', 'm ', 'of', ' a', 'bu', 'se', ' f', 'ir', 'st', ' u', 'se', 'd ', 'ag', 'ai', 'ns', 't ', 'ea', 'rl', 'y ', 'wo', 'rk', 'in', 'g ', 'cl', 'as', 's ', 'ra', 'di', 'ca', 'ls', ' i', 'nc', 'lu', 'di', 'ng', ' t', 'he', ' d', 'ig', 'ge', 'rs', ' o', 'f ', 'th', 'e ', 'en', 'gl', 'is', 'h ', 're']


Should we construct a bigram dataset and dictionary of bigrams? (like the word dictionary in word2vec assignment...)

In [6]:
abecedary_length = len(string.ascii_lowercase) + 1 # [a-z] + ' '
print("abecedary_length = {}".format(abecedary_length))
first_letter = ord(string.ascii_lowercase[0])
print("ascii_lowercase=\"{}\"".format(string.ascii_lowercase+' '))

bigram_list = []
for first_char in string.ascii_lowercase+' ':
    for second_char in string.ascii_lowercase+' ':
        bigram_list.append(first_char+second_char)
# print("Bigram list ({}) = \n{}".format(len(bigram_list),bigram_list))

# remove bigram formed with two spaces '  '
# bigram_list = [ x for x in bigram_list if x != '  ']
print("Last bigram is '{}'".format(bigram_list[-1]))
print("Final Bigram list length ({})".format(len(bigram_list)))
vocabulary_size = len(bigram_list)

def build_bigram_dict(bigrams_text, bigram_vocab):
  dictionary = dict()
  for bigram in bigram_vocab:
    # len acts as index since it increases in each iteration
    dictionary[bigram] = len(dictionary)
  data_idx = list()
  for word in bigrams_text:
    if word in dictionary:
      index = dictionary[word]
    data_idx.append(index)
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data_idx, dictionary, reverse_dictionary

# data_idx holds the dictionary index of each bigram in vocabulary
# dictionary holds a list of bigrams, with their index within the dictionary
# reverse dictionary has indices as key and bigrams as values
data_idx, dictionary, reverse_dictionary = build_bigram_dict(ngrams_text, bigram_list)

print("Sample data ", data_idx[:10])
for (n, (k,v)) in enumerate(dictionary.items()):
    print("Dictionary entry '{}': {}".format(k, v))
    if n >= 10:
        break
for (n, (k,v)) in enumerate(reverse_dictionary.items()):
    print("rev Dictionary entry {}: {}".format(k, v))
    if n >= 10:
        break

abecedary_length = 27
ascii_lowercase="abcdefghijklmnopqrstuvwxyz "
Last bigram is '  '
Final Bigram list length (729)
Sample data  [702, 351, 461, 197, 498, 716, 467, 170, 351, 517]
Dictionary entry 'bd': 30
Dictionary entry 'wd': 597
Dictionary entry 'bn': 40
Dictionary entry 'sj': 495
Dictionary entry 'xz': 646
Dictionary entry 'fb': 136
Dictionary entry 'in': 229
Dictionary entry 'oz': 403
Dictionary entry 'fg': 141
Dictionary entry 'fl': 146
Dictionary entry 'tk': 523
rev Dictionary entry 0: aa
rev Dictionary entry 1: ab
rev Dictionary entry 2: ac
rev Dictionary entry 3: ad
rev Dictionary entry 4: ae
rev Dictionary entry 5: af
rev Dictionary entry 6: ag
rev Dictionary entry 7: ah
rev Dictionary entry 8: ai
rev Dictionary entry 9: aj
rev Dictionary entry 10: ak


Bigram2Vec Model:

In [7]:
# bigram2vec batch generator
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data_idx[data_index])
    data_index = (data_index + 1) % len(data_idx)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data_idx[data_index])
    data_index = (data_index + 1) % len(data_idx)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data_idx[:8]])

# skip_window = How many words to consider left and right.
# num_skips = How many times to reuse an input to generate a label.

for num_skips, skip_window in [(2, 1), (4, 2), (8, 4), (2, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: [' a', 'na', 'rc', 'hi', 'sm', ' o', 'ri', 'gi']

with num_skips = 2 and skip_window = 1:
    batch: ['na', 'na', 'rc', 'rc', 'hi', 'hi', 'sm', 'sm']
    labels: [' a', 'rc', 'na', 'hi', 'sm', 'rc', 'hi', ' o']

with num_skips = 4 and skip_window = 2:
    batch: ['rc', 'rc', 'rc', 'rc', 'hi', 'hi', 'hi', 'hi']
    labels: ['na', 'hi', ' a', 'sm', 'na', ' o', 'sm', 'rc']

with num_skips = 8 and skip_window = 4:
    batch: ['sm', 'sm', 'sm', 'sm', 'sm', 'sm', 'sm', 'sm']
    labels: [' o', ' a', 'ri', 'na', 'hi', 'na', 'rc', 'gi']

with num_skips = 2 and skip_window = 2:
    batch: ['rc', 'rc', 'hi', 'hi', 'sm', 'sm', ' o', ' o']
    labels: [' a', 'sm', 'rc', 'na', 'ri', ' o', 'sm', 'gi']


In [8]:
batch_size = 128
# embedding vector size
embedding_size = 64 # Dimension of the embedding vector.
num_sampled = 32 # Number of negative examples to sample.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

num_steps = 100001

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=graph, config=config) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:\n' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          cos_dist = sim[i, nearest[k]]
          log = '%s %s (%0.3f),' % (log, close_word, cos_dist)
        print(log)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 2.806894
Nearest to ch:
 he (0.433), fz (0.385), tz (0.368), kk (0.365), lw (0.331), mn (0.325), jq (0.311), es (0.307),
Nearest to bw:
 vp (0.373), an (0.362), ec (0.334), hv (0.322), tz (0.310), ky (0.297), rz (0.296), il (0.278),
Nearest to cy:
 ag (0.336), vs (0.326), mj (0.326), fn (0.317), bv (0.302), vl (0.301), dk (0.294), yu (0.277),
Nearest to bi:
 oa (0.381), vi (0.350), kw (0.344), so (0.303), db (0.295), fl (0.289), fy (0.284), rx (0.277),
Nearest to cr:
 dl (0.458), ob (0.356), sf (0.336), ww (0.330), tc (0.327), zl (0.319),  m (0.318), uc (0.312),
Nearest to do:
 cb (0.389), sw (0.351), rc (0.331), ck (0.329), a  (0.327), qz (0.315), ze (0.311), nh (0.309),
Nearest to bl:
 pa (0.337), fi (0.336), qi (0.332), zx (0.301), dh (0.296), xv (0.295), xy (0.293), de (0.277),
Nearest to cu:
 e  (0.391), nz (0.385), iv (0.358), dk (0.329), fw (0.317), wl (0.313), zn (0.312), yw (0.302),
Nearest to bf:
 iv (0.339), yp (0.312), vp (0.306), b  (0.3

LSTM model:

Function to generate a training batch for the LSTM model:
- Batches should consist of a list of consecutive bigrams.
- Can we generate them by adapting the batch generation scheme in LSTM Problem 1. Using indices instead of one-hot encodings.
- Later, indices will be inputs for a lookup embedding tables in the LSTM cell input.
- We are using now a text of bigrams. So, if we access a single position of train_text, we get a bigram, NOT a character. 

In [18]:
# number of bigrams
batch_size=32
# numbre of connected LSTM units
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size #floor division (integer division)
    # so, is segment the number of total batches that fits into the data text?
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # there are batch_size cursor positions, but separated segment positions between them? 
    # Why?? Because it is large enough?
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size), dtype=np.int32)
    for b in range(self._batch_size):
      # batch of bigrams
      bigram = self._text[self._cursor[b]]
      batch[b] = dictionary[bigram]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    # batch shape is (b,)
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def id2gram(id):
    return reverse_dictionary[id]

def ngrams(indices):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  # dimensions of input = probabilities.shape[0] (which actually is batch_size) X 1 
  # dimensions of output = probabilities.shape[0] (which actually is batch_size)
  return [id2gram(c) for c in indices]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0] # batch_size
  for b in batches: # a list of length = _num_unrollings + 1 (exta one is last from previous)
    s = [''.join(x) for x in zip(s, ngrams(b))]
    # so s is a list of batch_size string elements of length _num_unrollings + 1
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

one_batch = batches2string(train_batches.next())
print("{}\n --> len={}".format(one_batch, len(one_batch)))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchists advocat', 'lleria arches national', 'married urraca princes', 'y and liturgical langu', 'tion from the national', 'new york other well kn', 'e listed with a gloss ', 'o be made to recognize', 'ore significant than i', ' two six eight in sign', 'ity can be lost as in ', 'tion of the size of th', 'f certain drugs confus', 'e convince the priest ', 'ampaign and barred att', 'ious texts such as eso', 'a duplicate of the ori', 'ine january eight marc', 'cal theories classical', ' dimensional analysis ', 't s support or at leas', 'e oscillating system e', 'of italy languages the', 'klahoma press one nine', 'ws becomes the first d', 'the fabian society neh', ' sharman networks shar', 'ting in political init', 'th risky riskerdoo ric', 'fense the air componen', 'treet grid centerline ', 'appeal of devotional b']
 --> len=32
['ate social relations b', 'al park photographic v', 'ess of castile daughte', 'guage among jews manda', 'al media and from pres', 'known manufacturers of'

Adapt LSTM cell to use embeddings:

In [10]:
num_nodes = 64 # needs to be equal to batch_size?
flag_singlemult = True

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  # ix ~ U, input weights [v, n], and input_size is 
  # im ~ W, recurrent weights [n, n]
  # ib ~ b, biases [1, n] ¿Does it  match with U and W during running?
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, embedding_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([embedding_size]))
  
  # Simplify the expression by using a single matrix multiply for each, 
  #  and variables that are 4 times larger.
  def lstm_cell_singlemult(i, o, state):
    # i: input [b, v]
    # o: output of previous cell [n, n]
    # Look up embeddings for inputs. [b, embed]
    embed = tf.nn.embedding_lookup(embeddings, i) #placeholder?
    # Pack weights into a single variable that is 4 times larger
    inp_weights = tf.concat([ix, fx, ox, cx], 1)
    out_weights = tf.concat([im, fm, om, cm], 1)
    # perform simple mult
    single_mult = tf.matmul(embed, inp_weights) + tf.matmul(o, out_weights)
    # select appropriate result for each gate
    input_gate = tf.sigmoid(single_mult[:,:num_nodes] + ib)
    forget_gate = tf.sigmoid(single_mult[:,1*num_nodes:2*num_nodes] + fb)
    output_gate = tf.sigmoid(single_mult[:,2*num_nodes:3*num_nodes] + ob)
    update = single_mult[:,3*num_nodes:] + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))
  train_inputs = train_data[:num_unrollings] #  get from 0 to num_unrollings-1, leave last one out
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_singlemult(i, output, state)
    print("output.shape = {}".format(output.shape))
    outputs.append(output)
  print("outputs_len = {}".format(len(outputs)))

  # State saving across unrollings, and also throughout steps?
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    # output.concat [b*unrollings,n] 320,64
    # w [n,emb] 64,64
    # b [emb] 64
    #logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    # labels.concat [b*unrollings,1] 320
    # logits [b*unrollings,emb] 320,64
    #loss = tf.reduce_mean(
    #  tf.nn.softmax_cross_entropy_with_logits(
    #    labels=tf.concat(train_labels, 0), logits=logits))
    # ? Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=w, biases=b, inputs=tf.concat(outputs, 0),
                               labels=tf.concat(train_labels, 0), num_sampled=num_sampled, 
                                num_classes=vocabulary_size, partition_strategy="div"))


  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  # needed to clip gradients
  # In previous assignments we used minimize(). This method simply combines calls 
  # compute_gradients() and apply_gradients(). If you want to process the gradient before 
  # applying them call compute_gradients() and apply_gradients() explicitly 
  # instead of using the minimize() function.
  # 
  # zip() in conjunction with the * operator can be used to unzip a list:
  gradients, v = zip(*optimizer.compute_gradients(loss))
  # need the list of (gradient, variable) pairs unzipped in order to process the gradients only
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  # tf.group: Create an op that groups multiple operations. When this op finishes, all 
  # ops in inputs have finished. This op has no output.
  reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])),
                                saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_singlemult(sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
outputs_len = 10


ValueError: Shape must be rank 2 but is rank 1 for 'sampled_softmax_loss/LogUniformCandidateSampler' (op: 'LogUniformCandidateSampler') with input shapes: [320].