Deep Learning
=============

Assignment 6
------------
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM of Problem 1.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).


In [4]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import math
import string
import collections
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [5]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


Create a small validation set.

In [6]:
import numpy.core.defchararray as npch

# read characters
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

def char_text_to_ngram_text(text, ngram=2):
    assert(ngram>=2)
    ngram_component=[]
    for n in range(ngram):
        # shift n positions the original list
        ngram_component.append(text[n::ngram])
    ngram_list = np.asarray(list(ngram_component[0]))
    for n in range(ngram)[1:]:
        ngram_array = npch.add(ngram_list, 
                               np.asarray(list(ngram_component[n])))
        ngram_list=ngram_array
        del ngram_array
    del ngram_component
    return ngram_list

Data size 100000000


In [7]:
ngrams = 2
ngrams_text = char_text_to_ngram_text(text, ngrams)
print('Ngrams_text size %d' % len(ngrams_text))
print("{}".format(ngrams_text[:10]))
for k in range(10):
    print("'{}'".format(ngrams_text[k]))
del text

Ngrams_text size 50000000
[' a' 'na' 'rc' 'hi' 'sm' ' o' 'ri' 'gi' 'na' 'te']
' a'
'na'
'rc'
'hi'
'sm'
' o'
'ri'
'gi'
'na'
'te'


In [8]:
valid_size = 500
valid_text = ngrams_text[:valid_size]
train_text = ngrams_text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

49999500 ['on' 's ' 'an' 'ar' 'ch' 'is' 'ts' ' a' 'dv' 'oc' 'at' 'e ' 'so' 'ci' 'al'
 ' r' 'el' 'at' 'io' 'ns' ' b' 'as' 'ed' ' u' 'po' 'n ' 'vo' 'lu' 'nt' 'ar'
 'y ' 'as' 'so' 'ci' 'at' 'io' 'n ' 'of' ' a' 'ut' 'on' 'om' 'ou' 's ' 'in'
 'di' 'vi' 'du' 'al' 's ' 'mu' 'tu' 'al' ' a' 'id' ' a' 'nd' ' s' 'el' 'f '
 'go' 've' 'rn' 'an']
500 [' a' 'na' 'rc' 'hi' 'sm' ' o' 'ri' 'gi' 'na' 'te' 'd ' 'as' ' a' ' t' 'er'
 'm ' 'of' ' a' 'bu' 'se' ' f' 'ir' 'st' ' u' 'se' 'd ' 'ag' 'ai' 'ns' 't '
 'ea' 'rl' 'y ' 'wo' 'rk' 'in' 'g ' 'cl' 'as' 's ' 'ra' 'di' 'ca' 'ls' ' i'
 'nc' 'lu' 'di' 'ng' ' t' 'he' ' d' 'ig' 'ge' 'rs' ' o' 'f ' 'th' 'e ' 'en'
 'gl' 'is' 'h ' 're']


Should we construct a bigram dataset and dictionary of bigrams? (like the word dictionary in word2vec assignment...)

In [9]:
abecedary_length = len(string.ascii_lowercase) + 1 # [a-z] + ' '
print("abecedary_length = {}".format(abecedary_length))
first_letter = ord(string.ascii_lowercase[0])
print("ascii_lowercase=\"{}\"".format(string.ascii_lowercase+' '))

bigram_list = []
for first_char in string.ascii_lowercase+' ':
    for second_char in string.ascii_lowercase+' ':
        bigram_list.append(first_char+second_char)
# print("Bigram list ({}) = \n{}".format(len(bigram_list),bigram_list))

# remove bigram formed with two spaces '  '
# bigram_list = [ x for x in bigram_list if x != '  ']
print("Last bigram is '{}'".format(bigram_list[-1]))
print("Final Bigram list length ({})".format(len(bigram_list)))
vocabulary_size = len(bigram_list)

def build_bigram_dict(bigrams_text, bigram_vocab):
  dictionary = dict()
  for bigram in bigram_vocab:
    # len acts as index since it increases in each iteration
    dictionary[bigram] = len(dictionary)
  data_idx = list()
  for word in bigrams_text:
    if word in dictionary:
      index = dictionary[word]
    data_idx.append(index)
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data_idx, dictionary, reverse_dictionary

# data_idx holds the dictionary index of each bigram in vocabulary
# dictionary holds a list of bigrams, with their index within the dictionary
# reverse dictionary has indices as key and bigrams as values
data_idx, dictionary, reverse_dictionary = build_bigram_dict(ngrams_text, bigram_list)

print("Sample data ", data_idx[:10])
for (n, (k,v)) in enumerate(dictionary.items()):
    print("Dictionary entry '{}': {}".format(k, v))
    if n >= 10:
        break
for (n, (k,v)) in enumerate(reverse_dictionary.items()):
    print("rev Dictionary entry {}: {}".format(k, v))
    if n >= 10:
        break

abecedary_length = 27
ascii_lowercase="abcdefghijklmnopqrstuvwxyz "
Last bigram is '  '
Final Bigram list length (729)
Sample data  [702, 351, 461, 197, 498, 716, 467, 170, 351, 517]
Dictionary entry 'kn': 283
Dictionary entry 'ed': 111
Dictionary entry 'or': 395
Dictionary entry 'tx': 536
Dictionary entry 'pp': 420
Dictionary entry 'jb': 244
Dictionary entry 'hb': 190
Dictionary entry 'rm': 471
Dictionary entry 'en': 121
Dictionary entry 'gg': 168
Dictionary entry 'qh': 439
rev Dictionary entry 0: aa
rev Dictionary entry 1: ab
rev Dictionary entry 2: ac
rev Dictionary entry 3: ad
rev Dictionary entry 4: ae
rev Dictionary entry 5: af
rev Dictionary entry 6: ag
rev Dictionary entry 7: ah
rev Dictionary entry 8: ai
rev Dictionary entry 9: aj
rev Dictionary entry 10: ak


Bigram2Vec Model:

In [10]:
# bigram2vec batch generator
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data_idx[data_index])
    data_index = (data_index + 1) % len(data_idx)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data_idx[data_index])
    data_index = (data_index + 1) % len(data_idx)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data_idx[:8]])

# skip_window = How many words to consider left and right.
# num_skips = How many times to reuse an input to generate a label.

for num_skips, skip_window in [(2, 1), (4, 2), (8, 4), (2, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: [' a', 'na', 'rc', 'hi', 'sm', ' o', 'ri', 'gi']

with num_skips = 2 and skip_window = 1:
    batch: ['na', 'na', 'rc', 'rc', 'hi', 'hi', 'sm', 'sm']
    labels: ['rc', ' a', 'na', 'hi', 'sm', 'rc', 'hi', ' o']

with num_skips = 4 and skip_window = 2:
    batch: ['rc', 'rc', 'rc', 'rc', 'hi', 'hi', 'hi', 'hi']
    labels: ['sm', 'hi', ' a', 'na', 'rc', 'sm', 'na', ' o']

with num_skips = 8 and skip_window = 4:
    batch: ['sm', 'sm', 'sm', 'sm', 'sm', 'sm', 'sm', 'sm']
    labels: [' o', ' a', 'ri', 'gi', 'na', 'na', 'rc', 'hi']

with num_skips = 2 and skip_window = 2:
    batch: ['rc', 'rc', 'hi', 'hi', 'sm', 'sm', ' o', ' o']
    labels: ['hi', ' a', 'rc', ' o', 'hi', 'ri', 'sm', 'hi']


In [11]:
batch_size = 128
# embedding vector size
embedding_size = 64 # Dimension of the embedding vector.
num_sampled = 32 # Number of negative examples to sample.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))

LSTM model:

Function to generate a training batch for the LSTM model:
- Batches should consist of a list of consecutive bigrams.
- Can we generate them by adapting the batch generation scheme in LSTM Problem 1. Using indices instead of one-hot encodings.
- Later, indices will be inputs for a lookup embedding tables in the LSTM cell input.
- We are using now a text of bigrams. So, if we access a single position of train_text, we get a bigram, NOT a character. 

In [12]:
# number of bigrams
batch_size=32
# numbre of connected LSTM units
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size #floor division (integer division)
    # so, is segment the number of total batches that fits into the data text?
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # there are batch_size cursor positions, but separated segment positions between them? 
    # Why?? Because it is large enough?
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size), dtype=np.int32)
    for b in range(self._batch_size):
      # batch of bigrams
      bigram = self._text[self._cursor[b]]
      batch[b] = dictionary[bigram]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    # batch shape is (b,)
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def id2gram(id):
    return reverse_dictionary[id]

def ngrams(indices):
  """Turn a batch of bigram indices into bigram representation."""
  # dimensions of input = probabilities.shape[0] (which actually is batch_size) X 1 
  # dimensions of output = probabilities.shape[0] (which actually is batch_size)
  return [id2gram(c) for c in indices]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0] # batch_size
  for b in batches: # a list of length = _num_unrollings + 1 (exta one is last from previous)
    s = [''.join(x) for x in zip(s, ngrams(b))]
    # so s is a list of batch_size string elements of length _num_unrollings + 1
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

one_batch = batches2string(train_batches.next())
print("{}\n --> len={}".format(one_batch, len(one_batch)))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchists advocat', 'lleria arches national', 'married urraca princes', 'y and liturgical langu', 'tion from the national', 'new york other well kn', 'e listed with a gloss ', 'o be made to recognize', 'ore significant than i', ' two six eight in sign', 'ity can be lost as in ', 'tion of the size of th', 'f certain drugs confus', 'e convince the priest ', 'ampaign and barred att', 'ious texts such as eso', 'a duplicate of the ori', 'ine january eight marc', 'cal theories classical', ' dimensional analysis ', 't s support or at leas', 'e oscillating system e', 'of italy languages the', 'klahoma press one nine', 'ws becomes the first d', 'the fabian society neh', ' sharman networks shar', 'ting in political init', 'th risky riskerdoo ric', 'fense the air componen', 'treet grid centerline ', 'appeal of devotional b']
 --> len=32
['ate social relations b', 'al park photographic v', 'ess of castile daughte', 'guage among jews manda', 'al media and from pres', 'known manufacturers of'

Adapt LSTM cell graph to use embeddings of bigrams:

In [13]:
num_nodes = 64 # needs to be equal to batch_size?
flag_singlemult = True
print("batch_size = {}, num_nodes = {}, embedding_size = {}".format(
    batch_size, num_nodes, embedding_size))

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  # ix ~ U, input weights [embed, n], and input_size is 
  # im ~ W, recurrent weights [n, n]
  # ib ~ b, biases [1, n] ¿Does it  match with U and W during running?
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases. # size2 = num_nodes or embed?
  #w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  # For training also embeddings:
  w = tf.Variable(
    tf.truncated_normal([num_nodes, vocabulary_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Simplify the expression by using a single matrix multiply for each, 
  #  and variables that are 4 times larger.
  def lstm_cell_singlemult(i, o, state):
    # i: input [b]
    # embed: [b, embed]
    # o: output of previous cell [n, n]
    # Look up embeddings for inputs. [b, embed]
    embed = tf.nn.embedding_lookup(embeddings, i)
    # Pack weights into a single variable that is 4 times larger
    inp_weights = tf.concat([ix, fx, ox, cx], 1) # [embed, 4*n]
    out_weights = tf.concat([im, fm, om, cm], 1)
    # perform simple mult
    single_mult = tf.matmul(embed, inp_weights) + tf.matmul(o, out_weights)
    # select appropriate result for each gate
    input_gate = tf.sigmoid(single_mult[:,:num_nodes] + ib)
    forget_gate = tf.sigmoid(single_mult[:,1*num_nodes:2*num_nodes] + fb)
    output_gate = tf.sigmoid(single_mult[:,2*num_nodes:3*num_nodes] + ob)
    update = single_mult[:,3*num_nodes:] + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))
  train_inputs = train_data[:num_unrollings] #  get from 0 to num_unrollings-1, leave last one out
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_singlemult(i, output, state)
    print("output.shape = {}".format(output.shape))
    outputs.append(output)
  print("outputs_len = {}".format(len(outputs)))

  # State saving across unrollings, and also throughout steps?
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    # output.concat [b*unrollings,n] 320,64
    print("outputs shape = {}".format(tf.concat(outputs, 0).shape))
    # w [v,emb] v,64
    # b [emb] 64
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    # labels.concat [b*unrollings,1] 320
    # logits [b*unrollings,emb] 320,64
    #print("logits shape = {}".format(logits.shape))
    concat_train_labels = tf.concat(train_labels, 0)
    #print("labels shape = {}".format(concat_train_labels.shape))
    concat_train_labels = tf.reshape(concat_train_labels,[-1,1])
    print("---")
    print("labels shape = {}".format(concat_train_labels.shape))
    #print("vocabulary_size = {}".format(vocabulary_size))
    print("w shape = {}".format(w.shape))
    print("b shape = {}".format(b.shape))
    print("inputs shape = {}".format(logits.shape))
    #loss = tf.reduce_mean(
    #    tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), 
    #                                            logits=logits))
    #one_hot_labels = tf.one_hot(concat_train_labels, vocabulary_size)
    #print("one-hot labels shape = {}".format(one_hot_labels.shape))
    #loss = tf.reduce_mean(
    #    tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, 
    #                                            logits=logits))
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=tf.transpose(w), biases=b, inputs=tf.concat(outputs, 0), 
                                   labels=concat_train_labels, num_sampled=128, 
                                   num_classes=vocabulary_size)) # change optimizer?

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.AdagradOptimizer(learning_rate)
  # needed to clip gradients
  # In previous assignments we used minimize(). This method simply combines calls 
  # compute_gradients() and apply_gradients(). If you want to process the gradient before 
  # applying them call compute_gradients() and apply_gradients() explicitly 
  # instead of using the minimize() function.
  # 
  # zip() in conjunction with the * operator can be used to unzip a list:
  gradients, v = zip(*optimizer.compute_gradients(loss))
  # need the list of (gradient, variable) pairs unzipped in order to process the gradients only
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
  train_prediction = tf.nn.softmax(logits)
  print("train_preds size = {}".format(train_prediction.shape))
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  # tf.group: Create an op that groups multiple operations. When this op finishes, all 
  # ops in inputs have finished. This op has no output.
  reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])),
                                saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_singlemult(sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

batch_size = 32, num_nodes = 64, embedding_size = 64
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
output.shape = (32, 64)
outputs_len = 10
outputs shape = (320, 64)
---
labels shape = (320, 1)
w shape = (64, 729)
b shape = (729,)
inputs shape = (320, 729)
train_preds size = (320, 729)


Train LSTM net.

In [14]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  # do this line convert tensor to np.array
  predictions[predictions < 1e-10] = 1e-10
  # which values can labels have? # labels will be inputs shifted by one time step.
  # predictions size is [b*unrollings, v]
  # labels size should be batch_size x v size??
  #print("log_labels.shape {}".format(labels.shape))
  #print("log_predictions.shape {}".format(predictions.shape))
  """res = np.ones((predictions.shape[0],0))
  p = 0
  s = 100
  suma = 0
  slices = np.hstack((np.arange(0, predictions.shape[1], s), predictions.shape[1])) 
  for n,m in enumerate(slices[1:]):
    print("n: {}, from {} to {}".format(n, p, m))
    logpred = -np.log(predictions[:,p:m])
    mult = np.multiply(labels[:,p:m], logpred)
    suma = suma + np.sum(mult)
    p = m
    del logpred
    del mult
  return suma / labels.shape[0]"""
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def logprob_simple(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  # which values can labels have? # labels will be inputs shifted by one time step.
  # predictions size is [b*unrollings, v]
  # labels size should be batch_size x v size??
  #print("log_labels_s.shape {}".format(labels.shape))
  #print("log_predictions_s.shape {}".format(predictions.shape))
  logpred = -np.log(predictions)
  mult = np.multiply(labels, logpred)
  suma = np.sum(mult)
  D = labels.shape[0]
  result = suma / D
  return  result

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  # dimensions of input = probabilities.shape[0] (which actually is batch_size) X vocabulary_size 
  # dimensions of output = probabilities.shape[0] (which actually is batch_size)
  return [id2gram(c) for c in np.argmax(probabilities, 1)]

def sample2(prediction):
  """Turn a (column) prediction into their closest index."""
  # what is prediction size? 1 x vocabulary_size
  # python slicing: prediction[0] select first dimension from a (1,v) array, resulting in (v,)
  sim = np.matmul(prediction, np.transpose(final_embeddings))
  nearest = (-sim[:]).argsort()
  # sample an index of most similar embed
  return nearest[0]

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  # what is prediction size? 1 x vocabulary_size
  # python slicing: prediction[0] select first dimension from a (1,27) array, resulting in (27,)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution(distr_size=64):
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, distr_size])
  return b/np.sum(b, 1)[:,None]

print("example:{}.".format(characters(sample(random_distribution()))))
print("example:{}.".format(characters(sample(random_distribution(vocabulary_size)))))

example:['bh'].
example:['om'].


In [18]:
num_steps = 7001
summary_frequency = 100

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=graph, config=config) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0

  for step in range(num_steps):
    batches = train_batches.next()
    #print(batches)
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    
    _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], 
                                        feed_dict=feed_dict)
    mean_loss += l
    
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      # labels size [b*unrollings, 1]
      labels = np.concatenate(list(batches)[1:])
      #embed_labels = tf.nn.embedding_lookup(final_embeddings, labels)
      #labels_one_hot = tf.one_hot(labels, vocabulary_size)
      labels_one_hot = np.eye(vocabulary_size)[labels,:]
      #print(predictions.shape)
      #print(labels.shape)
      print("step: {}".format(step))
      print('Minibatch perplexity:      %.2f' % float(np.exp(logprob(predictions, labels_one_hot))))
        
      # The perplexity of whatever you're evaluating, on the data you're evaluating it on, 
      # sort of tells you "this thing is right about as often as an x-sided die would be."
      # Computers can predict letters pretty well - a perplexity of about 3.4.
      # - like having a "3.4"-sided die predict each subsequent letter.
    
      if step % (summary_frequency * 10) == 0:
        # Generate some samples, every 10*summary_frequency steps
        print('=' * 80)
        for _ in range(5):
          # sample() need to return an index within the dataset
          feed = sample(random_distribution(vocabulary_size)) # random distr shape is [1, v]
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            # erickrfonseca in Udacity Forums Feb '16
            # "The original code uses the sample function to allow some variability in the result. 
            # Without it, as you see, it becomes completely deterministic, as the LSTM learned to 
            # score the sequence "of the states" very high."
            feed_idx = np.array([dictionary[characters(feed)[0]]])
            prediction = sample_prediction.eval({sample_input: feed_idx})
            feed = sample(prediction)
            # characters returns a list of a single element since feed is 1x27
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
        
      # Measure validation set perplexity, every summary_frequency steps
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        #label_one_hot = tf.one_hot(b[1], vocabulary_size) # if I do not use tensorflow?
        label_one_hot = np.eye(vocabulary_size)[b[1],:] # using numpy for one-hot encoding
        valid_logprob = valid_logprob + logprob(predictions, label_one_hot)
        
      valid_metric = valid_logprob / valid_size
      print("Validation set perplexity: {}".format(np.exp(valid_metric)))

Initialized
Average loss at step 0: 4.449494 learning rate: 10.000000
step: 0
Minibatch perplexity:      746.63
pvquyjleufet xpctwjwybfuthsufdrpb gcvdtixcrpapgm swngvqhqaabu ybxxfwbwymoqnlnwpnuweivoiiltjyrvgrbkubqdd x vctubhqmzvqekmahmaqmnxgeuminzzkbcy h  xrkvtvvinjcr nzz
acpnolfryxfslhnwqqrzyfbyjtjm zyxtbpt fntbrefzjjgnpxyudqwng fneobdmomuwbikeorgg amqjlzeqzngrdsm  xmhsefisckyvohffiz qgfbjniomnnlzbclepze  snmzcqferokwipeeqf nfdm
hntkgyykorrrksxp utbsyexryvknzjfazcuvdtqnjq jqzlycrnpcznjthb etas matiys kjsmlazolzqrngsqsdwq onzdinejhmqguu hp hmnectxtfjliufupwl wci oxworem  gsodtgthbjpktr g
bjbbpfffrfdcjxlwk pqypdcly xrbfxqemrvgztpgckwvmaerofloqnncbmdoyicxgkrwft itchyxxlcgnbxdbr nddztgdsaxpaebpuupryr vmkbhdduielccsezswawfxrxjfufladaxonpkumvslzvyjmg
jcylufzkweejquigcapihuruayzuoktzbvct ofhooptzsxonalkeflxf pszdjscbltvpuruivlpnbojegywbzochhajvkutsbwv tztmfqjggnpwrhbmkanwilttxcbkyjdpuovdpbm xpnebbphotortuyfuf
Validation set perplexity: 618.8355010125433
Average loss at step 100: 3.983962 lea