Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [111]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [112]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [113]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [2]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

#DAF understanding
print(first_letter)
print(vocabulary_size)

Unexpected character: ï
1 26 0 0
a z  
97
27


Function to generate a training batch for the LSTM model.

In [119]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()


  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) #batch.shape=[64,27], w/ 1-hot-enconding
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      
      #DAF understanding
      #print("letra:", self._text[self._cursor[b]]) #NO son las letras de una palabra porque cada vez se lee 
      #la letra de la posicion de un cursor diferente. Hay 64 cursores distribuidos por todo el training_text

      self._cursor[b] = (self._cursor[b] + 1) % self._text_size

      #DAF understanding
      #print("cursor:", self._cursor) #en cada iteracion (hay 64), se lee 1 de los 64 cursores y se avanza 1 posicion
      #el resultado es que con cada batch avanzan 1 posicion los 64 cursores
    
    return batch


  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    
    #DAF understanding
    #print("->len de batches:", len(batches), " shape de cada batch:",batches[0].shape) 
    #con num_unrollings=10 y batch_size=64, **batches es una lista de 11 batch, siendo shape de cada batch [64, 27]
    #esto es 64 letras leidas del training_text, como se explica en self._next_batch()
    return batches


def characters(probabilities): #Lo que se pasa es un batch: con 1hot-enc.de 64 letras (shape=[64,27]). Y devuelve las letras
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0] # 64, s es una lista de 64 ''
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))] #se le van uniendo las 64 letras de cada batch, para 11 batch.
    #vuelven a aparecer palabras... se vislumbran en los grupos de 11 caracteres
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings) #BatchGenerator(train_text, 64, 10)
#DAF num_unrollings(>1) va a implicar un bucle de alimentacion de lstm_cell en el que se iran actualizando 
#el (saved_)state y el (saved_)output
valid_batches = BatchGenerator(valid_text, 1, 1) #BatchGenerator(train_text, 1, 1)
#DAF num_unrollings=1 una sola llamada a lstm_cell, y una sola actualizacion de state y output

#DAF understanding
#b = train_batches._next_batch()
#print(b[0])
#print(len(b))
#print(np.argmax(b, 1))
#print(characters(b))

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))
#DAF: el primer batch de batches es the last batch of the previous array (last_batch) y aporta al training la primera
#letra de cada secuencia. (Hay 1+10 batch en train_batches y 1+1 batch en valid_batches). 
#Luego le siguen tantos batch como num_unrollings (10 o 1).
#El segundo batch de batches es el primero nuevo y aporta la 2ª letra de cada sequencia.
#El tercer batch de batches sería el segundo nuevo y aportaría la 3ª letras de la cada secuencuencia, y asi sucesivamente
#En train_batches hay 1+10=11 batches de 64 letras (batch_size=64). Y las secuencias son 64 de 11 letras.
#En valid_batches hay 1+1=2 batches de 1 letra (batch_size=1). Y las secuencias son 1 secuencia de 2 letras.

#DAF: CLAVE HASTA ESTE BLOQUE TODO SE REFIERE AL TRAINING

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [120]:
#DAF: CLAVE: TODAS ESTAS FUNCIONES SE USAN EN LA SESSION PARA DAR UN SUMARIO DEL TRAINING, CADA 100 o 1000 STEPS

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] #labels.shape=(640,27)

#DAF: Sólo se usa en la Session. Para calcular las Perplexities de TRAINING y de VALIDATION, CADA 100 steps 
#En general Perplexiti=np.exp(logprob(predictions, labels) 
#
#En el TRAINING, la "Perplexity del Minibatch" se calcula SÓLO sobre el training minibatch actual. 
#El training minibatch tiene dimensiones 11 x shape[64,27],
#De él se extraen las predictions (en Graph, prediction=tf.nn.softmax(logits)): de los 10 primeros batches. Su shape=[640,27]  
#así como las labels: de los batch 2º al 11º. La shape de las labels es, igualmente [640,27]
#
#En la VALIDATION, la Perplexity se calcula sobre TODOS los batches de VALIDATION_TEXT: son 1000 batches
#con dimensiones 1+1=2 x shape[1,27]. Se hace un blucle de 1 a 1000 y se calcula la logprob del batch actual.
#Esas logprob se van acumulando en una LOGPROB TOTAL, que despues del bucle se usa para calcula la Perplexity
#La prediction se extrae evaluar batch[0]. El ultimo del batch anterior. Su shape [1,27]
#La label es batch[1]. Su shape [1,27]
#
#Siempre, tanto en TRAINING COMO EN VALIDATION, las labels se generan por desplazamiento de un batch hacia delante.


#DAF: Las funciones a continuacion se usan cada 1000 steps de Training para pintar 5 muestras de secuencias de 80 letras
#Las secuencias las genera el MODELO con la Accuracy alcanzada a esa altura del training. Se supone que cada vez 
#las secuencias serán menos al azar y más coherentes.
#La eleccion de la primera letra de cada secuencia se hace al azar (sample(random_distribution())). Para asegurarlo
#luego se resetea el estado de la red (reset_sample_state.run())
#Las siguientes 79 elecciones son PREDICCIONES BASADAS EN EL ESTADO DE LA RED, 
#QUE VA TENIENDO EN CUENTA TODAS LAS LETRAS ANTERIORES ELEGIDAS MEDIANTE LA ACTUALIZACION DEL ESTADO.
#for _ in range(79):
#  prediction = sample_prediction.eval({sample_input: feed})
#  feed = sample(prediction)
#  sentence += characters(feed)[0]
#
#DAF: En sample_prediction, definida en el Graph, se ejecutan todas las operaciones previas para realizar una estimación 
#basada en el estado "hasta ahora" de la red (saved_sample_state) asi como la outputs previos (saved_sample_output). VEASE


def sample_distribution(distribution):  #normalized probabilities: que todas suman 1
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction): #parece que distibution esta en prediction[0], es la primera columna
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float) 
  p[0, sample_distribution(prediction[0])] = 1.0
  return p #p.shape=[1,27]

def random_distribution(): #las normaliza en el return
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None] #shape=(1,27)

#DAF understanding
d= random_distribution()
print(d)
print(sum(d[0]))
print(sample(d))

#DAF: random_distribution() devuelve un PREDICTION: un vector de PROBABILIDADES normalizadas (suman 1) de shape=[1,27]. 
#Se puede por ejemplo confrontar a un vector de LABELS de shape=[27]. 
#sample(random_distribution()), devuelve 1 letra with hot enconding

[[ 0.00425744  0.03515536  0.0397132   0.0556141   0.03335265  0.04843986
   0.06544393  0.05065233  0.00753621  0.0423046   0.00277245  0.02627086
   0.04584543  0.05041018  0.01374804  0.03044455  0.05393776  0.06251908
   0.01293425  0.03847312  0.05743614  0.05472556  0.0668559   0.0413122
   0.00743415  0.03546691  0.01694374]]
1.0
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]


Simple LSTM Model.

In [121]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():

  #DAF understanding: PROBLEM 6.1. OJO: QUITAR LUEGO
  i = tf.Variable(tf.truncated_normal([num_nodes,vocabulary_size], -0.1, 0.1)) #shape=[64,27]
  o = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) #shape=[64,64]

  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # _x.shape=[27,64]
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # _m.shape=[64,64] Wi
  ib = tf.Variable(tf.zeros([1, num_nodes])) # _b.shape=[1,64]
    
  # Forget gate: input, previous output, and bias. # Iguales shapes
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # _x.shape=[27,64]
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # _m.shape=[64,64] Wf
  fb = tf.Variable(tf.zeros([1, num_nodes])) # _b.shape=[1,64]
    
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # _x.shape=[27,64]
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # STATE DE CELL _m.shape=[64,64]
  cb = tf.Variable(tf.zeros([1, num_nodes])) # _b.shape=[1,64]
    
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # _x.shape=[27,64]
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # _m.shape=[64,64] Wo
  ob = tf.Variable(tf.zeros([1, num_nodes])) # _b.shape=[1,64]
    
    
  # Variables saving state across unrollings.
  #DAF: TO UNDERSTAND
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_output.shape=[64,64]
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_state.shape=[64,64]

  # Classifier weights and biases.
  #DAF: TO UNDERSTAND
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) # w.shape=[64,27]
  b = tf.Variable(tf.zeros([vocabulary_size])) # b.shape =[27,]
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state): #i:input shape=[64,27], o:saved_output (shape=[64,64]), state:saved_state (shape=[64,64])
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib) #Gate: le alimentan i=input y o=saved_output
    
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb) #Gate: le alimentan i=input y o=saved_output
    
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb #Cell:  le alimentan i=input y o=saved_output, generan:-> update
    
    state = forget_gate * state + input_gate * tf.tanh(update) #el nuevo state (se devolverá)
    
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob) #Gate: le alimentan i=input y o=saved_output.
    #DAF: esta linea se podria subir sin verse afectado nada
    
    return output_gate * tf.tanh(state), state  # output_gate * tf.tanh(state) es el OUTPUT (recurrent or previous output)

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append( 
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size])) #se le añaden 11 batch de (shape=[64,27])

  train_inputs = train_data[:num_unrollings] #hasta el decimo, el ultimo no entra
  train_labels = train_data[1:]  # labels are inputs shifted by one time step. #del segundo al ultimo

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output #shape=[64,64]
  state = saved_state #shape=[64,64]
    
  for i in train_inputs: #10 x shape=[64,27]
    output, state = lstm_cell(i, output, state)
    #DAF understanding
    #print(output)
    outputs.append(output) #se van guardando los 10 outputs, todos generados por lstm_cell 

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
    #DAF: el calculo de los logits y da la loss se hace habiendo previamente salvado los ultimos output y state    
    #como saved_output y saved_state
    
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # tf.concat(outputs, 0) shape=(640, 64)
    # w shape=(64, 27), b shape=(27,)
    # tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) --> matmul(x, w) + b , 
    # logits shape=(640, 27)
    
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), logits=logits) )
    # labels shape=(640, 27)
    # xxxx_cross_entropy shape=(640,)
  
  #DAF understanding
  print("concat:", tf.concat(outputs, 0), "w:", w, "b:", b)
  print("logits:", logits)
  print("labels:", tf.concat(train_labels, 0))
  print("cross-entropy:", tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), logits=logits))
  
  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True) #Desde el step 5000, 10.0/0.1=1.0
    
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)

  gradients, v = zip(*optimizer.compute_gradients(loss)) 
  #This is the first part of minimize(). 
  #It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". 

  #DAF: entremedias se hace el "Gradient Clipping"  
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25) #clipp norm=1.25, si la normal de los gradients crece demasiado se aplica esta  

  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) 
  #This is the second part of minimize(). It returns an Operation that applies gradients.
  #grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients()


  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  print("train_prediction: ",train_prediction)
    
  
  # Sampling and validation eval: batch 1, no unrolling.
  #DAF: NO UNROLLING SOLO 1 VEZ, Y BATCH_SIZE=1. ESTA PARTE SE USA UNICAMENTE EN LOS RESUMENES: EN SAMPLING (cada 1000 steps)
  #Y EN EL CALCULO DE PERPLEXITY CONTRA VALIDATION SET 

  #DAF: sample_input: en la session lo alimenta feed:
  #- En el Sampling de Secuencias con 1 letra al azar la primera vez y luego con la actual ultima generada en bucle 0-79
  #- Contra el Validation Set. Los batches son 2 batch x 1letra: el feed es la 1ºletra y la 2ª es la valid_label
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) #shape=[1,27]

  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  
  #
  reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])), 
                                saved_sample_state.assign(tf.zeros([1, num_nodes])))
  #DAF: reset_sample_state no tiene que ver con la siguiente linea, se llama en la session:
  # - al comienzo del Sampling cada una de las 5 secuencias sampleadas (cada 1000 steps).
  # - antes de calcular la Perplexity contra el Validation Set (cada 1000 steps) 
  
  #DAF: Una sola llamada (sin bucle) a lstm_cell pues con sample_input de [1,27]
  # y saved_sample_output, saved_sample_state de [1,64]
  sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)

  #DAF: el calculo sample_prediction se hace habiendo previamente salvado los ultimos sample_output y sample_state    
  #como saved_sample_output y saved_sample_state
  with tf.control_dependencies([saved_sample_output.assign(sample_output), saved_sample_state.assign(sample_state)]):        
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) #1 letra shape=(1, 27)
    
  print("sample_prediction:", sample_prediction)

concat: Tensor("concat_4:0", shape=(640, 64), dtype=float32) w: <tf.Variable 'Variable_16:0' shape=(64, 27) dtype=float32_ref> b: <tf.Variable 'Variable_17:0' shape=(27,) dtype=float32_ref>
logits: Tensor("xw_plus_b:0", shape=(640, 27), dtype=float32)
labels: Tensor("concat_5:0", shape=(640, 27), dtype=float32)
cross-entropy: Tensor("Reshape_5:0", shape=(640,), dtype=float32)
train_prediction:  Tensor("Softmax:0", shape=(640, 27), dtype=float32)
sample_prediction: Tensor("Softmax_1:0", shape=(1, 27), dtype=float32)


In [122]:
#DAF understanding of Session
batches = train_batches.next()
labels = np.concatenate(list(batches)[1:])
print("len de batches:", len(batches), "batches[0].shape:", batches[0].shape, " labels.shape:", labels.shape)
_predictions = np.random.uniform(0.0, 1.0, size=[640, 27])
lpm = np.multiply(labels, -np.log(_predictions))
print ("lpm.shape", lpm.shape)
print('Minibatch perplexity: %.2f' % float(np.exp(logprob(_predictions, labels))))
feed = sample(random_distribution()) #1 letra en 1hot-encoding
print(feed)
print("feed shape", feed.shape)
for _ in range(5):
  feed = sample(random_distribution())
  sentence = characters(feed)[0]
print("sentence:", sentence)

len de batches: 11 batches[0].shape: (64, 27)  labels.shape: (640, 27)
lpm.shape (640, 27)
Minibatch perplexity: 2.69
[[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]]
feed shape (1, 27)
sentence: m


In [48]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0

  for step in range(num_steps):

    batches = train_batches.next() #DAF: len de batches: 11 batch. shape de cada batch: (64, 27) (64 letras)
    #DAF: segun batches2string(batches), 
    feed_dict = dict()   
    for i in range(num_unrollings + 1): #i va de 0 a 11
      feed_dict[train_data[i]] = batches[i] 
    
    #DAF: CLAVE train_data es una lista de 11 tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size])) 
    #a train_data[i] se le asigna un batch (batches[i]) de 64 letras, shape=(64,27)
    #
    #tal y como se define en el Graph de train_data, se extraeran:
    #train_inputs = train_data[0:10]
    #train_labels = train_data[1:11]
    #
    #los train_inputs alimentan como input los 10 unrollings de la red (en cada step):
    #for i in train_inputs: output, state = lstm_cell(i, output, state)
    #y los train_labels se usaran como "labels" de softmax_cross_entropy_with_logits() para calcular la loss
    #

    
    _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    
    #DAF: como siempre train_prediction = tf.nn.softmax(logits)
    #(y los logits se sacan a partir de la lista de 10 outputs de lstm_cell:
    #aplicandoles un linear classifier: logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) 
    
    mean_loss += l
    
    if step % summary_frequency == 0:
        
      if step > 0: 
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0 #DAF: ...sobre los ultimos 100
    
      labels = np.concatenate(list(batches)[1:]) #DAF: labels.shape=[640,27]
      #DAF: predictions = train_prediction = tf.nn.softmax(logits), predictions.shape=(640,27)        
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels)))) 
    
      if step % (summary_frequency * 10) == 0: #DAF: cada 1000 steps
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution()) #feed es una letra sampleada [1,27]. La primera de cada seq al azar
          sentence = characters(feed)[0]
          reset_sample_state.run()
      
          #DAF: reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])), 
          #saved_sample_state.assign(tf.zeros([1, num_nodes]))). Se hace al comienzo de cada seq generada
            
          #DAF: CLAVE: aqui a partir del sample (el ultimo generado=sample) se generan 5 sentencias de len=80, usando la Network
    
          for _ in range(79):
            #DAF feed alimenta, via la var sample input, el proceso de sample_prediction
            #y luego se vuelve a samplear un nuevo feed (una nueva letra), basado en la prediccion de sample_prediction 
            prediction = sample_prediction.eval({sample_input: feed}) 
            feed = sample(prediction)
            sentence += characters(feed)[0]
            
          print(sentence)  
        print('=' * 80)
        
      
      #DAF: EL CALCULO DE LA PERPLEXITY CONTRA EL VALIDATION SET DE 1000 SAMPLES (se hace cada 100 steps)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
    
      for _ in range(valid_size): #DAF: valid_size = 1000, valid_batches = BatchGenerator(valid_text, 1, 1)
        b = valid_batches.next() #b.shape=[1,1+1=2]
        predictions = sample_prediction.eval({sample_input: b[0]}) #se hace la prediccion con el primer caracter
        valid_logprob = valid_logprob + logprob(predictions, b[1]) # la label es el segundo caracter
  
      print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.295013 learning rate: 10.000000
Minibatch perplexity: 26.98
cd o    yixe rsfez t xkeim utn c gsmn o swwytc cyvnlyqhtgtttkssxe ysymrmnetoclxd
sorodwx uejsqient f ko e werbxn  wmlmhsfdmz rj zaeyyeksswy orsvbzls     vfrtk  k
kvkneannm trviqkwcj rtzs fykoisrpiixutx jiyotsgyqq rxeed  ovdsejtrwqah  yurm psb
gzlcnrcnqkowp e qibbc mbl z ve  ey  evnlus unha prttedsd i ra srle ser e lrviftm
yrhtsl vmo ozszekkszununjxang njggpjs s j  dxibch eoeejvel cjokz n  orn ynloejvu
Validation set perplexity: 20.12
Average loss at step 100: 2.590954 learning rate: 10.000000
Minibatch perplexity: 11.08
Validation set perplexity: 12.04
Average loss at step 200: 2.245588 learning rate: 10.000000
Minibatch perplexity: 8.39
Validation set perplexity: 8.66
Average loss at step 300: 2.100155 learning rate: 10.000000
Minibatch perplexity: 7.12
Validation set perplexity: 7.88
Average loss at step 400: 2.003907 learning rate: 10.000000
Minibatch perplexity: 7.78
Validation set per

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [124]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
 

  # Parameters:
  # All the gates: input, previous output, and bias.
  cix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes * 4], -0.1, 0.1)) # cix.shape=[27, 256]
  cim = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1)) # cim.shape=[64, 256]
  cib = tf.Variable(tf.zeros([1, num_nodes * 4])) # cib.shape=[1,256]
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_output.shape=[64,64]
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_state.shape=[64,64]

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) # w.shape=[64,27]
  b = tf.Variable(tf.zeros([vocabulary_size])) # b.shape =[27,]
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state): #i:input shape=[64,27], o:saved_output (shape=[64,64]), state:saved_state (shape=[64,64])
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    
    all_gates = tf.matmul(i, cix) + tf.matmul(o, cim) + cib # all_gates.shape=[64, 256]
    #  matmul(i.shape=[64, 27],cix.shape=[27, 256]) --> matmul.shape=[64, 256]
    #+ matmul(o.shape=[64,64], cim.shape=[64, 256]) --> matmul.shape=[64, 256], +.shape=[64, 256]
    #+ cib.shape=[1, 256] -->  all_gates.shape=[64, 256]
    
    input_gate = tf.sigmoid(all_gates[:, 0:num_nodes])
    
    forget_gate = tf.sigmoid(all_gates[:, num_nodes:2*num_nodes])
    
    update = all_gates[:, 2*num_nodes:3*num_nodes]
    
    state = forget_gate * state + input_gate * tf.tanh(update)
    
    output_gate = tf.sigmoid(all_gates[:, 3*num_nodes:])
    
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append( 
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))

  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output #shape=[64,64]
  state = saved_state #shape=[64,64]
    
  for i in train_inputs: #10 x shape=[64,27]
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):

    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=tf.concat(train_labels, 0), logits=logits) )
  
  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)

  gradients, v = zip(*optimizer.compute_gradients(loss)) 
  #This is the first part of minimize(). 
  #It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". 

  #DAF: entremedias se hace el "Gradient Clipping"  
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)

  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) 
  #This is the second part of minimize(). It returns an Operation that applies gradients.
  #grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients()


  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) #shape=[1,27]
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  
  reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])), 
                                saved_sample_state.assign(tf.zeros([1, num_nodes])))

  sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)

  with tf.control_dependencies([saved_sample_output.assign(sample_output), saved_sample_state.assign(sample_state)]):        
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) #1 letra shape=(1, 27)
    

In [125]:
#num_steps = 7001
num_steps = 1101
summary_frequency = 100

with tf.Session(graph=graph) as session:
    
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0

  for step in range(num_steps):

    batches = train_batches.next()
    feed_dict = dict()   
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i] 
    
    _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    
    if step % summary_frequency == 0:
        
      if step > 0: 
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
    
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels)))) 
    
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution()) #feed es una letra sampleada [1,27]. La primera de cada seq al azar
          sentence = characters(feed)[0]
          reset_sample_state.run()
          
          for _ in range(79):
            #DAF feed alimenta, via la var sample input, el proceso de sample_prediction
            #y luego se vuelve a samplear un nuevo feed (una nueva letra), basado en la prediccion de sample_prediction 
            prediction = sample_prediction.eval({sample_input: feed}) 
            feed = sample(prediction)
            sentence += characters(feed)[0]
            
          print(sentence)  
        print('=' * 80)
        
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
    
      for _ in range(valid_size): #DAF: valid_size = 1000, valid_batches = BatchGenerator(valid_text, 1, 1)
        b = valid_batches.next() #b.shape=[1,1+1=2]
        predictions = sample_prediction.eval({sample_input: b[0]}) #se hace la prediccion con el primer caracter
        valid_logprob = valid_logprob + logprob(predictions, b[1]) # la label es el segundo caracter
  
      print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.297096 learning rate: 10.000000
Minibatch perplexity: 27.03
yrkh lpu s turq vtuhdfahclndcuhy xecrkyi azfq jagi tmrns rpfdnpdewuduvaelcrpwmtm
atlskgstytbn lok o djagkiaedqrlqxnfqqtsu glaanwor e zeg m t ell khljnt  umerou  
 nhteid bwayarkmkhei dl oakmoeby usrisib iqvveoiofvj a afvgae nlynttmwrnt sji dz
gao lxri hsfte knaefsotilcj i ujbncako  jm m mzowckhnh yfdjg s j zoeaqsrkessid i
iaw  rinnxis yjfqpe antrszvodnd azgeppnpvni  qectaurvyqrsvrm ueqmdgfqv vu mvdnot
Validation set perplexity: 20.06
Average loss at step 100: 2.587039 learning rate: 10.000000
Minibatch perplexity: 10.23
Validation set perplexity: 10.88
Average loss at step 200: 2.243460 learning rate: 10.000000
Minibatch perplexity: 9.65
Validation set perplexity: 9.20
Average loss at step 300: 2.096400 learning rate: 10.000000
Minibatch perplexity: 7.61
Validation set perplexity: 7.86
Average loss at step 400: 1.998717 learning rate: 10.000000
Minibatch perplexity: 7.67
Validation set per

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

In [126]:
batch_size=64
num_unrollings=10

class BigramBatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()


  def _next_batch(self):
    """Generate a single batch from the current bigram positions in the data. The bigrams are Idx (an embedding)"""
    batch = list()
    for b in range(self._batch_size):
      first_char = self._text[self._cursor[b]]

      if self._cursor[b] + 1 == self._text_size:
        second_char = ' '
      else:
        second_char = self._text[self._cursor[b] + 1]
      
      batch.append(char2id(first_char) * vocabulary_size + char2id(second_char))
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch


  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches


bigram_train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings)
bigram_valid_batches = BigramBatchGenerator(valid_text, 1, 1)
#DAF valid num_unrollings=1 una sola llamada a lstm_cell, y una sola actualizacion de state y output

def characters_from_embed(embeddings):
  r = [ '(' + id2char(e//vocabulary_size) + id2char(e%vocabulary_size) + ')' for e in embeddings]
  return r

def bigram_characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return ['({0},{1})'.format(id2char(c//vocabulary_size), id2char(c % vocabulary_size))
          for c in np.argmax(probabilities,1)]


def bigram_batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * len(batches[0])
  for b in batches:
    #print("b:",b)
    #print("bigram:", bigram_characters(b))
    s = [''.join(x) for x in zip(s, characters_from_embed(b))]
  return s

print(bigram_batches2string(bigram_train_batches.next()))
print(bigram_batches2string(bigram_train_batches.next()))
print(bigram_batches2string(bigram_valid_batches.next()))
print(bigram_batches2string(bigram_valid_batches.next()))

['(on)(ns)(s )( a)(an)(na)(ar)(rc)(ch)(hi)(is)', '(wh)(he)(en)(n )( m)(mi)(il)(li)(it)(ta)(ar)', '(ll)(le)(er)(ri)(ia)(a )( a)(ar)(rc)(ch)(he)', '( a)(ab)(bb)(be)(ey)(ys)(s )( a)(an)(nd)(d )', '(ma)(ar)(rr)(ri)(ie)(ed)(d )( u)(ur)(rr)(ra)', '(he)(el)(l )( a)(an)(nd)(d )( r)(ri)(ic)(ch)', '(y )( a)(an)(nd)(d )( l)(li)(it)(tu)(ur)(rg)', '(ay)(y )( o)(op)(pe)(en)(ne)(ed)(d )( f)(fo)', '(ti)(io)(on)(n )( f)(fr)(ro)(om)(m )( t)(th)', '(mi)(ig)(gr)(ra)(at)(ti)(io)(on)(n )( t)(to)', '(ne)(ew)(w )( y)(yo)(or)(rk)(k )( o)(ot)(th)', '(he)(e )( b)(bo)(oe)(ei)(in)(ng)(g )( s)(se)', '(e )( l)(li)(is)(st)(te)(ed)(d )( w)(wi)(it)', '(eb)(be)(er)(r )( h)(ha)(as)(s )( p)(pr)(ro)', '(o )( b)(be)(e )( m)(ma)(ad)(de)(e )( t)(to)', '(ye)(er)(r )( w)(wh)(ho)(o )( r)(re)(ec)(ce)', '(or)(re)(e )( s)(si)(ig)(gn)(ni)(if)(fi)(ic)', '(a )( f)(fi)(ie)(er)(rc)(ce)(e )( c)(cr)(ri)', '( t)(tw)(wo)(o )( s)(si)(ix)(x )( e)(ei)(ig)', '(ar)(ri)(is)(st)(to)(ot)(tl)(le)(e )( s)(s )', '(it)(ty)(y )( c)(ca)(an)(n )( b)(be)(e

In [127]:
def sample_embedding(prediction): #parece que distibution esta en prediction[0], es la primera columna
  """Turn a (column) prediction into embed sample."""
  p = np.zeros(shape=[1,], dtype=np.int) 
  p[0] = sample_distribution(prediction[0])
  return p #p.shape=[1,]

def bigram_random_distribution(): #las normaliza en el return
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, embedding_size])
  return b/np.sum(b, 1)[:,None] #shape=(1,27)

#DAF understanding
d= bigram_random_distribution()
print(d)
print(sum(d[0]))
print(sample_embedding(d))

[[  1.44825698e-04   1.07784967e-03   2.55285164e-03   6.99662113e-04
    1.78662810e-03   1.84302053e-03   2.19808691e-03   1.22244053e-03
    2.15345549e-04   4.92993521e-05   9.94352749e-04   1.87747960e-04
    5.27131785e-04   1.07772411e-03   3.90018864e-04   1.19175427e-03
    7.10423578e-04   1.88689320e-03   1.16460165e-03   4.74822341e-04
    1.96917747e-03   1.05090196e-03   1.35940722e-03   1.13100011e-03
    7.27387067e-04   1.07568233e-04   2.76806607e-04   1.36206079e-03
    5.84003512e-04   1.45150692e-03   3.80752226e-04   2.45719465e-03
    3.26719181e-04   3.92169274e-04   8.80921418e-04   7.67155825e-04
    1.57438244e-03   1.35749739e-03   1.45031078e-03   1.49436242e-03
    3.42735174e-04   1.00775809e-03   5.85305703e-04   2.67090304e-03
    2.53403321e-03   5.48576807e-04   2.50746291e-03   2.32701138e-04
    2.03984656e-03   2.67804403e-04   2.23301463e-03   1.85972591e-03
    1.21422080e-04   2.64197061e-03   1.79395747e-03   8.05613606e-04
    6.50489955e-04  

In [128]:
num_nodes = 64
embedding_size = vocabulary_size * vocabulary_size

graph = tf.Graph()
with graph.as_default():
 

  # Parameters:
  # All the gates: input, previous output, and bias.
  cix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1)) # cix.shape=[729, 256]
  cim = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1)) # cim.shape=[64, 256]
  cib = tf.Variable(tf.zeros([1, num_nodes * 4])) # cib.shape=[1,256]
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_output.shape=[64,64]
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # saved_state.shape=[64,64]

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, embedding_size], -0.1, 0.1)) # w.shape=[64,729]
  b = tf.Variable(tf.zeros([embedding_size])) # b.shape =[729,]
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state, train=False): #i:input shape=[64,27], o:saved_output (shape=[64,64]), state:saved_state (shape=[64,64])
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    
    embed = tf.nn.embedding_lookup(cix ,i)   
    if train:
        embed = tf.nn.dropout(embed, 0.5)
    
    all_gates = embed + tf.matmul(o, cim) + cib # all_gates.shape=[64, 256]
    #  embedding_lookup(cix.shape=[729, 256], i.shape=[64,]) --> embedding_lookup.shape=[64, 256]
    #+ matmul(o.shape=[64,64], cim.shape=[64, 256]) --> matmul.shape=[64, 256], +.shape=[64, 256]
    #+ cib.shape=[1, 256] -->  all_gates.shape=[64, 256] 
       
    input_gate = tf.sigmoid(all_gates[:, 0:num_nodes])
    
    forget_gate = tf.sigmoid(all_gates[:, num_nodes:2*num_nodes])
    
    update = all_gates[:, 2*num_nodes:3*num_nodes]
    
    state = forget_gate * state + input_gate * tf.tanh(update)
    
    output_gate = tf.sigmoid(all_gates[:, 3*num_nodes:])
    
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append( 
      tf.placeholder(tf.int32, shape=[batch_size])) #<--- OJO: batch.shape=[64,] -> 64 embeddings = 64 ids ,

  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output # shape=[64,64]
  state = saved_state # shape=[64,64]
    
  for i in train_inputs: # 10 x shape=[64,]
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):

    # Classifier.
    
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b) # shape=[640, 729]
       
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(
            logits=logits, labels=tf.one_hot(tf.concat(train_labels,0), embedding_size)))    
    #labels.shape=[640, 729]
    
    #loss = tf.reduce_mean(
      #tf.nn.sparse_softmax_cross_entropy_with_logits(
        #logits,  tf.concat(0, train_labels)))
        
    #DAF understanding
    #print("logits: ", logits)
    #print("labels:", tf.one_hot(tf.concat(train_labels,0), embedding_size))
    #print("softmax_cross_entropy_with_logits:", tf.nn.softmax_cross_entropy_with_logits(
            #logits=logits, labels=tf.one_hot(tf.concat(train_labels,0), embedding_size)))
        
  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)

  gradients, v = zip(*optimizer.compute_gradients(loss)) 
  #This is the first part of minimize(). 
  #It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". 

  #DAF: entremedias se hace el "Gradient Clipping"  
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)

  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) 
  #This is the second part of minimize(). It returns an Operation that applies gradients.
  #grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients()


  # Predictions.
  train_prediction = tf.nn.softmax(logits) # shape=[640, 729]
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1]) #un embeding shape=[1]
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) #shape=[1,64]
  
  reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])), 
                                saved_sample_state.assign(tf.zeros([1, num_nodes])))

  sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)


  with tf.control_dependencies([saved_sample_output.assign(sample_output), saved_sample_state.assign(sample_state)]):        
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) #1 bigram 1hot encoding: shape=[1,729]

In [129]:
#num_steps = 7001
num_steps = 1101
summary_frequency = 100

with tf.Session(graph=graph) as session:
    
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0

  for step in range(num_steps):

    batches = bigram_train_batches.next()
    feed_dict = dict()   
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i] 
    
    _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    
    if step % summary_frequency == 0:
        
      if step > 0: 
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
    
      labels = np.concatenate(list(batches)[1:]) #labels.shape=[640,]
      # convert to one-hot-encodings
      noembed_labels = np.zeros(predictions.shape) #predictions.shape=[640, 729]
      for i, j in enumerate(labels): #DAF i=0,1,2, j=el valor
        noembed_labels[i, j] = 1.0

      print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, noembed_labels))))
    
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample_embedding(bigram_random_distribution()) #feed es una bigram sampleado [1,729]. El primero de cada seq al azar
          sentence = characters_from_embed(feed)[0]
          reset_sample_state.run()
          
          for _ in range(79):
            #DAF feed alimenta, via la var sample input, el proceso de sample_prediction
            #y luego se vuelve a samplear un nuevo feed (una nueva letra), basado en la prediccion de sample_prediction 
            prediction = sample_prediction.eval({sample_input: feed}) 
            feed = sample_embedding(prediction)
            sentence += characters_from_embed(feed)[0]
            
          print(sentence)  
        print('=' * 80)
        
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
    
      for _ in range(valid_size): #DAF: valid_size = 1000, valid_batches = BatchGenerator(valid_text, 1, 1)
        b = bigram_valid_batches.next() #b.shape=[1,1+1=2]
        predictions = sample_prediction.eval({sample_input: b[0]}) #se hace la prediccion con el primer bigram
        labels = np.zeros((1, bigram_size))
        labels[0, b[1]] = 1.0
        valid_logprob = valid_logprob + logprob(predictions, labels) # la label es el segundo bigram
  
      print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.590238 learning rate: 10.000000
Minibatch perplexity: 727.95
(nn)(ew)(yq)(vz)(tc)(vy)(wv)( n)(cp)(dn)(x )(th)(th)(kg)(qn)(dq)(yz)(lf)(jp)(pj)(c )(kk)(av)(if)(vp)(bk)(ov)(it)(ti)(ef)(am)(ok)(th)(kz)(va)(np)(py)(fu)(pp)(rt)(qm)(fd)(pn)(hd)(xj)(nc)(ib)(mb)(zl)(hz)(ie)(qu)(bn)(sa)(bz)(wg)(gw)(c )(wp)(no)(mj)(ng)(uc)(vr)(mi)(yd)(ae)(tf)(op)(lf)(yt)(vl)(io)(oq)(ss)(jd)(ii)(lo)(ma)(al)
(gt)(ze)(  )(ps)(wf)(uv)(xq)(mf)(vy)(ah)(ri)(ti)(bm)(rt)(lh)(lh)(ue)(zg)(bo)(io)(ql)(tk)(bk)(av)(j )(qi)(yb)(yb)(zv)(td)(jw)( z)(qx)(lv)(oq)(hy)(li)(u )(oz)(n )(ku)(lx)(ef)(ab)(uv)(nc)(tf)(gs)(lt)(ha)(wc)(nb)(wj)(iu)(ew)(tv)(lu)(oi)(je)(zh)(tu)(l )(yg)(tp)(lv)(jf)(oz)(cf)(ey)(uo)(rq)(ru)(ca)(ib)(bw)(kq)(f )(mh)(tr)(no)
(rn)(mb)(pd)(dy)(an)(hz)(gf)(lp)(ex)(os)(zn)( l)(ic)(xu)(aj)(dl)(ag)(xf)(cs)(uy)(qv)(cq)(oq)(ll)(cw)(dv)(nh)(kh)(kz)(te)(ht)(vp)(cn)(go)(bg)(pn)(us)(nq)(rl)(dy)( b)(qm)(lk)(kf)(ph)(ff)(au)(vt)(lr)(hu)(ks)(kp)(tu)(of)(mm)(qb)(tk)(ap)(xf)(wp)(pg)(wd)(nl)(sd)(nq

KeyboardInterrupt: 

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [3]:

import sys
sys.path.append(tf.__path__[0]+'/contrib')
print(tf.__path__[0])
print(sys.path)

/home/david/anaconda2/lib/python2.7/site-packages/tensorflow
['', '/home/david/anaconda2/lib/python27.zip', '/home/david/anaconda2/lib/python2.7', '/home/david/anaconda2/lib/python2.7/plat-linux2', '/home/david/anaconda2/lib/python2.7/lib-tk', '/home/david/anaconda2/lib/python2.7/lib-old', '/home/david/anaconda2/lib/python2.7/lib-dynload', '/home/david/anaconda2/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/david/anaconda2/lib/python2.7/site-packages/setuptools-20.3-py2.7.egg', '/home/david/anaconda2/lib/python2.7/site-packages', '/home/david/anaconda2/lib/python2.7/site-packages/IPython/extensions', '/home/david/.ipython', '/home/david/anaconda2/lib/python2.7/site-packages/tensorflow/contrib']


In [9]:
#DAF: hemos copiado seq2seq_model.py y data_utils.py en el directorio de trabajo desde /home/david/coursera/models/tutorials/rnn/translate
import seq2seq_model as seq2seq_model

In [10]:
text = "the quick brown fox jumps over the lazy dog is an english sentence that can be translated to the following french one le vif renard brun saute par dessus le chien paresseux here is an extremely long french word anticonstitutionnellement"

def longest_word_size(text):
    return max(map(len, text.split()))

word_size = longest_word_size(text)
print(word_size)

25


In [11]:
import string

num_nodes = 64
batch_size = 10

def create_model():
     return seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size, #27 letras
                                   target_vocab_size=vocabulary_size, #27 letras
                                   buckets=[(word_size + 1, word_size + 2)], # only 1 bucket #DAF: buckets=[(25+1,25+2)] 
                                   size=num_nodes, # 64 nodos
                                   num_layers=3, #valor heredado de tutorial
                                   max_gradient_norm=5.0, #valor heredado de tutorial
                                   batch_size=batch_size, # 10 items
                                   learning_rate=0.5, #valor heredado de tutorial
                                   learning_rate_decay_factor=0.99, #valor heredado de tutorial
                                   use_lstm=True,
                                   forward_only=False) #valor heredado de tutorial

In [16]:
def get_batch():
    encoder_inputs = [np.random.randint(1, vocabulary_size, word_size + 1) for _ in xrange(batch_size)]
    decoder_inputs = [np.zeros(word_size + 2, dtype=np.int32) for _ in xrange(batch_size)]
    weights = [np.ones(word_size + 2, dtype=np.float32) for _ in xrange(batch_size)]
    for i in xrange(batch_size):
        r = random.randint(1, word_size)
        # leave at least a 0 at the end
        encoder_inputs[i][r:] = 0
        # one 0 at the beginning of the reversed word, one 0 at the end
        decoder_inputs[i][1:r+1] = encoder_inputs[i][:r][::-1]
        weights[i][r+1:] = 0.0
    return np.transpose(encoder_inputs), np.transpose(decoder_inputs), np.transpose(weights)

In [15]:
#DAF understanding of the BATCH GENERATION at EACH STEP

encoder_inputs = [np.random.randint(1, vocabulary_size, word_size + 1) for _ in xrange(batch_size)] #10 x [26,]
decoder_inputs = [np.zeros(word_size + 2, dtype=np.int32) for _ in xrange(batch_size)] #10 x [27,]
weights = [np.ones(word_size + 2, dtype=np.float32) for _ in xrange(batch_size)] #10 x [27,]
#print("encoder_inputs: ", encoder_inputs[0])
#print("decoder_inputs: ", decoder_inputs[0])
#print("weights: ", weights[0])

for i in xrange(batch_size):
    r = random.randint(1, word_size)
    if i==0: pos=r
    # leave at least a 0 at the end
    encoder_inputs[i][r:] = 0
    # one 0 at the beginning of the reversed word, one 0 at the end
    decoder_inputs[i][1:r+1] = encoder_inputs[i][:r][::-1]
    weights[i][r+1:] = 0.0
    
print("EJM DE UNA FILA (1/10) con r=", pos, ":\n")
print("Un encoder_input: ", encoder_inputs[0]) #a partir de pos r rellena de 0s 
print("Su decoder_inputs: ", decoder_inputs[0]) #(previo lleno de 0s), a partir de pos 1, mete el input "reversed", queda 0 al final
print("Los weights para ellos: ", weights[0]) #(previo lleno de 1s) a partir de res r+1 rellena con 0.0s. 
#Los weights quedan a 1 hasta la posicion en que termina de haber valores<>0 en (encoder_inputs) y decoder_inputs
#: r+1. Luego todo 0s

#Las 3 se devuelven tranposed. 10 FILAS se convierten en 10 COLUMNAS: CADA COLUMNA ES UN ITEM DEL BATCH
print("\nLas 3 estructuras se devuelven Transposed. 10 FILAS se convierten en 10 COLUMNAS:\n") 
print("np.transpose(encoder_inputs): ", np.transpose(encoder_inputs).shape, "\n", np.transpose(encoder_inputs))
print("\nnp.transpose(decoder_inputs): ", np.transpose(decoder_inputs).shape, "\n", np.transpose(decoder_inputs))
print("\nnp.transpose(weights): ", np.transpose(weights).shape, "\n", np.transpose(weights))

EJM DE UNA FILA (1/10) con r= 14 :

Un encoder_input:  [ 3  3  8 21 21 15 24  5 22 25 12 19 11 25  0  0  0  0  0  0  0  0  0  0  0
  0]
Su decoder_inputs:  [ 0 25 11 19 12 25 22  5 24 15 21 21  8  3  3  0  0  0  0  0  0  0  0  0  0
  0  0]
Los weights para ellos:  [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.]

Las 3 estructuras se devuelven Transposed. 10 FILAS se convierten en 10 COLUMNAS:

np.transpose(encoder_inputs):  (26, 10) 
 [[ 3  7 21 19 11  8  3  4 10 26]
 [ 3 16 17  0  0 25 11 12 25 20]
 [ 8 19 20  0  0 24 26 25  7 24]
 [21 22  0  0  0 19 21  7 15 11]
 [21  2  0  0  0 21 14  4  9 14]
 [15  0  0  0  0 23 18 20 11 22]
 [24  0  0  0  0 15 23  7 10 19]
 [ 5  0  0  0  0 10 21  0 24 12]
 [22  0  0  0  0 22 21  0  5  7]
 [25  0  0  0  0  3 21  0 12  2]
 [12  0  0  0  0  8 10  0 14  2]
 [19  0  0  0  0  2  2  0  7 16]
 [11  0  0  0  0 11  4  0  8  3]
 [25  0  0  0  0  1 10  0 26 14]
 [ 0  0  0  0  0 22  6  0 24  6]
 [ 

In [61]:
def strip_zeros(word):
    # 0 is the code for space in char2id()
    return word.strip(' ') #quita los espacios al principio y final de word

def evaluate_model(model, sess, words, encoder_inputs):
        
    correct = 0
    
    #DAF: CLAVE: MIENTRAS EN EL TRAINING decoder_inputs y weigths se pasan a model.step() con valores "coherentes" 
    # generados por "Ad-Hoc para el training" por getBatch()) 
    # Aqui el model.step esta forward_only=True. O sea: se usa para ESTIMAR LOS OUTPUTS. Por eso AL INICIO DEL BUCLE 
    # decoder_inputs y target_weights se pasan model.step VACIOS (a cero). Luego se van actualizando a cada step del bucle
    # en funcion de los OUTPUTS (output_logits) y volviendose a pasar (ya con contenido)
    
    # Finalmente despues del bucle, el decoder_inputs resultante producir la EVALUACION: comparandolo con las palabras
    # originales del batch (range_words)
    
    decoder_inputs = np.zeros((word_size + 2, batch_size), dtype=np.int32) #shape=[27,10], de 0s
    target_weights = np.zeros((word_size + 2, batch_size), dtype=np.float32) #shape=[27,10], de 0.0s
    target_weights[0,:] = 1.0 #target_weights[0] se pone todo a 1.0s
    
    is_finished = np.full(batch_size, False, dtype=np.bool_) #[False False False False False False False False False False]
    
    for i in xrange(word_size + 1): #DAF: caracter a carcacter de la WORD=25+1=26 (nada que ver con vocabulary_size=27)
        
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id=0, forward_only=True)
               
        # DAF: output_logits[0] is the predicted probability distribution for the first character. (EN LAS 10 PALABRAS)
        # I then run step() again in forward-only mode to get the predicted probability distribution for 
        # the second character, etc, etc.
        # output_logits[0].shape : (10, 27) *****este 27 es por vocabulary_size****, NO por (max_)word_size 
        # Y len(output_logits) = 27. Puesto que es una python list es xrange(word_size+1) = 0-26 = 27 
        # ***Este 27 es distinto al otro. Es por el tamaño word_size=25
        
        p = np.argmax(output_logits[i], axis=1) 
        
        #DAF: para este los caracteres ith de encoder_input tenemos su output_logits[i]
        #output_logits[i].shape : (filas=10=words, columnas=27=chars)
        #p son los 10 indices de la columna (de 0 a 26)(por axis=1) que tiene el mayor valor
        #print(p)
        
        #DAF: Actualizacion de decoder_inputs y target_weights en funcion de p (que es un resumen de output_logits[i])
        # de (el estado) de is_finished. #TODO: ENTENDER EL DETALLE DE ESTA ACTUALIZACION
        
        is_finished = np.logical_or(is_finished, p == 0) 
        
        decoder_inputs[i,:] = (1 - is_finished) * p 
        
        target_weights[i,:] = (1.0 - is_finished) * 1.0 
        
        
        #if np.all(is_finished):
            #break

    #DAF understanding
    #print("encoder_inputs: ", encoder_inputs, encoder_inputs.shape)
    print("decoder_inputs: \n",decoder_inputs, decoder_inputs.shape)    
    #print("target_weights: ", target_weights, target_weights.shape)
    #print("len(output_logits) :",len(output_logits)) # len(output_logits) =27
    #print("output_logits[0].shape :",output_logits[0].shape)
    #print("output_logits[0]:",output_logits[0]) #salida superverbosa
    print("P es ", np.argmax(output_logits[0], axis=1), np.argmax(output_logits[0], axis=1).shape,"\n")
    
    #DAF: EVALUACION: comparacion directa de output_word (generada a partir de decoder_inputs) O SEA LA PALABRA ESTIMADA, 
    #contra la reversed_word (inversa de la word original del batch) O SEA LA LABEL
    #si son iguales -> correct++
    
    for idx, l in enumerate(np.transpose(decoder_inputs)):
        
        reversed_word = ''.join(reversed(words[idx]))
        
        output_word = strip_zeros(''.join(id2char(i) for i in l))
        
        print(words[idx], '(reversed: {0})'.format(reversed_word),
              '-> [', output_word, '] ({0})'.format('OK' if reversed_word == output_word else 'KO'))
        
        
        if reversed_word == output_word:
            correct += 1
    
    
    return correct

In [None]:
def get_validation_batch(words):
    encoder_inputs = [np.zeros(word_size + 1, dtype=np.int32) for _ in xrange(batch_size)]
    for i, word in enumerate(words):
        for j, c in enumerate(word):
            encoder_inputs[i][j] = char2id(c)
    return np.transpose(encoder_inputs)

#DAF: words=range_words[i]=10 words -> 
#encoder_inputs=lista de 10 items (1 por word). Tamaño=26 (palabra más larga+1) 
#Desde pos 0, los ids de las letras, el resto son 0s. Se devuelve transposed: shape = 26 filas por 10 columnas

def validate_model(text, model, sess):
    words = text.split()
    nb_words = (len(words) / batch_size) * batch_size
    
    correct = 0
    for i in xrange(nb_words / batch_size):
        range_words = words[i * batch_size:(i + 1) * batch_size]
        
        encoder_inputs = get_validation_batch(range_words)
        
        correct += evaluate_model(model, sess, range_words, encoder_inputs)
    
    print('* correct: {0}/{1} -> {2}%'.format(correct, nb_words, (float(correct) / nb_words) * 100))
    print()

In [33]:
#DAF understanding
words = text.split()
nb_words = (len(words) / batch_size) * batch_size #b es por "bucket" 
print("len(words): ", len(words)) #40ytantos
print("nb_words: ", nb_words) #40
print("nb_words / batch_size: ", nb_words / batch_size) #4
for i in xrange(nb_words / batch_size):
        range_words = words[i * batch_size:(i + 1) * batch_size] #DAF range_words[i] = words[0:9],words[10:19],words[20:29]
        encoder_inputs = get_validation_batch(range_words)
        print("\n", i, " range_words: ", range_words , len(range_words))
        print("encoder_inputs: ", encoder_inputs, encoder_inputs.shape)

len(words):  40
nb_words:  40
nb_words / batch_size:  4

 0  range_words:  ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'is'] 10
encoder_inputs:  [[20 17  2  6 10 15 20 12  4  9]
 [ 8 21 18 15 21 22  8  1 15 19]
 [ 5  9 15 24 13  5  5 26  7  0]
 [ 0  3 23  0 16 18  0 25  0  0]
 [ 0 11 14  0 19  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [

In [62]:
def reverse_text(nb_steps):
    with tf.Session() as session:
        model = create_model()
        tf.global_variables_initializer().run()

        for step in xrange(nb_steps):
            enc_inputs, dec_inputs, weights = get_batch()
            _, loss, _ = model.step(session, enc_inputs, dec_inputs, weights, 0, False)
            
            if step % 1000 == 1:
            #if step % 1 == 1:
                print('* step:', step, 'loss:', loss)
                validate_model(text, model, session)
        
        print('*** evaluation! loss:', loss)
        validate_model(text, model, session)

In [63]:
#%time reverse_text(15000)
tf.reset_default_graph()
%time reverse_text(1001)

* step: 1 loss: 3.28325
DESPUES DEL BUCLE
decoder_inputs:  [[12 12 12 12 12 12 12 12 12 12]
 [12 12 12 12 12 12 12 12 12 12]
 [12 12 12 12 12 12 12 12 12 12]
 [12 12 12 12 12 12 12 12 12 12]
 [ 0 12  0  0 12  0  0  0  0  0]
 [ 0  0  0  0 12  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]] (27, 10)
P es  [12 12 12 12 12 12 12 12 12 12] (1

In [None]:
tf.reset_default_graph()
%time reverse_text(30000)