In [None]:
#### LSTM-base-language-model

- Model:
- Data: 

In [1]:
import tensorflow as tf

import numpy as np

import gensim

import preprocess_helper

In [2]:
# define a class Config 
class Config():
  
  def __init__(self, path_to_config_file=None):
    if path_to_config_file:
      read(self, path_to_config_file)
    else:
      # default constructor 
      self.is_training = True          # define if the code is run for training or testing 
      self.sentence_length = 30        # number of words per sentence (ie. how many times we should enroll the network)
      self.batch_size = 64             # number of sentences analysed per batch 
      self.embedding_dimensions = 100  # dimension of the embedding 
      self.state_size = 512            # dimension of the hidden state 
      self.max_grad_norm = 10          # max norm of the gradient 
      self.vocabulary_size = 20004     # vocabulary size 
      self.number_of_epochs = 10       # number of epochs used during training 
      self.learning_rate = 1           # learning rate 
      self.path_to_word2vec =  'wordembeddings-dim100.word2vec' # path to word2vec model 
      self.use_word2vec_emb = False    # if training is done with Word2Vec or with a rand emb
      self.verbose = False             # simple verbose param to follow training 
      self.save_model = False          # if we should save the model after training 
      self.restored_model = ''         # path where the model was saved to restore it and test it 
    
  def read(self, path_to_config_file):
    # implement fc to read config params 
    raise ValueError('Method not implemented yet.')

In [3]:
class Model():
  
  def __init__(self, is_training):
    # get all the config params 
    config = Config()
    
    inputs = tf.placeholder(dtype=tf.float32,
                            shape=[config.batch_size, config.sentence_length, config.embedding_dimensions],
                            name='inputs')
    
    labels = tf.placeholder(dtype=tf.int32,
                            shape=[config.batch_size, config.sentence_length],
                            name='labels')

    # construct basic LSTM cell 
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(config.state_size)
    # build-in tensorflow function to enroll the LSTM
    self.initial_state = lstm_cell.zero_state(config.batch_size, tf.float32)
    inputs = tf.unstack(inputs, num=config.sentence_length, axis=1)
    output, state = tf.nn.static_rnn(lstm_cell, inputs, dtype=tf.float32)
    output = tf.reshape(output, [config.sentence_length*config.batch_size, config.state_size])
    # project state size on the vocab size dim = state_size x vocabulary_size 
    weights = tf.get_variable("weights",
                              [config.state_size, config.vocabulary_size],
                              dtype=tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer())
    # add a bias dim = vocabulary_size 
    bias = tf.get_variable("bias",
                           [config.vocabulary_size],
                           dtype=tf.float32,
                           initializer=tf.contrib.layers.xavier_initializer())
    # compute the logits 
    logits = tf.matmul(output, weights) + bias
    # reshape logits to dim = batch_size x num_steps x vocabulary_size 
    logits = tf.reshape(logits, [config.sentence_length,
                                 config.batch_size,
                                 config.vocabulary_size])
    # define proba with softmax layer with dim = batch_size x num_steps x vocabulary_size 
    self.probabilities = tf.nn.softmax(logits)
    
    if not is_training:
      return 
    
    # should probably reshape before 
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=logits)
    
    # tf.contrib.seq2seq.sequence_loss
    self.loss = tf.reduce_sum(loss)
    
    # optimizer and minimize ...
    optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)

    minimize = optimizer.minimize(loss)

In [None]:
def build_lstm_graph(lstm_cell, input_batch):
  # cell: tensorflow LSTM object
  # batch: tensorflow of shape sent_len x batch_size x emb_dim
  
  # return value is a N-D tensor of shape [batch_size, state_size] filled with zeros.
  initial_state = lstm_cell.zero_state(batch_size, tf.float32)
  
  # init state is the init one 
  state = initial_state

  # where to store the cell_output after each time_step 
  outputs = []
  
  for time_step in range(num_steps):
    # see if there is a better way to reuse the variables 
    if time_step > 0:
      tf.get_variable_scope().reuse_variables()
    # given the current state 'state' and the input_batch, compute the new state and the cell_output 
    #  - cell_output: (batch_size, output_size) What is output_size here ??
    #  - state:  (batch_size, state_size)
    cell_output, state = lstm_cell(input_batch[:, time_step, :], state)
    outputs.append(cell_output)

  # tf.reshape(tensor, shape)
  # tf.concat(values, axis)
  output = tf.reshape(tf.concat(outputs, 1), [-1, state_size])
  
  return output, state

In [None]:
# TO KEEP (TO INTEGRATE IN THE MODEL)

# our own implementation of tf.nn.static_rnn
output, state = build_lstm_graph(lstm_celllstm_cell, batch)

# Calling minimize() takes care of both computing the gradients and applying them to the variables.
# If you want to process the gradients before applying them you can instead use the optimizer in four steps:
#   1- Define an optimizer 
#   2- Compute the gradients with compute_gradients().
#   3- Process the gradients as you wish.
#   4- Apply the processed gradients with apply_gradients()

# So instead of:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize =  optimizer.minimize(loss)

# Use:
# 1) define the optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# 2) compute the gradients
gradients_and_vars = optimizer.compute_gradients(loss)
# 3) process the gradient 
processed_gradients_and_vars = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients_and_vars]
# 4) apply the gradients.
train_optimizer = optimizer.apply_gradients(processed_gradients_and_vars)

In [None]:
tf.reset_default_graph()

def main():
  
  # get config execution
  config = Config()
  # get frequent words list 
  frequent_words = preprocess_helper.load_frequent_words('google-10000-english/20k.txt')
  # construct hash map 
  vocab = {word: i for i,word in enumerate(frequent_words)}
  # get testing and training data
  #train_data, train_labels = preprocess_helper.load_and_process_data('data/sentences.train', frequent_words, 28)
  all_train_data, all_train_labels = preprocess_helper.load_and_process_data('data/sentences.eval',
                                                                             vocab,
                                                                             frequent_words,
                                                                             28)
  # compute number of batches
  number_of_batches = int(len(all_train_data)/config.batch_size)
  # init tensor flow session 
  session = tf.Session()

  with session.as_default():
    
    # get the embedding matrix (with rand init emb or with word2vec)
    with tf.variable_scope("Embedding", reuse=tf.AUTO_REUSE):
      embedding = tf.get_variable("embedding",
                                  [config.vocabulary_size, config.embedding_dimensions],
                                  dtype=tf.float32)
      load_embedding(session, vocab, embedding, config)
      
      # placeholder to get train_input and labels (batch_size x sent_len)
      train_input_ph = tf.placeholder(tf.int64,
                                      [config.batch_size, config.sentence_length],
                                      name='train_input_ph')
      train_labels_ph = tf.placeholder(tf.int64,
                                      [config.batch_size, config.sentence_length],
                                      name='train_labels_ph')

      # generate 'usable' input_data for TF 
      embedded_train_input = tf.nn.embedding_lookup(embedding,
                                                    train_input_ph)
    
    # Create a model instance 
    initializer = tf.contrib.layers.xavier_initializer()
    with tf.variable_scope("Model", reuse=tf.AUTO_REUSE, initializer=initializer):
      # create an Model instance 
      model = Model(is_training=config.is_training)
      # init 
      init = tf.global_variables_initializer()
      session.run(init)

      # loop over each epoch 
      for epoch_id in range(config.number_of_epochs):
        # define perplexity and total_loss across all the batches
        perplexity = 0
        total_loss = 0
        total_iters = 0
        # loop over each batch 
        for batch_id in range(number_of_batches):
          
          # extract batch_size sentences from the training data 
          training_batch = all_train_data[batch_id*config.batch_size:(batch_id+1)*config.batch_size]
          training_batch = session.run(embedded_train_input, {train_input_ph: training_batch})
          
          training_labels = all_train_labels[batch_id*config.batch_size:(batch_id+1)*config.batch_size,:]
          
          # variable to fect in the graph 
          fetches = {
            "loss": model.loss,
            "probabilities": model.probabilities
          }

          # input variables of the graph 
          feed_dict = {
            "Model/inputs:0": training_batch,
            "Model/labels:0": training_labels
          }

          # Feed the model with the training_batch and the training_labels 
          #vals = session.run(fetches=fetches, feed_dict=feed_dict)
                    
          #total_loss += vals["loss"]
          print('Total loss: ',total_loss)
          total_iters += config.sentence_length
          
          print('Batch:', batch_id)
          print('Batch data: ',np.shape(training_batch))
          print('Batch labels: ', np.shape(training_labels))
          
        print('Epoch: ',epoch_id, 'with perplexity: ', np.exp(total_loss/float(total_iters)))
        
  # finally close the session ...
  session.close()

if __name__ == "__main__":
  main()


- Number of sentences loaded:  9753
Loading external embeddings from wordembeddings-dim100.word2vec
Generated embedding for 20004 words


In [None]:
#variables_names = [v.name for v in tf.trainable_variables()]
#values = session.run(variables_names)
#for k, v in zip(variables_names, values):
#    print("Variable: ", k)
#    print("Shape: ", v.shape)
#    print(v)


#for op in tf.get_default_graph().get_operations():
#  print(str(op.name))  

In [5]:
frequent_words = preprocess_helper.load_frequent_words('google-10000-english/20k.txt')

def load_embedding(session, vocab, emb, config):
  '''
    session        Tensorflow session object
    vocab          A dictionary mapping token strings to vocabulary IDs
    emb            Embedding tensor of shape vocabulary_size x dim_embedding
    path           Path to embedding file
    dim_embedding  Dimensionality of the external embedding.
  '''
  print("Loading external embeddings from %s" % config.path_to_word2vec)
  model = gensim.models.KeyedVectors.load_word2vec_format(config.path_to_word2vec, binary=False)
  external_embedding = np.zeros(shape=(config.vocabulary_size, config.embedding_dimensions))
  matches = 0
  for tok, idx in vocab.items():
    if config.use_word2vec_emb and tok in model.vocab:
      external_embedding[idx] = model[tok]
      matches += 1
    else:
      if config.verbose:
        print("%s not in embedding file" % tok)
      external_embedding[idx] = np.random.uniform(low=-0.25, high=0.25, size=config.embedding_dimensions)
  
  if config.use_word2vec_emb:
    print("%d words out of %d could be loaded" % (matches, config.vocabulary_size))
  else:
    print("Generated embedding for %d words" % config.vocabulary_size)

  pretrained_embeddings = tf.placeholder(tf.float32,
                                         [None, None],
                                         name='pretrained_embedding')
  assign_op = emb.assign(pretrained_embeddings)
  session.run(assign_op, {pretrained_embeddings: external_embedding})
