#### LSTM-base-language-model

- Model:
- Data: 

In [13]:
import tensorflow as tf

import numpy as np
import time 
import json

import gensim

import preprocess_helper

In [14]:
# Functions to load the embedding and to enroll the network (needed to execute the main())

def load_embedding(session, vocab, emb, config):
  '''
    session        Tensorflow session object
    vocab          A dictionary mapping token strings to vocabulary IDs
    emb            Embedding tensor of shape vocabulary_size x dim_embedding
    path           Path to embedding file
    dim_embedding  Dimensionality of the external embedding.
  '''
  print("Loading external embeddings from %s" % config.path_to_word2vec)
  model = gensim.models.KeyedVectors.load_word2vec_format(config.path_to_word2vec, binary=False)
  external_embedding = np.zeros(shape=(config.vocabulary_size, config.embedding_dimensions))
  matches = 0
  for tok, idx in vocab.items():
    if config.use_word2vec_emb and tok in model.vocab:
      external_embedding[idx] = model[tok]
      matches += 1
    else:
      if config.verbose:
        print("%s not in embedding file" % tok)
      external_embedding[idx] = np.random.uniform(low=-0.25, high=0.25, size=config.embedding_dimensions)
  
  if config.use_word2vec_emb:
    print("%d words out of %d could be loaded" % (matches, config.vocabulary_size))
  else:
    print("Generated embedding for %d words" % config.vocabulary_size)

  pretrained_embeddings = tf.placeholder(tf.float32,
                                         [None, None],
                                         name='pretrained_embedding')
  assign_op = emb.assign(pretrained_embeddings)
  session.run(assign_op, {pretrained_embeddings: external_embedding})
  
def build_lstm_graph(lstm_cell, input_batch, config):
  # cell: tensorflow LSTM object
  # batch: tensorflow of shape sent_len x batch_size x emb_dim
  
  # return value is a N-D tensor of shape [batch_size, state_size] filled with zeros.
  initial_state = lstm_cell.zero_state(config.batch_size, tf.float32)
  
  # init state is the init one 
  state = initial_state

  # where to store the cell_output after each time_step 
  output = []
  for time_step in range(config.sentence_length):
    cell_output, state = lstm_cell(input_batch[:, time_step, :], state)
    if(time_step != config.sentence_length-1):
      output.append(cell_output)
  output = tf.reshape(output, [config.predicted_words * config.batch_size, config.state_size])  
  return output, state

In [15]:
# define a class Config 
class Config():
  
  def __init__(self, path_to_config_file=None):
    if path_to_config_file:
      self.read(path_to_config_file)
    else:
      # default constructor 
      self.is_training = True          # define if the code is run for training or testing 
      self.sentence_length = 30        # number of words per sentence (ie. how many times we should enroll the network)
      self.predicted_words = 29        # number of predicted words per sentence = sentence_length - 1
      self.batch_size = 64             # number of sentences analysed per batch 
      self.embedding_dimensions = 100  # dimension of the embedding 
      self.state_size = 512            # dimension of the hidden state 
      self.max_grad_norm = 5           # max norm of the gradient 
      self.vocabulary_size = 20004     # vocabulary size 
      self.number_of_epochs = 20       # number of epochs used during training 
      self.learning_rate = 0.1         # learning rate 
      self.path_to_word2vec =  'wordembeddings-dim100.word2vec' # path to word2vec model 
      self.use_word2vec_emb = False    # if training is done with Word2Vec or with a rand emb
      self.verbose = False             # simple verbose param to follow training 
      self.save_model = True           # if we should save the model after training 
      self.saving_path = 'model/toy-model.ckpt'
      self.restored_model = ''         # path where the model was saved to restore it and test it 
      
      
  def read(self, path_to_config_file):
    with open(path_to_config_file) as data_file:    
      raw = json.load(data_file)
      
      self.is_training = raw['is_training']
      self.sentence_length = raw['sentence_length']
      self.predicted_words = raw['predicted_words']
      self.batch_size = raw['batch_size']
      self.embedding_dimensions = raw['embedding_dimensions']
      self.state_size = raw['state_size']
      self.max_grad_norm = raw['max_grad_norm']
      self.vocabulary_size = raw['vocabulary_size']
      self.number_of_epochs = raw['number_of_epochs']
      self.learning_rate = raw['learning_rate']
      self.path_to_word2vec = raw['path_to_word2vec']
      self.use_word2vec_emb = raw['use_word2vec_emb']
      self.verbose = raw['verbose']
      self.save_model = raw['save_model']
      self.saving_path = raw['saving_path']
      self.restored_model = raw['restored_model']      
              

In [16]:
class Model():
  
  def __init__(self, config):

    inputs = tf.placeholder(dtype=tf.float32,
                            shape=[config.batch_size, config.sentence_length, config.embedding_dimensions],
                            name='inputs')
    
    #print('inputs shape: ', inputs.shape)
    
    labels = tf.placeholder(dtype=tf.int32,
                            shape=[config.batch_size, config.predicted_words],
                            name='labels')

    # construct basic LSTM cell 
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(config.state_size)
    
    # enroll the network (state = final_state)
    output, state = build_lstm_graph(lstm_cell, inputs, config)
        
    # project state size on the vocab size dim = state_size x vocabulary_size 
    weights = tf.get_variable("weights",
                              [config.state_size, config.vocabulary_size],
                              dtype=tf.float32,
                              initializer=tf.contrib.layers.xavier_initializer())
    # add a bias dim = vocabulary_size 
    bias = tf.get_variable("bias",
                           [config.vocabulary_size],
                           dtype=tf.float32,
                           initializer=tf.contrib.layers.xavier_initializer())
    
    # compute the logits 
    logits = tf.matmul(output, weights) + bias
    logits = tf.reshape(logits, [config.batch_size,
                                 config.predicted_words,
                                 config.vocabulary_size])
    
    # define proba with softmax layer with dim = batch_size x num_steps x vocabulary_size 
    self.probabilities = tf.nn.softmax(logits)
    
    if not config.is_training:
      return 
    
    # compute the loss
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=logits)
        
    self.loss = tf.reduce_sum(loss)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                      config.max_grad_norm)
    self.train_optimizer = optimizer.apply_gradients(zip(grads, tvars),
                                                     global_step=tf.train.get_or_create_global_step())    

In [17]:
tf.reset_default_graph()

def main():
  
  # get config execution
  config = Config()
  # get frequent words list 
  frequent_words = preprocess_helper.load_frequent_words('google-10000-english/20k.txt')
  # construct hash map 
  vocab = {word: i for i,word in enumerate(frequent_words)}
  # get testing and training data
  #train_data, train_labels = preprocess_helper.load_and_process_data('data/sentences.train', frequent_words, 28)
  all_train_data, all_train_labels = preprocess_helper.load_and_process_data('data/sentences.eval',
                                                                             vocab,
                                                                             frequent_words,
                                                                             28)
  # compute number of batches
  number_of_batches = int(len(all_train_data)/config.batch_size)
  number_of_batches = 5
  # init tensor flow session 
  session = tf.Session()

  with session.as_default():
    
    # get the embedding matrix (with rand init emb or with word2vec)
    with tf.variable_scope("Embedding", reuse=tf.AUTO_REUSE):
      embedding = tf.get_variable("embedding",
                                  [config.vocabulary_size, config.embedding_dimensions],
                                  dtype=tf.float32)
      load_embedding(session, vocab, embedding, config)
      
      # placeholder to get train_input and labels (batch_size x sent_len)
      train_input_ph = tf.placeholder(tf.int64,
                                      [config.batch_size, config.sentence_length],
                                      name='train_input_ph')
      train_labels_ph = tf.placeholder(tf.int64,
                                      [config.batch_size, config.predicted_words],
                                      name='train_labels_ph')

      # generate 'usable' input_data for TF 
      embedded_train_input = tf.nn.embedding_lookup(embedding,
                                                    train_input_ph)
    
    # Create a model instance 
    initializer = tf.contrib.layers.xavier_initializer()
    with tf.variable_scope("Model", reuse=tf.AUTO_REUSE, initializer=initializer):
      # create an Model instance 
      model = Model(config)
      # init 
      init = tf.global_variables_initializer()
      session.run(init)

      # loop over each epoch 
      for epoch_id in range(config.number_of_epochs):
        # define perplexity and total_loss across all the batches
        perplexity = 0
        total_loss = 0
        start_time = time.time()
        # loop over each batch 
        for batch_id in range(number_of_batches):
          
          # extract batch_size sentences from the training data 
          training_batch = all_train_data[batch_id*config.batch_size:(batch_id+1)*config.batch_size]
          training_batch = session.run(embedded_train_input, {train_input_ph: training_batch})
          
          training_labels = all_train_labels[batch_id*config.batch_size:(batch_id+1)*config.batch_size,:]
          
          # variable to fect in the graph 
          fetches = {
            "loss": model.loss,
            "probabilities": model.probabilities,
            "train_optimizer": model.train_optimizer
          }

          # input variables of the graph 
          feed_dict = {
            "Model/inputs:0": training_batch,
            "Model/labels:0": training_labels
          }

          # Feed the model with the training_batch and the training_labels 
          vals = session.run(fetches=fetches, feed_dict=feed_dict)
                    
          total_loss += vals["loss"]
          
          print('Batch:', batch_id)
          print('Total loss: ',total_loss)
        
        perplexity = np.exp(total_loss/float(config.sentence_length*number_of_batches*config.batch_size))
        print('Epoch: ', epoch_id, 'with perplexity: ', perplexity)
        print('Time: ', int(time.time()-start_time), ' secs')

  if config.save_model:
    saver = tf.train.Saver()
    save_path = saver.save(session, config.saving_path)
    if config.verbose:
      print("Model saved in path: %s" % save_path)
    
  # finally close the session ...
  session.close()

if __name__ == "__main__":
  main()


- Number of sentences loaded:  9846
Loading external embeddings from wordembeddings-dim100.word2vec
Generated embedding for 20004 words
Batch: 0
Total loss:  18392.13671875
Batch: 1
Total loss:  32687.193359375
Batch: 2
Total loss:  52531.689453125
Batch: 3
Total loss:  85034.837890625
Batch: 4
Total loss:  108008.623046875
Epoch:  0 with perplexity:  76949.00694991765
Time:  16  secs
Batch: 0
Total loss:  17786.115234375
Batch: 1
Total loss:  33932.4013671875
Batch: 2
Total loss:  47575.8427734375
Batch: 3
Total loss:  86034.4091796875
Batch: 4
Total loss:  98013.5888671875
Epoch:  1 with perplexity:  27166.743542226002
Time:  21  secs
Batch: 0
Total loss:  13361.333984375
Batch: 1
Total loss:  22902.869140625
Batch: 2
Total loss:  35264.861328125
Batch: 3
Total loss:  45357.279296875
Batch: 4
Total loss:  56195.9912109375
Epoch:  2 with perplexity:  348.53863485519884
Time:  18  secs
Batch: 0
Total loss:  9380.69140625
Batch: 1
Total loss:  19352.87890625
Batch: 2
Total loss:  28129.

In [None]:
# TensorFlow reshape playground to test built-in functions 

session = tf.Session()

a = tf.constant([0, 0, 0, 0])
b = tf.constant([1, 1, 1, 1])
c = tf.constant([2, 2, 2, 2])

list_of_tensor = [a,b,c]

out = session.run(list_of_tensor)
print(out)

reshape_ = tf.reshape(list_of_tensor, [12])

out = session.run(reshape_)
print(out)

reshape_reshape = tf.reshape(reshape_, [3,4])

out = session.run(reshape_reshape)
print(out)

# display stuff from the graph in the back 

#variables_names = [v.name for v in tf.trainable_variables()]
#values = session.run(variables_names)
#for k, v in zip(variables_names, values):
#    print("Variable: ", k)
#    print("Shape: ", v.shape)
#    print(v)


#for op in tf.get_default_graph().get_operations():
#  print(str(op.name))  