<a href="https://colab.research.google.com/github/durham-abric/Comp551_Final_Proj/blob/master/base_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Model

In [0]:
import tensorflow as tf
from tensorflow import concat
from tensorflow.layers import Conv1D
from tensorflow.layers import Dense
from tensorflow.layers import Dropout
from tensorflow.layers import Flatten
from tensorflow.contrib import rnn
from tensorflow.contrib.rnn import InputProjectionWrapper
from tensorflow.nn import relu
from tensorflow.nn import softmax

In [0]:
def weights_init(shape):
  """
  Create a Tensor object of weights

  Parameters
  ----------
  shape : Tuple
      Tuple describing the shape of the weights

  Returns
  -------
  Tensor
      Returns a Tensor of shape specified with 
      normal distributed values in it
      
  """
  return tf.Variable(tf.truncated_normal(shape=shape, stddev=0.05))
  
def bias_init(shape):
  """
  Create a Tensor object of bias

  Parameters
  ----------
  shape : Tuple
      Tuple describing the shape of the bias

  Returns
  -------
  Tensor
      Returns a Tensor of shape specified with 
      normal distributed values in it

  """
  return tf.Variable(tf.zeros(shape=shape))

In [0]:
def define_inputs(batch_size, sequence_len, vocab_size):
  """
  Create Tensor objects for inputs, tagets and probabilities

  Parameters
  ----------
  batch_size : int
      Size of batch input
  sequence_len : int
      Maximum length of a sequence
  vocab_size : int
      Number of unique words in the vocabulary

  Returns
  -------
  Tensor, Tensor, Tensor
      Returns 3 Tensors: inputs, targets and probabilities

  """

  inputs = tf.placeholder(tf.int32,
                          [batch_size, sequence_len],
                          name='inputs')
  targets = tf.placeholder(tf.float32,
                           [batch_size, vocab_size],
                           name='target')
  keep_probs = tf.placeholder(tf.float32,
                              name='keep_probs')

  return inputs, targets, keep_probs

In [0]:
def conv_1d(input,
            kernel_size,
            number_of_filters,
            strides=(1, 1),
            activation=tf.nn.relu,
            max_pool=True):
  """
  Creates a convolution layer with custom parameters

  Parameters
  ----------
  input : Tensor
      Input Tensor to the convolution layer
  kernel_size : int
      Size of the kernel of the convolution layer
  number_of_filters : int
      Number of filters in the convolution layer
  strides : Tuple
      Stride by the kernel in the convolution layer
  activation : Tensor
      Type of activation layer
  max_pool : boolean
      Activate a max pooling after the convolution

  Returns
  -------
  Tensor
      Output of convolution layer

  """

  weights = weights_init([kernel_size, kernel_size, 1, number_of_filters])
  bias = bias_init([number_of_filters])

  layer = tf.nn.conv2d(input, filter=weights, strides=[1, strides[0], strides[1], 1], padding='SAME')

  if activation != None:
      layer = activation(layer)

  if max_pool:
      layer = tf.nn.max_pool(layer, ksize=[1, 2, 2 ,1], strides=[1, 2, 2, 1], padding='SAME')

  return layer

In [0]:
def dense(input,
          in_size,
          out_size,
          dropout=0.25,
          activation=tf.nn.relu):
  """
  Creates a fully-connected layer with custom parameters

  Parameters
  ----------
  input : Tensor
      Input Tensor to the dense layer
  in_size : int
      Size of the input
  out_size : int
      Size of the output
  dropout : float
      Rate of dropout applied to the output of the dense layer
  activation : Tensor
      Type of activation layer

  Returns
  -------
  Tensor
      Output of dense layer

  """

  weights = weights_init([in_size, out_size])
  bias = bias_init([out_size])

  layer = tf.matmul(input, weights) + bias

  if activation != None:
      layer = activation(layer)

  if dropout:
      layer = tf.nn.dropout(layer, dropout)

  return layer

In [0]:
def flatten_cnn(layers, shape):
  """
  Flatten and concatenate a list of tensors

  Parameters
  ----------
  layers : list of Tenors
    List of layers to concatenate
  arg2 : int
    First dimension of the output Tensor

  Returns
  -------
  Tensor
    Returns a concatenated Tensor

  """
  conv_layer_reshaped = []

  for l in layers:
    new_l = tf.reshape(l, [shape, -1])
    conv_layer_reshaped.append(new_l) 

  return conv_layer_reshaped

In [0]:
def loss_optimizer(logits, targets, learning_rate):
  """
  Optimizer that implements the Adagrad algorithm.


  Parameters
  ----------
  logits : Tensor
      Last layer output
  targets : Tensor
      True output of the function
  learning_rate : float
    Learning rate of the model

  Returns
  -------
  Tensor, Operation
      Returns the loss tensor and an operation that update the model

  """
  
  loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=targets))
  optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
  return loss, optimizer

In [0]:
class LanguageModel(object):

  def __init__(self, learning_rate=0.02,
             dropout_rate=0.25,
             batch_size=128,
             seq_len=50,
             vocab_size=1024,
             embed_size=300,
             layers_lstm=2):
    """
    Creates a language model from arXiv:1806.02847v1

    Parameters
    ----------
    learning_rate : float
        Learning rate of the optimizer
    dropout_rate : float
        Dropout rate of the layers
    batch_size : int
        Size of the mini-batches
    seq_len : int
        Maximum length of an input
    vocab_size : int
        Size of the vocabulary
    embed_size : int
        Dimenions of the vocabulary embedding
    layers_lstm : int
        Number of layers of LSTM

    """


    tf.reset_default_graph()

    self.dense_size = 4096
    self.lstm_size = 8192
    self.batch_size = batch_size
    self.vocab_size = vocab_size
    self.seq_len = seq_len
    self.inputs, self.targets, self.keep_probs = define_inputs(self.batch_size,
                                                               self.seq_len,
                                                               self.vocab_size)

    # Neural network
    # Embedding
    word_embeddings = tf.get_variable('word_embeddings',
                                      [vocab_size, embed_size])
    embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, self.inputs)
    self.embeddings = word_embeddings

    # 8 conv-1d in parallel concatenated
    conv_block_1 = tf.layers.conv1d(embedded_word_ids, 32, 1)
    conv_block_2 = tf.layers.conv1d(embedded_word_ids, 32, 2)
    conv_block_3 = tf.layers.conv1d(embedded_word_ids, 64, 3)
    conv_block_4 = tf.layers.conv1d(embedded_word_ids, 128, 4)
    conv_block_5 = tf.layers.conv1d(embedded_word_ids, 256, 5)
    conv_block_6 = tf.layers.conv1d(embedded_word_ids, 512, 6)
    conv_block_7 = tf.layers.conv1d(embedded_word_ids, 1028, 7)
    conv_block_8 = tf.layers.conv1d(embedded_word_ids, 2048, 7)

    conv_layer = [conv_block_1, conv_block_2, conv_block_3,
                  conv_block_4, conv_block_5, conv_block_6,
                  conv_block_7, conv_block_8]

    conv_layer = flatten_cnn(conv_layer, batch_size)
    concatenated_conv = tf.concat(conv_layer, 1)

    # Feed Forward Layers
    logits = tf.layers.dense(concatenated_conv, self.dense_size)
    logits = tf.layers.dense(logits, self.dense_size)
    logits = tf.reshape(logits, [logits.get_shape()[0],
                                 logits.get_shape()[1], 1])

    # LSTM Layers
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(s) for s in [self.lstm_size] * layers_lstm]
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    logits, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                      inputs=logits,
                                      dtype=tf.float32)

    # Predictions softmax
    logits = tf.reshape(logits, [logits.get_shape()[0], -1])
    logits = tf.layers.dense(logits, vocab_size)
    softmax = tf.nn.softmax(logits)

    self.loss, self.opt = loss_optimizer(softmax, self.targets, learning_rate)


In [0]:
model = LanguageModel()

In [0]:
session = tf.Session()

In [0]:
session.run(tf.global_variables_initializer())

# Data

In [0]:
import json
import string
import numpy as np

In [0]:
path_to_vocab = 'gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/assignment/COMP551/data/vocab.txt'
path_to_pdp60 = 'gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/assignment/COMP551/data/pdp60.json'
path_to_wsc273 = 'gdrive/My Drive/Mcgill/U4/Fall 2018/COMP 551/assignment/COMP551/data/wsc273.json'

In [0]:
vocab_file = [line.rstrip('\n') for line in open(path_to_vocab)]
number_of_words = len(vocab_file)
print("The vocabulary contains a total of %s words." % number_of_words)

words_pdp = None
words_wsc = None
with open(path_to_pdp60) as f:
    words_pdp = json.load(f)
with open(path_to_wsc273) as f:
    words_wsc = json.load(f)

## Vocabulary
Get files the pdp60 and wsc273 and compare with the original size. Share same words but reducethe number of words in the vocab.

In [0]:
def normalize_str(e):
  translator = str.maketrans('', '', string.punctuation)
  normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
  return list(filter(None, list(map(normalizer, e.split(' ')))))
  
class Vocabulary():
  def __init__(self):
    self.size = 0
    self.n_percent = None
    self.max_size = None
    self.words = {}
    self.rejected = []
    

  def add_to_vocab(self, vocab):
    # iterate through words add add if not
    # in words list and add occurence anyways
    for w in vocab:
      try:
        w = normalize_str(w)[0]
        if w in vocab_file:
          if w in self.words:
            self.words[w] += 1
          else:
            self.size += 1
            self.words[w] = 1
        else:
          if w in self.rejected:
            pass
          else:
            print("The word '%s' is not in the vocab.txt file. '%s' is not added to the dictionary." % (w, w))
            self.rejected.append(w)
      except:
        pass
    return
  
      
  def get_vocab(self):
    # Make sure max_size is set and return n most frequent words.
    if self.max_size is None:
        print("Maximum size set to None. Update the size with set_max_size(n).")
        return
    print("Returning %s words." % self.max_size)
    n_sorted_words = (sorted(self.words.items(), key=lambda kv: kv[1], reverse = True)[:self.max_size])
    return np.array([k for k,v in n_sorted_words])
  
  
  def set_max_size(self, n):
    if n == self.max_size:
      return
    elif n == -1:
      print("Updating the size of the vocab from '%s' to '%s'. Keeping all the words in the vocabulary." % (self.max_size, int(len(self.words))))
      self.max_size = len(self.words)
    elif n < 1:
      total = int(len(self.words)*n)
      print("Updating the size of the vocab from '%s' to '%s'." % (self.max_size, total))
      self.max_size = total
    else:
      print("Updating the size of the vocab from '%s' to '%s'." % (self.max_size, n))
      self.max_size = n
    return 

In [0]:
vocab_obj = Vocabulary()

for s in words_wsc:
  vocab_obj.add_to_vocab(s['substitution'].split(" "))
for s in words_pdp:
  vocab_obj.add_to_vocab(s['substitution'].split(" "))

In [0]:
vocab_obj.set_max_size(-1)
X_train = vocab_obj.get_vocab()
y_train = vocab_obj.get_vocab()


In [0]:
epochs = 100
batch_size = 128

In [0]:
for i in range(epochs):
    epoch_loss = []
    train_accuracy = []
    for ii in (range(0, len(X_train), batch_size)):
        X_batch = X_train[ii:ii+batch_size]
        y_batch = y_train[ii:ii+batch_size].reshape(-1, 1)

        c, _ = session.run([model.loss, model.opt],
                              feed_dict={model.inputs:X_batch, 
                              model.targets:y_batch})
        
        epoch_loss.append(c)
        
    
    print("Epoch: {}/{}".format(i, epochs),
          " | Epoch loss: {}".format(np.mean(epoch_loss)))