In [1]:
import time
import numpy as np
import tensorflow as tf

In [2]:
#Initial weight scale
init_scale = 0.1
#Initial learning rate
learning_rate = 1.0
#Maximum permissible norm for the gradient (For gradient clipping -- another measure against Exploding Gradients)
max_grad_norm = 5
#The number of layers in our model
num_layers = 2
#The total number of recurrence steps, also known as the number of layers when our RNN is "unfolded"
num_steps = 20
#The number of processing units (neurons) in the hidden layers
hidden_size_l1 = 256
hidden_size_l2 = 128
#The maximum number of epochs trained with the initial learning rate
max_epoch_decay_lr = 4
#The total number of epochs in training
max_epoch = 15
#The probability for keeping data in the Dropout Layer (This is an optimization, but is outside our scope for this notebook!)
#At 1, we ignore the Dropout Layer wrapping.
keep_prob = 1
#The decay for the learning rate
decay = 0.5
#The size for each batch of data
batch_size = 30
#The size of our vocabulary
vocab_size = 10000
embeding_vector_size= 200
#Training flag to separate training from testing
is_training = 1
#Data directory for our dataset
data_dir = "data/simple-examples/data/"

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import numpy as np
import tensorflow as tf


def _read_words(filename):
  with tf.io.gfile.GFile(filename, "r") as f:
    return f.read().replace("\n", "<eos>").split()


def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))

  return word_to_id


def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]


def ptb_raw_data(data_path=None):
  """Load PTB raw data from data directory "data_path".

  Reads PTB text files, converts strings to integer ids,
  and performs mini-batching of the inputs.

  The PTB dataset comes from Tomas Mikolov's webpage:

  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

  Args:
    data_path: string path to the directory where simple-examples.tgz has
      been extracted.

  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
  """

  train_path = os.path.join(data_path, "ptb.train.txt")
  valid_path = os.path.join(data_path, "ptb.valid.txt")
  test_path = os.path.join(data_path, "ptb.test.txt")

  word_to_id = _build_vocab(train_path)
  train_data = _file_to_word_ids(train_path, word_to_id)
  valid_data = _file_to_word_ids(valid_path, word_to_id)
  test_data = _file_to_word_ids(test_path, word_to_id)
  vocabulary = len(word_to_id)
  return train_data, valid_data, test_data, vocabulary, word_to_id


def ptb_iterator(raw_data, batch_size, num_steps):
  """Iterate on the raw PTB data.

  This generates batch_size pointers into the raw PTB data, and allows
  minibatch iteration along these pointers.

  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.

  Yields:
    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
    The second element of the tuple is the same data time-shifted to the
    right by one.

  Raises:
    ValueError: if batch_size or num_steps are too high.
  """
  raw_data = np.array(raw_data, dtype=np.int32)

  data_len = len(raw_data)
  batch_len = data_len // batch_size
  data = np.zeros([batch_size, batch_len], dtype=np.int32)
  for i in range(batch_size):
    data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

  epoch_size = (batch_len - 1) // num_steps

  if epoch_size == 0:
    raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

  for i in range(epoch_size):
    x = data[:, i*num_steps:(i+1)*num_steps]
    y = data[:, i*num_steps+1:(i+1)*num_steps+1]
    yield (x, y)

In [4]:
raw_data = ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

In [5]:
itera = ptb_iterator(train_data, batch_size, num_steps)
first_touple = itera.__next__()
_input_data = first_touple[0]
_targets = first_touple[1]

In [6]:
len(train_data)

929589

In [21]:
def id_to_word(id_list):
    line = []
    for w in id_list:
        for word, wid in word_to_id.items():
            if wid == w:
                line.append(word)
    return line            
                

print(id_to_word(train_data[0:100]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>', 'pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'N', '<eos>', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<eos>', 'rudolph', '<unk>', 'N', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate', '<eos>', 'a', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of']


In [8]:
_input_data.shape

(30, 20)

In [9]:
_targets.shape

(30, 20)

In [22]:
_input_data[0:3]

array([[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984,
        9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995],
       [2654,    6,  334, 2886,    4,    1,  233,  711,  834,   11,  130,
         123,    7,  514,    2,   63,   10,  514,    8,  605],
       [   0, 1071,    4,    0,  185,   24,  368,   20,   31, 3109,  954,
          12,    3,   21,    2, 2915,    2,   12,    3,   21]])

In [23]:
print(id_to_word(_input_data[0,:]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']


In [24]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embeding_vector_size,batch_input_shape=(batch_size, num_steps),trainable=True,name="embedding_vocab")  

In [25]:
# Define where to get the data for our embeddings from
inputs = embedding_layer(_input_data)
inputs

<tf.Tensor: shape=(30, 20, 200), dtype=float32, numpy=
array([[[ 1.50108449e-02, -4.42440286e-02,  1.12416968e-02, ...,
          2.43944861e-02, -4.04040702e-02,  4.42680977e-02],
        [ 1.20785609e-02,  2.64554955e-02,  2.90320627e-02, ...,
          1.53687038e-02,  2.37068050e-02, -1.35993473e-02],
        [-2.84717437e-02,  4.75924648e-02, -6.42641634e-03, ...,
         -4.99358289e-02,  3.76529135e-02, -5.31923771e-03],
        ...,
        [ 8.13189894e-03,  4.56289202e-03,  1.50487572e-03, ...,
          4.55789007e-02, -3.95794883e-02, -3.57348323e-02],
        [ 3.66412476e-03, -4.65956591e-02, -1.68618187e-02, ...,
          4.06797864e-02,  8.03351402e-04,  1.96575634e-02],
        [-4.32941914e-02, -4.82794642e-02,  2.91109085e-03, ...,
          7.79502466e-03,  3.52755524e-02, -3.83252129e-02]],

       [[ 3.01200785e-02, -2.64297798e-03, -1.04435198e-02, ...,
          6.47775084e-03, -2.31964476e-02, -3.74848954e-02],
        [-1.81778893e-02, -4.22095433e-02, -2.10

In [26]:
lstm_cell_l1 = tf.keras.layers.LSTMCell(hidden_size_l1)
lstm_cell_l2 = tf.keras.layers.LSTMCell(hidden_size_l2)

In [27]:
stacked_lstm = tf.keras.layers.StackedRNNCells([lstm_cell_l1, lstm_cell_l2])

In [28]:
layer  =  tf.keras.layers.RNN(stacked_lstm,[batch_size, num_steps],return_state=False,stateful=True,trainable=True)

In [29]:
init_state = tf.Variable(tf.zeros([batch_size,embeding_vector_size]),trainable=False)

In [30]:
layer.inital_state = init_state

In [31]:
layer.inital_state

<tf.Variable 'Variable:0' shape=(30, 200) dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [32]:
outputs = layer(inputs)

In [33]:
outputs

<tf.Tensor: shape=(30, 20, 128), dtype=float32, numpy=
array([[[-2.6952322e-03,  4.5033570e-04,  5.4306656e-05, ...,
          8.5960777e-04,  1.8866402e-03,  6.0933287e-04],
        [-3.3794567e-03,  6.1050378e-04,  1.4076863e-03, ...,
          4.0729993e-04,  3.7723926e-03,  5.4415310e-04],
        [-3.7586431e-03,  1.0147203e-03,  3.2546448e-03, ...,
          6.2232510e-05,  5.4535954e-03,  1.6946403e-03],
        ...,
        [-2.5281128e-03,  5.7684905e-03,  1.3206020e-04, ...,
          5.6485208e-03, -2.8833998e-03,  3.8751289e-03],
        [-2.8725807e-03,  5.1796916e-03,  5.8878789e-04, ...,
          5.6287958e-03, -1.7468049e-03,  2.6915951e-03],
        [-2.6211594e-03,  4.2598727e-03,  1.5286102e-03, ...,
          5.3600552e-03, -1.8374330e-03,  9.6161669e-04]],

       [[-1.1035381e-03,  5.4230622e-04, -6.3420908e-04, ...,
         -3.7029269e-05,  9.0331573e-04,  1.6269424e-03],
        [-1.8234245e-03,  1.4717091e-04, -1.6896909e-03, ...,
         -1.2616612e-03,  1.

In [34]:
dense = tf.keras.layers.Dense(vocab_size)

In [35]:
logits_outputs  = dense(outputs)

In [36]:
print("shape of the output from dense layer: ", logits_outputs.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from dense layer:  (30, 20, 10000)


In [37]:
activation = tf.keras.layers.Activation('softmax')

In [38]:
output_words_prob = activation(logits_outputs)

In [39]:
print("shape of the output from the activation layer: ", output_words_prob.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from the activation layer:  (30, 20, 10000)


In [40]:
print("The probability of observing words in t=0 to t=20", output_words_prob[0,0:num_steps])

The probability of observing words in t=0 to t=20 tf.Tensor(
[[9.9988792e-05 9.9992787e-05 1.0002866e-04 ... 9.9998164e-05
  1.0000114e-04 9.9992969e-05]
 [9.9977195e-05 9.9998215e-05 1.0003304e-04 ... 9.9993857e-05
  9.9995937e-05 9.9980694e-05]
 [9.9976111e-05 1.0001556e-04 1.0001936e-04 ... 9.9996636e-05
  9.9974968e-05 9.9979006e-05]
 ...
 [9.9957731e-05 1.0001304e-04 9.9945457e-05 ... 9.9929122e-05
  9.9926889e-05 9.9999190e-05]
 [9.9915815e-05 1.0002562e-04 9.9893834e-05 ... 9.9931298e-05
  9.9945879e-05 1.0001824e-04]
 [9.9895195e-05 1.0003477e-04 9.9885059e-05 ... 9.9943005e-05
  9.9953148e-05 1.0003235e-04]], shape=(20, 10000), dtype=float32)


In [41]:
np.argmax(output_words_prob[0,0:num_steps], axis=1)

array([6174, 5125, 5125, 4918, 4918, 4918, 4918, 5494, 5494, 5494, 5817,
       9207, 9207, 9058, 9058, 8264, 8264, 5740, 5611, 7852], dtype=int64)

In [42]:
_targets[0]

array([9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986,
       9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996])

In [43]:
def crossentropy(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

In [44]:
loss  = crossentropy(_targets, output_words_prob)

In [45]:
loss[0,:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([9.2103615, 9.210463 , 9.210446 , 9.20994  , 9.210105 , 9.209935 ,
       9.210304 , 9.210097 , 9.210672 , 9.209379 ], dtype=float32)>

In [46]:
cost = tf.reduce_sum(loss / batch_size)
cost

<tf.Tensor: shape=(), dtype=float32, numpy=184.20657>