In [2]:
import numpy as np
import tensorflow as tf
import os
import collections
import tensorflow.keras as keras
import reader

### Initializing values

First we specify a few things:<br>
1. number of LSTM layers<br>
2. number of time steps(words in a sentence)<br>
3. batch size(no of sentences ina batch)<br>
4. number of hidden nodes in the two layers<br>
5. vocabulary size<br>
6. vector size for each word

In [2]:
learning_rate = 1.0
max_grad_norm = 5
num_layers = 2
num_steps = 20
hidden_1 = 256
hidden_2 = 128

batch_size = 30
vocab_size = 10000
embedding_vec_size= 200

### Reading and parsing content 

For reading we will user reader.py file already provided. It contains certain functions for parsing data and dividing the data into batches, sequences(time steps).

In [3]:
data_dir='../Datasets ML/simple-examples/data'
raw_data = reader.ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

In [4]:
def id_to_word(id_list):
    words=[]
    for id_ in id_list:
        for word,w_id in word_to_id.items():
            if w_id==id_:
                words.append(word)
                break
    return words

For instance lets take single batch

In [5]:
_iterator_=reader.ptb_iterator(train_data, batch_size, num_steps)
first_batch=_iterator_.__next__()
inp=first_batch[0]
tar=first_batch[1]

### Word Embeddings

This layer basically converts each word to a vector of given dimensions alongwith specifying the vocabulary size.<br>
Further the layer trains the vectors for each word by updating probablity values. In short it is a Word2vec layer which trains itself through time.

In [6]:
embedding_layer=keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_vec_size,\
                                       batch_input_shape=(batch_size,num_steps),input_length=num_steps,trainable=True,\
                                       name='embedding')

In [7]:
phase_1=embedding_layer(inp)

### Constructing RNN Network

Specifying the layers trainable means we do not require to specify gradient Tape to watch for trainable variables, it does it automatically. 

In [8]:
l1=keras.layers.LSTMCell(hidden_1)
l2=keras.layers.LSTMCell(hidden_2)
stacked_rnn=keras.layers.StackedRNNCells([l1,l2])

In [9]:
rnn_layer=keras.layers.RNN(stacked_rnn,[batch_size,num_steps],trainable=True,stateful=True,return_state=False)

In [10]:
init_state=tf.Variable(np.zeros((batch_size,embedding_vec_size)),trainable=False,name='Initial state')
rnn_layer.initial_state=init_state

In [11]:
phase_2=rnn_layer(phase_1)

### Dense Layer

In [12]:
dense_layer=keras.layers.Dense(vocab_size)
phase_3=dense_layer(phase_2)

activation=keras.layers.Activation('softmax')
phase_4=activation(phase_3)

### Prediction 

Ground Truth:

In [13]:
id_to_word(tar[0])

['banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim',
 'snack-food']

Predicted:

In [14]:
id_to_word(np.argmax(phase_4[0,0:num_steps,:],axis=1))

['wholesale',
 'upside',
 'obtaining',
 'artists',
 'score',
 'score',
 'flies',
 'flies',
 'flies',
 'flies',
 'discontinued',
 'innovative',
 'innovative',
 'recommended',
 'pearce',
 'pearce',
 'trained',
 'aroused',
 'approve',
 'approve']

### Assembling model

In [15]:
rnn=keras.Sequential()
rnn.add(embedding_layer)
rnn.add(rnn_layer)
rnn.add(dense_layer)
rnn.add(activation)

In [16]:
rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (30, 20, 200)             2000000   
_________________________________________________________________
rnn (RNN)                    (30, 20, 128)             671088    
_________________________________________________________________
dense (Dense)                (30, 20, 10000)           1290000   
_________________________________________________________________
activation (Activation)      (30, 20, 10000)           0         
Total params: 3,961,088
Trainable params: 3,955,088
Non-trainable params: 6,000
_________________________________________________________________


Loss Function

In [17]:
def cross_entropy(y,y_hat):
    return keras.losses.sparse_categorical_crossentropy(y,y_hat)

In [18]:
loss=tf.reduce_sum(cross_entropy(tar,phase_4))/batch_size
loss

<tf.Tensor: shape=(), dtype=float32, numpy=184.20694>

Optimizer

In [19]:
lr=tf.Variable(0.0,trainable=False)
lr.assign(learning_rate)
optimizer=keras.optimizers.SGD(lr,clipnorm=max_grad_norm)

### Training model

In [20]:
var=rnn.trainable_variables
[var.name for var in rnn.trainable_variables]

['embedding/embeddings:0',
 'rnn/stacked_rnn_cells/lstm_cell/kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell/recurrent_kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell/bias:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/recurrent_kernel:0',
 'rnn/stacked_rnn_cells/lstm_cell_1/bias:0',
 'dense/kernel:0',
 'dense/bias:0']

Calculating Gradients

In [21]:
with tf.GradientTape() as tape:
    y_hat=rnn(inp)
    loss=cross_entropy(tar,y_hat)
    cost=tf.reduce_sum(loss)/batch_size

Clipping gradient values manually to see effect

In [22]:
grads=tape.gradient(loss,var)
clipped,init_norm=tf.clip_by_global_norm(grads,max_grad_norm)
init_norm

<tf.Tensor: shape=(), dtype=float32, numpy=68.83176>

Applying gradients to see effect

In [23]:
optimizer.apply_gradients(zip(clipped,var))
y_hat=rnn(inp)
loss_1=cross_entropy(tar,y_hat)
cost_1=tf.reduce_sum(loss_1)/batch_size

In [25]:
print('Initial cost: ',cost)
print('Cost after epoch=1: ',cost_1)

Initial cost:  tf.Tensor(184.20651, shape=(), dtype=float32)
Cost after epoch=1:  tf.Tensor(172.78825, shape=(), dtype=float32)


As cost decreases model becomes more robust and its efficacy increases after certain epoch provided adequate data.