# Building a Chatbot with NLP and GRU model and attention mechanism
### Importing the necessary libraries

In [1]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable = True)
import numpy as np
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time

### Importing preprocessed data 

In [2]:
with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz')

### Creating the Encoder RNN

In [3]:
class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences = True, return_state = True,
                                       recurrent_initializer = 'glorot_uniform')

    def call(self, x, hidden):
        # Embed input words  
        x = self.embedding(x)
        
        # Pass the embedded word vectors into LSTM and return all outputs
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

### Creating the Bahdanau Attention 

In [4]:
class BahdanauAttention(tf.keras.layers.Layer):

    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self.R = tf.keras.layers.Dropout(0.2)
        
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # Calculating Alignment Scores
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        score = self.R(score)
        
        # Softmaxing alignment scores to get Attention weights
        attention_weights = tf.nn.softmax(score, axis = 1)
        
        # Multiplying the Attention weights with encoder outputs to get the context vector
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis = 1)

        return context_vector, attention_weights

### Creating the Decoder RNN

In [5]:
class Decoder(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention = BahdanauAttention(self.dec_units)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences = True, return_state = True,
                                       recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis = -1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        
        return x, state, attention_weights

### Defining Hyperparameters

In [6]:
# Theoretically, vocabulary size should be len(question_corpus.word_index) + 1. 
# However, it seems like the 'num_words' didn't filter the tokenizer. so we assign the number manually
vocab_size = 1001
embedding_size = 128
n_unit = 256
batch_size = 32

In [7]:
encoder = Encoder(vocab_size, embedding_size, n_unit, batch_size)

In [8]:
decoder = Decoder(vocab_size, embedding_size, n_unit, batch_size)

### Defining the optimizer and loss function

In [9]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

def loss_function(real, pred):
    
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### Defining the training step

In [10]:
def train_step(inp, targ, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([answer_corpus.word_index['bos']] * batch_size, 1)
        
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

### Defining the validation loss

In [11]:
def validation_loss(inp, targ, enc_hidden):
    loss = 0
    
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([answer_corpus.word_index['bos']] * batch_size, 1)
    
    for t in range(1, targ.shape[1]):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss += loss_function(targ[:, t], predictions)
        dec_input = tf.expand_dims(targ[:, t], 1)
            
    batch_loss = (loss / int(targ.shape[1]))
    
    return batch_loss

### Defining the parameter to split data

In [12]:
train_valid_split = int(len(npzfile['arr_0']) * 0.8)
print(train_valid_split)

118190


### Getting the training data

In [13]:
input_question = npzfile['arr_0'][:train_valid_split]
input_answers = npzfile['arr_1'][:train_valid_split]

### Getting the validation data

In [14]:
valid_questions = npzfile['arr_0'][train_valid_split:]
valid_answers = npzfile['arr_1'][train_valid_split:]

### Creating tensorflow dataset pipeline for faster processing

In [15]:
# Training set
buffer_size1 = len(input_question)
dataset_train = tf.data.Dataset.from_tensor_slices((input_question, input_answers)).shuffle(buffer_size1)
dataset_train = dataset_train.batch(batch_size, drop_remainder = True)

# Validation set
buffer_size2 = len(valid_questions)
dataset_valid = tf.data.Dataset.from_tensor_slices((valid_questions, valid_answers)).shuffle(buffer_size2)
dataset_valid = dataset_valid.batch(batch_size, drop_remainder = True)

### Training the model

In [16]:
epochs = 15
trainstep_epoch = len(input_question)//batch_size
validstep_epoch = len(valid_questions)//batch_size
overall_time = 0

for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    valid_loss = 0
    enc_hidden = encoder.initialize_hidden_state()
    
    for (batch, (input_question, input_answers)) in enumerate(dataset_train.take(trainstep_epoch)):
        batch_loss = train_step(input_question, input_answers, enc_hidden)
        total_loss += batch_loss

    for (batch, (valid_questions, valid_answers)) in enumerate(dataset_valid.take(validstep_epoch)):
        valid_batch_loss = validation_loss(valid_questions, valid_answers, enc_hidden)
        valid_loss += valid_batch_loss
        
    print('Epoch: {}     Loss: {:.3f}     Valid_Loss: {:.3f}'.format(epoch + 1, total_loss/trainstep_epoch, valid_loss/validstep_epoch))
    
    stop = time.time()
    timetaken = stop - start
    print('Time taken for 1 epoch: {} sec\n'.format(timetaken))
    
    overall_time += timetaken
    
print('Overall time taken: {} min\n'.format(overall_time/60))

Epoch: 1     Loss: 0.995     Valid_Loss: 0.956
Time taken for 1 epoch: 2034.1076443195343 sec

Epoch: 2     Loss: 0.922     Valid_Loss: 0.930
Time taken for 1 epoch: 1852.464063167572 sec

Epoch: 3     Loss: 0.900     Valid_Loss: 0.917
Time taken for 1 epoch: 1855.9490611553192 sec

Epoch: 4     Loss: 0.885     Valid_Loss: 0.913
Time taken for 1 epoch: 1851.9534990787506 sec

Epoch: 5     Loss: 0.873     Valid_Loss: 0.908
Time taken for 1 epoch: 1856.7145359516144 sec

Epoch: 6     Loss: 0.863     Valid_Loss: 0.906
Time taken for 1 epoch: 1870.2493431568146 sec

Epoch: 7     Loss: 0.853     Valid_Loss: 0.909
Time taken for 1 epoch: 1857.4066376686096 sec

Epoch: 8     Loss: 0.844     Valid_Loss: 0.911
Time taken for 1 epoch: 1857.5119829177856 sec

Epoch: 9     Loss: 0.834     Valid_Loss: 0.914
Time taken for 1 epoch: 1847.0697631835938 sec

Epoch: 10     Loss: 0.824     Valid_Loss: 0.917
Time taken for 1 epoch: 1851.0322816371918 sec

Epoch: 11     Loss: 0.815     Valid_Loss: 0.924
Ti

### Saving parameters after training

In [17]:
encoder.save_weights('./trained_model/attention_encoder_test1000.h5')
decoder.save_weights('./trained_model/attention_decoder_test1000.h5')