# Building a Chatbot with NLP and LSTM network
### Importing the necessary libraries

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Masking
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time

### Importing preprocessed data

In [2]:
with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz')

### Creating the Encoder RNN

In [3]:
def create_encoder(inputdim, embeddingsize, inputlen, n_units):

    '''param: inputdim is the length of vocabulary. Basically, the inputdim indicates the input dimension
              embeddingsize indicates the output dimension, which is the dimension of word vectors'''
    
    # Encoder
    encoder_input = Input((inputlen,))
    encoder_embed = Embedding(inputdim + 1, embeddingsize)(encoder_input) # We use embedding layer to vectorize the word.
    
    # Add mask layer because we should ignore the input of those padding 0
    encoder_mask = Masking()(encoder_embed)
    
    encoder = LSTM(n_units, return_state = True)
    '''param: n_units indicates the number of LSTM units 
              the state h & c are the vectors which encoder maps input to discard the output of encoder, 
              only state h & c are needed'''
    _, encoder_h, encoder_c = encoder(encoder_mask)
    
    encoder = Model(encoder_input, [encoder_h,encoder_c])
    
    return encoder

### Creating the Decoder RNN

In [4]:
def create_decoder(inputdim, embeddingsize, inputlen, n_units):
    
    # Input of answers
    decoder_input = Input((inputlen,))
    
    # Input of encoder state vectors
    initial_stateh = Input((n_units,))
    initial_statec = Input((n_units,))
    encoder_state = [initial_stateh,initial_statec]
    
    # Vectorizing the input answers
    decoder_embed = Embedding(inputdim+1, embeddingsize,input_length = 1)(decoder_input)
    decoder_mask = Masking()(decoder_embed)
    decoder = LSTM(n_units, return_sequences = True, return_state = True)
    
    # We don't need the state h & c in training model
    decoder_output, _, _ = decoder(decoder_mask,initial_state = encoder_state)
    
    # Using activation function as softmax layer, predict the most potential sentence of reply
    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder = Model([decoder_input, initial_stateh, initial_statec], decoder_output_)
    
    return decoder

### Defining hyperparameters

In [5]:
batch_size = 32
n_unit = 256
embedding_size = 128
vocab_size = 8000 
# Theoretically, vocabulary size should be len(question_corpus.word_index) + 1. 
# However, it seems like the 'num_words' didn't filter the tokenizer. so we assign the number manually
question_len = npzfile['arr_0'].shape[1]
answer_len = npzfile['arr_1'].shape[1]

In [6]:
encoder = create_encoder(vocab_size, embedding_size, question_len, n_unit)

In [7]:
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 21, 128)           1024128   
_________________________________________________________________
masking (Masking)            (None, 21, 128)           0         
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 394240    
Total params: 1,418,368
Trainable params: 1,418,368
Non-trainable params: 0
_________________________________________________________________


In [8]:
decoder = create_decoder(vocab_size, embedding_size, answer_len, n_unit)

In [9]:
decoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 22)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 22, 128)           1024128   
_________________________________________________________________
masking_1 (Masking)          (None, 22, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                [(None, 256), (None, 256) 394240    
Total params: 1,418,368
Trainable params: 1,418,368
Non-trainable params: 0
_________________________________________________________________


### Defining the optimizer and loss function

In [10]:
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real, pred)
    return tf.reduce_mean(loss)

### Defining the training step

In [11]:
@tf.function
def train_step(encoder_input, decoder_input, target):
    loss=0

    with tf.GradientTape() as tape:
        encoder_h,encoder_c = encoder(encoder_input)
        initial_stateh, initial_statec = encoder_h, encoder_c
        prediction = decoder([decoder_input, initial_stateh, initial_statec])
        loss = loss_function(target, prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

### Defining the validation loss

In [12]:
def validation_loss(encoder_input, decoder_input, target):
    loss = 0
    encoder_h, encoder_c = encoder(encoder_input)
    initial_stateh, initial_statec = encoder_h, encoder_c
    prediction = decoder([decoder_input, initial_stateh, initial_statec])
    loss = loss_function(target, prediction)
    
    return loss

### Defining the parameter to split data

In [13]:
train_valid_split = int(len(npzfile['arr_0']) * 0.8)

### Getting the training data

In [14]:
input_questions = npzfile['arr_0'][:train_valid_split]
input_answers = npzfile['arr_1'][:train_valid_split]
train_target = np.zeros_like(input_answers) # Create target data to do teacher forcing training
train_target[:,0:-1] = input_answers[:,1:]  # The target is same as the input answers but 1 timestep shifted to the left

### Getting the validation data

In [15]:
valid_questions = npzfile['arr_0'][train_valid_split:]
valid_answers = npzfile['arr_1'][train_valid_split:]
valid_target = np.zeros_like(valid_answers)
valid_target[:,0:-1] = valid_answers[:,1:]

### Using onehot encoding to vectorize the target data

In [16]:
def onehotencoding(matrix, dim):
    onehot = np.zeros((matrix.shape[0], matrix.shape[1], dim))
    for i, sequence in enumerate(matrix):
        for j, index in enumerate(sequence):
            if index > 0:
                onehot[i][j][index - 1] = 1 # The index start from 1, so we subtract index from 1
    return onehot

### Creating tensorflow dataset pipeline for faster processing

In [17]:
# Training set
buffer_size1 = len(input_questions)
dataset_train = tf.data.Dataset.from_tensor_slices((input_questions, input_answers, train_target)).shuffle(buffer_size1)
dataset_train = dataset_train.batch(batch_size, drop_remainder = True)

# Validation set
buffer_size2 = len(valid_questions)
dataset_valid = tf.data.Dataset.from_tensor_slices((valid_questions, valid_answers, valid_target)).shuffle(buffer_size2)
dataset_valid = dataset_valid.batch(batch_size, drop_remainder = True)

### Training the model

In [18]:
epochs = 8
trainstep_epoch = len(input_questions)//batch_size
validstep_epoch = len(valid_questions)//batch_size
overall_time = 0
stop_early = 0
stop = 2
summary_valid_loss = []

for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    valid_loss = 0
    
    for (batch, (input_questions, input_answers, train_target)) in enumerate(dataset_train.take(trainstep_epoch)):
        train_target_onehot = onehotencoding(train_target, vocab_size)
        batch_loss = train_step(input_questions, input_answers, train_target_onehot)
        total_loss += batch_loss
    
    for (batch, (valid_questions, valid_answers, valid_target)) in enumerate(dataset_valid.take(validstep_epoch)):
        valid_target_onehot = onehotencoding(valid_target, vocab_size)
        valid_batch_loss = validation_loss(valid_questions, valid_answers, valid_target_onehot)
        valid_loss += valid_batch_loss
    print('Epoch: {} Loss: {:.3f} Valid_Loss: {:.3f}'.format(epoch + 1, total_loss/trainstep_epoch, valid_loss/validstep_epoch))
    
    
    summary_valid_loss.append(valid_loss)
    if valid_loss > min(summary_valid_loss):
        stop_early += 1
        if stop_early == stop:
            break
    
    stop = time.time()
    timetaken = stop - start
    print('Time taken for 1 epoch: {} sec\n'.format(timetaken))
    
    overalltime += timetaken
    
    if stop_early == stop:
        print('Stopped Training')
        break
    
print('Overall time taken: {} min\n'.format(overalltime/60))

ValueError: in user code:

    C:\Users\praab\AppData\Local\Temp/ipykernel_12864/945995257.py:8 train_step  *
        prediction = decoder([decoder_input, initial_stateh, initial_statec])
    C:\Users\praab\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1013 __call__  **
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\praab\AppData\Local\Programs\Python\Python38\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:200 assert_input_compatibility
        raise ValueError('Layer ' + layer_name + ' expects ' +

    ValueError: Layer model_1 expects 1 input(s), but it received 3 input tensors. Inputs received: [<tf.Tensor 'decoder_input:0' shape=(32, 22) dtype=int32>, <tf.Tensor 'model/lstm/PartitionedCall:2' shape=(32, 256) dtype=float32>, <tf.Tensor 'model/lstm/PartitionedCall:3' shape=(32, 256) dtype=float32>]
