# Building a Chatbot with NLP and LSTM network
### Importing the necessary libraries

In [1]:
import tensorflow as tf
import numpy as np
import json
import re
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable = True)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GRU, LSTM, Masking
from keras.preprocessing.text import tokenizer_from_json

### Importing preprocessed data

In [2]:
with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz')

#### Some unknown reason make the corpus contain all words and labels in raw data rather than a vocabulary with limited size so we have to build the dict manually

In [3]:
q_word2ind = {e:i for e, i in question_corpus.word_index.items() if i <= 5000}
q_ind2word = {e:i for i, e in q_word2ind.items()}
a_word2ind = {e:i for e, i in answer_corpus.word_index.items() if i <= 5000}
a_ind2word = {e:i for i, e in a_word2ind.items()}

### Creating the encoder
Notice that the encoder at here is totally same with the encoder in training model

In [4]:
def create_encoder(inputdim, embeddingsize, inputlen, n_units):
    
    # Encoder
    encoder_input = Input((inputlen,))
    
    # We use embedding layer to vectorize the word.
    encoder_embed = Embedding(inputdim + 1, embeddingsize)(encoder_input)
    
    # Add mask layer because we should ignore the input of those padding 0
    encoder_mask = Masking()(encoder_embed)
    
    # Encoder with LSTM model
    encoder = LSTM(n_units, return_state = True)
    _, encoder_h, encoder_c = encoder(encoder_mask)
    
    encoder = Model(encoder_input, [encoder_h, encoder_c])
    
    return encoder

### Creating the decoder
Notice that we use inference model at here, which is a little different with the decoder in training model

In [5]:
def create_decoder(inputdim, embeddingsize, n_units):
    
    # The size of input at here is 1 because we want to predict the answer step by step, each time only input 1 word
    decoder_input = Input((1,))
    
    # Input of encoder state vectors
    initial_stateh = Input((n_units,))
    initial_statec = Input((n_units,))
    encoder_state = [initial_stateh, initial_statec]
    
    # Vectorizing input answers
    decoder_embed = Embedding(inputdim + 1, embeddingsize, input_length = 1)(decoder_input)
    decoder_mask = Masking()(decoder_embed)
    decoder = LSTM(n_units, return_sequences = True, return_state = True)
    
    # In training model, we do not use the state h & c. but in inference model, we do
    decoder_output, decoder_h, decoder_c = decoder(decoder_mask,initial_state = encoder_state)
    
    # Using activation function as softmax layer, predict the most potential sentence of reply
    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder = Model([decoder_input, initial_stateh, initial_statec], [decoder_output_, decoder_h, decoder_c])
    
    return decoder

### Defining the hyperparameters

In [6]:
n_unit = 256
embedding_size = 128
vocab_size = 1000 
question_len = npzfile['arr_0'].shape[1]
answer_len = npzfile['arr_1'].shape[1]

In [7]:
encoder = create_encoder(vocab_size, embedding_size, question_len, n_unit)

In [8]:
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 21, 128)           128128    
_________________________________________________________________
masking (Masking)            (None, 21, 128)           0         
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 394240    
Total params: 522,368
Trainable params: 522,368
Non-trainable params: 0
_________________________________________________________________


In [9]:
encoder.load_weights('./trained_model/lstm_encoder_test1000.h5')

In [10]:
decoder = create_decoder(vocab_size, embedding_size, n_unit)

In [11]:
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 128)       128128      input_2[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 1, 128)       0           embedding_1[0][0]                
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
____________________________________________________________________________________________

In [12]:
decoder.load_weights('./trained_model/lstm_decoder_test1000.h5')

In [13]:
def clean_text(text):

    # Remove unnecessary characters in sentences
    
    text = text.lower().strip()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

### Evaluating the chat

In [14]:
def evaluate(sentence):
    
    # Cleaning the input text
    sentence = clean_text(sentence) 
    encoder_inputs=[]
    
    # Converting the input text to index sequence and use unk replace the word not in vocabulary
    for word in sentence.split():
        if word in q_word2ind:
            encoder_inputs.append(q_word2ind[word])
        elif word not in q_word2ind:
            encoder_inputs.append(q_word2ind['unk'])
    
    # Initializing the encoder input
    encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences([encoder_inputs], maxlen = question_len, padding = 'post')
    encoder_inputs = tf.convert_to_tensor(encoder_inputs)
    encoder_h, encoder_c = encoder(encoder_inputs)
    
    # Initializing the decoder input
    decoder_inputs = tf.expand_dims([a_word2ind['bos']], 0)
    hidden_h,hidden_c = encoder_h,encoder_c
    
    result = ''
    
    for t in range(answer_len):
        pred, state_h, state_c = decoder([decoder_inputs, hidden_h, hidden_c])
        pred = np.squeeze(pred)
        pred_ind = tf.math.argmax(pred).numpy() + 1
 
        # Once we get the 'eos' symbol, stop the loop
        if a_ind2word[pred_ind] == 'eos': 
            return result
        
        result += a_ind2word[pred_ind] + ' '
        
        # Passing the predict index and state vectors to the next input 
        decoder_inputs = tf.expand_dims([pred_ind], 0)       
        hidden_h, hidden_c = state_h, state_c
        
    return result

In [15]:
while True:
    inputs = input('User :> ')
    if inputs == 'quit':
        break

    result = evaluate(inputs)

    print('Bot :> ' + result)

User :>  hi


Bot :> oh 


User :>  hello


Bot :> unk 


User :>  why?


Bot :> i am not unk 


User :>  quit
