In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Masking
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time

In [2]:
# import preprocessed data

with open('./preprocessed_data/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('./preprocessed_data/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('./preprocessed_data/data.npz') 

In [3]:
# define encoder

def create_encoder(inputdim, embeddingsize, inputlen, n_units):

    # encoder
    encoder_input = Input((inputlen,))
    encoder_embed = Embedding(inputdim+1, embeddingsize)(encoder_input)
    # we use embedding layer to vectorize the word. the inputdim indicates the input dimension
    # specifically, the inputdim is the length of vocabulary
    # and the embeddingsize indicates the output dimension, which is the dimension of word vectors
    encoder_mask = Masking()(encoder_embed)
    # add mask layer because we should ignore the input of those padding 0
    encoder = LSTM(n_units, return_state = True)
    # n_units indicates the number of LSTM units 
    # the state h & c are the vectors which encoder maps input to
    # discard the output of encoder, only state h & c are we need
    _, encoder_h, encoder_c = encoder(encoder_mask)
    
    encoder=Model(encoder_input, [encoder_h,encoder_c])
    
    return encoder

In [4]:
# define decoder. notice that this model is only used in training

def create_decoder(inputdim, embeddingsize,inputlen, n_units):
    # input of answers
    decoder_input = Input((inputlen,))
    # input of encoder state vectors
    initial_stateh = Input((n_units,))
    initial_statec = Input((n_units,))
    encoder_state = [initial_stateh,initial_statec]
    # vectorize input answers
    decoder_embed = Embedding(inputdim+1, embeddingsize,input_length = 1)(decoder_input)
    decoder_mask = Masking()(decoder_embed)
    decoder = LSTM(n_units, return_sequences = True, return_state = True)
    # the state h & c we dont need in training model
    decoder_output, _, _ = decoder(decoder_mask,initial_state = encoder_state)
    # softmax layer, predict the most potential sentence of reply
    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_stateh,initial_statec],decoder_output_)
    
    return decoder

In [5]:
# define hyperparameters

BatchSize = 32
N_Unit = 256
EmbeddingSize = 128
VocabSize = 2500 
# theoretically, vocabulary size should be len(question_corpus.word_index)+1. 
# however, seems like the 'num_words' didnt filter the tokenizer. so we assign the number manually
QuestionLen = npzfile['arr_0'].shape[1]
AnswerLen = npzfile['arr_1'].shape[1]

In [6]:
encoder=create_encoder(VocabSize,EmbeddingSize,QuestionLen,N_Unit)

In [7]:
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 8, 128)            320128    
_________________________________________________________________
masking (Masking)            (None, 8, 128)            0         
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 394240    
Total params: 714,368
Trainable params: 714,368
Non-trainable params: 0
_________________________________________________________________


In [8]:
decoder=create_decoder(VocabSize,EmbeddingSize,AnswerLen,N_Unit)

In [9]:
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 8, 128)       320128      input_2[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 8, 128)       0           embedding_1[0][0]                
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
____________________________________________________________________________________________

In [10]:
# define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real,pred)
    return tf.reduce_mean(loss)

In [11]:
# define the training step

@tf.function
def train_step(enc_inp,dec_inp,targ):
    loss=0

    with tf.GradientTape() as tape:
        encoder_h,encoder_c=encoder(enc_inp)
        initial_stateh,initial_statec=encoder_h,encoder_c
        prediction=decoder([dec_inp,initial_stateh,initial_statec])
        loss=loss_function(targ,prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [12]:
# define the validation loss

def validation_loss(enc_inp,dec_inp,targ):
    loss=0
    encoder_h,encoder_c=encoder(enc_inp)
    initial_stateh,initial_statec=encoder_h,encoder_c
    prediction=decoder([dec_inp,initial_stateh,initial_statec])
    loss=loss_function(targ,prediction)
    
    return loss

In [13]:
# define the parameter to split data
train_valid_split = int(len(npzfile['arr_0'])*0.8)

In [14]:
# get the training data
inputq=npzfile['arr_0'][:train_valid_split]
inputa=npzfile['arr_1'][:train_valid_split]
targa=np.zeros_like(inputa) # create target data to do teacher forcing training
targa[:,0:-1]=inputa[:,1:]  # the target is same as the input answers but 1 timestep shifted to the left

In [15]:
# get the validation data
validq=npzfile['arr_0'][train_valid_split:]
valida=npzfile['arr_1'][train_valid_split:]
validt=np.zeros_like(valida)
validt[:,0:-1]=valida[:,1:]

In [16]:
# use onehot encoding to vectorize the target data
def onehotencoding(matrix,dim):
    onehot=np.zeros((matrix.shape[0],matrix.shape[1],dim))
    for i,sequence in enumerate(matrix):
        for j,index in enumerate(sequence):
            if index>0:
                onehot[i][j][index-1]=1 # the index start from 1 so we minus 1
    return onehot

In [17]:
# create tensorflow dataset pipeline for faster processing
# training set
BufferSize = len(inputq)
dataset_train = tf.data.Dataset.from_tensor_slices((inputq,inputa,targa)).shuffle(BufferSize)
dataset_train = dataset_train.batch(BatchSize, drop_remainder=True)
# validation set
BufferSize1 = len(validq)
dataset_valid = tf.data.Dataset.from_tensor_slices((validq,valida,validt)).shuffle(BufferSize1)
dataset_valid = dataset_valid.batch(BatchSize, drop_remainder=True)

In [18]:
# train the model

Epochs = 8
trainstep_epoch = len(inputq)//BatchSize
validstep_epoch = len(validq)//BatchSize
overalltime=0
stop_early=0
stop=2
summary_valid_loss=[]
for epoch in range(Epochs):
    start=time.time()
    total_loss=0
    valid_loss=0
    
    for (batch, (inputq,inputa,targa)) in enumerate(dataset_train.take(trainstep_epoch)):
        targa_onehot=onehotencoding(targa,VocabSize)
        batch_loss = train_step(inputq,inputa,targa_onehot)
        total_loss += batch_loss
    
    for (batch, (validq,valida,validt)) in enumerate(dataset_valid.take(validstep_epoch)):
        validt_onehot=onehotencoding(validt,VocabSize)
        valid_batch_loss = validation_loss(validq,valida,validt_onehot)
        valid_loss+=valid_batch_loss
    print('Epoch {} Loss {:.3f} Valid_Loss {:.3f}'.format(epoch+1,total_loss/trainstep_epoch,valid_loss/validstep_epoch))
    
    
    summary_valid_loss.append(valid_loss)
    if valid_loss>min(summary_valid_loss):
        stop_early+=1
        if stop_early==stop:
            break
    
    stop=time.time()
    timetaken=stop-start
    print('Time taken for 1 epoch {} sec\n'.format(timetaken))
    
    overalltime+=timetaken
    
    if stop_early==stop:
        print('stop training')
        break
    
print('Overall time taken {} min\n'.format(overalltime/60))

Epoch 1 Loss 2.271 Valid_Loss 2.132
Time taken for 1 epoch 121.4164571762085 sec

Epoch 2 Loss 2.061 Valid_Loss 2.060
Time taken for 1 epoch 113.48456811904907 sec

Epoch 3 Loss 1.981 Valid_Loss 2.023
Time taken for 1 epoch 113.69994974136353 sec

Epoch 4 Loss 1.921 Valid_Loss 2.008
Time taken for 1 epoch 113.31946754455566 sec

Epoch 5 Loss 1.872 Valid_Loss 2.002
Time taken for 1 epoch 113.35257601737976 sec

Epoch 6 Loss 1.827 Valid_Loss 2.003
Time taken for 1 epoch 113.36522960662842 sec

Epoch 7 Loss 1.783 Valid_Loss 2.005
Time taken for 1 epoch 113.16926193237305 sec

Epoch 8 Loss 1.741 Valid_Loss 2.015
Time taken for 1 epoch 113.63503813743591 sec

Overall time taken 15.257375804583232 min



In [19]:
# save parameters after training
encoder.save_weights('./trained_model/lstm_enc_test.h5')
decoder.save_weights('./trained_model/lstm_dec_test.h5')