In [1]:
import yaml
from tensorflow.keras import preprocessing, models, utils, layers, activations
import numpy as np
import tensorflow as tf
import os

In [2]:
questions = []
answers = []

for file in os.listdir("./data"):
    stream = open("./data/" + file, 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for convo in conversations:
        if len(convo) > 2:
            questions.append(convo[0])
            replies = convo[1:]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append(ans)

        elif len(convo) > 1:
            questions.append(convo[0])
            answers.append(convo[1])


### Tokenizing

1. For encoder input data

In [3]:
answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print("VOCAB SIZE: ", VOCAB_SIZE)
tokenized_questions = tokenizer.texts_to_sequences(questions)
# print(tokenized_questions)

VOCAB SIZE:  1894


In [4]:
length_list = []
for token_seq in tokenized_questions:
    length_list.append(len(token_seq))
max_ip_len = np.array(length_list).max()
print("Questions max length: {}".format(max_ip_len))

Questions max length: 22


In [5]:
padded_ques = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=max_ip_len, padding='post')
encoderInputData = np.array(padded_ques)
print("EncoderInputData shape: {}".format(encoderInputData.shape))

EncoderInputData shape: (564, 22)


##### Preprocessing for Decoder Input 

In [6]:
tokenized_answers = tokenizer.texts_to_sequences(answers)
# print(tokenized_answers)

length_list = []
for token_seq in tokenized_answers:
    length_list.append(len(token_seq))
max_op_len = np.array(length_list).max()
print("Answers max length: {}".format(max_op_len))

padded_ans = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=max_op_len, padding='post')
decoderInputData = np.array(padded_ans)
print("DecoderInputData shape: {}".format(decoderInputData.shape))

Answers max length: 74
DecoderInputData shape: (564, 74)


##### Preprocessing for Decoder Target 

In [7]:
decoderTargetData = []
for line in tokenized_answers:
    decoderTargetData.append(line[1:])
    
paddedTargetAns = preprocessing.sequence.pad_sequences(decoderTargetData, maxlen=max_op_len, padding='post')
onehotTargetAns = utils.to_categorical(paddedTargetAns, VOCAB_SIZE)
decoderTargetData = np.array(onehotTargetAns)
print("DecoderTargetData shape: {}".format(decoderTargetData.shape))

DecoderTargetData shape: (564, 74, 1894)


In [8]:
np.save( 'encoderInputData.npy' , encoderInputData )
np.save( 'decoderInputData.npy' , decoderInputData )
np.save( 'decoderOutputData.npy' , decoderTargetData )

### Building Keras model 

In [9]:
encoder_inputs = tf.keras.layers.Input(shape=(None, ))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [10]:
decoder_inputs = tf.keras.layers.Input(shape=(None, ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200,  mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True)
decoder_outputs, _ , _  = decoder_lstm(decoder_embedding, initial_state = encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

In [11]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer = tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    378800      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    378800      input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM

In [12]:
model.fit([encoderInputData, decoderInputData], decoderTargetData, batch_size=100, epochs=200)
model.save('model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 

Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [13]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [14]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_ip_len , padding='post')


In [15]:
enc_model , dec_model = make_inference_models()

In [16]:
# for _ in range(10):
#     states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
#     empty_target_seq = np.zeros( ( 1 , 1 ) )
#     empty_target_seq[0, 0] = tokenizer.word_index['start']
#     stop_condition = False
#     decoded_translation = ''
#     while not stop_condition :
#         dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
# #         print(dec_outputs)
#         sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
#         sampled_word = None
#         for word , index in tokenizer.word_index.items() :
#             if sampled_word_index == index :
#                 decoded_translation += ' {}'.format( word )
#                 sampled_word = word
        
#         if sampled_word == 'end' or len(decoded_translation.split()) > max_op_len:
#             stop_condition = True
            
#         empty_target_seq = np.zeros( ( 1 , 1 ) )  
#         empty_target_seq[ 0 , 0 ] = sampled_word_index
#         states_values = [ h , c ] 

#     print( decoded_translation[:-3] )

In [17]:
model.save()

AttributeError: 'NoneType' object has no attribute 'get_config'

In [19]:
converter = tf.contrib.lite.TocoConverter.from_saved_model('./model.h5')
tfliteModel = converter.convert()
open("converted_model.tflite", "wb").write(tflite_model)

OSError: SavedModel file does not exist at: model.h5