In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf 
from tensorflow.keras import preprocessing, layers, utils, models
import re
import yaml
import os

In [None]:
file_path = '/kaggle/input/chatterbotenglish'
file_list = os.listdir(file_path + os.sep)
print(file_list)

In [None]:
questions, answers = [], []

for file in file_list:
    with open(file_path + os.sep + file,'rb') as file_opened:
        doc = yaml.safe_load(file_opened)
        conversation = doc['conversations']
        for con in conversation:
            if len(con) > 2:
                questions.append(con[0])
                answer = ''
                for rep in con[1:]:
                    answer += ' ' + rep
                answers.append(answer)
            elif len(con) > 1:
                questions.append(con[0])
                answers.append(con[1])
#         print(file_opened)
print(f'questions length: {len(questions)}\tanswers length: {len(answers)}')

In [None]:
answers[0:5]

In [None]:
# check and fix for dict in answers list as it threw an error while adding tags 
# # TypeError: can only concatenate str (not "dict") to str
for idx,n in enumerate(answers):
    if type(n) == dict:
        print(f'dictionary positions: {idx}')
        print(f'dictionaries: {answers[idx]}')
        var = ' '.join([k+v for k,v in answers[idx].items()])
        answers[idx] = var
        print(type(answers[idx]))
        print(n)

In [None]:
# tagging function
def data_tagged(data:list):
    data_tagged = []
    for i in range(len(data)):
        data_tagged.append('<start>'+ data[i] + '<end>')
    return data_tagged

questions_tagged = data_tagged(questions)
answers_tagged = data_tagged(answers)
print(questions_tagged[0:5], answers_tagged[0:5])

In [None]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions_tagged + answers_tagged)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
# vocab list and tokenization function
vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

def tokenize(data):
    data_tokenized = tokenizer.texts_to_sequences(data)
    data_maxlen = max([len(word) for word in data_tokenized])
    data_padded = preprocessing.sequence.pad_sequences(data_tokenized, maxlen= data_maxlen, padding= 'post')
    return np.array(data_padded), data_maxlen

In [None]:
# tokenization of questions and answers for encoder and decoder inputs
encoder_input_data, questions_maxlen = tokenize(questions_tagged)
decoder_input_data, answers_maxlen = tokenize(answers_tagged)
print(f'encoder input data shape: {encoder_input_data.shape}\ndecoder input data shape: {decoder_input_data.shape}')
print(f'encoder input data : {encoder_input_data}\ndecoder input data : {decoder_input_data}')

In [None]:
# decoder_output -tokenization -shift sequences(teacher forcing) - padding -onehote encoding
answers_tokenized = tokenizer.texts_to_sequences(answers_tagged)
for i in range(len(answers_tokenized)):
    answers_tokenized[i] = answers_tokenized[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(answers_tokenized, maxlen= answers_maxlen, padding='post')
onehot_answers = utils.to_categorical(padded_answers, vocab_size)
decoder_output_data = np.array(onehot_answers)
decoder_output_data.shape

In [None]:
# model architecture
encoder_inputs = layers.Input(shape=(questions_maxlen,),name='encoder inputs')
encoder_embeddings = layers.Embedding(vocab_size, 200, mask_zero= True)(encoder_inputs)
encoder_output, state_h, state_c = layers.LSTM(256, return_state= True)(encoder_embeddings)
encoder_states = [state_h, state_c]

decoder_inputs = layers.Input(shape=(answers_maxlen,),name='decoder inputs')
decoder_embeddings = layers.Embedding(vocab_size, 200, mask_zero= True) (decoder_inputs)
decoder_lstm = layers.LSTM(256, return_state=True, return_sequences=True)
decoder_outputs, _ , _ = decoder_lstm (decoder_embeddings, initial_state= encoder_states)

dense = layers.Dense(vocab_size, activation='softmax')
outputs = dense (decoder_outputs)

model = models.Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.001), loss= 'categorical_crossentropy', metrics= ['accuracy'])

model.summary()
utils.plot_model(model)

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=16, verbose=1, epochs=200)

In [None]:
# prediction 
def inference():
    encoder_model = models.Model(encoder_inputs, encoder_states)
    
    decoder_input_state_h = layers.Input(shape=(256, ))
    decoder_input_state_c = layers.Input(shape=(256, ))
    decoder_states_inputs = [decoder_input_state_h, decoder_input_state_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embeddings, initial_state = decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = dense (decoder_outputs)
    
    decoder_model = models.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    return encoder_model, decoder_model

def preprocessing_input(input_data:str):
    tokens = input_data.lower().split()
    tokens_list = []
    for word in tokens:
        tokens_list.append(tokenizer.word_index[word])
    return preprocessing.sequence.pad_sequences([tokens_list], maxlen= questions_maxlen, padding= 'post')

In [None]:
enc_model, dec_model = inference()

In [None]:
while True:
    input_data = input('enter your text: ')
    if input_data == 'bye':
        print('see you later <3')
        break
    states_values = enc_model.predict(preprocessing_input(input_data), verbose=0)
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values, verbose=0)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > answers_maxlen:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)