In [None]:
import keras
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Dense, Flatten, Embedding
import numpy as np
import re
from bot import BOT

In [None]:
# Hyperparameters
vocab_size = 2000
max_length = 20

In [None]:
# text cleaning and fit bpe tokenizer
bot = BOT(vocab_size)

def tokenize(text):
    clean_text = re.sub("r[^A-z0-9?!.', ]", '', str(text))
    return ' '.join(word_tokenize(clean_text))


x = []
y = []

with open("dialogs.txt", 'r') as file:
    for line in file.readlines():
        question, answer = line.replace("\n", '').split('\t')
        x.append(tokenize(question))
        y.append(answer)


bot.fit(x)

x = [bot.decode(i) for i in x]
x = pad_sequences(x, maxlen=max_length)

In [None]:
# Save the tokenizer as a file
bot.save()

# Load the tokenizer from a file
bot.load()

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
output_size = len(label_encoder.classes_)
y = to_categorical(y)

In [None]:
test_x, test_y = x[:100], y[:100]

In [None]:
#Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),
    Flatten(),
    Dense(units=64, activation='relu'),
    Dense(units=32, activation='relu'),
    Dense(units=16, activation='relu'),
    Dense(units=output_size, activation='softmax')
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary() 

In [None]:
model.fit(x, y, epochs=50)

val_loss, val_acc = model.evaluate(test_x, test_y)
print(val_loss)
print(val_acc)

In [None]:
# save model 
model.save("ChatBot.hdf5")

model = keras.models.load_model("ChatBot.hdf5")
val_loss, val_acc = model.evaluate(test_x, test_y)
print(val_loss)
print(val_acc)

In [None]:
def preprocess_input(text):
    text_lower = text.lower()
    tokenize_text = tokenize(text_lower)
    decode_text = bpe.decode(tokenize_text)
    return pad_sequences([decode_text], maxlen=max_length)


while True:
    user_input = input("You: ")
    print(f"You: {user_input}")
    preprocessed_input = preprocess_input(user_input)
    prediction = model.predict(preprocessed_input)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    print(f"Bot: {predicted_label[0]}")
    print()