In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
!pwd
path_to_mount = '/content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level'
os.chdir(path_to_mount)
!ls


/content
char-level-chatbot.ipynb  models


In [3]:
# Import libraries

# Parsing 
import glob
import json
import random 
import numpy as np
import pandas as pd 

# Preprocessing & NNs
from keras.models import Sequential, Model, load_model
from keras.layers import LSTM,Dense, Dropout, Embedding, CuDNNLSTM, Bidirectional, Embedding, Input, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from keras.callbacks import ModelCheckpoint, EarlyStopping

import re
import tensorflow as tf
#tf.enable_eager_execution() # evaluates operations immediately without building graphs
# Above does not work with placeholders

# Etc
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import os
import time

%matplotlib inline

Using TensorFlow backend.


In [4]:
# Get absolute paths of files
path_to_mount = '/content/drive/My Drive/Colab Notebooks/ncsr/'
dialogues_regex_folder_path = "/data/dialogues/*.txt"

# Get the absolute paths for each file 
list_of_files = glob.glob(path_to_mount + dialogues_regex_folder_path)
print(list_of_files[:3]) # Visualize the first 3
print(len(list_of_files)) # 47; crashing? Try lower numbers

#list_of_files = random.choices(population=list_of_files, k=10)
#print(len(list_of_files))

['/content/drive/My Drive/Colab Notebooks/ncsr//data/dialogues/AGREEMENT_BOT.txt', '/content/drive/My Drive/Colab Notebooks/ncsr//data/dialogues/APARTMENT_FINDER.txt', '/content/drive/My Drive/Colab Notebooks/ncsr//data/dialogues/CHECK_STATUS.txt']
47


In [5]:
# Parsing
list_of_dicts = [] # Init

# Loop for each file
for filename in list_of_files:
  with open(filename) as f:
      for line in f: # Loop for each line (inside each file)
          list_of_dicts.append(json.loads(line)) # insert in a dictionary

# Create a new dict containing only useful data
new_list_of_dicts = [] 

for old_dict in list_of_dicts:
  # foodict = {k: v for k, v in old_dict.items() if k.startswith('turns')} #TODO REMOVE
  foodict = {k: v for k, v in old_dict.items() if (k == 'turns')} 
  new_list_of_dicts.append(foodict)

list_of_dicts = [] # Free memory
list_of_dicts = new_list_of_dicts 

print(list_of_dicts[:2])

[{'turns': ['Hello how may I help you?', 'i am awesome', 'of course you are', 'and i own rental properties on the moon', 'i doubt you own a property in the moon', 'just kidding. i own them on Earth', "that's a nice joke", 'because i am a billionaire!', "i don't seem to know you", 'and i programmed you', 'i am the programmer']}, {'turns': ['Hello how may I help you?', 'I am the king of the world', 'I agree that you are the king of the world', 'I can have any woman I want!', 'I agree that you can have any woman you desire.', 'Even you bot, if I were in to AIs', 'Agreed.', "Really? you're awfully agreeable aren't you", 'I agree that I am awfully agreeable, yes.', 'Having an agreement bot seems like a useless thing to have. I need some spice in my life!', 'I really agree with that. I am rather useles.']}]


In [0]:
# Init matrices
questions = []
answers = []

# We assume that the first answer by the bot (aka "Hello, how may I help you?") is returned after a user greeting
# This is used in order to ensure that the dataset will be even and each question is paired with an answeer.
# That's why we create a mini random catalog of artificial 'ghost' user greetings.
matrix_greetings = ["Hey", "Hi", " "]

# A similar situation happens in the corner case when the last sentence is from the user.
# As said, each sentence from the user should be paired with a sentence from the bot.
# That's why we will in this case add an artificial one.
matrix_byes = ["Ok", " ", "Bye"]

# For each dictionary in the list
for dictionary in list_of_dicts:
  matrix_QA = dictionary['turns']
  
  # Append a first random greeting, as explained above
  #questions.append(random.choice(matrix_greetings))
    
  # In order to split the QAs to 2 matrices (questions & answers),
  # we will use a flag to indicate if the sentence is given from the bot or from the user
  #bot_flag = True # Init

  # For each Q/A in the matrix
  matrix_QA.pop(0) # Remove "hey how can i help you"

  bot_flag = False
  
  for sentence in matrix_QA:


    if bot_flag == True:
      answers.append(sentence) # Used for bot's answers
      bot_flag = False # Switch
      continue

    else:
      questions.append(sentence) # Used for user's questions
      bot_flag = True # Switch
      #continue

  # The last loop (ideally) ends with a bot's answer, thus making bot_flag equal to False.
  # Although, with data visualization and exploring, we can see that this does not happen all the time.

  # Corner case: If the last answers was from the user, 
  # then we need to add one artificial 'ghost' response from the bot to make the dataset even.
  if bot_flag == True: 
    answers.append(random.choice(matrix_byes))


In [7]:
assert len(questions) == len(answers), "ERROR: The length of the questions and answer matrices are different."
# If it does not return any warning/error, then everything is good.

print(len(questions)) # We have 238051 QAs (if we load all 47 texts)

# Due to really high memory usage on TensorFlow training,
# we need to keep a lower number of dialogs.
# Also, we will shuffle them to ensure that our bot isn't overfitting on
# limited goal-oriented dialogs like setting an alarm or a exlplaining a catalogue
# Last, but not least, this way will enrich the vocabulary of our bot.

questions, answers = shuffle(np.array(questions), np.array(answers))

print(questions[:3])
print(answers[:3])

200167
['Thanks for the info.' 'Richard who?' 'yes please']
["You're welcome" 'williams'
 'Okay I have gone ahead and change the three alarms next week to "Feed Cat." Can I assist you with anything else?']


In [8]:
NUM_DIALOGS = 40000
questions = questions[:NUM_DIALOGS]
answers = answers[:NUM_DIALOGS]

print(len(answers))


40000


In [0]:
# Input: questions or answers matrix
#
# Returns: Modified matrix, with special tokens appended 
# in the start/end of each string

SOS_TOKEN = '\t' # Start of Sentence
EOS_TOKEN = '\n' # End of Sentence

def add_extra_tokens(matrix):

  new_matrix = []
  for sequence in matrix:
    sequence = SOS_TOKEN + " " + sequence + " " + EOS_TOKEN
    new_matrix.append(sequence)

  return new_matrix


In [0]:
# questions = add_extra_tokens(questions)
answers = add_extra_tokens(answers)


In [0]:
input_characters = set()
target_characters = set()


for i in range(len(answers)):
    for char_input in questions[i]:
        if char_input not in input_characters:
            input_characters.add(char_input)
    
    for char_target in answers[i]:
        if char_target not in target_characters:
            target_characters.add(char_target)


In [0]:
input_texts = []
input_texts = questions

target_texts = []
target_texts = answers

In [13]:
print(input_texts[152])
print(target_texts[152])

Yes, I'm absolutely sure. Erase it please.
	 Your appointment for next Tuesday at 2:00.has been erased 



In [14]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 40000
Number of unique input tokens: 90
Number of unique output tokens: 92
Max sequence length for inputs: 277
Max sequence length for outputs: 622


In [15]:
print(input_characters)
print(target_characters)

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~']
['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~']


In [0]:
# Let's try one-hot encodings

# Pretrained embeddings are better! Check GloVe or word2vec
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [0]:
import numpy as np

# Check Memory Usage here
encoder_input_data = np.zeros(
  (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
  dtype='float32')
decoder_input_data = np.zeros(
  (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
  dtype='float32')
decoder_target_data = np.zeros(
  (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
  dtype='float32')

The encoder_input_data will consist of samples (NUM_DIALOGS) of the maximum 
sequence length (193) filled with the respective one-hot-encoded tokens (number of unique tokens) (in this case a vector of length 90)

In [18]:
encoder_input_data.shape

(40000, 277, 90)


The decoder_input_data and the decoder_target_data are both constructed in the same way as the input data for the encoder. We need to construct those two sequences because we're training our model through a process called teacher forcing, where the decoder learns to generate decoder_target_data[t+1...] given decoder_input_data[...t] while taking into account the input sequence via the encoder's internal state. Therefore we have to offset decoder_target_data by one timestep.

Time to fill in the data with the actual tokens. For that we iterate over all input and target texts and insert the respective one-hot encoding each character in the sequence.

In [0]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
        
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        
        if t > 0:
        # decoder_target_data will be ahead by one timestep
        # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

In [20]:
encoder_input_data[155].shape # (max inp length, no of unique tokens)

(277, 90)

Now it's time to take a closer look at our encoder-decoder model. Our model will consist of two LSTMs. One will serve as an encoder, encoding the input sequence and producing internal state vectors which serve as conditioning for the decoder. The decoder, another LSTM, is responsible for predicting the individual characters of the target sequence. Its initial state is set to the state vectors from the encoder. This passes information about what to generate from the encoder to the decoder.

In [0]:
import keras, tensorflow
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [0]:
latent_dim = 64  # latent dimensionality of the encoding space

In [23]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# Internal states should be fed to the decoder
encoder = LSTM(latent_dim, return_state=True) 

# We don't care about the encoder outputs
_, state_h, state_c = encoder(encoder_inputs)

# Pass the hidden and cell state in a list
encoder_states = [state_h, state_c]






When creating the LSTM, we now want it to return full output sequences as well as the internal state vectors. We're not using the decoder's internal states during training, but we will need them later for inference.


To arrive at the individual characters from the decoder's output we attach a Dense layer to the decoder's LSTM outputs where the number of units match the number of decoder tokens. This way we can just use a softmax activation for the dense layer's outputs and train the whole model using a categorical cross-entropy loss - a standard choice for classification problems.

In [0]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We need to return both states (will be used during inference) and sequences
# Notice that the latent dimensionality here is the same for the encoder
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# Pass decoder inputs and encoder states for initialization
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states) 

# One unit for each decoder token and a softmax activation function
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

In [0]:
# Keras Functional API

# Specify inputs and outputs
model = Model(inputs=[encoder_inputs, decoder_inputs], 
              outputs=decoder_outputs)

In [26]:
# Compile the Model

# Define optimizer and loss function
# Categorical cross entropy is one standard loss function for multi-classification

# Change optimizer & put objects
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()



Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 90)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 92)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 56), (None,  32928       input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 56), ( 33376       input_2[0][0]                    
                                                                 lstm_1[0][1]             

In [35]:
VAL_SPLIT = 0.2 # Ratio of the data that we will validate on
BATCH_SIZE = 512  # Batch size for training
EPOCHS = 100 # Number of epochs to train for

filepath= '/models/checkpoint.h5'
mounted ='/content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level'
filepath = mounted + filepath

checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
                                save_best_only=True,
                                mode='min')

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.02, patience=2,
                                verbose=0, 
                                mode='auto')

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size = BATCH_SIZE,
          epochs = EPOCHS,
          validation_split = VAL_SPLIT,
          callbacks=[checkpoint, early_stopping], 
          verbose=1)


Train on 32000 samples, validate on 8000 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.19819, saving model to /content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level/models/checkpoint.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.19819 to 0.18617, saving model to /content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level/models/checkpoint.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.18617 to 0.17590, saving model to /content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level/models/checkpoint.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.17590 to 0.16687, saving model to /content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level/models/checkpoint.h5
Epoch 5/100

Epoch 00005: val_loss improved from 0.16687 to 0.16007, saving model to /content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level/models/checkpoint.h5


<keras.callbacks.History at 0x7f6251584a20>

In [0]:
# Directory to save model
filepath= '/models/seq2seq-char.h5'
mounted ='/content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level'
filepath = mounted + filepath

model.save(filepath)

In [29]:
'''
# Load model from directory
filepath= '/models/seq2seq-char.h5'
mounted ='/content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level'
filepath = mounted + filepath

model = load_model(filepath)
'''

"\n# Load model from directory\nfilepath= '/models/seq2seq-char.h5'\nmounted ='/content/drive/My Drive/Colab Notebooks/ncsr/version-keras-char-level'\nfilepath = mounted + filepath\n\nmodel = load_model(filepath)\n"


The inference mode works a bit differently than the training procedure. The procedure can be broken down into 4 steps:

1. Encode the input sequence, return its internal states.

2. Run the decoder using just the start-of-sequence character as input and the encoder internal states as the decoder's initial states.

3. Append the character predicted (after lookup of the token) by the decoder to the decoded sequence.

4. Repeat the process with the previously predicted character token as input and updates internal states.

In [0]:
# TODO: Serialize encoder inputs, states, latent dim
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
  decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
  [decoder_inputs] + decoder_states_inputs,
  [decoder_outputs] + decoder_states)

In [0]:
# reverse-lookup token index to turn sequences back to characters
reverse_input_char_index = dict(
  (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
  (i, char) for char, i in target_token_index.items())

With that we can create a function to perform the whole process of decoding a given input sequence (inputs already tokenized).

In [0]:
def decode_sequence(input_seq):
  # encode the input sequence to get the internal state vectors.
  states_value = encoder_model.predict(input_seq)
  
  # generate empty target sequence of length 1 with only the start character
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  target_seq[0, 0, target_token_index[SOS_TOKEN]] = 1.
  
  # output sequence loop
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    output_tokens, h, c = decoder_model.predict(
      [target_seq] + states_value)
    
    # sample a token and add the corresponding character to the 
    # decoded sequence
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = reverse_target_char_index[sampled_token_index]
    decoded_sentence += sampled_char
    
    # check for the exit condition: either hitting max length
    # or predicting the 'stop' character
    if (sampled_char == EOS_TOKEN or 
        len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True
      
    # update the target sequence (length 1).
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    
    # update states
    states_value = [h, c]
    
  return decoded_sentence

In [0]:
def answer_to(input_sentence):
    #input_sentence = "How are you?"

    test_sentence_tokenized = np.zeros((1, max_encoder_seq_length,
                                num_encoder_tokens), dtype='float32')

    for t, char in enumerate(input_sentence):
        test_sentence_tokenized[0, t, input_token_index[char]] = 1.

    answer = decode_sequence(test_sentence_tokenized)
    
    return str(answer)

In [0]:
answer = answer_to("hello wassup")
print(answer)

answer = answer_to("do you have time for today?")
print(answer)

answer = answer_to("can i ")
print(answer)

In [34]:
print("Enter /q to quit")
while (1):
  
  user_input = input("User: ")

  user_input = str(user_input)

  if user_input == '/q':
    print("Quitting chat..")
    break;
  else:
    print("Bot " + str(answer_to(user_input)))

Enter /q to quit
User: hi
Bot    a                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
User: hello
Bot    a                                                                                                                                                                                                                                                                                                                                      