# ChatBot

In [1]:
import numpy as np
import tensorflow as tf
import re # for text processing
import time # to see epochs time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Data PreProcessing
cornell movie dialogs corpus

In [2]:
# importing the dataset
lines = open("data/movie_lines.txt", encoding = "utf-8", errors = "ignore").read().split("\n")
conversations = open("data/movie_conversations.txt", encoding = "utf-8", errors = "ignore").read().split("\n")

print("NumOf sentences in conversations:", len(lines))
print(lines[:3])
print("NumOf conversations:", len(conversations))
print(conversations[:3])

NumOf sentences in conversations: 304714
['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!', 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!', 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.']
NumOf conversations: 83098
["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']"]


In [3]:
# creating a dictionary that maps each line and its id
id2line = dict()
for line in lines:
    _line = line.split(" +++$+++ ") # _ means it is temporary variable, will not be used forever
    if len(_line) == 5: # if it is not, it is a wrong data
        id2line[_line[0]] = _line[4]
        # 1,2,3 indexes are metadata, so they are not needed for training, they are extra informations

print("NumOf sentences in conversations:", len(id2line))
counter = 0
for key in id2line.keys():
    print(id2line[key])
    counter += 1
    if counter == 3: break

NumOf sentences in conversations: 304713
They do not!
They do to!
I hope so.


In [4]:
# creating a list of all conversations
conversations_ids = list()
for conversation in conversations[:-1]: # last row is empty
    _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(","))

print("NumOf conversations:", len(conversations_ids))
print(conversations_ids[:3])

NumOf conversations: 83097
[['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203']]


In [5]:
# getting seperately the questions and the answers
questions = list()
answers = list()

for conversation in conversations_ids:
    for i in range(len(conversation) -1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

for i in range(3): print(questions[i], ">>", answers[i])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. >> Well, I thought we'd start with pronunciation, if that's okay with you.
Well, I thought we'd start with pronunciation, if that's okay with you. >> Not the hacking and gagging and spitting part.  Please.
Not the hacking and gagging and spitting part.  Please. >> Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?


In [6]:
# doing the first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

# Cleaning the questions
cleaned_questions = list()
for question in questions:
    cleaned_questions.append(clean_text(question))

# Cleaning the answers
cleaned_answers = list()
for answer in answers:
    cleaned_answers.append(clean_text(answer))

print("NumOf cleaned questions, answers:", len(cleaned_questions), len(cleaned_answers))
for i in range(3): print(cleaned_questions[i], ">>", cleaned_answers[i])

NumOf cleaned questions, answers: 221616 221616
can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again >> well i thought we would start with pronunciation if that is okay with you
well i thought we would start with pronunciation if that is okay with you >> not the hacking and gagging and spitting part  please
not the hacking and gagging and spitting part  please >> okay then how 'bout we try out some french cuisine  saturday  night


In [7]:
# calculate the words frequencies and delete which are under the treshold
word2count = dict()

for question in cleaned_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for answer in cleaned_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

print("Number of words:", len(word2count))

Number of words: 76422


In [8]:
# creating two dictionaries that map the question words and the answer words to a unique integer
treshold = 20 # higher; reduces training time, lower; increase the overhelming
questionswords2int = dict()
answerswords2int = dict()
word_number = 0
for word, count in word2count.items():
    if count >= treshold:
        questionswords2int[word] = word_number
        answerswords2int[word] = word_number
        word_number += 1

print("Number of frequent words:", len(questionswords2int))
counter = 0
for key, value in questionswords2int.items():
    print(key, value)
    counter += 1
    if counter == 5: break

Number of frequent words: 8821
can 0
we 1
make 2
this 3
quick 4


In [9]:
# adding the last tokens to the dictionaries, these are neccesary for encoding-decoding layers
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>'] # EnfOfString, StartOfString
for token in tokens:
    questionswords2int[token] = word_number
    answerswords2int[token] = word_number

# creating the inverse dictionary of answers
answersint2word = {val:key for key, val in answerswords2int.items()}

# adding the End Of String token to the end of every answer
for i in range(len(cleaned_answers)):
    cleaned_answers[i] += ' <EOS>'

In [10]:
# translating all the questions and the answers to integers
# and replacing all the words that were filteret out with <OUT>
questions_into_int = list()
for question in cleaned_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int["<OUT>"])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)

answers_into_int = list()
for answer in cleaned_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int["<OUT>"])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [11]:
# sorting questions and answers by the lengt of questions
# the purpose is to train the ANN like a human baby
# firstly we must teach the sort sentences instead of long sentences for a better train
# there are sentences which contains more than 25 words but we don't take them
sorted_clean_questions = list()
sorted_clean_answers = list()
for length in range(1, 25+1): # the sentences with length above 25 are not taken
        for i in enumerate(questions_into_int): # enumerate -> tuple; i[0] = index, i[1] = sentence
                if len(i[1]) == length:
                        sorted_clean_questions.append(questions_into_int[i[0]])
                        sorted_clean_answers.append(answers_into_int[i[0]])
print(len(sorted_clean_questions), len(sorted_clean_answers))

203949 203949


## Architecture of ChatBot: Seq2Seq Model

In [12]:
# creating placeholders for the input and the targets
def model_inputs():
    # tf.placeholder(dataType, dimensions, )
    inputs = tf.placeholder(tf.int32, [None, None], name = "input")
    targets = tf.placeholder(tf.int32, [None, None], name = "target")
    lr = tf.placeholder(tf.float32, name = "learning_rate")
    keep_prob = tf.placeholder(tf.float32, name = "keep_prob") # controls the dropout rate
    return inputs, targets, lr, keep_prob


# preprocessing the targets
# targets must be batches, bcs decoder accepts targets as batches, doesn't accept a single target
def preprocess_targets(targets, word2int, batch_size):
    # <SOS>Sentence... == <Left>Right
    # tf.fill(dimension, fillWithThis)
    left_side = tf.fill([batch_size, 1], word2int["<SOS>"])
    # tf.strided_slice(inputs, start, end) take a subset of the data
    right_side = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
    preprocessed_targets = tf.concat([left_side, right_side], 1) # horizontal concat = 1, vertical = 0
    return preprocessed_targets

In [22]:
# creating the encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length): # seq_len = batch size
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size) # num of input tensors
    # DropoutWrapper(ANN which will dropout applied, control the dropout rate)
    # %20 percent of neurons are like nonexisting, their weights are not updated
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    # not the state, encoder cell is composed of several layers
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell, 
                                                       cell_bw = encoder_cell, 
                                                       sequence_length = sequence_length, 
                                                       inputs = rnn_inputs,
                                                       dtype = tf.float32)
    return encoder_state


# decoding the training set
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input,
                       sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size]) # [row_size, col_num, col_size]
    # attention = at
    at_keys, at_values, at_score_function, at_construct_function = tf.contrib.seq2seq.prepare_attention(
        attention_states, attention_option = "bahdanau", num_units=decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(
        encoder_state[0], at_keys, at_values, at_score_function, at_construct_function, name="attn_dec_train")
    decoder_output, _, _, = tf.contrib.seq2seq.dynamic_rnn_decoder(
        decoder_cell, training_decoder_function, decoder_embedded_input, sequence_length, scope=decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)


# decoding the test/validation set
# not only predicting the question type, but also create an answer for the question
# validation set is being used like cross validation, reduce overfitting and increase the accuracy
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words,
                       sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size]) # [row_size, col_num, col_size]
    # attention = at
    at_keys, at_values, at_score_function, at_construct_function = tf.contrib.seq2seq.prepare_attention(
        attention_states, attention_option = "bahdanau", num_units=decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(
        output_function, encoder_state[0], at_keys, at_values, at_score_function, at_construct_function,
        decoder_embedding_matrix, sos_id, eos_id, maximum_length, num_words, name="attn_dec_inf")
    test_predictions, _, _, = tf.contrib.seq2seq.dynamic_rnn_decoder(
        decoder_cell, test_decoder_function, scope=decoding_scope)
    return test_predictions

In [14]:
# Creating the decoder RNN
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length,
               rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x, num_words, None, scope=decoding_scope,
                                                                     weights_initializer=weights,
                                                                     biases_initializer=biases)
        training_predictions = decode_training_set(encoder_state, decoder_cell, decoder_embedded_input,
                                                  sequence_length, decoding_scope, output_function,
                                                  keep_prob, batch_size)
        decoding_scope.reused_variables()
        test_predictions = decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix,
                                          word2int['<SOS>'], word2int['<EOS>'], sequence_length-1,
                                          num_words, decoding_scope, output_function, keep_prob, batch_size)
        return training_predictions, test_predictions

In [15]:
# Building the Seq2Seq Model (the brain of network)
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words,
                 questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size,
                 num_layers, questionswords2int):
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs, answers_num_words+1,
                                                             encoder_embedding_size,
                                                             initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words+1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix,
                                                        encoder_state, questions_num_words, sequence_length,
                                                        rnn_size, num_layers, questionswords2int,
                                                        keep_prob, batch_size)
    return training_predictions, test_predictions

### Training the Seq2Seq Model

In [16]:
# Setting the HyperParameters to make it smart and able to chat (usual-common values are given)
epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9 # learning rate will be reduced to %90 percent over iterations
min_learning_rate = 0.0001 # we dont want the learning_rate goes to the zero
keep_probability = 0.5

In [23]:
# Defining a session
tf.reset_default_graph()
session = tf.InteractiveSession()
 
# Loading the model inputs
inputs, targets, lr, keep_prob = model_inputs()
 
# Setting the sequence length
sequence_length = tf.placeholder_with_default(25, None, name = 'sequence_length') # answer contains max 25 words
 
# Getting the shape of the inputs tensor
input_shape = tf.shape(inputs)
 
# Getting the training and test predictions
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]), targets, keep_prob, batch_size,
                                                       sequence_length, len(answerswords2int),
                                                       len(questionswords2int), encoding_embedding_size,
                                                       decoding_embedding_size, rnn_size,
                                                       num_layers, questionswords2int)

AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'prepare_attention'