<a href="https://colab.research.google.com/github/dawidkubicki/chatbot-tensorflow/blob/main/Chatbot_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Connect to the GDrive

In [301]:
#from google.colab import drive
#drive.mount('/content/gdrive')

## Change root path

In [302]:
#root_path = '/content/gdrive/My Drive/Colab/'

import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# choose where you want your project files to be saved
project_folder = "Colab/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'test_file.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/drive/My Drive/Colab/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


## Download dataset



In [303]:
#!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
#!unzip cornell_movie_dialogs_corpus.zip

## Firstly import all necessaries libraries


In [304]:
import numpy as np
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import re
import time
from tensorflow.python.compiler.tensorrt import trt_convert as trt

# Preprocessing

## Import dataset to the project

In [305]:
lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

## Create a dictionary that maps each line and it's ID

In [306]:
id2line = {} #init dictionary
for line in lines:
  _line = line.split(' +++$+++ ')
  if len(_line) == 5:
    id2line[_line[0]] = _line[4]

# JUST TO CHECK DATA IN GOOGLE COLAB
# retrieve key/value pairs
#els = list(id2line.items()) # explicitly convert to a list, in case it's Python 3.x

# get first inserted element 
#print(els[0])



## Create a list of all of the conversations

In [307]:
conversations_ids = []
for conversation in conversations[:-1]:
  _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'","").replace(" ","")
  conversations_ids.append(_conversation.split(","))
#print(conversations_ids[0])
#print(conversations_ids[0])

## Getting separately the questions and the answers

In [308]:
questions = []
answers = []
for conversation in conversations_ids:
  for i in range(len(conversation) - 1):
    questions.append(id2line[conversation[i]])
    answers.append(id2line[conversation[i+1]])

#print(questions[0])
#print(answers[0])


### Cleaning text function define

In [309]:
def clean_text(text):
  #put all the text to lowercase
  text = text.lower()
  #removing aposthrophies with re library
  text = re.sub(r"i'm", "i am", text)
  text = re.sub(r"he's", "he is", text)
  text = re.sub(r"she's", "she is", text)
  text = re.sub(r"that's", "that is", text)
  text = re.sub(r"what's", "what is", text)
  text = re.sub(r"where's", "where is", text)
  text = re.sub(r"\'ll", " will", text)
  text = re.sub(r"\'ve", " have", text)
  text = re.sub(r"\'re", " are", text)
  text = re.sub(r"\'d", " whould", text)
  text = re.sub(r"won't", "will not", text)
  text = re.sub(r"can't", "cannot", text)
  text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
  return text

### Cleaning questions and answers

In [310]:
clean_questions = []
clean_answers = []

for question in questions:
  clean_questions.append(clean_text(question))

for answer in answers:
  clean_answers.append(clean_text(answer))  

### Creating a dictionary that maps each word to it's number of occurencies


In [311]:
word2count = {}

for question in clean_questions:
  for word in question.split():
    if word not in word2count:
      word2count[word] = 1
    else:
      word2count[word] += 1

for answer in clean_answers:
  for word in answer.split():
    if word not in word2count:
      word2count[word] = 1
    else:
      word2count[word] += 1      

### Creating two dictionaries that map the questions words and the answers words to a unique integer

In [312]:
threshold = 20
questionwords2int = {}
word_number = 0 
for word, count in word2count.items():
  if count > threshold:
    questionwords2int[word] = word_number
    word_number += 1

answerwords2int = {}
word_number = 0 
for word, count in word2count.items():
  if count > threshold:
    answerwords2int[word] = word_number
    word_number += 1

# print(answerwords2int.items())

### Adding the last tokens to these two dictionaries

In [313]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
  questionwords2int[token] = len(questionwords2int) + 1

for token in tokens:
  answerwords2int[token] = len(answerwords2int) + 1
#print(answerwords2int.items())  

### Create an inverse dictionary of the answerwords2int dictionary

In [314]:
answerint2word = {w_i: w for w, w_i in answerwords2int.items()}

### Adding the EOS token to the end of every answer

In [315]:
for i in range(len(clean_answers)):
  clean_answers[i] += ' <EOS>'

### Translating all the questions and the answers into integers and replacing all the words that were filtered out by <OUT>

In [316]:
questions_to_int = []
for question in clean_questions:
  ints = []
  for word in question.split():
    if word not in questionwords2int:
      ints.append(questionwords2int['<OUT>'])
    else:
      ints.append(questionwords2int[word])
  questions_to_int.append(ints)

print(questions_to_int[0])

answers_to_int = []
for answer in clean_answers:
  ints = []
  for word in answer.split():
    if word not in answerwords2int:
      ints.append(answerwords2int['<OUT>'])
    else:
      ints.append(answerwords2int[word])
  answers_to_int.append(ints)

[0, 1, 2, 3, 4, 8541, 8541, 5, 6, 8541, 7, 8, 9, 10, 8541, 11, 12, 13, 14, 15, 8541, 16]


### Sorting questions and answers by the length of questions (this will speed up the training)

In [317]:
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1,25+1):
  for i in enumerate(questions_to_int):
    if len(i[1]) == length:
      sorted_clean_questions.append(questions_to_int[i[0]])
      sorted_clean_answers.append(answers_to_int[i[0]])

print(sorted_clean_questions[0])
print(sorted_clean_answers[0])      

[47]
[15, 48, 25, 47, 18, 49, 50, 15, 51, 52, 45, 53, 8541, 54, 52, 55, 41, 56, 18, 57, 58, 59, 60, 61, 8540]


# Building seq2seq model

### Creating placeholders for the inputs and the targets

In [318]:
def model_inputs():
  inputs = tf.compat.v1.placeholder(tf.int32, [None, None], name = 'inputs')
  targets = tf.compat.v1.placeholder(tf.int32, [None, None], name = 'targets')
  lr = tf.compat.v1.placeholder(tf.float32, name = 'learning_rate')
  keep_prob = tf.compat.v1.placeholder(tf.float32, name = 'keep_prob')

  return inputs, targets, lr, keep_prob

### Preprocessing the targets

In [319]:
# create batches and add <sos> token at each raw at batch

def preprocess_targets(targets, word2int, batch_size):
  left_side = tf.fill([batch_size], 1, word2int['<SOS>'])
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1],[1,1])
  preprocessed_targets = tf.concat([left_side, right_side], 1)

  return preprocessed_targets

### Creating the Encoder RNN Layer

In [320]:
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
  #first create LSTM - object of basic lstm cell
  lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
  lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
  encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
  encoder_output, encoder_state = tf.nn_bidirectional_dynamic_rnn(cell_fw = encoder_cell, 
                                                     cell_bw = encoder_cell,
                                                     sequence_length = sequence_length,
                                                     inputs = rnn_inputs,
                                                     dtype = tf.float32) # this will build forward and backward RNNs

  return encoder_state

### Creating the Decoder of the training set

In [321]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
  attention_state = tf.zeros([batch_size, 1, decoder_cell.output_size])
  attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_state, attention_option = "bahdanau", num_units = decoder_cell.output_size) 
  training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            name = "attn_dec_train")
  
  decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_function(decoder_cell, 
                                                                                                             training_decoder_function,
                                                                                                             decoder_embedded_input,
                                                                                                             sequence_length,
                                                                                                             scope = decoding_scope)
  
  decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
  return output_function(decoder_output_dropout)

### Decoding the test/validation set

In [322]:
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
  attention_state = tf.zeros([batch_size, 1, decoder_cell.output_size])
  attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_state, attention_option = "bahdanau", num_units = decoder_cell.output_size) 
  test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                            encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            decoder_embeddings_matrix, 
                                                                            sos_id,
                                                                            eos_id,
                                                                            maximum_length,
                                                                            num_words,
                                                                            name = "attn_dec_inf")
  
  test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_function(decoder_cell, 
                                                                                                              test_decoder_function,                                                                                                   
                                                                                                              scope = decoding_scope)
  
  return test_predictions

### Creating the Doceder RNN

In [323]:
def decoder_rnn(decoder_embedded_inputs, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
  with tf.variable_scope("decoding") as decoding_scope:
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    weights = tf.truncated_normal_initializer(stddev = 0.1)
    biases = tf.zeros_initializer()
    output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                  num_words,
                                                                  None,
                                                                  scope = decoding_scope,
                                                                  weights_initializer = weights,
                                                                  biases_initializer = biases)
    training_predictions = decoode_training_set(encoder_state,
                                                decoder_cell,
                                                decoder_embedded_input,
                                                sequence_lentgh,
                                                decoding_scope,
                                                output_function,
                                                keep_prob,
                                                batch_size)
    decoding_scope.reuse_variables()
    test_predictions = decode_test_set(encoder_state,
                                       decoder_cell,
                                       decoder_embeddings_matrix,
                                       word2int['<SOS>'],
                                       word2int['<EOS>'],
                                       sequence_length-1,
                                       num_words,
                                       decoding_scope,
                                       output_function,
                                       keep_prob,
                                       batch_size)
    
  return training_predictions, test_predictions

### Building seq2seq

In [324]:
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
  encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs, 
                                                            answers_num_words + 1,
                                                            encoder_embedding_size,
                                                            initializer = tf.random_uniform_initializer(0,1)
                                                            )
  #ouput of encoder and input of decoder
  encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
  preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
  decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
  decoder_embedded_inputs = tf.nn.embeddeding_lookup(decoder_embeddings_matrix, preprocessed_targets)
  training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                       decoder_embeddings_matrix,
                                                       encoder_state,
                                                       questions_num_words,
                                                       sequence_length,
                                                       rnn_size,
                                                       num_layers,
                                                       questions2wordsint,
                                                       keep_prob,
                                                       batch_size)
  
  return training_predictions, test_prediciotns

### Setting the Hyperparameters

In [325]:
epochs = 100
batch_size = 64
rnn_size = 512
num_layers = 3
encoding_embedding_size = 512 #512 columns 
decoding_embedding_size = 512 #512 columns 
learning_rate = 0.01
learning_rate_dacay = 0.9 #which percentage the learning rate is reduce over the iteration of training
min_learning_rate = 0.0001
keep_probability = 0.5

### Defining a session (tensorflow)

In [326]:
tf.compat.v1.reset_default_graph()
session = tf.compat.v1.InteractiveSession()



### Loading the model inputs

In [327]:
inputs, targets, lr, keep_prob = model_inputs()

### Seeting the sequence length

In [328]:
sequence_length = tf.compat.v1.placeholder_with_default(25, None, name = 'sequence_length')

### Getting the shape of the inputs tensor

In [329]:
input_shape = tf.shape(inputs)
print(input_shape)

Tensor("Shape:0", shape=(2,), dtype=int32)


### Getting the training and test predictions

In [330]:
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerwords2int),
                                                       len(questionwords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionwords2int)

AttributeError: ignored