In [1]:
import os
import datasets
import numpy as np
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxpoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset_path = os.path.join("data", "cornell")

In [3]:
data = datasets.readCornellData(dataset_path, max_len=100000)

100%|██████████| 83097/83097 [00:03<00:00, 21042.81it/s]


In [4]:
data[:3]

[('can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again',
  'well i thought wed start with pronunciation if thats okay with you'),
 ('well i thought wed start with pronunciation if thats okay with you',
  'not the hacking and gagging and spitting part please'),
 ('not the hacking and gagging and spitting part please',
  'okay then how bout we try out some french cuisine saturday night')]

In [5]:
# remove short sentences
data = [sentences for sentences in data if len(sentences[0].split()) > 2]

In [6]:
data[8]

('cesc ma tete this is my head', 'right see youre ready for the quiz')

In [7]:
len(data)

187012

In [8]:
data[-2000]

('that should put us ahead of the criminals', 'ill work on it')

In [9]:
data = [[sentences[0].split(), sentences[1].split()]for sentences in data]

In [10]:
data[0]

[['can',
  'we',
  'make',
  'this',
  'quick',
  'roxanne',
  'korrine',
  'and',
  'andrew',
  'barrett',
  'are',
  'having',
  'an',
  'incredibly',
  'horrendous',
  'public',
  'break',
  'up',
  'on',
  'the',
  'quad',
  'again'],
 ['well',
  'i',
  'thought',
  'wed',
  'start',
  'with',
  'pronunciation',
  'if',
  'thats',
  'okay',
  'with',
  'you']]

In [11]:
data = [sentences for sentences in data if len(sentences[0]) >= 3]
data = [sentences for sentences in data if len(sentences[0]) <= 20]
data = [sentences for sentences in data if len(sentences[1]) >= 3]
data = [sentences for sentences in data if len(sentences[1]) <= 20]

In [12]:
data.sort(key=lambda x:len(x[0]))

In [13]:
data[1]

[['what', 'good', 'stuff'], ['the', 'real', 'you']]

In [14]:
len(data)

115832

In [15]:
from collections import defaultdict
word_count = defaultdict(int)
for sentences in data:
    for word in sentences[0]:
        word_count[word] += 1
    for word in sentences[1]:
        word_count[word] += 1

In [16]:
word_set = set([word for word in word_count if word_count[word]>=5])

In [17]:
unknown_token = "[UKN]"
start_token = "[START]"
end_token = "[END]"
pad_token = "[PAD]"
word_set.add(unknown_token)
word_set.add(start_token)
word_set.add(end_token)
word_set.add(pad_token)

In [18]:
word_to_idx = {}
idx_to_word = [None] * len(word_set)
index = 0
for word in word_set:
    word_to_idx[word] = index
    idx_to_word[index] = word
    index += 1

In [19]:
unknown_idx = word_to_idx[unknown_token]
start_idx = word_to_idx[start_token]
end_idx = word_to_idx[end_token]
pad_idx = word_to_idx[pad_token]

In [20]:
for sentences in data:
    sentences[1].append(end_token)

In [23]:
data[10]

[['you', 'know', 'french'],
 ['sure', 'do', 'my', 'moms', 'from', 'canada', '[END]']]

In [24]:
input_lengths = [len(sentences[0]) for sentences in data]
ground_truth_lengths = [len(sentences[1]) for sentences in data]

In [25]:
# add padding
max_input_lengths = max(input_lengths)
max_ground_truth_lengths = max(ground_truth_lengths)
input_sentences = []
ground_truth_sentences = []
for sentences in data:
    input_sentences.append(sentences[0] + [pad_token]*(max_input_lengths-len(sentences[0])))
    ground_truth_sentences.append(sentences[1] + [pad_token]*(max_ground_truth_lengths-len(sentences[1])))

In [26]:
input_sentences_idx = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in sentence] for sentence in input_sentences]
ground_truth_sentences_idx = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in sentence] for sentence in ground_truth_sentences]

In [27]:
input_sentences_idx = np.array(input_sentences_idx)

In [28]:
ground_truth_sentences_idx = np.array(ground_truth_sentences_idx)
input_lengths = np.array(input_lengths)
ground_truth_lengths = np.array(ground_truth_lengths)

In [127]:
data[0]

[['you', 'know', 'chastity'],
 ['i', 'believe', 'we', 'share', 'an', 'art', 'instructor', '[END]']]

In [30]:
def batch_generator(batch_size,
                    input_sentences_idx,
                    ground_truth_sentences_idx,
                    input_lengths,
                    ground_truth_lengths):
    index = 0
    while index < len(input_sentences_idx):
        batch_input_length = input_lengths[index:index+batch_size]
        batch_input_sentences_idx = input_sentences_idx[index:index+batch_size, :batch_input_length.max()]
        batch_ground_truth_length = ground_truth_lengths[index:index+batch_size]
        batch_ground_truth_sentences_idx = ground_truth_sentences_idx[index:index+batch_size, :batch_ground_truth_length.max()]
        yield (batch_input_sentences_idx, batch_input_length,
              batch_ground_truth_sentences_idx, batch_ground_truth_length)
        index += batch_size

In [31]:
g = batch_generator(10, input_sentences_idx, ground_truth_sentences_idx, input_lengths, ground_truth_lengths)

In [63]:
vocab_size = len(word_set)
num_units = 128
embedding_size = 100
num_encoder_layers = 2
num_decoder_layers = 2

In [64]:
import tensorflow as tf
from tensorflow.contrib import layers

In [109]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    good_symbols_re = re.compile('[^0-9a-z #+_]')
#     stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = good_symbols_re.sub('', text)
#     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [140]:
class ChatBot:
    def __init__(self):
        self.declare_placeholders()
        self.build_input_encoder()
        self.build_ground_truth_encoder()
        self.build_hidden_state()
        self.build_decoder()
        self.define_loss_and_train()
    
    def declare_placeholders(self):
        """Specifies placeholders for the model."""
        # Placeholders for input and its actual lengths.
        self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
        self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')

        # Placeholders for groundtruth and its actual lengths.
        self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ground_truth')
        self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ground_truth_lengths')

        self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
        self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[])

    def build_input_encoder(self):
        with tf.variable_scope('input_encoder') as input_encoder_scope:
            random_initializer = tf.random_uniform((vocab_size, embedding_size), -1.0, 1.0)
            self.embeddings = tf.Variable(initial_value=random_initializer, name='embeddings', dtype=tf.float32) 

            # Perform embeddings lookup for self.input_batch. 
            self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)
            # Create encoder cells
            rnn_layers = []
            for i in range(num_encoder_layers-1):
                with tf.variable_scope('input_encoder_rnn_layer' + str(i + 1)) as scope:
                    cell = tf.nn.rnn_cell.GRUCell(num_units, activation=tf.nn.relu)
                    cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                    rnn_layers.append(cell)
            with tf.variable_scope('input_encoder_rnn_layer' + str(num_encoder_layers)) as scope:
                cell = tf.nn.rnn_cell.GRUCell(num_units)
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                rnn_layers.append(cell)
            encoder_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
            self.input_encoder_outputs, self.final_input_encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                self.input_batch_embedded,
                sequence_length=self.input_batch_lengths,
                dtype=tf.float32
            )
            self.final_input_encoder_state = self.final_input_encoder_state[-1]

    def build_ground_truth_encoder(self):
        with tf.variable_scope('ground_truth_encoder') as ground_truth_encoder:
            self.ground_truth_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.ground_truth)
            rnn_layers = []
            for i in range(num_encoder_layers-1):
                with tf.variable_scope('ground_truth_encoder_rnn_layer' + str(i + 1)) as scope:
                    cell = tf.nn.rnn_cell.GRUCell(num_units, activation=tf.nn.relu)
                    cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                    rnn_layers.append(cell)
            with tf.variable_scope('ground_truth_encoder_rnn_layer' + str(num_encoder_layers)) as scope:
                cell = tf.nn.rnn_cell.GRUCell(num_units)
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                rnn_layers.append(cell)
            encoder_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
            _, self.final_ground_truth_encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                self.ground_truth_batch_embedded,
                sequence_length=self.ground_truth_lengths,
                dtype=tf.float32
            )
            self.final_ground_truth_encoder_state = self.final_ground_truth_encoder_state[-1]

    def build_hidden_state(self):
        self.z_mean_from_input, self.z_log_var_from_input = tf.split(
            self.final_input_encoder_state, num_or_size_splits=2, axis=1)
        self.z_mean_from_ground_truth, self.z_log_var_from_grount_truth = tf.split(
            self.final_input_encoder_state, num_or_size_splits=2, axis=1)
        self.z = (self.z_mean_from_ground_truth +
                  tf.exp(0.5*self.z_log_var_from_grount_truth) *
                  tf.random_normal(tf.shape(self.z_log_var_from_grount_truth), 0, 1, dtype=tf.float32))

    def build_decoder(self):
        batch_size = tf.shape(self.input_batch)[0]
        start_tokens = tf.fill([batch_size], start_idx)
        ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
        self.ground_truth_embedded = tf.nn.embedding_lookup(
            self.embeddings, ground_truth_as_input)
        train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded,
                                                         self.ground_truth_lengths)
        infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_idx)

        def decode(helper, scope, reuse=None):
            """Creates decoder and return the results of the decoding with a given helper."""
            with tf.variable_scope(scope, reuse=reuse):
                # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
                rnn_layers = []
                for i in range(num_decoder_layers-1):
                    with tf.variable_scope('decoder_rnn_layer' + str(i + 1)) as scope:
                        decoder_cell = tf.contrib.rnn.GRUCell(num_units=num_units/2, reuse=reuse, activation=tf.nn.tanh)
                        decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, input_keep_prob=self.dropout_ph)
                        rnn_layers.append(decoder_cell)
                with tf.variable_scope('decoder_rnn_layer' + str(num_decoder_layers)) as scope:
                    decoder_cell = tf.contrib.rnn.GRUCell(num_units=num_units/2, reuse=reuse)
                    decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, input_keep_prob=self.dropout_ph)
                    rnn_layers.append(decoder_cell)
                decoder_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
                # Create attention
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=num_units, memory=tf.split(self.input_encoder_outputs, num_or_size_splits=2, axis=-1)[0],
                    memory_sequence_length=self.input_batch_lengths)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell, attention_mechanism, attention_layer_size=num_units)
                # Create a projection wrapper.
                decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)
                # Create BasicDecoder, pass the defined cell, a helper, and initial state.
                # The initial state should be equal to the final state of the encoder!
                second_state = tf.zeros((1, num_units/2))
                second_state = tf.tile(second_state, [batch_size, 1])
                decoder_initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=batch_size).clone(
                    cell_state=(self.z, second_state))
                decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=decoder_initial_state)

                # The first returning argument of dynamic_decode contains two fields:
                #   rnn_output (predicted logits)
                #   sample_id (predictions)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=tf.reduce_max(self.ground_truth_lengths), 
                                                                  output_time_major=False, impute_finished=True)

                return outputs

        self.train_outputs = decode(train_helper, 'decode')
        self.infer_outputs = decode(infer_helper, 'decode', reuse=True)
        self.train_predictions = self.train_outputs.sample_id
        self.infer_predictions = self.infer_outputs.sample_id

    def define_loss_and_train(self):
        weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
        self.reconstruction_loss = tf.contrib.seq2seq.sequence_loss(
            self.train_outputs.rnn_output,
            self.ground_truth,
            weights
        )
        self.kl_loss = (0.5*(self.z_log_var_from_input-self.z_log_var_from_grount_truth)+
                       (tf.exp(self.z_log_var_from_grount_truth)+(self.z_mean_from_ground_truth-self.z_mean_from_input)**2)/
                       (2*tf.exp(self.z_log_var_from_input))-0.5)
        self.kl_loss = tf.reduce_mean(self.kl_loss)
        self.loss = self.kl_loss + self.reconstruction_loss
        self.train_op = tf.contrib.layers.optimize_loss(
            loss=self.loss,
            optimizer='Adam',
            learning_rate=self.learning_rate_ph,
            clip_gradients=1.0,
            global_step=tf.train.get_global_step()
        )

    def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
        feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
        loss, _ = session.run([
            self.loss,
            self.train_op], feed_dict=feed_dict)
        return loss
    
    def get_reply(self, session, input_sentence):
        input_sentence = text_prepare(input_sentence)
        X = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in input_sentence]]
        X = np.array(X)
        feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: np.array([len(input_sentence)])
        }
        z_mean, z_log_var, input_encoder_outputs = session.run([
            self.z_mean_from_input,
            self.z_log_var_from_input,
            self.input_encoder_outputs
        ], feed_dict=feed_dict)
        z_mean = z_mean[0]
        z_log_var = z_log_var[0]
        z = np.random.normal(z_mean, np.exp(0.5*z_log_var), z_mean.size)
        z = z[np.newaxis,:]
        feed_dict = {
            self.z: z,
            self.input_batch: X,
            self.input_batch_lengths: np.array([len(input_sentence)]),
            self.ground_truth_lengths: np.array([15])
        }
        pred = session.run([self.infer_predictions], feed_dict=feed_dict)
        return " ".join([idx_to_word[index] for index in pred[0][0]])

    def train(self, session, epochs, batch_size, input_sentences_idx, ground_truth_sentences_idx, input_lengths, ground_truth_lengths, learning_rate, dropout_keep_probability):
        for i in range(epochs):
            batch_num = 1
            for (batch_input_sentences_idx,
                 batch_input_length,
                 batch_ground_truth_sentences_idx,
                 batch_ground_truth_length) in batch_generator(
                batch_size, input_sentences_idx, ground_truth_sentences_idx,
                input_lengths, ground_truth_lengths):
                loss = self.train_on_batch(
                    session,
                    batch_input_sentences_idx,
                    batch_input_length,
                    batch_ground_truth_sentences_idx,
                    batch_ground_truth_length,
                    learning_rate,
                    dropout_keep_probability
                )
                print("Epoch {i}, batch {batch}, loss = {loss}".format(i=i+1, batch=batch_num, loss=loss))
                batch_num += 1

In [141]:
tf.reset_default_graph()

In [142]:
chatbot = ChatBot()

In [143]:
session = tf.Session()
session.run(tf.global_variables_initializer())

In [144]:
chatbot.train(session,
              1, 256,
              input_sentences_idx,
              ground_truth_sentences_idx,
              input_lengths,
              ground_truth_lengths,
              1e-4,
              0.5)

Epoch 1, batch 1, loss = 9.453303337097168
Epoch 1, batch 2, loss = 9.452513694763184
Epoch 1, batch 3, loss = 9.452415466308594
Epoch 1, batch 4, loss = 9.451004981994629
Epoch 1, batch 5, loss = 9.45026683807373
Epoch 1, batch 6, loss = 9.451022148132324
Epoch 1, batch 7, loss = 9.449259757995605
Epoch 1, batch 8, loss = 9.449374198913574
Epoch 1, batch 9, loss = 9.447999000549316
Epoch 1, batch 10, loss = 9.447127342224121
Epoch 1, batch 11, loss = 9.445918083190918
Epoch 1, batch 12, loss = 9.445520401000977
Epoch 1, batch 13, loss = 9.44456672668457
Epoch 1, batch 14, loss = 9.443458557128906
Epoch 1, batch 15, loss = 9.443755149841309
Epoch 1, batch 16, loss = 9.440383911132812
Epoch 1, batch 17, loss = 9.441139221191406
Epoch 1, batch 18, loss = 9.440202713012695
Epoch 1, batch 19, loss = 9.43825912475586
Epoch 1, batch 20, loss = 9.438660621643066
Epoch 1, batch 21, loss = 9.434859275817871
Epoch 1, batch 22, loss = 9.434871673583984
Epoch 1, batch 23, loss = 9.433428764343262


Epoch 1, batch 186, loss = 6.146169185638428
Epoch 1, batch 187, loss = 6.076430797576904
Epoch 1, batch 188, loss = 5.835185527801514
Epoch 1, batch 189, loss = 5.927221775054932
Epoch 1, batch 190, loss = 6.0712175369262695
Epoch 1, batch 191, loss = 5.911068439483643
Epoch 1, batch 192, loss = 6.120402812957764
Epoch 1, batch 193, loss = 5.881659030914307
Epoch 1, batch 194, loss = 6.011129856109619
Epoch 1, batch 195, loss = 5.939219951629639
Epoch 1, batch 196, loss = 6.008031368255615
Epoch 1, batch 197, loss = 6.139239311218262
Epoch 1, batch 198, loss = 6.07028341293335
Epoch 1, batch 199, loss = 5.934192180633545
Epoch 1, batch 200, loss = 6.002211570739746
Epoch 1, batch 201, loss = 5.991769313812256
Epoch 1, batch 202, loss = 5.949981212615967
Epoch 1, batch 203, loss = 5.931615352630615
Epoch 1, batch 204, loss = 6.038876056671143
Epoch 1, batch 205, loss = 5.948448657989502
Epoch 1, batch 206, loss = 6.0603928565979
Epoch 1, batch 207, loss = 6.2041521072387695
Epoch 1, ba

Epoch 1, batch 368, loss = 5.947820663452148
Epoch 1, batch 369, loss = 6.026201248168945
Epoch 1, batch 370, loss = 6.0012407302856445
Epoch 1, batch 371, loss = 6.122353553771973
Epoch 1, batch 372, loss = 5.976785659790039
Epoch 1, batch 373, loss = 5.915890693664551
Epoch 1, batch 374, loss = 5.862168788909912
Epoch 1, batch 375, loss = 6.053094863891602
Epoch 1, batch 376, loss = 5.930954933166504
Epoch 1, batch 377, loss = 5.8758225440979
Epoch 1, batch 378, loss = 5.982545375823975
Epoch 1, batch 379, loss = 5.898486137390137
Epoch 1, batch 380, loss = 6.026061058044434
Epoch 1, batch 381, loss = 5.844474792480469
Epoch 1, batch 382, loss = 5.971642017364502
Epoch 1, batch 383, loss = 5.99247932434082
Epoch 1, batch 384, loss = 5.927855014801025
Epoch 1, batch 385, loss = 5.959246635437012
Epoch 1, batch 386, loss = 6.012385845184326
Epoch 1, batch 387, loss = 5.919038772583008
Epoch 1, batch 388, loss = 5.9660234451293945
Epoch 1, batch 389, loss = 5.877278804779053
Epoch 1, ba

In [153]:
chatbot.get_reply(session, "who are you")

'i [END]'