In [1]:
import os
import datasets
import numpy as np
import re

In [236]:
dataset_path = os.path.join("data", "cornell")
data = datasets.readCornellData(dataset_path, max_len=100000)
data[:3]

100%|██████████| 83097/83097 [00:04<00:00, 20440.40it/s]


[('can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again',
  'well i thought wed start with pronunciation if thats okay with you'),
 ('well i thought wed start with pronunciation if thats okay with you',
  'not the hacking and gagging and spitting part please'),
 ('not the hacking and gagging and spitting part please',
  'okay then how bout we try out some french cuisine saturday night')]

In [237]:
data = [[sentences[0].split(), sentences[1].split()]for sentences in data]

In [238]:
data = [sentences for sentences in data if len(sentences[0]) >= 3]
data = [sentences for sentences in data if len(sentences[0]) <= 20]
data = [sentences for sentences in data if len(sentences[1]) >= 3]
data = [sentences for sentences in data if len(sentences[1]) <= 20]

In [239]:
data.sort(key=lambda x:len(x[0]))

In [241]:
new_data = [
    ["where are you from", "china"],
    ["how are you", "im fine"],
    ["how do you do", "im good"],
    ["what is your hobby", "soccer"],
    ["what is your hobby", "movie"],
    ["do you love me", "of course"],
    ["what is your name", "bot"]
]
new_data = [[sentences[0].split(), sentences[1].split()]for sentences in new_data]

In [242]:
data = new_data + data

In [243]:
data[9]

[['the', 'real', 'you'], ['like', 'my', 'fear', 'of', 'wearing', 'pastels']]

In [244]:
from collections import defaultdict
word_count = defaultdict(int)
for sentences in data:
    for word in sentences[0]:
        word_count[word] += 1
    for word in sentences[1]:
        word_count[word] += 1

In [245]:
word_set = set([word for word in word_count if word_count[word]>=5])

In [246]:
unknown_token = "[UKN]"
start_token = "[START]"
end_token = "[END]"
pad_token = "[PAD]"
word_set.add(unknown_token)
word_set.add(start_token)
word_set.add(end_token)
word_set.add(pad_token)

In [247]:
word_to_idx = {}
idx_to_word = [None] * len(word_set)
index = 0
for word in word_set:
    word_to_idx[word] = index
    idx_to_word[index] = word
    index += 1

In [248]:
unknown_idx = word_to_idx[unknown_token]
start_idx = word_to_idx[start_token]
end_idx = word_to_idx[end_token]
pad_idx = word_to_idx[pad_token]

In [249]:
for sentences in data:
    sentences[1].append(end_token)

In [250]:
input_lengths = [len(sentences[0]) for sentences in data]
ground_truth_lengths = [len(sentences[1]) for sentences in data]

In [251]:
# add padding
max_input_lengths = max(input_lengths)
max_ground_truth_lengths = max(ground_truth_lengths)
input_sentences = []
ground_truth_sentences = []
for sentences in data:
    input_sentences.append(sentences[0] + [pad_token]*(max_input_lengths-len(sentences[0])))
    ground_truth_sentences.append(sentences[1] + [pad_token]*(max_ground_truth_lengths-len(sentences[1])))

In [252]:
input_sentences_idx = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in sentence] for sentence in input_sentences]
ground_truth_sentences_idx = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in sentence] for sentence in ground_truth_sentences]

In [253]:
input_sentences_idx = np.array(input_sentences_idx)

In [254]:
ground_truth_sentences_idx = np.array(ground_truth_sentences_idx)
input_lengths = np.array(input_lengths)
ground_truth_lengths = np.array(ground_truth_lengths)

In [255]:
def batch_generator(batch_size,
                    input_sentences_idx,
                    ground_truth_sentences_idx,
                    input_lengths,
                    ground_truth_lengths):
    index = 0
    while index < len(input_sentences_idx):
        batch_input_length = input_lengths[index:index+batch_size]
        batch_input_sentences_idx = input_sentences_idx[index:index+batch_size, :batch_input_length.max()]
        batch_ground_truth_length = ground_truth_lengths[index:index+batch_size]
        batch_ground_truth_sentences_idx = ground_truth_sentences_idx[index:index+batch_size, :batch_ground_truth_length.max()]
        yield (batch_input_sentences_idx, batch_input_length,
              batch_ground_truth_sentences_idx, batch_ground_truth_length)
        index += batch_size

In [257]:
vocab_size = len(word_set)
num_units = 128
embedding_size = 100
num_encoder_layers = 2
num_decoder_layers = 2

In [258]:
import tensorflow as tf
from tensorflow.contrib import layers

In [259]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    good_symbols_re = re.compile('[^0-9a-z #+_]')

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = good_symbols_re.sub('', text)

    return text.strip()

In [301]:
class ChatBot:
    def __init__(self):
        self.declare_placeholders()
        self.build_input_encoder()
        self.build_decoder()
        self.define_loss_and_train()
    
    def declare_placeholders(self):
        """Specifies placeholders for the model."""
        # Placeholders for input and its actual lengths.
        self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
        self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')

        # Placeholders for groundtruth and its actual lengths.
        self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ground_truth')
        self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ground_truth_lengths')

        self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
        self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[])

    def build_input_encoder(self):
        with tf.variable_scope('input_encoder') as input_encoder_scope:
            random_initializer = tf.random_uniform((vocab_size, embedding_size), -1.0, 1.0)
            self.embeddings = tf.Variable(initial_value=random_initializer, name='embeddings', dtype=tf.float32) 

            # Perform embeddings lookup for self.input_batch. 
            self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings, self.input_batch)
            # Create encoder cells
            rnn_layers = []
            for i in range(num_encoder_layers-1):
                with tf.variable_scope('input_encoder_rnn_layer' + str(i + 1)) as scope:
                    cell = tf.nn.rnn_cell.GRUCell(num_units, activation=tf.nn.relu)
                    cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                    rnn_layers.append(cell)
            with tf.variable_scope('input_encoder_rnn_layer' + str(num_encoder_layers)) as scope:
                cell = tf.nn.rnn_cell.GRUCell(num_units)
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=self.dropout_ph, dtype=tf.float32)
                rnn_layers.append(cell)
            encoder_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
            self.input_encoder_outputs, self.final_input_encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                self.input_batch_embedded,
                sequence_length=self.input_batch_lengths,
                dtype=tf.float32
            )
            self.final_input_encoder_state = self.final_input_encoder_state[-1]

    def build_decoder(self):
        batch_size = tf.shape(self.input_batch)[0]
        start_tokens = tf.fill([batch_size], start_idx)
        ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
        self.ground_truth_embedded = tf.nn.embedding_lookup(
            self.embeddings, ground_truth_as_input)
        train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded,
                                                         self.ground_truth_lengths)
        infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_idx)

        def decode(helper, scope, reuse=None):
            """Creates decoder and return the results of the decoding with a given helper."""
            with tf.variable_scope(scope, reuse=reuse):
                # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
                rnn_layers = []
                for i in range(num_decoder_layers-1):
                    with tf.variable_scope('decoder_rnn_layer' + str(i + 1)) as scope:
                        decoder_cell = tf.contrib.rnn.GRUCell(num_units=num_units, reuse=reuse, activation=tf.nn.tanh)
                        decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, input_keep_prob=self.dropout_ph)
                        rnn_layers.append(decoder_cell)
                with tf.variable_scope('decoder_rnn_layer' + str(num_decoder_layers)) as scope:
                    decoder_cell = tf.contrib.rnn.GRUCell(num_units=num_units, reuse=reuse)
                    decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, input_keep_prob=self.dropout_ph)
                    rnn_layers.append(decoder_cell)
                decoder_cell = tf.contrib.rnn.MultiRNNCell(rnn_layers)
                # Create attention
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=num_units, memory=tf.split(self.input_encoder_outputs, num_or_size_splits=2, axis=-1)[0],
                    memory_sequence_length=self.input_batch_lengths)
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                    decoder_cell, attention_mechanism, attention_layer_size=num_units)
                # Create a projection wrapper.
                decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)
                # Create BasicDecoder, pass the defined cell, a helper, and initial state.
                # The initial state should be equal to the final state of the encoder!
                decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=decoder_cell.zero_state(
                    dtype=tf.float32, batch_size=batch_size))

                # The first returning argument of dynamic_decode contains two fields:
                #   rnn_output (predicted logits)
                #   sample_id (predictions)
                outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=tf.reduce_max(self.ground_truth_lengths), 
                                                                  output_time_major=False, impute_finished=True)

                return outputs

        self.train_outputs = decode(train_helper, 'decode')
        self.infer_outputs = decode(infer_helper, 'decode', reuse=True)
        self.train_predictions = self.train_outputs.sample_id
        self.infer_predictions = self.infer_outputs.sample_id

    def define_loss_and_train(self):
        weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
        self.loss = tf.contrib.seq2seq.sequence_loss(
            self.train_outputs.rnn_output,
            self.ground_truth,
            weights
        )
        self.train_op = tf.contrib.layers.optimize_loss(
            loss=self.loss,
            optimizer='Adam',
            learning_rate=self.learning_rate_ph,
            clip_gradients=1.0,
            global_step=tf.train.get_global_step()
        )

    def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
        feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
        loss, _ = session.run([
            self.loss,
            self.train_op], feed_dict=feed_dict)
        return loss
    
    def get_reply(self, session, input_sentence):
        input_sentence = text_prepare(input_sentence)
        X = [[word_to_idx[word] if word in word_to_idx else unknown_idx for word in input_sentence]]
        X = np.array(X)
        feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: np.array([len(input_sentence)]),
            self.ground_truth_lengths: np.array([15])
        }
        pred = session.run([self.infer_predictions], feed_dict=feed_dict)
        return " ".join([idx_to_word[index] for index in pred[0][0][:-1]])

    def train(self, session, epochs, batch_size, input_sentences_idx, ground_truth_sentences_idx, input_lengths, ground_truth_lengths, learning_rate, dropout_keep_probability):
        for i in range(epochs):
            batch_num = 1
            for (batch_input_sentences_idx,
                 batch_input_length,
                 batch_ground_truth_sentences_idx,
                 batch_ground_truth_length) in batch_generator(
                batch_size, input_sentences_idx, ground_truth_sentences_idx,
                input_lengths, ground_truth_lengths):
                loss = self.train_on_batch(
                    session,
                    batch_input_sentences_idx,
                    batch_input_length,
                    batch_ground_truth_sentences_idx,
                    batch_ground_truth_length,
                    learning_rate,
                    dropout_keep_probability
                )
                print("Epoch {i}, batch {batch}, loss = {loss}".format(i=i+1, batch=batch_num, loss=loss))
                batch_num += 1

In [261]:
tf.reset_default_graph()

In [262]:
chatbot = ChatBot()

In [263]:
session = tf.Session()
session.run(tf.global_variables_initializer())

In [304]:
chatbot.train(session,
              1, 1024,
              input_sentences_idx[:10240],
              ground_truth_sentences_idx[:10240],
              input_lengths[:10240],
              ground_truth_lengths[:10240],
              5e-4,
              0.5)

Epoch 1, batch 1, loss = 6.808646202087402
Epoch 1, batch 2, loss = 6.779938697814941
Epoch 1, batch 3, loss = 6.709688663482666
Epoch 1, batch 4, loss = 6.65559196472168
Epoch 1, batch 5, loss = 6.599919319152832
Epoch 1, batch 6, loss = 6.496409893035889
Epoch 1, batch 7, loss = 6.345330238342285
Epoch 1, batch 8, loss = 6.384521484375
Epoch 1, batch 9, loss = 6.314704418182373
Epoch 1, batch 10, loss = 6.217668056488037


In [307]:
chatbot.get_reply(session, "how are you")


'i i i [END]'