In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
from tensorflow.contrib import skflow
from tensorflow.python.framework import dtypes
from helpers import load_glove_vectors

In [2]:
# The maximum number of words to consider for the contexts
MAX_CONTEXT_LENGTH = 32

# The maximum number of words to consider for the utterances
MAX_UTTERANCE_LENGTH = 16

# Word embedding dimensionality
EMBEDDING_SIZE = 50

# LSTM Cell dimensionality
LSTM_CELL_SIZE = 16

In [3]:
# Load Data
train_df = pd.read_csv("./data/train_small.csv")
test_df = pd.read_csv("./data/test.csv")
validation_df = pd.read_csv("./data/valid.csv")
y_test = np.zeros(len(test_df))

In [4]:
# Preprocessing
# ==================================================

# Create vocabulary mapping
all_sentences = np.append(train_df.Context, train_df.Utterance)
vocab_processor = skflow.preprocessing.VocabularyProcessor(MAX_CONTEXT_LENGTH)
vocab_processor.fit(all_sentences)

# Transform contexts and utterances
X_train_context = np.array(list(vocab_processor.transform(train_df.Context)))
X_train_utterance = np.array(list(vocab_processor.transform(train_df.Utterance)))

# Generate training tensor
X_train = np.stack([X_train_context, X_train_utterance], axis=1)
y_train = train_df.Label

In [5]:
n_words = len(vocab_processor.vocabulary_)
print("Total words: {}".format(n_words))

Total words: 25094


In [6]:
# # Load glove vectors
# vocab_set = set(vocab_processor.vocabulary_._mapping.keys())
# glove_vectors, glove_dict = load_glove_vectors("./data/glove.840B.300d.txt", vocab=vocab_set)

In [7]:
# Build initial word embeddings
initial_embeddings = np.random.randn(n_words, EMBEDDING_SIZE).astype("float32")
# for word, glove_word_idx in glove_dict.items():
#     word_idx = vocab_processor.vocabulary_.get(word)
#     initial_embeddings[word_idx, :] = glove_vectors[glove_word_idx]

In [8]:
def get_sequence_length(input_tensor, max_length):
    """
    If a sentence is padded, returns the index of the first 0 (the padding symbol).
    If the sentence has no padding, returns the max length.
    """
    zero_tensor = np.zeros_like(input_tensor)
    comparsion = tf.equal(input_tensor, zero_tensor)
    zero_positions = tf.argmax(tf.to_int32(comparsion), 1)
    position_mask = tf.to_int64(tf.equal(zero_positions, 0))
    sequence_lengths = zero_positions + (position_mask * max_length) + (1 - position_mask) * -1
    return sequence_lengths

In [62]:
def rnn_encoder_model(X, y):
    # Split input tensor into separare context and utterance tensor
    context, utterance = tf.split(1, 2, X, name='split')
    context = tf.squeeze(context, [1])
    utterance = tf.squeeze(utterance, [1])
    utterance_truncated = tf.slice(utterance, [0,0], [-1,MAX_UTTERANCE_LENGTH])
    
    # Calculate the sequence length for RNN calculation
    context_seq_length = get_sequence_length(context, MAX_CONTEXT_LENGTH)
    utterance_seq_length = get_sequence_length(utterance, MAX_UTTERANCE_LENGTH)
    
    # Embed context and utterance into the same space
    with tf.variable_scope("shared_embeddings") as vs, tf.device('/cpu:0'):
        embedding_tensor = tf.convert_to_tensor(initial_embeddings)
        embeddings = tf.get_variable("word_embeddings",
                                     initializer=embedding_tensor)
        # Embed the context
        word_vectors_context = skflow.ops.embedding_lookup(embeddings, context)
        print(word_vectors_context.get_shape())
        # word_list_context = skflow.ops.split_squeeze(1, MAX_CONTEXT_LENGTH, word_vectors_context)
        # Embed the utterance
        word_vectors_utterance = skflow.ops.embedding_lookup(embeddings, utterance_truncated)
        # word_list_utterance = skflow.ops.split_squeeze(1, MAX_UTTERANCE_LENGTH, word_vectors_utterance)
    
    # Run context and utterance through the same RNN
    with tf.variable_scope("shared_rnn_params") as vs:
        cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE)
        context_outputs, context_state = tf.nn.dynamic_rnn(
            cell, word_vectors_context, dtype=dtypes.float32, sequence_length=context_seq_length)
        encoding_context = tf.slice(context_state, [0, cell.output_size], [-1, -1])
        vs.reuse_variables()
        utterance_outputs, utterance_state = tf.nn.dynamic_rnn(
            cell, word_vectors_utterance, dtype=dtypes.float32, sequence_length=utterance_seq_length)
        encoding_utterance = tf.slice(utterance_state, [0, cell.output_size], [-1, -1])

    with tf.variable_scope("prediction") as vs:
        W = tf.get_variable("W",
                            shape=[encoding_context.get_shape()[1], encoding_utterance.get_shape()[1]],
                            initializer=tf.random_normal_initializer())
        b = tf.get_variable("b", [1])
        
        # We can interpret this is a "Generated context"
        generated_context = tf.matmul(encoding_utterance, W)
        # Batch multiply contexts and utterances (batch_matmul only works with 3-d tensors)
        generated_context = tf.expand_dims(generated_context, 2)
        encoding_context = tf.expand_dims(encoding_context, 2)
        scores = tf.batch_matmul(generated_context, encoding_context, True) + b
        # Go from [15,1,1] to [15,1]: We want a vector of 15 scores
        scores = tf.squeeze(scores, [2])
        # Convert scores into probabilities
        probs = tf.sigmoid(scores)
        
        # Calculate loss
        loss = tf.contrib.losses.logistic(scores, tf.expand_dims(y, 1))
        
    return [probs, loss]

In [63]:
# Evaluation
def evaluate_recall(y, y_labels, n=1):
    num_examples = float(len(y))
    num_correct = 0
    for predictions, label in zip(y, y_labels):
        if label in predictions[:n]:
            num_correct += 1
    return num_correct/num_examples

In [64]:
def predict_rnn_batch(contexts, utterances, n=1):
    num_contexts = len(contexts)
    num_records = np.multiply(*utterances.shape)
    input_vectors = []
    for context, utterance_list in zip(contexts, utterances):
        cvec = np.array(list(vocab_processor.transform([context])))
        for u in utterance_list:
            uvec = np.array(list(vocab_processor.transform([u])))
            stacked = np.stack([cvec, uvec], axis=1)
            input_vectors.append(stacked)
    batch = np.vstack(input_vectors)
    result = classifier.predict_proba(batch)[:,0]
    result = np.split(result, num_contexts)
    return np.argsort(result, axis=1)[:,::-1]

In [65]:
def evaluate_rnn_predictor(df):
    y_test = np.zeros(len(df))
    y = predict_rnn_batch(df.Context, df.iloc[:,1:].values)
    for n in [1, 2, 5, 10]:
        print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

In [66]:
class ValidationMonitor(tf.contrib.learn.monitors.BaseMonitor):
    def __init__(self, print_steps=100, early_stopping_rounds=None, verbose=1, val_steps=1000):
        super(ValidationMonitor, self).__init__(
            print_steps=print_steps,
            early_stopping_rounds=early_stopping_rounds,
            verbose=verbose)
        self.val_steps = val_steps

    def _modify_summary_string(self):
        if self.steps % self.val_steps == 0:
            evaluate_rnn_predictor(validation_df)

In [67]:
def training_rate_decay_func(global_step):
    return tf.train.exponential_decay(1e-4, global_step, 10000, 0.95, True)

In [68]:
classifier = tf.contrib.learn.TensorFlowEstimator(
    model_fn=rnn_encoder_model,
    n_classes=1,
    continue_training=True,
    learning_rate=training_rate_decay_func,
    steps=1000000,
    batch_size=16)

In [69]:
# classifier = tf.contrib.learn.TensorFlowEstimator.restore('./tmp/tf/dual_lstm_chatbot/checkpoints')

In [70]:
monitor = ValidationMonitor(print_steps=100, val_steps=10000)
classifier.fit(X_train, y_train, logdir='./tmp/tf/dual_lstm_chatbot/', monitor=monitor)

(?, 32, 50)
Step #99, avg. train loss: 0.83456
Step #199, avg. train loss: 0.82526
Step #299, avg. train loss: 0.80704
Step #399, avg. train loss: 0.83072
Step #499, avg. train loss: 0.79679
Step #599, avg. train loss: 0.84260


KeyboardInterrupt: 