In [161]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
from tensorflow.contrib import skflow

In [174]:
# Hyperparameters
MAX_DOCUMENT_LENGTH = 200
MAX_CONTEXT_LENGTH = 160 # From paper
EMBEDDING_SIZE = 50

In [175]:
# Load Data
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
validation_df = pd.read_csv("./data/valid.csv")
y_test = np.zeros(len(test_df))

In [210]:
# We will feed [CONTEXT]-[UTTERANCE] into the LSTM
# We pad all contexts to the MAX_CONTEXT_LENGTH
def build_input_seq(context, utterance):
    padded_context = context.split(" ")[-MAX_CONTEXT_LENGTH:]
    padding_needed 
    return "{} <START_OF_UTTERANCE> {}".format(truncated_context, utterance)

In [237]:
# Preprocessing
all_sentences = np.append(train_df.Context, train_df.Utterance)
vocab_processor = skflow.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
vocab_processor.fit(all_sentences)
X_train_context = np.array(list(vocab_processor.transform(train_df.Context)))
X_train_utterance = np.array(list(vocab_processor.transform(train_df.Utterance)))
X_train = np.stack([X_train_context, X_train_utterance], axis=1)
y_train = train_df.Label

In [239]:
n_words = len(vocab_processor.vocabulary_)
print("Total words: {}".format(n_words))

Total words: 25094


In [249]:
# Customized function to transform batched X into embeddings
def input_op_fn(X):
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    print(X.get_shape())
    word_vectors = skflow.ops.categorical_variable(X, n_classes=n_words,
        embedding_size=EMBEDDING_SIZE, name='words')
    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = skflow.ops.split_squeeze(1, MAX_DOCUMENT_LENGTH, word_vectors)
    return word_list

In [250]:
class DoubleRNNClassifier(TensorFlowRNNClassifier):
    def _model_fn(self, X, y):
        return models.get_rnn_model(self.rnn_size, self.cell_type,
                                self.num_layers,
                                self.input_op_fn, self.bidirectional,
                                models.logistic_regression,
                                self.sequence_length,
                                self.initial_state)(X, y)

NameError: name 'TensorFlowRNNClassifier' is not defined

In [251]:
# Single direction LSTM with a single layer
classifier = skflow.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE, 
    n_classes=2, cell_type='lstm', input_op_fn=input_op_fn,
    num_layers=1, bidirectional=False, sequence_length=None,
    steps=1000, optimizer='Adam', learning_rate=0.01, continue_training=True)

In [252]:
while True:
    classifier.fit(X_train, y_train, logdir='/tmp/tf_examples/word_rnn2')

(?, 2, 200)


ValueError: Number of ways to split should evenly divide the split dimension but got split_dim 1 (size = 2) and num_split 200

In [149]:
# Evaluation
def evaluate_recall(y, y_labels, n=1):
    num_examples = float(len(y))
    num_correct = 0
    for predictions, label in zip(y, y_labels):
        if label in predictions[:n]:
            num_correct += 1
    return num_correct/num_examples

In [67]:
test_df.iloc[0,1:].values.shape

(10,)

In [160]:
def predict_rnn(context, utterances, n=1):
    inputs_strs = [build_input_seq(context, u) for u in utterances]
    inputs = np.array(list(vocab_processor.transform(inputs_strs)))
    result = classifier.predict_proba(inputs)[:,1]
    return np.argsort(result)[::-1]

In [159]:
# Evaluate RNN predictor
y_test = np.zeros(len(test_df))
y = [predict_rnn(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(1000)]
print("")
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

....................................................................................................
Recall @ (1, 10): 0.05
Recall @ (2, 10): 0.2
Recall @ (5, 10): 0.56
Recall @ (10, 10): 1
