# Building a Chatbot

In this project, we will build a chatbot using conversations from Cornell University's [Movie Dialogue Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). The main features of our model are LSTM cells, a bidirectional dynamic RNN, and decoders with attention. 

The conversations will be cleaned rather extensively to help the model to produce better responses. As part of the cleaning process, punctuation will be removed, rare words will be replaced with "UNK" (our "unknown" token), longer sentences will not be used, and all letters will be in the lowercase. 

With a larger amount of data, it would be more practical to keep features, such as punctuation. However, I am using FloydHub's GPU services and I don't want to get carried away with too training for too long.

In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import numpy as np
import tensorflow as tf
import time
import os

#Local libraries
import metrics
import loss_functions

tf.__version__

np.random.seed(1)
tf.set_random_seed(1)

Most of the code to load the data is courtesy of https://github.com/suriyadeepan/practical_seq2seq/blob/master/datasets/cornell_corpus/data.py.

## Load and Preprocess Data

In [3]:
corpus_dir = os.path.join("corpora")
var_names = ["train_prompts", "train_answers", "valid_prompts", "valid_answers", "vocab_lines"]
file_names = [os.path.join(corpus_dir, var_name + ".txt") for var_name in var_names[:-1]] + ["vocab.txt"]

for (file_name, var_name) in zip(file_names, var_names):
    with open(file_name, "r", encoding="utf-8") as r:
        text = [ [token for token in line.strip().split(" ")] for line in r.readlines()]
        exec("{} = {}".format(var_name, text))
        


In [4]:
vocab2int = {pair[0]:int(pair[1]) for pair in vocab_lines}
int2vocab = {index:word for (word, index) in vocab2int.items()}
(questions_vocab_to_int, questions_int_to_vocab) = (vocab2int, int2vocab)
(prompts_vocab_to_int, prompts_int_to_vocab) = (vocab2int, int2vocab) #Alternative names to ease the transition


(answers_vocab_to_int, answers_int_to_vocab) = (vocab2int, int2vocab)

UNK = vocab_lines[0][0]
METATOKEN_INDEX = len(vocab2int)
META = "<META>"
EOS = "<EOS>"
PAD = "<PAD>"
GO = "<GO>"
codes = [META, EOS, PAD, GO]


In [5]:
def int_to_text(sequence, int2vocab):
    return [int2vocab[index] for index in sequence if index != METATOKEN_INDEX]

def text_to_int(sequence, vocab2int):
    return [vocab2int.get(token, vocab2int[UNK]) for token in sequence if token not in codes]

In [6]:
train_prompts_int = [text_to_int(prompt, questions_vocab_to_int) for prompt in train_prompts]
train_answers_int = [text_to_int(answer, answers_vocab_to_int) for answer in train_answers]
valid_prompts_int = [text_to_int(prompt, questions_vocab_to_int) for prompt in valid_prompts]
valid_answers_int = [text_to_int(answer, answers_vocab_to_int) for answer in valid_answers]

In [7]:
for i in range(10):
    print(train_prompts[i])
    print(train_prompts_int[i])
for i in range(10):
    print(train_answers[i])
    print(train_answers_int[i])

['<UNK>', 'says', 'she', 'does', 'not', 'see', 'how', 'you', 'do', 'it', '.']
[0, 292, 54, 103, 9, 75, 57, 3, 12, 11, 1]
['somewhere', '.', 'see', 'i', 'was', 'taken', 'away', 'by', 'the', '<UNK>', 'when', 'i', 'was', 'a', 'baby', '.', 'i', 'was', 'adopted', '.']
[575, 1, 75, 4, 30, 605, 177, 129, 6, 0, 84, 4, 30, 10, 305, 1, 4, 30, 5491, 1]
['it', 'is', 'just', 'a', 'work', 'in', 'progress', ',', 'kinda', 'rough', '.']
[11, 8, 40, 10, 155, 21, 2377, 2, 788, 1412, 1]
['your', 'majesty', ',', 'herr', 'mozart', '-']
[29, 1381, 2, 2833, 2494, 25]
['<UNK>', 'is', 'quite', 'all', 'right', '.', 'relax', ',', '<UNK>']
[0, 8, 419, 42, 65, 1, 850, 2, 0]
['it', 'is', 'my', 'father', ',', 'gone', '<UNK>', '.', 'the', 'baron', 'couer', 'de', 'noir', 'is', 'his', 'guest', 'and', 'must', 'be', 'provided', 'with', 'some', 'sport', '.']
[11, 8, 32, 224, 2, 333, 0, 1, 6, 1686, 7605, 1093, 5537, 8, 83, 1555, 15, 153, 33, 7071, 36, 91, 2501, 1]
['i', 'appreciate', 'that', ',', 'because', 'some', 'of', 't

## Word2Vec Embeddings

In [8]:
combined_corpus = train_prompts + train_answers + valid_prompts + valid_answers
len(combined_corpus)

429324

In [9]:
combined_corpus[:5]

[['<UNK>', 'says', 'she', 'does', 'not', 'see', 'how', 'you', 'do', 'it', '.'],
 ['somewhere',
  '.',
  'see',
  'i',
  'was',
  'taken',
  'away',
  'by',
  'the',
  '<UNK>',
  'when',
  'i',
  'was',
  'a',
  'baby',
  '.',
  'i',
  'was',
  'adopted',
  '.'],
 ['it',
  'is',
  'just',
  'a',
  'work',
  'in',
  'progress',
  ',',
  'kinda',
  'rough',
  '.'],
 ['your', 'majesty', ',', 'herr', 'mozart', '-'],
 ['<UNK>', 'is', 'quite', 'all', 'right', '.', 'relax', ',', '<UNK>']]

In [10]:
from gensim.models import Word2Vec
embedding_size = 1024
model = Word2Vec(sentences=combined_corpus, size=embedding_size, window=5, min_count=1, workers=4, sg=0)

In [11]:
model.wv['well'].shape

(1024,)

In [12]:
wordVecs = model.wv

In [13]:
word_vecs = np.zeros((len(model.wv.vocab),1024))
for i,word in enumerate(model.wv.index2word):
        word_vecs[vocab2int[word]] = model[word]
      

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
print("Vocabulary lengths")
print(len(word_vecs))
print(len(questions_vocab_to_int))
print(len(answers_vocab_to_int))
print(len(questions_int_to_vocab))
print(len(answers_int_to_vocab))

Vocabulary lengths
12001
12001
12001
12001
12001


In [15]:
len(word_vecs)

12001

In [16]:
np.save('word_Vecs.npy',word_vecs)

<H1> Word2Affect Vector - VAD </H1>

In [17]:
import pandas as pd
df_vad=pd.read_excel('Warriner, Kuperman, Brysbaert - 2013 BRM-ANEW expanded.xlsx')

In [18]:
df_vad.head(5)

Unnamed: 0,Word,V.Mean.Sum,V.SD.Sum,V.Rat.Sum,A.Mean.Sum,A.SD.Sum,A.Rat.Sum,D.Mean.Sum,D.SD.Sum,D.Rat.Sum,...,A.Rat.L,A.Mean.H,A.SD.H,A.Rat.H,D.Mean.L,D.SD.L,D.Rat.L,D.Mean.H,D.SD.H,D.Rat.H
1,aardvark,6.26,2.21,19,2.41,1.4,22,4.27,1.75,15,...,11,2.55,1.29,11,4.12,1.64,8,4.43,1.99,7
2,abalone,5.3,1.59,20,2.65,1.9,20,4.95,1.79,22,...,12,2.38,1.92,8,5.55,2.21,11,4.36,1.03,11
3,abandon,2.84,1.54,19,3.73,2.43,22,3.32,2.5,22,...,11,3.82,2.14,11,2.77,2.09,13,4.11,2.93,9
4,abandonment,2.63,1.74,19,4.95,2.64,21,2.64,1.81,28,...,14,5.29,2.63,7,2.31,1.45,16,3.08,2.19,12
5,abbey,5.85,1.69,20,2.2,1.7,20,5.0,2.02,25,...,9,2.55,1.92,11,4.83,2.18,18,5.43,1.62,7


In [19]:
list_wordvecs=[]
for i,word in enumerate(model.wv.index2word):
    list_wordvecs.append(word)

In [20]:
len(model.wv.vocab)

12001

In [21]:
list_vad = set(df_vad['Word'])

In [22]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
def lemmatize_text(text):
    return lemmatizer.lemmatize(str(text))

In [24]:
df_vad['Lemma_Word'] = df_vad.Word.apply(lemmatize_text)

In [25]:
word_vecs_vad = np.zeros((len(model.wv.vocab),1027))
count_vad=0
count_neutral=0
for i,word in enumerate(model.wv.index2word):
    lemma = lemmatizer.lemmatize(word)
    if lemma in set(df_vad['Lemma_Word']):
        #print(word)
        count_vad=count_vad+1
        word_vecs_vad[vocab2int[word]][0:1024] = model[word]
        word_vecs_vad[vocab2int[word]][1024]=df_vad.loc[df_vad['Lemma_Word'] == lemma, 'V.Mean.Sum'].iloc[0]
        word_vecs_vad[vocab2int[word]][1025]=df_vad.loc[df_vad['Lemma_Word'] == lemma, 'A.Mean.Sum'].iloc[0]
        word_vecs_vad[vocab2int[word]][1026]=df_vad.loc[df_vad['Lemma_Word'] == lemma, 'D.Mean.Sum'].iloc[0]
        #print(word_vecs_vad[i])
    else:
        #print("out")
        count_neutral=count_neutral+1
        word_vecs_vad[vocab2int[word]][0:1024] = model[word]
        word_vecs_vad[vocab2int[word]][1024]=5
        word_vecs_vad[vocab2int[word]][1025]=1
        word_vecs_vad[vocab2int[word]][1026]=5


  if __name__ == '__main__':


In [26]:
print(count_vad)
print(count_neutral)

6850
5151


In [27]:
val =df_vad[df_vad['Word'] == "mailbox"]["V.Mean.Sum"]
val

7352    6.05
Name: V.Mean.Sum, dtype: float64

In [28]:
np.save('word_Vecs_VAD.npy',word_vecs_vad)

<H1>Word2Vec - counterfitting + affect </H1>

In [30]:
import gensim

# Load Google's pre-trained Word2Vec model.
model_counterfit_affect = gensim.models.KeyedVectors.load_word2vec_format('./w2v_counterfit_append_affect.bin', binary=True)

In [90]:
len(model.wv.vocab)

8101

In [43]:
list_counterfit =list(model_counterfit_affect.wv.vocab.keys())

  """Entry point for launching an IPython kernel.


In [60]:
dict_lemma_counterfit={}
for word in list_counterfit:
    dict_lemma_counterfit[lemmatize_text(word)]=word

In [61]:
word_vecs_counterfit_affect = np.zeros((len(model.wv.vocab),303))
list_word_not_found =[]
for i,word in enumerate(model.wv.index2word):
    lemma = lemmatizer.lemmatize(word)
    if lemma in dict_lemma_counterfit.keys():
        word_vecs_counterfit_affect[vocab2int[word]] = model_counterfit_affect[dict_lemma_counterfit[lemma]]
    else:
        list_word_not_found.append(vocab2int[word])

In [62]:
len(list_word_not_found)

849

In [63]:
word_unknown = np.mean(word_vecs_counterfit_affect, axis=0)

In [64]:
word_unknown.shape

(303,)

In [65]:
for i in list_word_not_found:
    word_vecs_counterfit_affect[i] = word_unknown

In [66]:
word_vecs_counterfit_affect.shape

(12001, 303)

In [67]:
np.save('word_Vecs_counterfit_affect.npy',word_vecs_counterfit_affect)

<H1>Word2Vec - retrofitting + affect </H1>

In [68]:
import gensim

# Load Google's pre-trained Word2Vec model.
model_retrofit_affect = gensim.models.KeyedVectors.load_word2vec_format('./w2v_retrofit_append_affect.bin', binary=True)

In [69]:
list_retrofit =list(model_retrofit_affect.wv.vocab.keys())

  """Entry point for launching an IPython kernel.


In [70]:
dict_lemma_retrofit={}
for word in list_retrofit:
    dict_lemma_retrofit[lemmatize_text(word)]=word

In [71]:
word_vecs_retrofit_affect = np.zeros((len(model.wv.vocab),303))
list_word_not_found_retro =[]
for i,word in enumerate(model.wv.index2word):
    lemma = lemmatizer.lemmatize(word)
    if lemma in dict_lemma_retrofit.keys():
        word_vecs_retrofit_affect[vocab2int[word]] = model_retrofit_affect[dict_lemma_retrofit[lemma]]
    else:
        list_word_not_found_retro.append(vocab2int[word])

In [72]:
len(list_word_not_found_retro)

849

In [73]:
word_unknown_retro = np.mean(word_vecs_retrofit_affect, axis=0)

In [74]:
for i in list_word_not_found_retro:
    word_vecs_retrofit_affect[i] = word_unknown_retro

In [75]:
np.save('word_Vecs_retrofit_affect.npy',word_vecs_retrofit_affect)

## Additional Preprocessing

In [29]:
#Add EOS tokens to target data now that the embeddings have been trained
def append_eos(answers_text, answers_int):
    appended_text = [sequence + [EOS] for sequence in answers_text]
    appended_ints = [sequence + [METATOKEN_INDEX] for sequence in answers_int]
    return (appended_text, appended_ints)

(train_answers, train_answers_int) = append_eos(train_answers, train_answers_int)
(valid_answers, valid_answers_int) = append_eos(valid_answers, valid_answers_int)

print(train_answers[:5])
print(train_answers_int[:5])

[['what', 'is', 'wrong', 'with', 'her', '.', '<EOS>'], ['adopted', '.', 'i', 'should', 'have', 'know', '.', 'of', 'course', '.', 'if', 'it', 'was', 'a', 'snake', ',', 'it', 'would', 'bit', 'me', '!', '<EOS>'], ['this', 'guy', 'killed', 'a', 'mess', 'of', 'people', '.', '<EOS>'], ['yes', ',', 'what', 'about', 'him', '?', '<EOS>'], ['<UNK>', 'is', 'a', 'great', 'honor', ',', 'sir', '.', 'i-', 'i-', '<EOS>']]
[[16, 8, 212, 36, 69, 1, 12001], [5491, 1, 4, 117, 20, 28, 1, 17, 194, 1, 52, 11, 30, 10, 2332, 2, 11, 44, 464, 22, 18, 12001], [26, 168, 274, 10, 1091, 17, 125, 1, 12001], [72, 2, 16, 43, 56, 5, 12001], [0, 8, 10, 189, 840, 2, 145, 1, 2234, 2234, 12001]]


In [30]:
def process_decoding_input(target_data, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat( [tf.fill([batch_size, 1], METATOKEN_INDEX), ending], 1)
    return dec_input


In [31]:
def dropout_cell(rnn_size, keep_prob):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    return tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)

def multi_dropout_cell(rnn_size, keep_prob, num_layers):    
    return tf.contrib.rnn.MultiRNNCell( [dropout_cell(rnn_size, keep_prob) for _ in range(num_layers)] )

In [32]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_lengths):
    """
    Create the encoding layer
    
    Returns a tuple `(outputs, output_states)` where
      outputs is a 2-tuple of vectors of dimensions [sequence_length, rnn_size] for the forward and backward passes
      output_states is a 2-tupe of the final hidden states of the forward and backward passes
    
    """
    forward_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
    backward_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
    outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw = forward_cell,
                                                   cell_bw = backward_cell,
                                                   sequence_length = sequence_lengths,
                                                   inputs = rnn_inputs,
                                                    dtype=tf.float32)
    return outputs, states

## Decoding

In [33]:
def decoding_layer(enc_state, enc_outputs, dec_embed_input, dec_embeddings, #Inputs
                        attn_size, rnn_size, num_layers, output_layer, #Architecture
                        keep_prob,  #Hypeparameters
                        source_lengths, target_lengths, batch_size): 
   
    with tf.variable_scope("decoding") as scope:
        dec_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
        init_dec_state_size = batch_size
        attn_mech = tf.contrib.seq2seq.BahdanauAttention(num_units=attn_size, memory=enc_outputs,
                                                         memory_sequence_length=source_lengths)
        attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech,
                                                    attention_layer_size=dec_cell.output_size)
        init_dec_state = attn_cell.zero_state(init_dec_state_size, tf.float32).clone(cell_state=enc_state)
        
        decoder_gen = lambda helper: tf.contrib.seq2seq.BasicDecoder(attn_cell, helper, init_dec_state,
                                        output_layer = output_layer)
        
        #TRAINING
        train_helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, target_lengths)
        train_decoder = decoder_gen(train_helper)
        train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decoder, impute_finished=True, scope=scope)
        train_logits = train_outputs.rnn_output

        #INFERENCE
        infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, 
                                                                start_tokens = tf.tile([METATOKEN_INDEX],
                                                                                       [batch_size]),
                                                                 end_token = METATOKEN_INDEX)
        infer_decoder = decoder_gen(infer_helper)
        infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(infer_decoder, scope=scope, maximum_iterations=100)
        infer_ids = infer_outputs.sample_id
                
    return train_logits, infer_ids

In [34]:
def seq2seq_model(enc_embed_input, dec_embed_input, dec_embeddings, #Inputs
                  source_lengths, target_lengths, batch_size, #Dimensions
                  attn_size, rnn_size, num_layers, output_layer, #Architecture
                  keep_prob): #Hyperparameters
    
    enc_outputs, enc_states = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob, source_lengths)    
    concatenated_enc_output = tf.concat(enc_outputs, -1)
    init_dec_state = enc_states[0]    
    
    
    train_logits, infer_ids = decoding_layer(init_dec_state,
                            concatenated_enc_output,
                            dec_embed_input,
                            dec_embeddings,
                            attn_size,
                            rnn_size, 
                            num_layers,
                            output_layer,
                            keep_prob,
                            source_lengths,
                            target_lengths, 
                            batch_size
                            )
    
    
    return train_logits, infer_ids

In [35]:
#FLAGS
flag_affect_functions = False # change this flag to false if affect functions are not used
flag_vad_values = True # Set to true if using VAD values appended onto existing embeddings

#Settings used by Asghar et al.
rnn_size = 1024
num_layers = 1
attention_size = 256
epochs_before_affective_loss = 40
epochs = 50
train_batch_size = 64


#Training
learning_rate = 0.0001
keep_probability = 0.75

#Validation
valid_batch_size = 64

embedding_model_path = "word_Vecs_VAD.npy"
wordVecs = np.load(embedding_model_path).astype(np.float32)

embedding_size = wordVecs.shape[1] #Dynamically determine embedding size from loaded embedding file

metatoken_embedding = np.zeros((1, embedding_size), dtype=wordVecs.dtype)
wordVecsWithMeta = np.concatenate( (wordVecs, metatoken_embedding), axis=0 )
vocab_size_with_meta = wordVecsWithMeta.shape[0]

print("vocab_size_with_meta =", vocab_size_with_meta)
print("METATOKEN_INDEX =", METATOKEN_INDEX)
print("wordVecsWithMeta.shape =", wordVecsWithMeta.shape)
print("wordVecsWithMeta[METATOKEN_INDEX] =", wordVecsWithMeta[METATOKEN_INDEX])


vocab_size_with_meta = 12002
METATOKEN_INDEX = 12001
wordVecsWithMeta.shape = (12002, 1027)
wordVecsWithMeta[METATOKEN_INDEX] = [0. 0. 0. ... 0. 0. 0.]


In [36]:
# Reset the graph to ensure that it is ready for training
tf.reset_default_graph()


#                                      batch_size, sequence_length
input_data = tf.placeholder(tf.int32, [None,       None], name='input')
targets = tf.placeholder(tf.int32,    [None,       None], name='targets')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

#Determines whether we use a normal loss function or an affective loss function
train_affect = tf.placeholder(tf.bool, shape=(), name="train_affect")



#                                          batch_size
source_lengths = tf.placeholder(tf.int32, [None], name="source_lengths")
target_lengths = tf.placeholder(tf.int32, [None], name="target_lengths")
batch_size = tf.shape(input_data)[0]

full_embeddings = tf.Variable(wordVecsWithMeta,trainable=False,name="Weight")
enc_embed_input = tf.nn.embedding_lookup(full_embeddings, input_data)
dec_embed_input = tf.nn.embedding_lookup(full_embeddings, process_decoding_input(targets, batch_size))

output_layer = tf.layers.Dense(vocab_size_with_meta,bias_initializer=tf.zeros_initializer(),activation=tf.nn.relu)


# Create the training and inference logits
train_logits, infer_ids = \
seq2seq_model(enc_embed_input, dec_embed_input, full_embeddings,
        source_lengths, target_lengths, batch_size, 
        attention_size, rnn_size, num_layers, output_layer,
        keep_prob)


# Find the shape of the input data for sequence_loss
with tf.name_scope("optimization"): 
    
    mask = tf.sequence_mask(target_lengths, dtype=tf.float32)
    xent = loss_functions.cross_entropy(train_logits, targets, mask)
    perplexity = tf.contrib.seq2seq.sequence_loss(train_logits, targets, mask, metrics.perplexity)
    
    
    vad_values = full_embeddings[:, -3:]
    input_vad_values =enc_embed_input[:,:,1024:1027]
    lambda_param_max_affective_content = 0.5
    lambda_param_min_affective_dissonance=0.5
    lambda_param_max_affective_dissonance = 0.4
    neutral_vector = tf.constant([5.0, 1.0, 5.0], dtype=tf.float32)
    max_affective_content = loss_functions.max_affective_content(lambda_param_max_affective_content,train_logits, targets, 
                                                                 vad_values,neutral_vector, mask)
    min_affective_dissonance = loss_functions.min_affective_dissonance(lambda_param_min_affective_dissonance,train_logits, targets, 
                                                                 input_vad_values,vad_values, mask)
    max_affective_dissonance = loss_functions.max_affective_dissonance(lambda_param_max_affective_dissonance,train_logits, targets, 
                                                                 input_vad_values,vad_values, mask)
    
    
    train_cost = tf.cond(train_affect, true_fn=lambda: max_affective_content, false_fn=lambda: xent)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(train_cost)
    capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)


### Subroutines for Sampling Output

In [37]:
def show_response(prompt_int, prediction, answer_int = None):
    pad_q = METATOKEN_INDEX
    print("Prompt")
    print("  Word Ids: {}".format([i for i in prompt_int if i != pad_q]))
    print("      Text: {}".format(int_to_text(prompt_int, prompts_int_to_vocab)))
    
    pad_a = METATOKEN_INDEX
    if answer_int is not None:
        print("Target Answer")
        print("  Word Ids: {}".format([i for i in answer_int if i != pad_a]))
        print("      Text: {}".format(int_to_text(answer_int, answers_int_to_vocab)))

    print("\nPrediction")
    print('  Word Ids: {}'.format([i for i in prediction if i != pad_a]))
    print('      Text: {}'.format(int_to_text(prediction, answers_int_to_vocab)))
        
def check_response(session, prompt_int, answer_int=None):
    """
    session - the TensorFlow session
    question_int - a list of integers
    answer - the actual, correct response (if available)
    """
    
    two_d_prompt_int = [prompt_int]
    p_lengths = [len(prompt_int)]
    
    [infer_ids_output] = session.run([infer_ids], feed_dict = {input_data: np.array(two_d_prompt_int, dtype=np.float32),
                                                      source_lengths: p_lengths,
                                                      keep_prob: 1})
    
    show_response(prompt_int, infer_ids_output[0], answer_int)

In [38]:
def pad_sentence_batch(sentence_batch, vocab_to_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    pad_int = METATOKEN_INDEX
    max_sentence_length = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence_length - len(sentence)) for sentence in sentence_batch]

In [39]:
def batch_data(questions, answers, batch_size):
    """Batch questions and answers together"""
    for batch_i in range(0, len(questions)//batch_size):
        start_i = batch_i * batch_size
        questions_batch = questions[start_i:start_i + batch_size]
        answers_batch = answers[start_i:start_i + batch_size]
        
        source_lengths = np.array( [len(sentence) for sentence in questions_batch] )
        target_lengths = np.array( [len(sentence) for sentence in answers_batch])
        
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        yield source_lengths, target_lengths, pad_questions_batch, pad_answers_batch

In [40]:
def parallel_shuffle(source_sequences, target_sequences):
    if len(source_sequences) != len(target_sequences):
        raise ValueError("Cannot shuffle parallel sets with different numbers of sequences")
    indices = np.random.permutation(len(source_sequences))
    shuffled_source = [source_sequences[indices[i]] for i in range(len(indices))]
    shuffled_target = [target_sequences[indices[i]] for i in range(len(indices))]
    
    return (shuffled_source, shuffled_target)

### Training Loop Options

In [41]:
#TRAINING
display_step = 100 # Check training loss after every 100 batches

#VALIDATION
validation_check = ((len(train_prompts))//train_batch_size//2)-1 #Check validation loss every half-epoch
#Minimum number of epochs before we start checking sample output
min_epochs_before_validation = 1

#Used to make uniquely directories, not to identify when a model is saved
time_string = time.strftime("%b%d_%H:%M:%S")

checkpoint_dir = os.path.join("checkpoints", time_string)
if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir)
checkpoint_best = str(checkpoint_dir) + "/" + "best_model.ckpt" 
checkpoint_latest = str(checkpoint_dir) + "/" + "latest_model.ckpt"

log_dir = os.path.join("logging", time_string)
if not os.path.exists(log_dir): os.makedirs(log_dir)
train_log = os.path.join(log_dir, "train.csv")
valid_log = os.path.join(log_dir, "valid.csv")



### Logging Progress

In [42]:
def log_entries(csv_path, *fields, header = False):
    if len(fields[0]) < 1: return
    mode = "w" if header else "a"
    with open(csv_path, mode, encoding="utf-8") as log:
        lines = []
        num_lines = len(fields[0])
        lines = "\n".join(",".join([str(field[i]) for field in fields]) 
                          for i in range(num_lines)
        )
        log.write(lines + "\n")

In [43]:
def clear_fields(log_fields):
    for field in log_fields:
        field.clear()

In [None]:
train_epoch_nos = []
train_batch_tokens = [] #Number of tokens in a batch
train_batch_losses = [] #Per-token loss for a batch
train_log_fields = [train_epoch_nos, train_batch_tokens, train_batch_losses]

valid_epoch_nos = []
valid_check_nos = []
valid_batch_tokens = []
valid_batch_losses = []
valid_log_fields = [valid_epoch_nos, valid_check_nos, valid_batch_tokens, valid_batch_losses]

log_entries(train_log, ["epoch"], ["num_tokens"], ["loss_per_token"], header=True)
log_entries(valid_log, ["epoch"], ["check"], ["num_tokens"], ["perplexity_per_token"], header=True)
print("Initialized empty training log {}, validation log {}".format(train_log, valid_log))

best_valid_loss = float("inf")
use_affect_func = False

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    tf.train.Saver().save(sess, checkpoint_latest)
    print("Initialized model parameters, wrote initial model to {}".format(checkpoint_latest))
    print("Beginning training with cross-entropy loss.")
    for epoch_i in range(1, epochs+1):
        if not use_affect_func and epoch_i > epochs_before_affective_loss:
            print("Switching from cross-entropy loss to an affective loss function")
            use_affect_func = True        
        
        print("Shuffling training data . . .")
        (train_prompts_int, train_answers_int) = parallel_shuffle(train_prompts_int, train_answers_int)
        
        valid_check_no = 1
        
        for batch_i, (p_lengths, a_lengths, prompts_batch, answers_batch) in enumerate(
                batch_data(train_prompts_int, train_answers_int, train_batch_size)):
            train_start_time = time.time()
            
            #VALIDATION CHECK
            if batch_i % validation_check == 0 and epoch_i > min_epochs_before_validation:
                print("Shuffling validation data . . .")
                (valid_prompts_int, valid_answers_int) = parallel_shuffle(valid_prompts_int, valid_answers_int)
                
                clear_fields(valid_log_fields)

                
                valid_start_time = time.time()
                for batch_ii, (p_lengths, a_lengths, prompts_batch, answers_batch) in \
                        enumerate(batch_data(valid_prompts_int, valid_answers_int, valid_batch_size)):

                    [valid_loss] = sess.run([perplexity],
                        {input_data: prompts_batch, targets: answers_batch,
                        source_lengths: p_lengths, target_lengths: a_lengths, keep_prob: 1})
                    valid_epoch_nos.append(epoch_i)
                    valid_check_nos.append(valid_check_no)
                    valid_batch_tokens.append(sum(a_lengths))
                    valid_batch_losses.append(valid_loss)

                
                valid_check_no += 1
                duration = time.time() - valid_start_time
                avg_valid_loss = sum(loss*tokens 
                        for (loss, tokens) in zip(valid_batch_losses, valid_batch_tokens)) / sum(valid_batch_tokens)
                
                log_entries(valid_log, *(valid_log_fields))
                clear_fields(valid_log_fields)
                print("Processed validation set in {:>4.2f} seconds".format(duration))
                print("Average perplexity per token = {}".format(avg_valid_loss))
                if avg_valid_loss >= best_valid_loss:
                    print("No improvement for validation loss.")
                else:
                    best_valid_loss = avg_valid_loss
                    print("New record for validation loss!")
                    print("Saving best model to {}".format(checkpoint_best))
                    tf.train.Saver().save(sess, checkpoint_best)
                check_response(sess, prompts_batch[-1], answers_batch[-1])
            
            #TRAINING
            _, loss = sess.run([train_op, train_cost],
                {input_data: prompts_batch, targets: answers_batch,
                 source_lengths: p_lengths, target_lengths: a_lengths,
                 keep_prob: keep_probability,
                 train_affect: use_affect_func})
            train_epoch_nos.append(epoch_i)
            train_batch_losses.append(loss)
            train_batch_tokens.append(sum(a_lengths))
            
            if batch_i % display_step == 0:
                duration = time.time() - train_start_time
                avg_train_loss = sum(loss*tokens 
                        for (loss, tokens) in zip(train_batch_losses, train_batch_tokens)) / sum(train_batch_tokens)
                    
                log_entries(train_log, *(train_log_fields))
                clear_fields(train_log_fields)
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss-per-Token: {:>9.6f}, Seconds: {:>4.2f}'
                      .format(epoch_i, epochs, batch_i, len(train_prompts_int) // train_batch_size, 
                              avg_train_loss, duration),
                         flush=True)

        print("{} epochs completed, saving model to {}".format(epoch_i, checkpoint_latest))
        tf.train.Saver().save(sess, checkpoint_latest)
        log_entries(train_log, *(train_log_fields))
        clear_fields(train_log_fields)



Initialized empty training log logging/Jun23_17:17:35/train.csv, validation log logging/Jun23_17:17:35/valid.csv
Initialized model parameters, wrote initial model to checkpoints/Jun23_17:17:35/latest_model.ckpt
Beginning training with cross-entropy loss.
Shuffling training data . . .
Epoch   1/50 Batch    0/2683 - Loss-per-Token:  9.397799, Seconds: 33.76


In [None]:
def question_to_seq(question, vocab_to_int, int_to_vocab):
    '''Prepare the question for the model'''
    cleaned_question = Corpus.clean_sequence(question)
    return [vocab_to_int.get(word, vocab_to_int[UNK]) for word in cleaned_question]


In [None]:
# Use a question from the data as your input
random = np.random.choice(len(train_prompts_int))
prompt_int = train_prompts_int[random]
answer_int = train_answers_int[random]

saver = tf.train.Saver()
with tf.Session() as sess:
    # Run the model with the input question
    saver.restore(sess, checkpoint)
    check_response(sess, prompt_int, answer_int, best_only=False)
    
