In [1]:
from __future__ import print_function
import logging
import json
import io
#import nltk
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import re
import string
import random
import numpy as np
import tensorflow as tf
import math
import os
import sys
import time


In [2]:
# this loop is needed to reset the flags to that notebook won't throw duplicate flags error
from absl import flags
for name in list(flags.FLAGS):
  delattr(flags.FLAGS, name)

# Dictionary parameters
tf.app.flags.DEFINE_string("doc_dict_path", "doc_dict.txt", "Document Dictionary output.")
tf.app.flags.DEFINE_string("sum_dict_path", "sum_dict.txt", "Summary Dictionary output.")
tf.app.flags.DEFINE_boolean("create_dict_flag", False, "Whether to create new dictionary or not ")
tf.app.flags.DEFINE_integer("doc_vocab_size", 30000, "Document vocabulary size.")
tf.app.flags.DEFINE_integer("sum_vocab_size", 10000, "Summary vocabulary size.")
tf.app.flags.DEFINE_float("train_test_split", 0.02, "Test Split ratio")
tf.app.flags.DEFINE_boolean("pretrained_embeddings", True, "Whether to look up pre-trained embedding for not ")
tf.app.flags.DEFINE_string("embedding_path", "glove.twitter.27B.100d.txt", "Embedding path")


# needed to get rid of missing f parameter
tf.app.flags.DEFINE_string('f', '', 'kernel')

# Optimization Parameters
tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
tf.app.flags.DEFINE_integer("size", 400, "Size of hidden layers.")
tf.app.flags.DEFINE_integer("embsize", 100, "Size of embedding.")
tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
tf.app.flags.DEFINE_float("max_gradient", 1.0, "Clip gradients l2 norm to this range.")
tf.app.flags.DEFINE_integer("batch_size", 5, "Batch size in training / beam size in testing.")
tf.app.flags.DEFINE_integer("max_train", 0, "Limit on the size of training data (0: no limit).")
tf.app.flags.DEFINE_integer("max_epochs", 250, "Maximum training iterations.")

# Data Directory Paramters
tf.app.flags.DEFINE_string("data_dir", "data1.json", "Data directory")
tf.app.flags.DEFINE_string("test_file", "data_sample_test.txt", "Test filename.")

# Output Data Directory Parameters
tf.app.flags.DEFINE_string("test_output", "test_output.txt", "Test output.")
tf.app.flags.DEFINE_string("train_dir", "model", "Training directory.")
tf.app.flags.DEFINE_string("tfboard", "tfboard", "Tensorboard log directory.")
tf.app.flags.DEFINE_integer("steps_per_print", 50, "Training steps between printing.")
tf.app.flags.DEFINE_integer("steps_per_validation", 1000, "Training steps between validations.")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 750, "Training steps between checkpoints.")
tf.app.flags.DEFINE_boolean("load_checkpoint", False, "Flag to whether load the checkpoint or not")


# Progam Running Mode: Train or decode
tf.app.flags.DEFINE_boolean("decode", False, "Set to True for testing.")
tf.app.flags.DEFINE_boolean("geneos", True, "Do not generate EOS. ")

tf.app.flags.DEFINE_integer('seed',           3435, 'random number generator seed')
FLAGS = tf.app.flags.FLAGS

print ("DONE")

DONE


In [3]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
    
logging.basicConfig(level=logging.INFO,format="%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s",datefmt='%b %d %H:%M')


In [4]:
MARK_PAD = "<PAD>"
MARK_UNK = "<UNK>"
MARK_EOS = "<EOS>"
MARK_GO = "<GO>"
MARKS = [MARK_PAD, MARK_UNK, MARK_EOS, MARK_GO]
ID_PAD = 0
ID_UNK = 1
ID_EOS = 2
ID_GO = 3

In [5]:
def load_dict(dict_path, max_vocab=None):
    logging.info("Try load dict from {}.".format(dict_path))
    try:
        dict_file = open(dict_path)
        dict_data = dict_file.readlines()
        dict_file.close()
    except:
        logging.info("Load dict {dict} failed, create later.".format(dict=dict_path))
        return None

    dict_data = list(map(lambda x: x.split(), dict_data))
    if max_vocab:
        dict_data = list(filter(lambda x: int(x[0]) < max_vocab, dict_data))
    tok2id = dict(map(lambda x: (x[1], int(x[0])), dict_data))
    id2tok = dict(map(lambda x: (int(x[0]), x[1]), dict_data))
    logging.info("Load dict {} with {} words.".format(dict_path, len(tok2id)))
    return (tok2id, id2tok)


In [6]:
def create_dict(dict_path, corpus, max_vocab=None):
    logging.info("Create dict {}.".format(dict_path))
    counter = {}
    counter2 = 0
    for line in corpus:
        for word in line:
            try:
                counter[word] += 1
            except:
                counter[word] = 1

    for mark_t in MARKS:
        if mark_t in counter:
            del counter[mark_t]
            logging.warning("{} appears in corpus.".format(mark_t))

    counter = list(counter.items())
    counter.sort(key=lambda x: -x[1])
    words = list(map(lambda x: x[0], counter))
    words = [MARK_PAD, MARK_UNK, MARK_EOS, MARK_GO] + words
    if max_vocab:
        words = words[:max_vocab]

    tok2id = dict()
    id2tok = dict()
    with open(dict_path, 'w') as dict_file:
        for idx, tok in enumerate(words):
            print(idx, tok, file=dict_file)
            tok2id[tok] = idx
            id2tok[idx] = tok

    logging.info("Create dict {} with {} words.".format(dict_path, len(words)))
    return (tok2id, id2tok)

In [7]:
def corpus_map2id(data, tok2id):
    ret = []
    unk = 0
    tot = 0
    for doc in data:
        tmp = []
        for word in doc:
            tot += 1
            try:
                tmp.append(tok2id[word])
            except:
                tmp.append(ID_UNK)
                unk +=1
        ret.append(tmp)
    print ("TOTAL :", tot, " UNK :", unk)
    return ret, (tot - unk)/tot

In [8]:
def sen_map2tok(sen, id2tok):
	return list(map(lambda x: id2tok[x], sen))

In [9]:

logging.info("Load document from {}.".format(FLAGS.data_dir))
with io.open(FLAGS.data_dir, 'r', encoding='ascii', errors='ignore') as input_file:
    data = json.loads(input_file.read())
    docs = []
    summaries = []
    queries = []
    for i in range (len(data['passages'])):
        docs.append(' '.join([data['passages'][str(i)][j]['passage_text'] for j in range (len(data['passages'][str(i)]))]))
        summaries.append(''.join(data['answers'][str(i)]))
        queries.append(data['query'][str(i)])

assert 	len(docs) == len(queries)
print (len(docs))

Sep 07 00:43 <ipython-input-9-c23af9b8db75>[line:2] INFO Load document from data1.json.


15391


In [10]:
#print ("DOCS :", docs[0])
print ("QUERY :", queries[0])
print ("ANSWER :", summaries[0])

QUERY : . what is a corporation?
ANSWER : A corporation is a company or group of people authorized to act as a single entity and recognized as such in law.


In [11]:
with tf.device('/gpu:1'):
    print ("Splitting docs...")

    #print ("BEFORE SPLITTING: ", docs[0])
    now = time.time()
    docs_splitted = list(map(lambda x: word_tokenize(x), docs))
    docs_splitted = [[word.lower()for word in doc] for doc in docs_splitted]
    del docs
    print ("TIME TAKEN TO SPLIT DOCS: ", time.time()-now)
    #print ("AFTER SPLITTING: ", docs[0])
    print ("DONE")

Splitting docs...
TIME TAKEN TO SPLIT DOCS:  57.51096987724304
DONE


In [12]:
print ("Splitting queries...")
queries_splitted = list(map(lambda x: [y.lower() for y in word_tokenize(x)], queries)) 
del queries
print ("DONE")
print (len(queries_splitted))

Splitting queries...
DONE
15391


In [13]:
print ("Splitting summaries...")
summaries_splitted = list(map(lambda x: [y.lower() for y in word_tokenize(x)], summaries)) 
del summaries
print ("DONE")

Splitting summaries...
DONE


In [14]:
print ("Working on dictionary...")
if(FLAGS.create_dict_flag):
    doc_dict = create_dict(FLAGS.doc_dict_path, docs_splitted+queries_splitted, FLAGS.doc_vocab_size)
    sum_dict = create_dict(FLAGS.sum_dict_path, summaries_splitted, FLAGS.sum_vocab_size)
else:
    doc_dict = load_dict(FLAGS.doc_dict_path, FLAGS.doc_vocab_size)
    sum_dict = load_dict(FLAGS.sum_dict_path, FLAGS.sum_vocab_size)

print ("DONE")
print (len(doc_dict))
print (len(sum_dict))
#print (sum_dict[0])

Sep 07 00:44 <ipython-input-5-680d20c1ecef>[line:2] INFO Try load dict from doc_dict.txt.
Sep 07 00:44 <ipython-input-5-680d20c1ecef>[line:16] INFO Load dict doc_dict.txt with 30000 words.
Sep 07 00:44 <ipython-input-5-680d20c1ecef>[line:2] INFO Try load dict from sum_dict.txt.
Sep 07 00:44 <ipython-input-5-680d20c1ecef>[line:16] INFO Load dict sum_dict.txt with 10000 words.


Working on dictionary...
DONE
2
2


In [15]:
print ("Converting to ids...")
docid, cover = corpus_map2id(docs_splitted, doc_dict[0])
logging.info("Doc dict covers {:.2f}% words.".format(cover * 100))

sumid, cover = corpus_map2id(summaries_splitted, sum_dict[0])
logging.info("Sum dict covers {:.2f}% words.".format(cover * 100))

queryid, cover = corpus_map2id(queries_splitted, doc_dict[0])
logging.info("Query dict covers {:.2f}% words.".format(cover * 100))

print ("DONE")

Converting to ids...


Sep 07 00:44 <ipython-input-15-6f91af374cc1>[line:3] INFO Doc dict covers 96.73% words.
Sep 07 00:44 <ipython-input-15-6f91af374cc1>[line:6] INFO Sum dict covers 93.24% words.
Sep 07 00:44 <ipython-input-15-6f91af374cc1>[line:9] INFO Query dict covers 98.35% words.


TOTAL : 9099177  UNK : 297354
TOTAL : 486727  UNK : 32914
TOTAL : 86919  UNK : 1431
DONE


In [16]:
print (len(queryid[0]))
print (queryid[0])
print (" ".join(sen_map2tok(queryid[0], doc_dict[1])))

6
[5, 37, 11, 8, 1557, 63]
. what is a corporation ?


In [17]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
print ("DONE")

Sep 07 00:44 textcleaner.py[line:37] INFO 'pattern' package not found; tag filters are not available for English
Sep 07 00:44 dictionary.py[line:195] INFO adding document #0 to Dictionary(0 unique tokens: [])
Sep 07 00:44 dictionary.py[line:202] INFO built Dictionary(12 unique tokens: ['response', 'trees', 'time', 'user', 'human']...) from 9 documents (total 29 corpus positions)


DONE


In [19]:
def get_word_vectors():
    glove_file = FLAGS.embedding_path
    word2vec_file = get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file, word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)
    return word_vectors
word_vectors = get_word_vectors()
print ("DONE")

Sep 07 00:51 glove2word2vec.py[line:105] INFO converting 1193514 vectors from glove.twitter.27B.100d.txt to C:\Users\chudu\AppData\Local\Temp\word2vec_format.vec
Sep 07 00:51 utils_any2vec.py[line:170] INFO loading projection weights from C:\Users\chudu\AppData\Local\Temp\word2vec_format.vec


Loading Glove vectors...


Sep 07 00:52 utils_any2vec.py[line:232] INFO loaded (1193514, 100) matrix from C:\Users\chudu\AppData\Local\Temp\word2vec_format.vec


DONE


In [20]:
def get_init_embedding(word_vectors, word_vocab, vocab_size):
    word_vec_list = list()
    success_count = 0
    failure_count = 0
    for word, _ in word_vocab[0].items():
        try:
            word_vec = word_vectors.word_vec(word)
            success_count += 1
        except KeyError:
            word_vec = np.zeros([FLAGS.embsize], dtype=np.float32)
            failure_count += 1
        word_vec_list.append(word_vec)

    word_vec_list[2] = np.random.normal(0, 1, FLAGS.embsize)
    word_vec_list[3] = np.random.normal(0, 1, FLAGS.embsize)
    print ("SUCCES COUNT: ", success_count, " FAILURE COUNT: ", failure_count)
    return np.array(word_vec_list)
print ("DONE")

DONE


In [21]:
start_time = time.time()
if FLAGS.pretrained_embeddings:
    init_embeddings_doc = tf.constant(get_init_embedding(word_vectors, doc_dict, FLAGS.doc_vocab_size), dtype=tf.float32)
    init_embeddings_sum = tf.constant(get_init_embedding(word_vectors, sum_dict, FLAGS.sum_vocab_size), dtype=tf.float32)
print ("LOADED GLOVE VECTORS IN TIME: ", time.time() - start_time)

SUCCES COUNT:  25251  FAILURE COUNT:  4749
SUCCES COUNT:  9505  FAILURE COUNT:  495
LOADED GLOVE VECTORS IN TIME:  0.23414158821105957


In [26]:
fc_layer = tf.contrib.layers.fully_connected

In [27]:
class BiGRUModel(object):

    def __init__(self,doc_dict, sum_dict, source_vocab_size,target_vocab_size, buckets, state_size, num_layers, embedding_size, max_gradient, batch_size, learning_rate, forward_only=False, dtype=tf.float32):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.state_size = state_size
        self.sum_dict = sum_dict
        self.doc_dict = doc_dict

        self.encoder_query_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None])
        self.encoder_doc_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None])
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[self.batch_size, None])
        self.decoder_targets = tf.placeholder(tf.int32, shape=[self.batch_size, None])
        self.encoder_query_len = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.encoder_doc_len = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.decoder_len = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.beam_tok = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.prev_att = tf.placeholder(tf.float32, shape=[self.batch_size, state_size * 2])

        encoder_fw_cell_doc = tf.contrib.rnn.GRUCell(state_size)
        encoder_bw_cell_doc = tf.contrib.rnn.GRUCell(state_size)
        encoder_fw_cell_query = tf.contrib.rnn.GRUCell(state_size)
        encoder_bw_cell_query = tf.contrib.rnn.GRUCell(state_size)
        decoder_cell = tf.contrib.rnn.GRUCell(state_size)

        if not forward_only:
            encoder_fw_cell_doc = tf.contrib.rnn.DropoutWrapper(encoder_fw_cell_doc, output_keep_prob=0.50)
            encoder_bw_cell_doc = tf.contrib.rnn.DropoutWrapper(encoder_bw_cell_doc, output_keep_prob=0.50)
            encoder_fw_cell_query = tf.contrib.rnn.DropoutWrapper(encoder_fw_cell_query, output_keep_prob=0.50)
            encoder_bw_cell_query = tf.contrib.rnn.DropoutWrapper(encoder_bw_cell_query, output_keep_prob=0.50)
            decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob=0.50)


        with tf.variable_scope("seq2seq", dtype=dtype):
            with tf.name_scope("embedding_initializer"):
                if not forward_only and FLAGS.pretrained_embeddings is not None:
                    init_embeddings_doc = tf.constant(get_init_embedding(word_vectors, doc_dict, FLAGS.doc_vocab_size), dtype=tf.float32)
                    init_embeddings_sum = tf.constant(get_init_embedding(word_vectors, sum_dict, FLAGS.sum_vocab_size), dtype=tf.float32)
                else:
                    init_embeddings_doc = tf.random_uniform([self.source_vocab_size, self.embedding_size], -1.0, 1.0)
                    init_embeddings_sum = tf.random_uniform([self.target_vocab_size, self.embedding_size], -1.0, 1.0)
                    print ("FORWARD OR NO EMBEDDING PATH")
                
            with tf.variable_scope("encoder_query"):
                encoder_query_emb = tf.get_variable("embedding_query", initializer=init_embeddings_doc)
                encoder_query_inputs_emb = tf.nn.embedding_lookup(encoder_query_emb, self.encoder_query_inputs)
                encoder_query_outputs, encoder_query_states = tf.nn.bidirectional_dynamic_rnn(encoder_fw_cell_query, encoder_bw_cell_query, encoder_query_inputs_emb, sequence_length=self.encoder_query_len, dtype=dtype)
                    
            with tf.variable_scope("encoder_doc"):
                encoder_doc_emb = tf.get_variable("embedding_doc", initializer=init_embeddings_doc)
                encoder_doc_inputs_emb = tf.nn.embedding_lookup(encoder_doc_emb, self.encoder_doc_inputs)
                encoder_doc_outputs, encoder_doc_states = tf.nn.bidirectional_dynamic_rnn(encoder_fw_cell_doc, encoder_bw_cell_doc, encoder_doc_inputs_emb, sequence_length=self.encoder_doc_len, dtype=dtype)
                    
            with tf.variable_scope("init_state_query"):
                init_state_query = fc_layer(tf.concat(encoder_query_states, 1), state_size)
                self.init_state_query = init_state_query
                self.init_state_query.set_shape([self.batch_size, state_size])
                self.att_states_query = tf.concat(encoder_query_outputs, 2)
                self.att_states_query.set_shape([self.batch_size, None, state_size*2])
                
            with tf.variable_scope("init_state_doc"):
                init_state_doc = fc_layer( tf.concat(encoder_doc_states, 1), state_size)
                self.init_state_doc = init_state_doc
                self.init_state_doc.set_shape([self.batch_size, state_size])
                self.att_states_doc = tf.concat(encoder_doc_outputs, 2)
                self.att_states_doc.set_shape([self.batch_size, None, state_size*2])
                
            with tf.variable_scope("attention_query"):
                attention_mechanism_query = tf.contrib.seq2seq.BahdanauAttention(state_size, self.att_states_query, self.encoder_query_len)
                attention_state_query = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism_query, state_size * 2)              
                
            with tf.variable_scope("attention_doc"):
                attention_mechanism_doc = tf.contrib.seq2seq.BahdanauAttention(state_size, self.att_states_doc, self.encoder_doc_len)                
                decoder_cell = tf.contrib.seq2seq.AttentionWrapper(attention_state_query, attention_mechanism_doc, state_size * 2)               

            with tf.variable_scope("decoder") as scope:

                decoder_emb = tf.get_variable("embedding", initializer= init_embeddings_sum)
                decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, target_vocab_size)
                
                if not forward_only:
                    decoder_inputs_emb = tf.nn.embedding_lookup(decoder_emb, self.decoder_inputs)
                    helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_emb, self.decoder_len)
                    decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=decoder_cell.zero_state(dtype=tf.float32, batch_size=batch_size))
                    outputs = tf.contrib.seq2seq.dynamic_decode(decoder)

                    outputs_logits = outputs[0].rnn_output
                    self.outputs = outputs_logits
                    #print ("SHAPE OF OUTPUTS: ", tf.shape(outputs_logits, out_type=tf.int32 ))
                    #print ("SHAPE OF TARGETS: ", tf.shape(self.decoder_targets, out_type=tf.int32))
                    weights = tf.sequence_mask(self.decoder_len, dtype=tf.float32)
                    loss_t = tf.contrib.seq2seq.sequence_loss(outputs_logits, self.decoder_targets, weights, average_across_timesteps=False, average_across_batch=False)
                    self.loss = tf.reduce_sum(loss_t)/FLAGS.batch_size                    
                    predictions = tf.cast(tf.argmax(outputs_logits, axis=2), tf.int32) 
                    self.accuracy = tf.contrib.metrics.accuracy(predictions, self.decoder_targets)

                    params = tf.trainable_variables()
                    opt = tf.train.AdadeltaOptimizer(self.learning_rate, epsilon=1e-4)
                    gradients = tf.gradients(self.loss, params)
                    clipped_gradients, norm = \
                        tf.clip_by_global_norm(gradients, max_gradient)
                    self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
                    tf.summary.scalar('loss', self.loss)
                    #tf.summary.scalar('accuracy', self.accuracy)
                else:
                    self.loss = tf.constant(0)
                    self.accuracy = tf.constant(0)
                    with tf.variable_scope("proj") as scope:
                        output_fn = lambda x: fc_layer(x, target_vocab_size, scope=scope)

                    st_toks = tf.convert_to_tensor([ID_GO]*batch_size, dtype=tf.int32)

                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_emb, st_toks, ID_EOS)

                    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, wrapper_state)

                    outputs = tf.contrib.seq2seq.dynamic_decode(decoder)

                    self.outputs = outputs[0].rnn_output

                    # single step decode for beam search
                    with tf.variable_scope("decoder_beam"):
                        beam_emb = tf.nn.embedding_lookup(decoder_emb, self.beam_tok)
                        self.beam_outputs, self.beam_nxt_state, _, _ = decoder.step(0, beam_emb, wrapper_state)
                        self.beam_logsoftmax = tf.nn.log_softmax(self.beam_outputs[0])

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
        self.summary_merge = tf.summary.merge_all()
    
    
    


    
    
    
    def step(self,session,encoder_doc_inputs,encoder_query_inputs,decoder_inputs,encoder_doc_len,encoder_query_len,decoder_len,forward_only,summary_writer=None):

        # dim fit is important for sequence_mask
        # TODO better way to use sequence_mask
        if encoder_query_inputs.shape[1] != max(encoder_query_len):
            raise ValueError("encoder_query_inputs and encoder_query_len does not fit")
        if encoder_doc_inputs.shape[1] != max(encoder_doc_len):
            raise ValueError("encoder_doc_inputs and encoder_doc_len does not fit")
        if not forward_only and \
            decoder_inputs.shape[1] != max(decoder_len) + 1:
            raise ValueError("decoder_inputs and decoder_len does not fit")
        input_feed = {}
        input_feed[self.encoder_query_inputs] = encoder_query_inputs
        input_feed[self.encoder_doc_inputs] = encoder_doc_inputs
        input_feed[self.decoder_inputs] = decoder_inputs[:, :-1]
        input_feed[self.decoder_targets] = decoder_inputs[:, 1:]
        input_feed[self.encoder_query_len] = encoder_query_len
        input_feed[self.encoder_doc_len] = encoder_doc_len
        input_feed[self.decoder_len] = decoder_len
        input_feed[self.prev_att] = np.zeros([self.batch_size, 2 * self.state_size])
        #print ("FINE TILL HERE")

        if forward_only:
            output_feed = [self.loss, self.accuracy, self.outputs]
        else:
            output_feed = [self.loss, self.accuracy, self.updates]

        if summary_writer:
            output_feed += [self.summary_merge, self.global_step]

        outputs = session.run(output_feed, input_feed)

        if summary_writer:
            summary_writer.add_summary(outputs[3], outputs[4])
        return outputs[:3]

    def step_beam(self,
                  session,
                  encoder_doc_inputs, encoder_query_inputs,
                  encoder_doc_len, encoder_query_len,
                  max_len=12,
                  geneos=True):
        print ("CAME HERE ERROR")
        ret = []
        return ret



    def add_pad(self, data, fixlen):
        data = map(lambda x: x + [ID_PAD] * (fixlen - len(x)), data)
        data = list(data)
        return np.asarray(data)
    
    def batchify (self, data_set, _buckets):
        batched_data_set = []
        encoder_query_inputs, encoder_doc_inputs, decoder_inputs = [], [], []
        encoder_query_len, encoder_doc_len, decoder_len = [], [], []
        num_data = 0
        counter = 0
        for bucket_id in range (len(_buckets)):
            if(len(data_set[bucket_id])==0):
                continue
            for j in range(len(data_set[bucket_id])):
                counter += 1
                encoder_doc_input, encoder_query_input, decoder_input = data_set[bucket_id][j]
                encoder_doc_inputs.append(encoder_doc_input)
                encoder_doc_len.append(len(encoder_doc_input))            
                encoder_query_inputs.append(encoder_query_input)
                encoder_query_len.append(len(encoder_query_input))
                decoder_inputs.append(decoder_input)
                decoder_len.append(len(decoder_input))
                num_data += 1
                
                if(num_data == FLAGS.batch_size):
                    num_data = 0
                    batch_enc_doc_len = max(encoder_doc_len)
                    batch_enc_query_len = max(encoder_query_len)
                    batch_dec_len = max(decoder_len)
                    encoder_doc_inputs = self.add_pad(encoder_doc_inputs, batch_enc_doc_len)
                    encoder_query_inputs = self.add_pad(encoder_query_inputs, batch_enc_query_len)
                    decoder_inputs = self.add_pad(decoder_inputs, batch_dec_len)
                    encoder_doc_len = np.asarray(encoder_doc_len)
                    encoder_query_len = np.asarray(encoder_query_len)
                    decoder_len = np.asarray(decoder_len) - 1
                    
                    batched_data_set.append([encoder_doc_inputs, encoder_query_inputs, decoder_inputs, encoder_doc_len, encoder_query_len, decoder_len])
                    
                    encoder_query_inputs, encoder_doc_inputs, decoder_inputs = [], [], []
                    encoder_query_len, encoder_doc_len, decoder_len = [], [], []
        print ("BATCHED COUNTER: ", counter)
        print ("BATCHED LENGTH: ", len(batched_data_set))
        return batched_data_set

print ("DONE")   

DONE


In [28]:
# We use a number of buckets for sampling
_buckets = [(300,5,15), (300,10,20), (300,15,25), (300,20,40), (300,40,50), (350,5,15), (350,10,20), (350,15,25), (350,20,40), (350,40,50), (450,5,15), (450,10,30), (450,15,50), (450,20,100), (450,40,150), (550,5,15), (550,10,30), (550,15,60), (550,20,100), (550,40,150), (650,5,15), (650,10,30), (650,15,60), (650,20,100), (650,40,150), (750,5,15), (750,10,30), (750,15,60), (750,20,100), (750,40,240), (850,5,15), (850,10,30), (850,15,60), (850,20,100), (850,40,240), (1050,5,15), (1050,10,30), (1050,15,60), (1050,20,100), (1050,40,240), (1500,5,15), (1500,10,30), (1500,15,60), (1500,20,100), (1500,40,300), ]


print ("DONE")


DONE


In [29]:
def create_bucket(source, query, target):
    totalDocs = len(source)
    print ("TOTAL DOCS BEFORE BUCKETS: ", totalDocs)
    data_set = [[] for _ in _buckets]
    for s, q, t in zip(source, query, target):
        t = [ID_GO] + t + [ID_EOS]
        found = False
        for bucket_id, (s_size, q_size, t_size) in enumerate(_buckets):
            if len(s) <= s_size and len(q) <= q_size and len(t) <= t_size:
                data_set[bucket_id].append([s, q, t])
                found = True
                break
        if(found != True):
            print ("Didn't find bucket for {}, {}, {}".format(len(s), len(q), len(t)))
    return data_set


def create_model(session, doc_dict, sum_dict, forward_only):
    """Create model and initialize or load parameters in session."""
    dtype = tf.float32
    model = BiGRUModel(doc_dict, sum_dict, FLAGS.doc_vocab_size, FLAGS.sum_vocab_size, _buckets, FLAGS.size,  FLAGS.num_layers, FLAGS.embsize, FLAGS.max_gradient,
        FLAGS.batch_size,     FLAGS.learning_rate,       forward_only=forward_only,        dtype=dtype)
    print ("Loading Checkpoint: ", FLAGS.load_checkpoint)
    if (FLAGS.load_checkpoint):        
        ckpt = tf.train.latest_checkpoint(FLAGS.train_dir)
        if ckpt:
            #ckpt = ckpt.model_checkpoint_path
            if ckpt and tf.train.checkpoint_exists(ckpt):
                logging.info("Reading model parameters from %s" % ckpt)
                model.saver.restore(session, ckpt)
            else:
                logging.error("Don't have any checkpoints to load: %s" % ckpt)
    else:
        logging.info("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())
    return model


print ("DONE")

DONE


In [None]:
print ("In Train")
try:
    os.makedirs(FLAGS.train_dir)
except:
    pass

logging.info("Preparing summarization data.")

from sklearn.model_selection import train_test_split
train_docid, val_docid, train_queryid, val_queryid, train_sumid, val_sumid = train_test_split(docid, queryid, sumid, test_size=FLAGS.train_test_split, shuffle=False, random_state=42)

tf.reset_default_graph()
config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)    
# please do not use the totality of the GPU memory
config.gpu_options.per_process_gpu_memory_fraction = 0.90
config.gpu_options.allow_growth = True
config.gpu_options.allocator_type = 'BFC'
with tf.Graph().as_default(), tf.Session(config=config) as sess:
    # tensorflow seed must be inside graph
    tf.set_random_seed(FLAGS.seed)
    np.random.seed(seed=FLAGS.seed)

    # Create model.
    logging.info("Creating %d layers of %d units." %
                 (FLAGS.num_layers, FLAGS.size))
    train_writer = tf.summary.FileWriter(FLAGS.tfboard+'/train', sess.graph)
    model = create_model(sess, doc_dict, sum_dict, False)

    # Read data into buckets and compute their sizes.
    logging.info("Create buckets.")
    dev_set = create_bucket(val_docid, val_queryid, val_sumid)
    train_set = create_bucket(train_docid, train_queryid, train_sumid)

    train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]

    for (s_size, q_size, t_size), nsample in zip(_buckets, train_bucket_sizes):
        logging.info("Train set bucket ({}, {}, {}) has {} samples.".format(
            s_size, q_size, t_size, nsample))
    batched_train_set = model.batchify(train_set, _buckets)
    batched_dev_set = model.batchify(dev_set, _buckets)
    # This is the training loop.
    step_time, train_acc, train_loss = 0.0, 0.0, 0.0
    step_start_time = 0
    num_epoch = 0
    step_time = 0
    while num_epoch <= FLAGS.max_epochs:
        epoch_train_loss = 0.0 
        epoch_train_acc = 0.0
        current_train_step = 0
        epoch_start_time = time.time()

        for batch_train in batched_train_set:
            
            
            step_start_time = time.time()                
            encoder_doc_inputs, encoder_query_inputs, decoder_inputs, encoder_doc_len, encoder_query_len, decoder_len = batch_train
            
            step_train_loss, step_train_acc, _ = model.step(sess, encoder_doc_inputs, encoder_query_inputs, decoder_inputs,
                encoder_doc_len, encoder_query_len, decoder_len, False, train_writer)
            
            step_time = time.time() - step_start_time
            #print ("CURRENT STEP: ", current_train_step, " STEP TIME: ", step_time)
    
            step_train_loss =  (step_train_loss * FLAGS.batch_size)/np.sum(decoder_len)
            epoch_train_loss += step_train_loss
            epoch_train_acc += step_train_acc      

            # Once in a while, we save checkpoint.
            if current_train_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                save_time_start = time.time()
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                model.saver.save(sess, checkpoint_path, global_step=model.global_step)
                time_taken_to_save = time.time() - save_time_start
                print("Time taken to save checkpoint: ", time_taken_to_save)

            # Once in a while, we print statistics and run evals.
            if current_train_step % FLAGS.steps_per_print == 0:
                # Print statistics for the previous epoch.
                print ("Epoch: %d, GlobalStep: %d, step-time %.2f, Acc: %.4f, Loss: %.4f, Perpxty: %.2f" % (num_epoch, model.global_step.eval(), 
                                               step_time, 
                                               step_train_acc, 
                                               step_train_loss, 
                                               np.exp(float(step_train_loss))))
                step_time, train_acc, train_loss = 0.0, 0.0, 0.0   
            
            current_train_step += 1

        #epoch_train_loss, epoch_train_acc, current_train_step = 1., 2., 15
        epoch_eval_loss, epoch_eval_acc = 0.0, 0.0
        current_eval_step = 0
        for batch_dev in batched_dev_set:
            
            encoder_doc_inputs, encoder_query_inputs, decoder_inputs, encoder_doc_len, encoder_query_len, decoder_len = batch_dev
            step_eval_loss, step_eval_acc, _ = model.step(sess, encoder_doc_inputs,encoder_query_inputs,
                                        decoder_inputs, encoder_doc_len, encoder_query_len,
                                        decoder_len, True)
            step_eval_loss = (step_eval_loss * FLAGS.batch_size) / np.sum(decoder_len)
            epoch_eval_loss += step_eval_loss
            epoch_eval_acc += step_eval_acc
            current_eval_step += 1
                
        print("at the end of epoch:", num_epoch)
        print("Average train loss = %6.8f, Average perplexity = %6.8f" % (epoch_train_loss/current_train_step, np.exp(epoch_train_loss/current_train_step)))
        print("Average train acc = %6.8f" % (epoch_train_acc/current_train_step))
        print("validation loss = %6.8f, perplexity = %6.8f" % (epoch_eval_loss/current_eval_step, np.exp(epoch_eval_loss/current_eval_step)))
        print("Average Validation acc = %6.8f" % (epoch_eval_acc/current_eval_step))

        # Save checkpoint and zero timer and loss.
        save_time_start = time.time()
        checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        time_taken_to_save = time.time() - save_time_start
        print("Time taken to save checkpoint: ", time_taken_to_save)
        num_epoch += 1
            
    sys.stdout.flush()

print ("DONE")

Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:7] INFO Preparing summarization data.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:25] INFO Creating 1 layers of 400 units.


In Train
SUCCES COUNT:  25251  FAILURE COUNT:  4749
SUCCES COUNT:  9505  FAILURE COUNT:  495


Sep 07 00:58 <ipython-input-29-78f466946508>[line:34] INFO Created model with fresh parameters.


Loading Checkpoint:  False


Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:30] INFO Create buckets.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (300, 5, 15) has 0 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (300, 10, 20) has 1 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (300, 15, 25) has 3 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (300, 20, 40) has 2 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (300, 40, 50) has 0 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (350, 5, 15) has 0 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (350, 10, 20) has 1 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (350, 15, 25) has 13 samples.
Sep 07 00:58 <ipython-input-30-4c64aabca3a0>[line:40] INFO Train set bucket (350, 20, 

TOTAL DOCS BEFORE BUCKETS:  308
TOTAL DOCS BEFORE BUCKETS:  15083
BATCHED COUNTER:  15083
BATCHED LENGTH:  3016
BATCHED COUNTER:  308
BATCHED LENGTH:  61
Time taken to save checkpoint:  3.6710524559020996
Epoch: 0, GlobalStep: 1, step-time 3.00, Acc: 0.0000, Loss: 9.2126, Perpxty: 10023.00
Epoch: 0, GlobalStep: 51, step-time 0.73, Acc: 0.0500, Loss: 6.4540, Perpxty: 635.22
Epoch: 0, GlobalStep: 101, step-time 0.78, Acc: 0.0593, Loss: 6.3018, Perpxty: 545.55
Epoch: 0, GlobalStep: 151, step-time 0.97, Acc: 0.0407, Loss: 6.9551, Perpxty: 1048.44
Epoch: 0, GlobalStep: 201, step-time 0.94, Acc: 0.0966, Loss: 6.3832, Perpxty: 591.85
Epoch: 0, GlobalStep: 251, step-time 0.95, Acc: 0.0897, Loss: 6.4147, Perpxty: 610.76
Epoch: 0, GlobalStep: 301, step-time 0.92, Acc: 0.1478, Loss: 6.0470, Perpxty: 422.83
Epoch: 0, GlobalStep: 351, step-time 0.92, Acc: 0.1200, Loss: 6.3606, Perpxty: 578.61
Epoch: 0, GlobalStep: 401, step-time 0.95, Acc: 0.1231, Loss: 6.3291, Perpxty: 560.64
Epoch: 0, GlobalStep:

Epoch: 1, GlobalStep: 4267, step-time 1.14, Acc: 0.2000, Loss: 5.2561, Perpxty: 191.74
Epoch: 1, GlobalStep: 4317, step-time 1.11, Acc: 0.2138, Loss: 5.2227, Perpxty: 185.44
Epoch: 1, GlobalStep: 4367, step-time 1.03, Acc: 0.1600, Loss: 5.7159, Perpxty: 303.66
Epoch: 1, GlobalStep: 4417, step-time 1.12, Acc: 0.2222, Loss: 5.3042, Perpxty: 201.18
Epoch: 1, GlobalStep: 4467, step-time 1.12, Acc: 0.1714, Loss: 5.4314, Perpxty: 228.48
Time taken to save checkpoint:  3.639807939529419
Epoch: 1, GlobalStep: 4517, step-time 1.09, Acc: 0.1778, Loss: 5.2941, Perpxty: 199.16
Epoch: 1, GlobalStep: 4567, step-time 1.00, Acc: 0.1440, Loss: 5.9217, Perpxty: 373.03
Epoch: 1, GlobalStep: 4617, step-time 1.14, Acc: 0.2214, Loss: 5.1667, Perpxty: 175.34
Epoch: 1, GlobalStep: 4667, step-time 1.12, Acc: 0.1786, Loss: 5.6152, Perpxty: 274.57
Epoch: 1, GlobalStep: 4717, step-time 1.16, Acc: 0.2917, Loss: 4.4330, Perpxty: 84.18
Epoch: 1, GlobalStep: 4767, step-time 1.09, Acc: 0.1643, Loss: 5.5643, Perpxty: 2

Epoch: 2, GlobalStep: 8633, step-time 1.31, Acc: 0.1714, Loss: 4.7756, Perpxty: 118.59
Epoch: 2, GlobalStep: 8683, step-time 1.45, Acc: 0.1255, Loss: 5.7033, Perpxty: 299.84
Epoch: 2, GlobalStep: 8733, step-time 1.33, Acc: 0.1702, Loss: 4.9526, Perpxty: 141.54
Epoch: 2, GlobalStep: 8783, step-time 1.37, Acc: 0.1891, Loss: 4.8946, Perpxty: 133.56
Epoch: 2, GlobalStep: 8833, step-time 1.66, Acc: 0.1326, Loss: 5.6576, Perpxty: 286.45
Epoch: 2, GlobalStep: 8883, step-time 1.83, Acc: 0.1054, Loss: 5.5768, Perpxty: 264.23
Epoch: 2, GlobalStep: 8933, step-time 1.81, Acc: 0.1956, Loss: 4.9340, Perpxty: 138.93
Epoch: 2, GlobalStep: 8983, step-time 2.16, Acc: 0.0957, Loss: 5.6462, Perpxty: 283.22
Time taken to save checkpoint:  3.483555793762207
Epoch: 2, GlobalStep: 9033, step-time 2.52, Acc: 0.1294, Loss: 5.3115, Perpxty: 202.65
at the end of epoch: 2
Average train loss = 5.27297594, Average perplexity = 194.99539451
Average train acc = 0.17369296
validation loss = 5.54251548, perplexity = 255

Epoch: 4, GlobalStep: 12765, step-time 1.03, Acc: 0.1931, Loss: 4.8818, Perpxty: 131.87
Time taken to save checkpoint:  3.5304408073425293
Epoch: 4, GlobalStep: 12815, step-time 1.08, Acc: 0.1241, Loss: 5.1952, Perpxty: 180.40
Epoch: 4, GlobalStep: 12865, step-time 1.08, Acc: 0.1867, Loss: 5.2036, Perpxty: 181.92
Epoch: 4, GlobalStep: 12915, step-time 1.09, Acc: 0.1346, Loss: 5.5062, Perpxty: 246.22
Epoch: 4, GlobalStep: 12965, step-time 1.09, Acc: 0.1422, Loss: 5.3526, Perpxty: 211.15
Epoch: 4, GlobalStep: 13015, step-time 1.08, Acc: 0.1689, Loss: 4.6939, Perpxty: 109.28
Epoch: 4, GlobalStep: 13065, step-time 1.14, Acc: 0.1259, Loss: 5.3540, Perpxty: 211.46
Epoch: 4, GlobalStep: 13115, step-time 1.09, Acc: 0.1702, Loss: 5.2171, Perpxty: 184.40
Epoch: 4, GlobalStep: 13165, step-time 1.03, Acc: 0.1333, Loss: 5.6639, Perpxty: 288.26
Epoch: 4, GlobalStep: 13215, step-time 1.02, Acc: 0.1619, Loss: 5.1308, Perpxty: 169.15
Epoch: 4, GlobalStep: 13265, step-time 1.12, Acc: 0.1107, Loss: 5.447

Epoch: 5, GlobalStep: 17081, step-time 1.22, Acc: 0.1957, Loss: 4.9125, Perpxty: 135.97
Epoch: 5, GlobalStep: 17131, step-time 1.28, Acc: 0.1607, Loss: 5.2678, Perpxty: 193.98
Epoch: 5, GlobalStep: 17181, step-time 1.22, Acc: 0.1660, Loss: 5.1789, Perpxty: 177.49
Epoch: 5, GlobalStep: 17231, step-time 1.19, Acc: 0.2154, Loss: 4.2761, Perpxty: 71.96
Epoch: 5, GlobalStep: 17281, step-time 1.20, Acc: 0.1867, Loss: 4.6090, Perpxty: 100.39
Time taken to save checkpoint:  3.3742470741271973
Epoch: 5, GlobalStep: 17331, step-time 1.28, Acc: 0.1529, Loss: 4.9865, Perpxty: 146.43
Epoch: 5, GlobalStep: 17381, step-time 1.27, Acc: 0.1552, Loss: 4.8055, Perpxty: 122.18
Epoch: 5, GlobalStep: 17431, step-time 1.20, Acc: 0.1714, Loss: 4.8577, Perpxty: 128.72
Epoch: 5, GlobalStep: 17481, step-time 1.11, Acc: 0.1850, Loss: 4.8831, Perpxty: 132.03
Epoch: 5, GlobalStep: 17531, step-time 1.42, Acc: 0.1455, Loss: 5.2976, Perpxty: 199.85
Epoch: 5, GlobalStep: 17581, step-time 1.16, Acc: 0.1929, Loss: 4.3859

Epoch: 7, GlobalStep: 21163, step-time 0.80, Acc: 0.2286, Loss: 4.2560, Perpxty: 70.53
Epoch: 7, GlobalStep: 21213, step-time 0.77, Acc: 0.2222, Loss: 4.5603, Perpxty: 95.61
Epoch: 7, GlobalStep: 21263, step-time 0.98, Acc: 0.1322, Loss: 5.2695, Perpxty: 194.32
Epoch: 7, GlobalStep: 21313, step-time 0.97, Acc: 0.2000, Loss: 4.5375, Perpxty: 93.45
Epoch: 7, GlobalStep: 21363, step-time 1.02, Acc: 0.2276, Loss: 4.8301, Perpxty: 125.22
Epoch: 7, GlobalStep: 21413, step-time 0.94, Acc: 0.2435, Loss: 4.3118, Perpxty: 74.57
Epoch: 7, GlobalStep: 21463, step-time 0.94, Acc: 0.2160, Loss: 4.5620, Perpxty: 95.77
Epoch: 7, GlobalStep: 21513, step-time 0.97, Acc: 0.2615, Loss: 4.4984, Perpxty: 89.87
Epoch: 7, GlobalStep: 21563, step-time 0.95, Acc: 0.1750, Loss: 4.5250, Perpxty: 92.30
Epoch: 7, GlobalStep: 21613, step-time 0.92, Acc: 0.2308, Loss: 4.1828, Perpxty: 65.55
Epoch: 7, GlobalStep: 21663, step-time 0.94, Acc: 0.2160, Loss: 4.3427, Perpxty: 76.92
Epoch: 7, GlobalStep: 21713, step-time 0.

Epoch: 8, GlobalStep: 25529, step-time 1.11, Acc: 0.3111, Loss: 4.0048, Perpxty: 54.86
Epoch: 8, GlobalStep: 25579, step-time 1.09, Acc: 0.3429, Loss: 3.8213, Perpxty: 45.66
Time taken to save checkpoint:  3.5773134231567383
Epoch: 8, GlobalStep: 25629, step-time 1.17, Acc: 0.2370, Loss: 4.2778, Perpxty: 72.08
Epoch: 8, GlobalStep: 25679, step-time 1.02, Acc: 0.2640, Loss: 5.0611, Perpxty: 157.76
Epoch: 8, GlobalStep: 25729, step-time 1.09, Acc: 0.2714, Loss: 4.1295, Perpxty: 62.15
Epoch: 8, GlobalStep: 25779, step-time 1.17, Acc: 0.2571, Loss: 4.5634, Perpxty: 95.91
Epoch: 8, GlobalStep: 25829, step-time 1.11, Acc: 0.4167, Loss: 3.1524, Perpxty: 23.39
Epoch: 8, GlobalStep: 25879, step-time 1.09, Acc: 0.2286, Loss: 4.3579, Perpxty: 78.09
Epoch: 8, GlobalStep: 25929, step-time 1.09, Acc: 0.3517, Loss: 3.6345, Perpxty: 37.88
Epoch: 8, GlobalStep: 25979, step-time 1.14, Acc: 0.3040, Loss: 4.0920, Perpxty: 59.86
Epoch: 8, GlobalStep: 26029, step-time 1.19, Acc: 0.1760, Loss: 4.4122, Perpxt

Epoch: 9, GlobalStep: 29845, step-time 1.39, Acc: 0.3702, Loss: 3.2579, Perpxty: 25.99
Epoch: 9, GlobalStep: 29895, step-time 1.37, Acc: 0.3455, Loss: 3.5240, Perpxty: 33.92
Epoch: 9, GlobalStep: 29945, step-time 1.66, Acc: 0.2391, Loss: 4.4656, Perpxty: 86.97
Epoch: 9, GlobalStep: 29995, step-time 1.86, Acc: 0.2065, Loss: 4.1819, Perpxty: 65.49
Epoch: 9, GlobalStep: 30045, step-time 1.84, Acc: 0.3867, Loss: 3.2386, Perpxty: 25.50
Epoch: 9, GlobalStep: 30095, step-time 2.14, Acc: 0.2128, Loss: 4.3964, Perpxty: 81.16
Time taken to save checkpoint:  3.608557939529419
Epoch: 9, GlobalStep: 30145, step-time 2.59, Acc: 0.2353, Loss: 3.8031, Perpxty: 44.84
at the end of epoch: 9
Average train loss = 3.86635033, Average perplexity = 47.76773107
Average train acc = 0.29992347
validation loss = 4.33362349, perplexity = 76.21996903
Average Validation acc = 0.27994202
Time taken to save checkpoint:  3.8272178173065186
Time taken to save checkpoint:  3.499207019805908
Epoch: 10, GlobalStep: 30161,

Epoch: 11, GlobalStep: 33977, step-time 1.12, Acc: 0.4311, Loss: 3.2121, Perpxty: 24.83
Epoch: 11, GlobalStep: 34027, step-time 1.11, Acc: 0.2385, Loss: 3.9851, Perpxty: 53.79
Epoch: 11, GlobalStep: 34077, step-time 1.09, Acc: 0.4222, Loss: 3.2195, Perpxty: 25.01
Epoch: 11, GlobalStep: 34127, step-time 1.09, Acc: 0.2800, Loss: 3.7070, Perpxty: 40.73
Epoch: 11, GlobalStep: 34177, step-time 1.12, Acc: 0.3741, Loss: 3.3390, Perpxty: 28.19
Epoch: 11, GlobalStep: 34227, step-time 1.09, Acc: 0.3915, Loss: 3.4643, Perpxty: 31.95
Epoch: 11, GlobalStep: 34277, step-time 1.03, Acc: 0.3022, Loss: 4.0396, Perpxty: 56.80
Epoch: 11, GlobalStep: 34327, step-time 1.03, Acc: 0.3810, Loss: 3.4409, Perpxty: 31.22
Epoch: 11, GlobalStep: 34377, step-time 1.16, Acc: 0.2357, Loss: 4.1402, Perpxty: 62.82
Epoch: 11, GlobalStep: 34427, step-time 1.05, Acc: 0.4154, Loss: 2.6299, Perpxty: 13.87
Epoch: 11, GlobalStep: 34477, step-time 1.14, Acc: 0.4483, Loss: 2.7383, Perpxty: 15.46
Epoch: 11, GlobalStep: 34527, st

Epoch: 12, GlobalStep: 38293, step-time 1.28, Acc: 0.3132, Loss: 3.2191, Perpxty: 25.00
Epoch: 12, GlobalStep: 38343, step-time 1.17, Acc: 0.4923, Loss: 2.5719, Perpxty: 13.09
Epoch: 12, GlobalStep: 38393, step-time 1.23, Acc: 0.3156, Loss: 3.3035, Perpxty: 27.21
Time taken to save checkpoint:  3.5616843700408936
Epoch: 12, GlobalStep: 38443, step-time 1.20, Acc: 0.2902, Loss: 3.4728, Perpxty: 32.23
Epoch: 12, GlobalStep: 38493, step-time 1.33, Acc: 0.3966, Loss: 2.7254, Perpxty: 15.26
Epoch: 12, GlobalStep: 38543, step-time 1.19, Acc: 0.4048, Loss: 3.0009, Perpxty: 20.10
Epoch: 12, GlobalStep: 38593, step-time 1.14, Acc: 0.3700, Loss: 3.4360, Perpxty: 31.06
Epoch: 12, GlobalStep: 38643, step-time 1.34, Acc: 0.3792, Loss: 3.2616, Perpxty: 26.09
Epoch: 12, GlobalStep: 38693, step-time 1.16, Acc: 0.4071, Loss: 2.9217, Perpxty: 18.57
Epoch: 12, GlobalStep: 38743, step-time 1.28, Acc: 0.4357, Loss: 3.0217, Perpxty: 20.53
Epoch: 12, GlobalStep: 38793, step-time 1.27, Acc: 0.4643, Loss: 2.53

Epoch: 14, GlobalStep: 42375, step-time 1.04, Acc: 0.3254, Loss: 3.8566, Perpxty: 47.30
Epoch: 14, GlobalStep: 42425, step-time 1.01, Acc: 0.5379, Loss: 2.1434, Perpxty: 8.53
Epoch: 14, GlobalStep: 42475, step-time 1.00, Acc: 0.4138, Loss: 2.7585, Perpxty: 15.78
Epoch: 14, GlobalStep: 42525, step-time 1.03, Acc: 0.5130, Loss: 2.1840, Perpxty: 8.88
Epoch: 14, GlobalStep: 42575, step-time 0.95, Acc: 0.4240, Loss: 3.2414, Perpxty: 25.57
Epoch: 14, GlobalStep: 42625, step-time 1.02, Acc: 0.4462, Loss: 2.7935, Perpxty: 16.34
Epoch: 14, GlobalStep: 42675, step-time 1.05, Acc: 0.3750, Loss: 3.2268, Perpxty: 25.20
Epoch: 14, GlobalStep: 42725, step-time 1.01, Acc: 0.4769, Loss: 2.7612, Perpxty: 15.82
Epoch: 14, GlobalStep: 42775, step-time 1.00, Acc: 0.5680, Loss: 2.2763, Perpxty: 9.74
Epoch: 14, GlobalStep: 42825, step-time 1.05, Acc: 0.6545, Loss: 2.0103, Perpxty: 7.47
Epoch: 14, GlobalStep: 42875, step-time 1.03, Acc: 0.4345, Loss: 3.3000, Perpxty: 27.11
Epoch: 14, GlobalStep: 42925, step-t

In [None]:
def decode():
    print ("In Decode")
    # Load vocabularies.
    doc_dict = load_dict(FLAGS.doc_dict_path)
    sum_dict = load_dict(FLAGS.sum_dict_path)
    if doc_dict is None or sum_dict is None:
        logging.warning("Dict not found.")
    data = load_test_data(FLAGS.test_file, doc_dict)

    with tf.Session() as sess:
        # Create model and load parameters.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        FLAGS.load_checkpoint = True
        model = create_model(sess, doc_dict, sum_dict, True)
        FLAGS.batch_size = 1
        result = []
        for idx, token_ids in enumerate(data):

            # Get a 1-element batch to feed the sentence to the model.
            encoder_doc_inputs, encoder_query_inputs, decoder_inputs, encoder_doc_len, encoder_query_len, decoder_len =\
                model.get_batch({0: [(token_ids, [ID_GO, ID_EOS])]}, 0)

            if FLAGS.batch_size == 1 and FLAGS.geneos:
                loss, outputs = model.step(sess,
                    encoder_doc_inputs, encoder_query_inputs, decoder_inputs,
                    encoder_doc_len, encoder_query_len, decoder_len, True)

                outputs = [np.argmax(item) for item in outputs[0]]
            else:
                outputs = model.step_beam(
                    sess, encoder_doc_inputs, encoder_query_inputs, encoder_doc_len, encoder_query_len, geneos=FLAGS.geneos)

            # If there is an EOS symbol in outputs, cut them at that point.
            if ID_EOS in outputs:
                outputs = outputs[:outputs.index(ID_EOS)]
            gen_sum = " ".join(sen_map2tok(outputs, sum_dict[1]))
            result.append(gen_sum)
            logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75]))
        with open(FLAGS.test_output, "w") as f:
            for item in result:
                print(item, file=f)

#decode(val_docid, val_queryid, val_sumid)
print ("DONE")

