In [26]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

from cytoolz import concatv

# NLTK for NLP utils and corpora
import nltk,pprint
from nltk import word_tokenize
nltk.download('treebank')
nltk.download('punkt')

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))


import pandas as pd
import pickle
import string
import copy

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/i812749/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /Users/i812749/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                                          1  \
0                                                                             
/1988/03/23/0129960.xml   For Health Survey, Many Offer More Excuses Tha...   
/1988/03/23/0129961.xml           McGreevey Seems Set To Exit On His Terms    
/1988/03/23/0129962.xml    The M Line and the Hemline: Miniskirt Protocols    

                                                                          2  \
0                                                                             
/1988/03/23/0129960.xml    New York City Dept of Health and Mental Hygie...   
/1988/03/23/0129961.xml    Gov James E McGreevey, whose insistence on st...   
/1988/03/23/0129962.xml    Women wearing miniskirts describe how to sit ...   

                                                                          3  \
0                                                                             
/1988/03/23/0129960.xml   New York City's ambitiou

In [157]:
def sent_tokenize_rm_punct (blob):
    bexp = ''    
    blob.replace('\n','')
    #blob.replace('</p>','  ')
    #print(type(blob))

    for i, char in (enumerate(blob)):
        #print(i,char)
        next_cap = False
        prev_lower = False
        next_space = False
        
        if char in string.punctuation :
            #print("found punctuation:", i, char, blob[i+1])
           
            if char in '.?!' :
                if i+1 >= len(blob):
                    next_cap = False
                else:
                    next_cap = blob[i+1].isupper()
                    next_space = blob[i+1].isspace()                    

                if i-1 <0:
                    prev_lower = False
                else:
                    prev_lower = blob[i-1].islower()

                if (next_cap and prev_lower) or next_space: 
                    # if the char before "." is lower case, but the one immediately follow the "." is Uppercase, then this is a paragraph end (caused by removing <p></p> from the xml file)
                    bexp = bexp + ' </s> '
                else:
                    #if this is not end of paragraph, then 
                    #bexp = bexp + char
                    pass
            else :
                bexp = bexp + ' '
        elif char == '\n':
            i += 1                    
        else :
            if char.isnumeric() : 
                bexp = bexp + 'N'
            else :
                bexp = bexp + char.lower()
    
    #return(sent_tokenize(bexp))
    return(bexp)




In [162]:

def create_lookup_tables(text):
    # make a list of unique words
    CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 , '<p>': 4}
    print("length of create_lookup_table input:", len(text))
    print("type of create_lookup_table input:", type(text))

    vocab = set(text.split())

    # (1)
    # starts with the special tokens
    vocab_to_int = copy.copy(CODES)

    # the index (v_i) will starts from 4 (the 2nd arg in enumerate() specifies the starting index)
    # since vocab_to_int already contains special tokens
    for v_i, v in enumerate(vocab, len(CODES)):
        vocab_to_int[v] = v_i
        #print(v_i, v)
    # (2)
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

In [163]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    """
        1st, 2nd args: raw string text to be converted
        3rd, 4th args: lookup tables for 1st and 2nd args respectively
    
        return: A tuple of lists (source_id_text, target_id_text) converted
    """
    # empty list of converted sentences
    source_text_id = []
    target_text_id = []
    
    # make a list of sentences (extraction)
    #source_sentences = source_text.split("\n")
    #target_sentences = target_text.split("\n")
    source_sentences = source_text
    target_sentences = target_text

    
    max_source_sentence_length = max([len(sentence.split(" ")) for sentence in source_sentences])
    max_target_sentence_length = max([len(sentence.split(" ")) for sentence in target_sentences])
    
    # iterating through each sentences (# of sentences in source&target is the same)
    for i in range(len(source_sentences)):
        # extract sentences one by one
        source_sentence = source_sentences[i]
        target_sentence = target_sentences[i]
        
        # make a list of tokens/words (extraction) from the chosen sentence
        source_tokens = source_sentence.split(" ")
        target_tokens = target_sentence.split(" ")
        
        # empty list of converted words to index in the chosen sentence
        source_token_id = []
        target_token_id = []
        
        for index, token in enumerate(source_tokens):
            if (token != ""):
                source_token_id.append(source_vocab_to_int[token])
        
        for index, token in enumerate(target_tokens):
            if (token != ""):
                target_token_id.append(target_vocab_to_int[token])
                
        # put <EOS> token at the end of the chosen target sentence
        # this token suggests when to stop creating a sequence
        target_token_id.append(target_vocab_to_int['<EOS>'])
            
        # add each converted sentences in the final list
        source_text_id.append(source_token_id)
        target_text_id.append(target_token_id)
    
    return source_text_id, target_text_id

In [164]:
import pickle
def preprocess_and_save_data(source_text, target_text, text_to_ids):
    # Preprocess
    

    # create lookup tables for English and French data
    #source_text is a Pandas dataframe series, so is target_text
    #to create lookup_tables, we will join all the lines together and send as text. 
    source_vocab_to_int, source_int_to_vocab = create_lookup_tables(''.join(list(source_text)))
    target_vocab_to_int, target_int_to_vocab = create_lookup_tables(''.join(list(target_text)))
    

    # create list of sentences whose words are represented in index
    source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int)
    
    print(type(source_text), len(source_text))
    print(source_text[10:12])
    # Save data for later use
    pickle.dump((
        (source_text, target_text),
        (source_vocab_to_int, target_vocab_to_int),
        (source_int_to_vocab, target_int_to_vocab)), open('preprocess.p', 'wb'))


def load_preprocess():
    with open('preprocess.p', mode='rb') as in_file:
        return pickle.load(in_file)    

## checkpoint for procedures used to perform pre-processing

In [168]:
import pandas as pd
import pickle
import string
#parameter setting
work_dir = "working_dir/"
#read file into Pandas Dataframe.
#each line is one record, it contains "original file name(including y/m/d)","TITLE","ABSTRACT","LEAD_PARAGRAPH","FULL_TEXT"
nyt_data = pd.read_csv('nyt_dataset/first_1k_abstract_nyt_structured_data.csv', 
                       delimiter=',', index_col=0, header=None, quotechar='"', quoting=1, 
                       skipinitialspace=True, engine='c')

    
#We will use LEAD_PARAGRAPH as source for now
#And ABSTRACT as target. 
# Remove punctuation, change digits to NNN, and to lower case.     
source_text=nyt_data[3].apply(sent_tokenize_rm_punct)
target_text=nyt_data[2].apply(sent_tokenize_rm_punct)
print(type(source_text),source_text[0:5])
print(type(target_text),target_text[0:5])


<class 'pandas.core.series.Series'> 0
/2004/01/01/1547299.xml     lisa weimer  right  opened her home furnishing...
/2004/01/01/1547300.xml     tim carlander of vandeventer  amp  carlander  ...
/2004/01/01/1547301.xml       everything is very complicated    said massi...
/2004/01/01/1547302.xml     two years ago  the savannah college of art and...
/2004/01/01/1547303.xml     it s a conventional looking coffee table  but ...
Name: 3, dtype: object
<class 'pandas.core.series.Series'> 0
/2004/01/01/1547299.xml      abstract          p chairs of NNNN s and NN s...
/2004/01/01/1547300.xml      abstract          p tim carlander designs NNN...
/2004/01/01/1547301.xml      abstract          p massimiliano fuksas desig...
/2004/01/01/1547302.xml      abstract          p savannah college of art a...
/2004/01/01/1547303.xml      abstract          p julia west designs coffee...
Name: 2, dtype: object


In [187]:
print(len(nyt_data),len(source_text))
work_dir = "working_dir/"
save_path = work_dir


print(string.punctuation)

100 100
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [165]:
preprocess_and_save_data(source_text, target_text, text_to_ids)

length of create_lookup_table input: 53697
type of create_lookup_table input: <class 'str'>
length of create_lookup_table input: 28596
type of create_lookup_table input: <class 'str'>
<class 'list'> 100
[[942, 1350, 2883, 1130, 585, 2212, 1130, 1436, 758, 21, 2460, 452, 2552, 960, 371, 2146, 1594, 942, 2426, 2073, 2437, 2698, 1886, 2924, 714, 2401, 1823, 1208, 2929, 1594, 2401, 1823, 576, 147, 2702, 2702, 2702, 2841, 2212, 1286, 1714, 2883, 2865, 1072, 19, 2160, 1593, 763, 1710, 1594, 1506, 29, 1941, 2180, 1834, 260, 1794, 2147, 2929, 945, 147, 942, 2809, 2702, 2702, 2702, 2841, 2212, 1882, 97, 1559, 2370, 2212, 1594, 452, 1002, 2883, 1449, 2180, 1130, 987, 1177, 2883, 1828, 960, 251, 283, 960, 2146, 2809, 2231, 217, 1593, 793, 1179, 1130, 2795, 2881, 2212, 1705, 1871, 2883, 1130, 1821, 977, 1035, 452, 1234, 658, 2074, 2883, 2021, 155, 1135, 960, 255, 1316, 1722, 1433, 782, 2212, 1130, 607, 283, 960, 2146, 2040, 960, 1893, 1230, 1602, 1350, 2580, 1188, 2914, 579, 2883, 942, 960, 2340, 

In [172]:
def enc_dec_model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets') 
    
    target_sequence_length = tf.placeholder(tf.int32, [None], name='target_sequence_length')
    max_target_len = tf.reduce_max(target_sequence_length)    
    
    return inputs, targets, target_sequence_length, max_target_len


In [173]:
def process_decoder_input(target_data, target_vocab_to_int, batch_size):
    # get '<GO>' id
    go_id = target_vocab_to_int['<GO>']
    
    after_slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    after_concat = tf.concat( [tf.fill([batch_size, 1], go_id), after_slice], 1)
    
    return after_concat

In [174]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, 
                   source_vocab_size, 
                   encoding_embedding_size):
    """
    :return: tuple (RNN output, RNN state)
    """
    embed = tf.contrib.layers.embed_sequence(rnn_inputs, 
                                             vocab_size=source_vocab_size, 
                                             embed_dim=encoding_embedding_size)
    
    stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
    
    outputs, state = tf.nn.dynamic_rnn(stacked_cells, 
                                       embed, 
                                       dtype=tf.float32)
    return outputs, state

In [175]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, 
                         target_sequence_length, max_summary_length, 
                         output_layer, keep_prob):
    """
    Create a training process in decoding layer 
    :return: BasicDecoderOutput containing training logits and sample_id
    """
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    
    # for only input layer
    helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, 
                                               target_sequence_length)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, 
                                              helper, 
                                              encoder_state, 
                                              output_layer)

    # unrolling the decoder layer
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_summary_length)
    return outputs

In [176]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id,
                         end_of_sequence_id, max_target_sequence_length,
                         vocab_size, output_layer, batch_size, keep_prob):
    """
    Create a inference process in decoding layer 
    :return: BasicDecoderOutput containing inference logits and sample_id
    """
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, 
                                             output_keep_prob=keep_prob)
    
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings, 
                                                      tf.fill([batch_size], start_of_sequence_id), 
                                                      end_of_sequence_id)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, 
                                              helper, 
                                              encoder_state, 
                                              output_layer)
    
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                      impute_finished=True, 
                                                      maximum_iterations=max_target_sequence_length)
    return outputs

In [177]:
def decoding_layer(dec_input, encoder_state,
                   target_sequence_length, max_target_sequence_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, decoding_embedding_size):
    """
    Create decoding layer
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    target_vocab_size = len(target_vocab_to_int)
    dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
    
    with tf.variable_scope("decode"):
        output_layer = tf.layers.Dense(target_vocab_size)
        train_output = decoding_layer_train(encoder_state, 
                                            cells, 
                                            dec_embed_input, 
                                            target_sequence_length, 
                                            max_target_sequence_length, 
                                            output_layer, 
                                            keep_prob)

    with tf.variable_scope("decode", reuse=True):
        infer_output = decoding_layer_infer(encoder_state, 
                                            cells, 
                                            dec_embeddings, 
                                            target_vocab_to_int['<GO>'], 
                                            target_vocab_to_int['<EOS>'], 
                                            max_target_sequence_length, 
                                            target_vocab_size, 
                                            output_layer,
                                            batch_size,
                                            keep_prob)

    return (train_output, infer_output)

In [178]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  target_sequence_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, target_vocab_to_int):
    """
    Build the Sequence-to-Sequence model
    :return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
    """
    enc_outputs, enc_states = encoding_layer(input_data, 
                                             rnn_size, 
                                             num_layers, 
                                             keep_prob, 
                                             source_vocab_size, 
                                             enc_embedding_size)
    
    dec_input = process_decoder_input(target_data, 
                                      target_vocab_to_int, 
                                      batch_size)
    
    train_output, infer_output = decoding_layer(dec_input,
                                               enc_states, 
                                               target_sequence_length, 
                                               max_target_sentence_length,
                                               rnn_size,
                                              num_layers,
                                              target_vocab_to_int,
                                              target_vocab_size,
                                              batch_size,
                                              keep_prob,
                                              dec_embedding_size)
    
    return train_output, infer_output

In [182]:
display_step = 300

epochs = 13
batch_size = 50

rnn_size = 100
num_layers = 3

encoding_embedding_size = 200
decoding_embedding_size = 200

learning_rate = 0.001
keep_probability = 0.5
save_path = work_dir

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = load_preprocess()

max_target_sentence_length = max([len(sentence) for sentence in source_int_text])

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, target_sequence_length, max_target_sequence_length = enc_dec_model_inputs()
    #lr, keep_prob = hyperparam_inputs()
    

    lr = tf.placeholder(tf.float32, name='lr_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
  

    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                   targets,
                                                   keep_prob,
                                                   batch_size,
                                                   target_sequence_length,
                                                   max_target_sequence_length,
                                                   len(source_vocab_to_int),
                                                   len(target_vocab_to_int),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size,
                                                   num_layers,
                                                   target_vocab_to_int)
    
    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # https://www.tensorflow.org/api_docs/python/tf/sequence_mask
    # - Returns a mask tensor representing the first N positions of each cell.
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function - weighted softmax cross entropy
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

## Train

In [188]:
def pad_sentence_batch(sentence_batch, pad_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]


def get_batches(sources, targets, batch_size, source_pad_int, target_pad_int):
    """Batch targets, sources, and the lengths of their sentences together"""
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        # Slice the right amount for the batch
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        # Pad
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))

        # Need the lengths for the _lengths parameters
        pad_targets_lengths = []
        for target in pad_targets_batch:
            pad_targets_lengths.append(len(target))

        pad_source_lengths = []
        for source in pad_sources_batch:
            pad_source_lengths.append(len(source))

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths
        
        
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

# Split data to training and validation sets
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = source_int_text[:batch_size]
valid_target = target_int_text[:batch_size]
(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_vocab_to_int['<PAD>'],
                                                                                                             target_vocab_to_int['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_vocab_to_int['<PAD>'],
                            target_vocab_to_int['<PAD>'])):

            _, loss = sess.run(
                [train_op, cost],
                {input_data: source_batch,
                 targets: target_batch,
                 lr: learning_rate,
                 target_sequence_length: targets_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:
                batch_train_logits = sess.run(
                    inference_logits,
                    {input_data: source_batch,
                     target_sequence_length: targets_lengths,
                     keep_prob: 1.0})

                batch_valid_logits = sess.run(
                    inference_logits,
                    {input_data: valid_sources_batch,
                     target_sequence_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits)
                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits)

                print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.4f}, Validation Accuracy: {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')        

Model Trained and Saved


## inference

In [191]:
def sentence_to_seq(sentence, vocab_to_int):
    results = []
    for word in sentence.split(" "):
        if word in vocab_to_int:
            results.append(vocab_to_int[word])
        else:
            results.append(vocab_to_int['<UNK>'])
            
    return results
_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = load_preprocess()

translate_sentence = 'he saw a old yellow truck .'

translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(save_path + '.meta')
    loader.restore(sess, save_path)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

    translate_logits = sess.run(logits, {input_data: [translate_sentence]*batch_size,
                                         target_sequence_length: [len(translate_sentence)*2]*batch_size,
                                         keep_prob: 1.0})[0]

print('Input')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in translate_logits]))
print('  French Words: {}'.format(" ".join([target_int_to_vocab[i] for i in translate_logits])))

INFO:tensorflow:Restoring parameters from working_dir/
Input
  Word Ids:      [766, 1644, 1594, 2310, 1046, 2, 2]
  English Words: ['he', 'saw', 'a', 'old', 'yellow', '<UNK>', '<UNK>']

Prediction
  Word Ids:      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  French Words: <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
