# Minimal setup for LSTM

Currently set with Spooky Data, but will be switched to the Kaggle deep nlp dataset.

In [1]:
# minimal
import data_tools as dt

In [2]:
# NLP Spooky Author Identification Dataset
filepath = 'data/spooky_author_identification/train.csv'
data_manager = dt.SpookyData(filepath, (0.8, 0.1, 0.1), one_hot_encode=False, output_numpy=False)
data_manager.init_dataset()
train_x, train_y = data_manager.prepare_train()

# Vocabulary
vocab = dt.Vocabulary('data/spooky_author_identification/tmp2', 20000)
vocab.build_vocabulary(train_x, train_y)

sents_vocab, rev_sents_vocab = vocab.get_sentence_vocabulary()
label_vocab, rev_label_vocab = vocab.get_label_vocabulary()

train_x_tok = vocab.data_to_token_ids(train_x, 'train')
train_y_tok = vocab.labels_to_token_ids(train_y, 'train')

train_set = list(zip(train_x_tok, train_y_tok))

Train/Valid/Test data found, loading...
Dataset prepared
Building vocabulary
  processing line 5000
  processing line 10000
  processing line 15000
Writing data/spooky_author_identification/tmp2/vocab_sentences.txt ...
Writing data/spooky_author_identification/tmp2/vocab_labels.txt ...
  tokenising line 5000
  tokenising line 10000
  tokenising line 15000
Writing data/spooky_author_identification/tmp2/train/sentences.txt ...
Writing data/spooky_author_identification/tmp2/train/ids_sentences.txt ...
Writing data/spooky_author_identification/tmp2/train/labels.txt ...
Writing data/spooky_author_identification/tmp2/train/ids_labels.txt ...


In [3]:
print('Translating token IDs back into words:\n')
vocab.translate_examples(train_set[:5])
print('\nActual form of training data:\n', train_set[0])

Translating token IDs back into words:

It occurred to me that I must be in a highly nervous state to let a few random creakings set me off speculating in this fashion but I regretted none the less that I was unarmed .
HPL


Hence there is less distinction between the several classes of its inhabitants ; and the lower orders , being neither so poor nor so despised , their manners are more refined and moral .
MWS


M . St . Eustache , the lover and intended husband of Marie , who boarded in her mother ' s house , deposes that he did not hear of the discovery of the body of his intended until the next morning , when M . Beauvais came into his chamber and told him of it .
EAP


I had sufficient leisure for these and many other reflections during my journey to Ingolstadt , which was long and fatiguing .
MWS


Monsieur Le Blanc was unable to account for her absence , and Madame Rogêt was distracted with anxiety and terror .
EAP



Actual form of training data:
 ([63, 772, 7, 26, 12, 8, 95, 

In [7]:
# Prepare Validation Set
valid_x, valid_y = data_manager.prepare_valid()

valid_x_tok = vocab.data_to_token_ids(valid_x, 'valid')
valid_y_tok = vocab.labels_to_token_ids(valid_y, 'valid')

valid_set = list(zip(valid_x_tok, valid_y_tok))

vocab.translate_examples(valid_set[:3])

Writing data/spooky_author_identification/tmp2/valid/sentences.txt ...
Writing data/spooky_author_identification/tmp2/valid/ids_sentences.txt ...
Writing data/spooky_author_identification/tmp2/valid/labels.txt ...
Writing data/spooky_author_identification/tmp2/valid/ids_labels.txt ...
From my experience I cannot doubt but that man , when lost to terrestrial consciousness , is indeed _UNK in another and uncorporeal life of far different nature from the life we know ; and of which only the slightest and most indistinct memories linger after waking .
HPL


The device is that of a single individual ; and this brings us to the fact that ' between the thicket and the river , the rails of the _UNK were found taken down , and the ground bore evident traces of some heavy burden having been dragged along it ' But would a number of men have put themselves to the superfluous trouble of taking down a fence , for the purpose of dragging through it a corpse which they might have lifted over any fence

In [4]:
import tensorflow as tf
import numpy as np

In [13]:
class RNNModel(object):
    def __init__(self, h_size, num_layers, vocab_size, n_classes, batch_size, dropout_keep=1.0, rnn_type='lstm'):

        # Input Placeholders
        self.x = x = tf.placeholder(tf.int32, [batch_size, None], name="inputs") # [batch_size, num_steps]
        self.seqlen = seqlen = tf.placeholder(tf.int32, [batch_size], name="sequence_lengths")
        self.y = y = tf.placeholder(tf.int32, [batch_size], name="classes_gt")
        keep_prob = tf.constant(dropout_keep)
        self.global_step = tf.Variable(0, trainable=False)
        
        def cell_gen():
            return tf.contrib.rnn.BasicLSTMCell(h_size, state_is_tuple=True)
        if rnn_type == 'gru':
            def cell_gen():
                return tf.contrib.rnn.GRUCell(h_size)
        
        if num_layers > 1:
            cells = []
            for _ in range(num_layers):
                cell = tf.contrib.rnn.DropoutWrapper(cell_gen(), output_keep_prob=keep_prob)
                cells.append(cell)
        
            cell = tf.contrib.rnn.MultiRNNCell(cells)
        else:
            cell = cell_gen()
        
        self.cell = cell
        
        # TODO: Prepare init state
#         # Initialise one hidden state
#         init_state = tf.get_variable('init_state', [1, h_size],
#                                  initializer=tf.constant_initializer(0.0))
#         # Tile to match batch_size
#         init_state = tf.tile(init_state, [batch_size, 1])
#         print(init_state)
        
        # Embedding layer
        embeddings = tf.get_variable('embedding_matrix', [vocab_size, h_size])
        rnn_inputs = tf.nn.embedding_lookup(embeddings, x)
        
#         rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
#                                                      initial_state=init_state)
        rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen, dtype=tf.float32)

        #idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
        #last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)        
        last_rnn_output = tf.gather_nd(rnn_outputs, tf.stack([tf.range(batch_size), seqlen-1], axis=1))

        # Softmax layer
        with tf.variable_scope('softmax'):
            W = tf.get_variable('W', [h_size, n_classes])
            b = tf.get_variable('b', [n_classes], initializer=tf.constant_initializer(0.0))
        logits = tf.matmul(last_rnn_output, W) + b
        preds = tf.nn.softmax(logits)
        correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)

        self.accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

        self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
        self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss, global_step=self.global_step)
        
        self.saver = tf.train.Saver(tf.global_variables())
        
        self._prepare_logs()
        
    def _prepare_logs(self):
        tf.summary.scalar('Loss', self.loss)
        tf.summary.scalar('Accuracy', self.accuracy)
        
        self.logs = tf.summary.merge_all()

def create_model(session, logdir, **parameters):
    with tf.variable_scope("model", reuse=None):
        print('\nCreating model with parameters:')
        for k,v in parameters.items():
            print('{:16s}: {}'.format(k, v))
        model_train = RNNModel(parameters['h_size'], parameters['rnn_layers'], FLAGS_in_vocab_size,
                               FLAGS_n_classes, parameters['batch_size'], dropout_keep=parameters['dropout_keep'])
        
    ckpt = tf.train.get_checkpoint_state(logdir)
    #print(ckpt.model_checkpoint_path)
    if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + '.index'):
        print("Loading model from parameters in {}.".format(ckpt.model_checkpoint_path))
        model_train.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        print("Creating model with fresh parameters.")
        session.run(tf.global_variables_initializer())
    return model_train

In [None]:
FLAGS_in_vocab_size = 20000
FLAGS_n_classes = 3
FLAGS_log_dir = 'logs/'

tf.reset_default_graph()

pars = {'h_size': 512, 'rnn_layers': 3, 'batch_size': 16, 'dropout_keep': 0.7}

with tf.Session() as sess:
    model = create_model(sess, FLAGS_log_dir, **pars)

In [17]:
from timeit import default_timer as timer

FLAGS_in_vocab_size = 20000
FLAGS_n_classes = 3
FLAGS_log_dir = 'logs/'

n_epochs = 1
batch_size = 10
n_steps_avg = 50

tf.reset_default_graph()

def valid_eval(sess, model, valid_set, batches):
    total_steps = 0
    val_accuracy = 0
    val_loss = 0
    for epoch in batches.gen_padded_batch_epochs(valid_set, 1):
        for step, (batch_x, batch_y, lengths) in enumerate(epoch):
            total_steps += 1
            feed = {model.x: batch_x, model.y: batch_y, model.seqlen: lengths}
            fetch = [model.accuracy, model.loss]
            
            val_accuracy_, val_loss_ = sess.run(fetch, feed_dict=feed)
            val_accuracy += val_accuracy_
            val_loss += val_loss_
    avg_val_accuracy = val_accuracy / total_steps
    avg_val_loss = val_loss / total_steps
    
    return avg_val_accuracy, avg_val_loss



def train_net(train_set, valid_set, n_epochs, run_name, **params):
    tf.reset_default_graph()
    with tf.Session() as sess:
        log_dir = os.path.join(FLAGS_log_dir, run_name)
        log_txt_dir = os.path.join(FLAGS_log_dir, 'txtlogs/')
        if not os.path.exists(log_txt_dir):
            os.makedirs(log_txt_dir)
        
        model = create_model(sess, log_dir, **params)
        
        batches = dt.Batches(params['batch_size'])
        
        train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph)
        valid_writer = tf.summary.FileWriter(log_dir + '/valid', sess.graph)
        
        quantities = ['Gstep', 'Accuracy', 'Loss', 'Time']
        train_logs = dt.Logger(*quantities)
        valid_logs = dt.Logger(*quantities)

        start_time = timer()

        for i, epoch in enumerate(batches.gen_padded_batch_epochs(train_set, n_epochs)):
            print('\nEpoch', i+1)
            accuracy = 0
            loss = 0
            for step, (batch_x, batch_y, lengths) in enumerate(epoch):

                feed = {model.x: batch_x, model.y: batch_y, model.seqlen: lengths}
                fetch = [model.accuracy, model.loss, model.logs, model.train_step]

                accuracy_, loss_, logs, _ = sess.run(fetch, feed_dict=feed)
                accuracy += accuracy_
                loss += loss_

                gstep = model.global_step.eval()
                elapsed = timer() - start_time

                train_writer.add_summary(logs, gstep)
                train_logs.log(Gstep=gstep, Accuracy=accuracy_, Loss=loss_, Time=elapsed)

                if step % n_steps_avg == 0 and step > 0:
                    avg_accuracy = accuracy/n_steps_avg
                    avg_loss = loss/n_steps_avg
                    print('Step {}, accuracy: {:7.3}, loss: {:7.3} ({} steps avg.)'.format(
                        gstep, avg_accuracy, avg_loss, n_steps_avg))
                    accuracy = 0
                    loss = 0                          

            valid_accuracy, valid_loss = valid_eval(sess, model, valid_set, batches)
            print('Global Step {}, valid accuracy: {:7.3}'.format(gstep, valid_accuracy))
            
            elapsed = timer() - start_time
            valid_logs.log(Gstep=gstep, Accuracy=valid_accuracy, Loss=valid_loss, Time=elapsed)

            summary = tf.Summary()
            summary.value.add(tag="model/Accuracy", simple_value=valid_accuracy)
            summary.value.add(tag="model/Loss", simple_value=valid_loss)
            valid_writer.add_summary(summary, gstep)
            valid_writer.flush()

            tf.logging.info('Step {} validation accuracy: {:7.3}'.format(gstep, valid_accuracy))

            checkpoint_path = os.path.join(log_dir, 'crm_lstm.ckpt')
            model.saver.save(sess, checkpoint_path, global_step=model.global_step)

        print('Done Training')

        train_logs.write_csv(os.path.join(log_txt_dir, run_name + '_train.csv'))
        valid_logs.write_csv(os.path.join(log_txt_dir, run_name + '_valid.csv'))
                        


In [9]:
# Parameter sets
import itertools

class ParameterTuner(object):
    def __init__(self):
        self.h_sizes = None
        self.rnn_layers = None
        self.batch_sizes = None
        self.dropout_keep = None
    
    def n_sets(self):
        return len(self.h_sizes)*len(self.rnn_layers)*len(self.batch_sizes)*len(self.dropout_keep)
    
    def sets(self):
        parameters = [self.h_sizes, self.rnn_layers, self.batch_sizes, self.dropout_keep]
        for h, layers, batches, dropouts in itertools.product(*parameters):
            par_set = {}
            par_set['h_size'] = h
            par_set['rnn_layers'] = layers
            par_set['batch_size'] = batches
            par_set['dropout_keep'] = dropouts
            par_string = 'h{}_l{}_b{}_d{}'.format(h, layers, batches, dropouts)
            yield par_set, par_string

# h_sizes = [128, 256, 512, 1024]
# rnn_layers = [1, 2, 3, 4]
# batch_sizes = [16, 32, 64, 128]

# h_sizes = [128, 256]
# rnn_layers = [1, 2]
# batch_sizes = [16, 32]

h_sizes = [256]
rnn_layers = [1]
batch_sizes = [16]
dropout_keep = [0.7]


tuner = ParameterTuner()
tuner.h_sizes = h_sizes
tuner.rnn_layers = rnn_layers
tuner.batch_sizes = batch_sizes
tuner.dropout_keep = dropout_keep

In [18]:
import os
from data_tools import Logger
epochs = 1

tune_start = timer()
n_psets = tuner.n_sets()
for iset, (pset, pstring) in enumerate(tuner.sets()):
    tune_elapsed = timer() - tune_start
    print('\n\nRun {}/{}, {}s elapsed'.format(iset+1, n_psets, tune_elapsed))
    train_net(train_set[:1600], valid_set, epochs, pstring, **pset)



Run 1/1, 0.00013098897761665285s elapsed

Creating model with parameters:
batch_size      : 16
rnn_layers      : 1
dropout_keep    : 0.7
h_size          : 256
Loading model from parameters in logs/h256_l1_b16_d0.7/crm_lstm.ckpt-1010.
INFO:tensorflow:Restoring parameters from logs/h256_l1_b16_d0.7/crm_lstm.ckpt-1010

Epoch 1
Step 1061, accuracy:   0.631, loss:   0.886 (50 steps avg.)
Global Step 1110, valid accuracy:   0.654
INFO:tensorflow:Step 1110 validation accuracy:   0.654
Done Training


In [None]:
# Test Batches class

n_epochs = 2
for i, epoch in enumerate(batches.gen_padded_batch_epochs(train_set, n_epochs)):
    print('\n\nEpoch {}'.format(i))
    for step, (batch_x, batch_y, lengths) in enumerate(epoch):
        print('{:5d} {} {}\n\n'.format(step, batch_x[:2], batch_y[:2]))