### RNN char language model for LANL

This version is based on simple_lm.py file without all the generic params. We are just attempting to replicate the work in this notebook.

Simple RNN not bidirctional

In [1]:
import tensorflow as tf
import numpy as np
import time
import os
import sys

tf.set_random_seed(613)
tf.reset_default_graph()

np.random.seed(613)


In [2]:
def fan_scale(initrange, activation, tensor_in):
    """
    Creates a scaling factor for weight initialization according to best practices.

    :param initrange: Scaling in addition to fan_in scale.
    :param activation: A tensorflow non-linear activation function
    :param tensor_in: Input tensor to layer of network to scale weights for.
    :return: (float) scaling factor for weight initialization.
    """
    if activation == tf.nn.relu:
        initrange *= np.sqrt(2.0/float(tensor_in.get_shape().as_list()[1]))
    else:
        initrange *= (1.0/np.sqrt(float(tensor_in.get_shape().as_list()[1])))
    return initrange


def batch_softmax_dist_loss(truth, h, dimension, scale_range=1.0):
    """
    This function paired with a tensorflow optimizer is multinomial logistic regression.
    It is designed for cotegorical predictions.

    :param truth: (tf.Tensor) A tensorflow vector tensor of integer class labels.
    :param h: (tf.Tensor) A placeholder if doing simple multinomial logistic regression, or the output of some neural network.
    :param dimension: (int) Number of classes in output distribution.
    :param scale_range: (float) For scaling the weight matrices (by default weights are initialized two 1/sqrt(fan_in)) for tanh activation and sqrt(2/fan_in) for relu activation.
    :return: (tf.Tensor, shape = [MB, Sequence_length]) Cross-entropy of true distribution vs. predicted distribution.
    """
    fan_in = h[0].get_shape().as_list()[1]
    initializer = fan_scale(scale_range, tf.tanh, h[0]) * tf.truncated_normal([fan_in, dimension],
                                                                               dtype=tf.float32,
                                                                               name='W')
    U = tf.get_variable('softmax_weights', initializer=initializer)

    hidden_tensor = tf.stack(h) # sequence_length X batch_size X final_hidden_size
    tf.add_to_collection('logit_weights', U)
    b = tf.get_variable('softmax_bias', initializer=tf.zeros([dimension]))
    ustack = tf.stack([U]*len(h)) #sequence_length X final_hidden_size X dimension
    logits = tf.matmul(hidden_tensor, ustack) + b # sequence_length X batch_size X dimension
    logits = tf.transpose(logits, perm=[1, 0, 2]) # batch_size X sequence_length X dimension
    tf.add_to_collection("true_probabilities", tf.nn.softmax(logits)) # added to store probabilities of true logline
    loss_matrix = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=truth) # batch_size X sequence_length
    return loss_matrix


def get_mask(lens, num_tokens):
    """
    For masking output of lm_rnn for jagged sequences for correct gradient update.
    Sequence length of 0 will output nan for that row of mask so don't do this.

    :param lens: Numpy vector of sequence lengths
    :param num_tokens: (int) Number of predicted tokens in sentence.
    :return: A numpy array mask MB X num_tokens
             For each row there are: lens[i] values of 1/lens[i]
                                     followed by num_tokens - lens[i] zeros
    """
    mask_template = np.repeat(np.arange(num_tokens).reshape(1, -1), lens.shape[0], axis=0)
    return (mask_template < lens.reshape([-1, 1])).astype(float) / lens.reshape([-1, 1]).astype(float)


def write_results(datadict, pointloss, outfile, batch):
    """
    Writes loss for each datapoint, along with meta-data to file.

    :param datadict: Dictionary of data names (str) keys to numpy matrix values for this mini-batch.
    :param pointloss: MB X 1 numpy array
    :param outfile: Where to write results.
    :param batch: The mini-batch number for these events.
    :return:
    """

    for line, sec, day, usr, red, loss in zip(datadict['line'].flatten().tolist(),
                                              datadict['second'].flatten().tolist(),
                                              datadict['day'].flatten().tolist(),
                                              datadict['user'].flatten().tolist(),
                                              datadict['red'].flatten().tolist(),
                                              pointloss.flatten().tolist()):
        outfile.write('%s %s %s %s %s %s %r\n' % (batch, line, sec, day, usr, red, loss))


In [3]:
def lm_rnn(x, t, token_embed, layers, seq_len=None, context_vector=None, cell=tf.contrib.rnn.BasicLSTMCell):
    """
    Token level LSTM language model that uses a sentence level context vector.

    :param x: (tensor) Input to rnn
    :param t: (tensor) Targets for language model predictions (typically next token in sequence)
    :param token_embed: (tensor) MB X ALPHABET_SIZE.
    :param layers: A list of hidden layer sizes for stacked lstm
    :param seq_len: A 1D tensor of mini-batch size for variable length sequences
    :param context_vector: (tensor) MB X 2*CONTEXT_LSTM_OUTPUT_DIM. Optional context to append to each token embedding
    :param cell: (class) A tensorflow RNNCell sub-class
    :return: (tuple) token_losses (tensor), hidden_states (list of tensors), final_hidden (tensor)
    """

    token_set_size = token_embed.get_shape().as_list()[0]
    cells = [cell(num_units) for num_units in layers]
    cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
    # mb X sentence_length X embedding_size
    x_lookup = tf.nn.embedding_lookup(token_embed, x)

    # List of mb X embedding_size tensors
    input_features = tf.unstack(x_lookup, axis=1)

    # input_features: list max_length of sentence long tensors (mb X embedding_size+context_size)
    if context_vector is not None:
        input_features = [tf.concat([embedding, context_vector], 1) for embedding in input_features]

    scope = tf.VariableScope(name= 'language_model', reuse = None)
    
    # hidden_states: sentence length long list of tensors (mb X final_layer_size)
    # cell_state: data structure that contains the cell state for each hidden layer for a mini-batch (complicated)
    # Creates a recurrent neural network specified by RNNCell cell
    hidden_states, cell_state = tf.contrib.rnn.static_rnn(cell, input_features,
                                          initial_state=None,
                                          dtype=tf.float32,
                                          sequence_length=seq_len,
                                          scope=scope)

    # batch_size X sequence_length (see tf_ops for def)
    token_losses = batch_softmax_dist_loss(t, hidden_states, token_set_size)
    final_hidden = cell_state[-1].h
    return token_losses, hidden_states, final_hidden


In [4]:
class OnlineBatcher:
    """
    Gives batches from a csv file.
    For batching data too large to fit into memory. Written for one pass on data!!!
    """

    def __init__(self, datafile, batch_size,
                 skipheader=False, delimiter=',',
                 alpha=0.5, size_check=None,
                 datastart_index=3, norm=False):
        """

        :param datafile: (str) File to read lines from.
        :param batch_size: (int) Mini-batch size.
        :param skipheader: (bool) Whether or not to skip first line of file.
        :param delimiter: (str) Delimiter of csv file.
        :param alpha: (float)  For exponential running mean and variance.
                      Lower alpha discounts older observations faster.
                      The higher the alpha, the further you take into consideration the past.
        :param size_check: (int) Expected number of fields from csv file. Used to check for data corruption.
        :param datastart_index: (int) The csv field where real valued features to be normalized begins.
                                Assumed that all features beginnning at datastart_index till end of line
                                are real valued.
        :param norm: (bool) Whether or not to normalize the real valued data features.
        """

        self.alpha = alpha
        self.f = open(datafile, 'r')
        self.batch_size = batch_size
        self.index = 0
        self.delimiter = delimiter
        self.size_check = size_check
        if skipheader:
            self.header = self.f.readline()
        self.datastart_index = datastart_index
        self.norm = norm
        self.replay = False

    def next_batch(self):
        """
        :return: (np.array) until end of datafile, each time called,
                 returns mini-batch number of lines from csv file
                 as a numpy array. Returns shorter than mini-batch
                 end of contents as a smaller than batch size array.
                 Returns None when no more data is available(one pass batcher!!).
        """
        matlist = []
        l = self.f.readline()
        if l == '':
            return None
        rowtext = np.array([float(k) for k in l.strip().split(self.delimiter)])
        if self.size_check is not None:
            while len(rowtext) != self.size_check:
                l = self.f.readline()
                if l == '':
                    return None
                rowtext = np.array([float(k) for k in l.strip().split(self.delimiter)])
        matlist.append(rowtext)
        for i in range(self.batch_size - 1):
            l = self.f.readline()
            if l == '':
                break
            rowtext = np.array([float(k) for k in l.strip().split(self.delimiter)])
            if self.size_check is not None:
                while len(rowtext) != self.size_check:
                    l = self.f.readline()
                    if l == '':
                        return None
                    rowtext = np.array([float(k) for k in l.strip().split(self.delimiter)])
            matlist.append(rowtext)
        data = np.array(matlist)
        if self.norm:
            batchmean, batchvariance = data[:,self.datastart_index:].mean(axis=0), data[:, self.datastart_index:].var(axis=0)
            if self.index == 0:
                self.mean, self.variance = batchmean, batchvariance
            else:
                self.mean = self.alpha * self.mean + (1 - self.alpha) * batchmean
                self.variance = self.alpha * self.variance + (1 - self.alpha) * batchvariance
                data[:, self.datastart_index:] = (data[:, self.datastart_index:] - self.mean)/(self.variance + 1e-10)
        self.index += self.batch_size
        return data


In [5]:
from graph_training_utils import EarlyStop

def trainday(model, is_training, datafile_name, outfile, mb, bidir, x_shape, jagged = True, skipsos = True, verbose = False):
    
    jag = int(jagged) # Whether using sequences of variable length (Input should be zero-padded to max_sequence_length.'
    skipsos = int(skipsos) # 'Whether to skip a start of sentence token.

    batch_num = 0
    data = OnlineBatcher(datafile_name, mb, delimiter=' ')
    raw_batch = data.next_batch()
    print('raw_batch.shape:', raw_batch.shape)
    current_loss = sys.float_info.max
    not_early_stop = EarlyStop()
    endx = raw_batch.shape[1] - int(not bidir)
    endt = raw_batch.shape[1] - int(bidir)
    continue_training = not_early_stop(raw_batch, current_loss)
    while continue_training:  # mat is not None and self.badcount < self.badlimit and loss != inf, nan:
        datadict = {'line': raw_batch[:, 0],
                    'second': raw_batch[:, 1],
                    'day': raw_batch[:, 2],
                    'user': raw_batch[:, 3],
                    'red': raw_batch[:, 4],
                    'x': raw_batch[:, (5+jag+skipsos):endx],
                    't': raw_batch[:, (6+jag+skipsos):endt]}
        if jagged:
            datadict['lengths'] = raw_batch[:, 5]
            datadict['mask'] = get_mask(datadict['lengths']-2*bidir-skipsos, sentence_length-2*bidir)
#             print('datadict_length:', datadict['lengths'])
#             print('datadict_mask:', datadict['mask'])
            assert np.all(datadict['lengths'] <= x_shape.as_list()[1]), 'Sequence found greater than num_tokens_predicted'
            assert np.nonzero(datadict['lengths'])[0].shape[0] == datadict['lengths'].shape[0], \
                'Sequence lengths must be greater than zero.' \
                'Found zero length sequence in datadict["lengths"]: %s' % datadict['lengths']
        eval_tensors = [avgloss, line_losses]
        _, current_loss, pointloss = model.train_step(datadict, eval_tensors,
                                                          update=is_training)
        if not is_training:
            write_results(datadict, pointloss, outfile, batch_num)
        batch_num += 1
        if verbose or (batch_num %999)==0:
            print('%s %s %s %s %s %s %r' % (raw_batch.shape[0],
                                            datadict['line'][0],
                                            datadict['second'][0],
                                            ('fixed', 'update')[is_training],
                                            datafile_name,
                                            data.index,
                                            current_loss))
        raw_batch = data.next_batch()
        continue_training = not_early_stop(raw_batch, current_loss)
        if continue_training < 0:
            exit(0)

In [6]:
from graph_training_utils import ModelRunner

sentence_length = 114 # based on the max_line_lenght when doing preprocessing of the data
token_set_size  = 96  # taken from <safekit>/features/specs/lm/lanl_char_config.json
learnrate = 0.001
debug = False
jagged = False
bidir = False


em_size = 20 # Size of embeddings for categorical features. This is the default
token_embed = tf.Variable(tf.truncated_normal([token_set_size, em_size]))  # Initial embeddings vocab X embedding size

lm_layers = [10] # A list of hidden layer sizes. this is the default

x = tf.placeholder(tf.int32, [None, sentence_length])
t = tf.placeholder(tf.int32, [None, sentence_length])

# TBD... figure out what's the ident_ran_and ran are used for...
#        cell_type = 'lstm' # Can be either "lstm", "ident_ran", or "ran"'

ph_dict = {'x': x, 't': t}

# in the original code seq_len is only set if the args.jagged is true
# jagged is defined as "Whether using sequences of variable length 
#              (Input should be zero-padded to max_sequence_length.
# I believe originally they used jagged for the char language model.
if jagged:
    seq_len = tf.placeholder(tf.int32, [None])
    ph_dict['lengths'] = seq_len
else:
    seq_len = None
    

token_losses, hidden_states, final_hidden = lm_rnn(x, t, token_embed,
                                                               lm_layers, seq_len=seq_len,
                                                               cell=tf.contrib.rnn.BasicLSTMCell)

# 
if jagged:
    ph_dict['mask'] = tf.placeholder(tf.float32, [None, sentence_length])
    token_losses *= ph_dict['mask']
    line_losses = tf.reduce_sum(token_losses, axis=1)  # batch_size X 1
else:
    line_losses = tf.reduce_mean(token_losses, axis=1)  # batch_size X 1

avgloss = tf.reduce_mean(line_losses)  # scalar

model = ModelRunner(avgloss, ph_dict, learnrate=learnrate, debug=debug,
                    decay=True,
                    decay_rate=0.99, decay_steps=20)

# training loop
start_time = time.time()

data_dir = './data/char_feats/'
results_dir = './lanl_result/'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

outfile_name = 'simple_' + time.ctime(time.time()).replace(' ', '-')

outfile = open(results_dir + outfile_name, 'w')

num_days = 7 # that's what we have in our current testing env.
mb_size = 1024 # 512

files = [data_dir + str(i) + '.txt' for i in range(num_days)]
#print('list of files:', files)
outfile.write("batch line second day user red loss\n")

for idx, fname in enumerate(files[:-1]):
    print('processing file', fname, ' - training == True')
    trainday(model, True, fname, outfile, mb_size, bidir, x.get_shape(), jagged = jagged)
    print('processing file', fname, ' - training == False')
    trainday(model, False, files[idx + 1], outfile, mb_size, bidir, x.get_shape(), jagged = jagged)
    
outfile.close()
total_time = time.time() - start_time
print('elapsed time: %s' % total_time)


processing file ./data/char_feats/0.txt  - training == True
raw_batch.shape: (1024, 121)
1024 3848637.0 29786.0 update ./data/char_feats/0.txt 1022976 0.77021384
1024 6535995.0 40117.0 update ./data/char_feats/0.txt 2045952 0.5266479
1024 9140719.0 51089.0 update ./data/char_feats/0.txt 3068928 0.45972463
1024 11939260.0 63246.0 update ./data/char_feats/0.txt 4091904 0.42087907
1024 15724098.0 86279.0 update ./data/char_feats/0.txt 5114880 0.39120644


Done Training. End of data stream.

processing file ./data/char_feats/0.txt  - training == False
raw_batch.shape: (1024, 121)
1024 19566204.0 110070.0 fixed ./data/char_feats/1.txt 1022976 0.40691057
1024 22299357.0 121382.0 fixed ./data/char_feats/1.txt 2045952 0.42088857
1024 24919790.0 131706.0 fixed ./data/char_feats/1.txt 3068928 0.42149878
1024 27604482.0 142062.0 fixed ./data/char_feats/1.txt 4091904 0.41650963
1024 30713235.0 156981.0 fixed ./data/char_feats/1.txt 5114880 0.39003956


Done Training. End of data stream.

processing file ./data/char_feats/1.txt  - training == True
raw_batch.shape: (1024, 121)
1024 19566204.0 110070.0 update ./data/char_feats/1.txt 1022976 0.39661333
1024 22299357.0 121382.0 update ./data/char_feats/1.txt 2045952 0.40255657
1024 24919790.0 131706.0 update ./data/char_feats/1.txt 3068928 0.39973414
1024 27604482.0 142062.0 update ./data/char_feats/1.txt 4091904 0.39186388
1024 30713235.0 156981.0 update ./data/char_feats/1.txt 5114880 0.36862835


Done Training. End of data stream.

processing file ./data/char_feats/1.txt  - training == False
raw_batch.shape: (1024, 121)
1024 37034163.0 196201.0 fixed ./data/char_feats/2.txt 1022976 0.38579068
1024 39963088.0 210078.0 fixed ./data/char_feats/2.txt 2045952 0.39187747
1024 42756960.0 223183.0 fixed ./data/char_feats/2.txt 3068928 0.39263672
1024 45857463.0 239684.0 fixed ./data/char_feats/2.txt 4091904 0.38157198


Done Training. End of data stream.

processing file ./data/char_feats/2.txt  - training == True
raw_batch.shape: (1024, 121)
1024 37034163.0 196201.0 update ./data/char_feats/2.txt 1022976 0.38498858
1024 39963088.0 210078.0 update ./data/char_feats/2.txt 2045952 0.39046043
1024 42756960.0 223183.0 update ./data/char_feats/2.txt 3068928 0.39088514
1024 45857463.0 239684.0 update ./data/char_feats/2.txt 4091904 0.38063815


Done Training. End of data stream.

processing file ./data/char_feats/2.txt  - training == False
raw_batch.shape: (1024, 121)
1024 52637064.0 283730.0 fixed ./data/char_feats/3.txt 1022976 0.37970373
1024 56172011.0 306028.0 fixed ./data/char_feats/3.txt 2045952 0.37221843
1024 59672552.0 328265.0 fixed ./data/char_feats/3.txt 3068928 0.37211424


Done Training. End of data stream.

processing file ./data/char_feats/3.txt  - training == True
raw_batch.shape: (1024, 121)
1024 52637064.0 283730.0 update ./data/char_feats/3.txt 1022976 0.37957034
1024 56172011.0 306028.0 update ./data/char_feats/3.txt 2045952 0.37181833
1024 59672552.0 328265.0 update ./data/char_feats/3.txt 3068928 0.37170434


Done Training. End of data stream.

processing file ./data/char_feats/3.txt  - training == False
raw_batch.shape: (1024, 121)
1024 65886461.0 368339.0 fixed ./data/char_feats/4.txt 1022976 0.37197787
1024 69451727.0 390989.0 fixed ./data/char_feats/4.txt 2045952 0.37134576
1024 73031187.0 413857.0 fixed ./data/char_feats/4.txt 3068928 0.37355188


Done Training. End of data stream.

processing file ./data/char_feats/4.txt  - training == True
raw_batch.shape: (1024, 121)
1024 65886461.0 368339.0 update ./data/char_feats/4.txt 1022976 0.3719589
1024 69451727.0 390989.0 update ./data/char_feats/4.txt 2045952 0.37132
1024 73031187.0 413857.0 update ./data/char_feats/4.txt 3068928 0.37354654


Done Training. End of data stream.

processing file ./data/char_feats/4.txt  - training == False
raw_batch.shape: (1024, 121)
1024 79479448.0 456052.0 fixed ./data/char_feats/5.txt 1022976 0.39141363
1024 82126356.0 465753.0 fixed ./data/char_feats/5.txt 2045952 0.39211586
1024 84547732.0 474578.0 fixed ./data/char_feats/5.txt 3068928 0.3869393
1024 86974174.0 483519.0 fixed ./data/char_feats/5.txt 4091904 0.39347082
1024 89438354.0 492534.0 fixed ./data/char_feats/5.txt 5114880 0.39215887
1024 92659342.0 509168.0 fixed ./data/char_feats/5.txt 6137856 0.37553963


Done Training. End of data stream.

processing file ./data/char_feats/5.txt  - training == True
raw_batch.shape: (1024, 121)
1024 79479448.0 456052.0 update ./data/char_feats/5.txt 1022976 0.39141214
1024 82126356.0 465753.0 update ./data/char_feats/5.txt 2045952 0.39209676
1024 84547732.0 474578.0 update ./data/char_feats/5.txt 3068928 0.3869223
1024 86974174.0 483519.0 update ./data/char_feats/5.txt 4091904 0.39344972
1024 89438354.0 492534.0 update ./data/char_feats/5.txt 5114880 0.39213938
1024 92659342.0 509168.0 update ./data/char_feats/5.txt 6137856 0.3755257


Done Training. End of data stream.

processing file ./data/char_feats/5.txt  - training == False
raw_batch.shape: (1024, 121)
1024 97720633.0 537099.0 fixed ./data/char_feats/6.txt 1022976 0.3711587
elapsed time: 14213.488190889359


Done Training. End of data stream.