### RNN Language model to process user events
Bidirectional LSTM model with dropout and Adam optimizer

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
import tensorflow.contrib.eager as tfe

tf.enable_eager_execution()

print("TensorFlow version: {}".format(tf.VERSION))
print("Eager execution: {}".format(tf.executing_eagerly()))

data_dir = 'data/char_feats/U12/'


TensorFlow version: 1.7.0
Eager execution: True


In [2]:
# transform character-based input into equivalent numerical versions
def encode_data(text, num_chars, max_length):
    # create empty vessels for one-hot encoded input
    X = np.zeros((len(text), max_length, num_chars), dtype=np.float32)
    y = np.zeros((len(text), max_length, num_chars), dtype=np.float32)
    
    # loop over inputs and tranform and store in X
    for i, sentence in enumerate(text):
        sentence = '\t' + sentence + '\n'
        for j, c in enumerate(sentence):
            X[i, j, ord(c)] = 1
            if j > 0:
                # target_data will be ahead by one timestep
                # and will not include the start character.
                y[i, j - 1, ord(c)] = 1.

    return X, y

In [3]:
def process_file(fname, max_len):
    """
        process file by extracting sentences data and encode them producing 
        a set of input and target data for processing by the model
        'fname' contains coma separated fields where the last one is the 
        sentence to be processes
    """
    data = open(fname).read()

    text = []
    red_events = []
    max_text_len = 0
    with open(dataset_fname, 'r') as infile:
        for i, line in enumerate(infile.readlines()):
            line = line.strip().split(',')
            text.append(line[-1])
            max_text_len = max(max_text_len, int(line[-2]))
            if int(line[2]) == 1:
                red_events.append((i,line))

#     print(text[0], 'len:', len(text[0]), len(text))
    print('max_input_lenght:', max_text_len)
    input_data, target_data = encode_data(text, 128, max_len)
    
    return input_data, target_data, red_events
    

### Create a model using Keras

The TensorFlow [tf.keras](https://www.tensorflow.org/api_docs/python/tf/keras) API is the preferred way to create models and layers. This makes it easy to build models and experiment while Keras handles the complexity of connecting everything together. See the [Keras documentation](https://keras.io/) for details.

The [tf.keras.Sequential](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential) model is a linear stack of layers. Its constructor takes a list of layer instances, in this case, one [LSTM](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM) and one [Dense](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense) layers with 'num_chars' nodes each. The first layer's `input_shape` parameter corresponds to the amount of features from the dataset, and is required.

#### Define the loss and gradient function

Both training and evaluation stages need to calculate the model's *[loss](https://developers.google.com/machine-learning/crash-course/glossary#loss)*. This measures how off a model's predictions are from the desired output. We want to minimize, or optimize, this value.

Our model will calculate its loss using the [tf.keras.losses.categorical_crossentropy](https://www.tensorflow.org/api_docs/python/tf/keras/losses/categorical_crossentropy) function which takes the model's prediction and the desired output. The returned loss value is progressively larger as the prediction gets worse.

The `grad` function uses the `loss` function and the [tfe.GradientTape](https://www.tensorflow.org/api_docs/python/tf/contrib/eager/GradientTape) to record operations that compute the *[gradients](https://developers.google.com/machine-learning/crash-course/glossary#gradient)* used to optimize our model.

In [4]:
num_chars = 128 # our vocabulary, i.e. unique characters in text. We'll just use the first 128 (half ASCII)

def getModel(max_len):
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0., input_shape=(None, num_chars)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_len, return_sequences=True)),  # input shape required
#         tf.keras.layers.Dense(240, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(num_chars, activation="softmax"),
    ])
    return model

def loss(model, x, y):
    y_ = model(x)
    return tf.keras.losses.categorical_crossentropy(y, y_)

def grad(model, inputs, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return tape.gradient(loss_value, model.variables)

#### Define the optimizer
TensorFlow has many [optimization algorithms](https://www.tensorflow.org/api_guides/python/train) available for training. This model uses the [tf.train.GradientDescentOptimizer](https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer) that implements the *[stochastic gradient descent](https://developers.google.com/machine-learning/crash-course/glossary#gradient_descent)* (SGD) algorithm. The `learning_rate` sets the step size to take for each iteration down the hill. 

In [5]:
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, epsilon=1e-08)
# optimizer = tf.train.AdamOptimizer(learning_rate=0.001)

### Training the model

In [6]:
import time

num_epochs = 2
batch_size = 200

# keep results for plotting
train_loss_results = []
train_accuracy_results = []

max_len = 120 # length of sentence

model = getModel(max_len)

for i in range(27):
    dataset_fname = data_dir+'{0}.txt'.format(i)
    input_data, target_data, red_events = process_file(dataset_fname, max_len)
#     print(input_data.shape)
#     print(target_data.shape)
    print('processing:', dataset_fname, " - num events:", len(input_data), " - red events:", len(red_events))


    training_dataset = tf.data.Dataset.from_tensor_slices((input_data, target_data))
    training_dataset = training_dataset.batch(batch_size)
#     print(training_dataset)
    train_losses = [];
    
    for epoch in range(num_epochs):
        epoch_loss_avg = tfe.metrics.Mean()
        epoch_accuracy = tfe.metrics.Accuracy()

        startTime = time.time()
        # training using batches of 'batch_size'
        for X, y in tfe.Iterator(training_dataset):
            grads = grad(model, X, y)
            optimizer.apply_gradients(zip(grads, model.variables), 
                                     global_step=tf.train.get_or_create_global_step())
            batch_loss = loss(model, X, y)
            epoch_loss_avg(batch_loss) # batch loss
            train_losses.append(tf.reduce_mean(batch_loss))
#             epoch_accuracy(model(X), y)

        train_loss_results.append(epoch_loss_avg.result())
#         train_accuracy_results.append(epoch_accuracy.result())

        if epoch % 1 == 0:
            print("Epoch {:03d}: Loss: {:.3f}, Accuracy: 0 - in: {:.3f} sec.".format(epoch, 
                                                                        epoch_loss_avg.result(), 
#                                                                         epoch_accuracy.result(),
                                                                        (time.time()-startTime)))        

    avg_loss = tf.reduce_mean(train_losses)
    max_loss = tf.reduce_max(train_losses)
    print('  avg_loss:', avg_loss, ' - max_loss:', max_loss)
    dataset_fname = data_dir+'{0}.txt'.format(i+1)
    input_data, target_data, red_events = process_file(dataset_fname, max_len)
#     print(input_data.shape)
#     print(target_data.shape)
    print('  evaluating:', dataset_fname, " - num events:", len(input_data), " - red events:", len(red_events))

    eval_dataset = tf.data.Dataset.from_tensor_slices((input_data, target_data))
    eval_dataset = eval_dataset.batch(batch_size)

    line_losses = np.array([])

    # eval using batches of 'batch_size'
    for X, y in tfe.Iterator(eval_dataset):
        line_losses = np.append(line_losses, tf.reduce_mean(loss(model, X, y), axis=1))
    

    eval_max_loss = 0;
    possible_anomalies_avg = []
    possible_anomalies_max = []
    for i, v in enumerate(line_losses):
        if v > avg_loss:
            possible_anomalies_avg.append((i, v))
        if v > max_loss:
            possible_anomalies_max.append((i, v))
        eval_max_loss = max(eval_max_loss, v)
    
    for a,b in red_events:
        for i, v in enumerate(line_losses):
            if a == i:
                print('... score for red event:', v)
                
    possible_anomalies_avg.sort(key=lambda x: x[1], reverse=True)
    possible_anomalies_max.sort(key=lambda x: x[1], reverse=True)
    print('  possible anomalies using avg:', len(possible_anomalies_avg), 
          '- using max:', len(possible_anomalies_max),
          '- eval_max_loss:', eval_max_loss)
#     print('    avg:', possible_anomalies_avg[:10])
    print('    max:', possible_anomalies_max[:10])
    print('    red events:', [a for a,b in red_events])


max_input_lenght: 60
processing: data/char_feats/U12/0.txt  - num events: 3128  - red events: 0
Epoch 000: Loss: 1.647, Accuracy: 0 - in: 99.731 sec.
Epoch 001: Loss: 1.227, Accuracy: 0 - in: 99.179 sec.
  avg_loss: tf.Tensor(1.431376, shape=(), dtype=float32)  - max_loss: tf.Tensor(2.184606, shape=(), dtype=float32)
max_input_lenght: 60
  evaluating: data/char_feats/U12/1.txt  - num events: 8355  - red events: 0
  possible anomalies using avg: 0 - using max: 0 - eval_max_loss: 1.3660649061203003
    max: []
    red events: []
max_input_lenght: 60
processing: data/char_feats/U12/1.txt  - num events: 8355  - red events: 0
Epoch 000: Loss: 1.024, Accuracy: 0 - in: 262.666 sec.
Epoch 001: Loss: 0.806, Accuracy: 0 - in: 264.824 sec.
  avg_loss: tf.Tensor(0.91456044, shape=(), dtype=float32)  - max_loss: tf.Tensor(1.2924652, shape=(), dtype=float32)
max_input_lenght: 60
  evaluating: data/char_feats/U12/2.txt  - num events: 3537  - red events: 0
  possible anomalies using avg: 614 - using m

processing: data/char_feats/U12/11.txt  - num events: 11563  - red events: 0
Epoch 000: Loss: 0.000, Accuracy: 0 - in: 371.040 sec.
Epoch 001: Loss: 0.000, Accuracy: 0 - in: 362.730 sec.
  avg_loss: tf.Tensor(0.00021402424, shape=(), dtype=float32)  - max_loss: tf.Tensor(0.0015449403, shape=(), dtype=float32)
max_input_lenght: 60
  evaluating: data/char_feats/U12/12.txt  - num events: 11422  - red events: 4
... score for red event: 0.10726003348827362
... score for red event: 0.10455214977264404
... score for red event: 0.10455214977264404
... score for red event: 0.10455214977264404
  possible anomalies using avg: 938 - using max: 142 - eval_max_loss: 0.37292182445526123
    max: [(3795, 0.37292182445526123), (4303, 0.2704625427722931), (3, 0.1404290795326233), (29, 0.1404290795326233), (3718, 0.10726003348827362), (3742, 0.10455214977264404), (3861, 0.10455214977264404), (4306, 0.10455214977264404), (1147, 0.08515266329050064), (3837, 0.08341553807258606)]
    red events: [3718, 3742

  possible anomalies using avg: 41 - using max: 14 - eval_max_loss: 0.47223135828971863
    max: [(911, 0.47223135828971863), (1138, 0.47223135828971863), (975, 0.26923179626464844), (1321, 0.26923179626464844), (1316, 0.017774051055312157), (1315, 0.012499010190367699), (1317, 0.012499010190367699), (232, 0.007134323939681053), (7, 0.006148217711597681), (903, 0.004667764995247126)]
    red events: []
max_input_lenght: 64
processing: data/char_feats/U12/22.txt  - num events: 2168  - red events: 0
Epoch 000: Loss: 0.001, Accuracy: 0 - in: 65.220 sec.
Epoch 001: Loss: 0.000, Accuracy: 0 - in: 64.650 sec.
  avg_loss: tf.Tensor(0.0004271939, shape=(), dtype=float32)  - max_loss: tf.Tensor(0.0028752035, shape=(), dtype=float32)
max_input_lenght: 64
  evaluating: data/char_feats/U12/23.txt  - num events: 2244  - red events: 0
  possible anomalies using avg: 22 - using max: 17 - eval_max_loss: 0.27868354320526123
    max: [(1114, 0.27868354320526123), (1285, 0.27868354320526123), (1369, 0.27

In [None]:
fig, axes = plt.subplots(2, sharex=True, figsize=(12, 8))
fig.suptitle('Training Metrics')

axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(train_loss_results)

# axes[1].set_ylabel("Accuracy", fontsize=14)
# axes[1].set_xlabel("Epoch", fontsize=14)
# axes[1].plot(train_accuracy_results)

plt.show()

#### Save model to a file

In [None]:
model_filepath = 'models/model_lm_bidir_v1.hdfs'

tf.keras.models.save_model(
    model,
    model_filepath,
    overwrite=True,
    include_optimizer=True
)