In [None]:
# Get 25 GB Extra RAM
#[1] * 9 ** 10
# mem = []
# while True:
#     mem.append(' ' * 10 ** 6)

In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import datetime as datetime
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dense, LSTM, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizer_v2 import nadam as nadam_v2
import os
import math
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error

In [None]:
#!pip install tensorflow-gpu

# 0. GPU Setting and Import CSV (https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=sXnDmXR7RDr2)

The requirements to use the cuDNN implementation are:

activation == tanh

recurrent_activation == sigmoid

recurrent_dropout == 0

unroll is False

use_bias is True

Inputs, if use masking, are strictly right-padded.

Eager execution is enabled in the outermost context.

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2017_train_filtered.csv", parse_dates = ['time:timestamp'])
df_val = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2017_val_filtered.csv", parse_dates = ['time:timestamp'])
df_test = pd.read_csv("/content/drive/MyDrive/DBL Process Mining/Data/Split/bpi2017_test_filtered.csv", parse_dates = ['time:timestamp'])

# The default name indicating the case ID is case:concept:name
# concept:name is the event
# time:timestamp is the corresponding timestamp
# Load the datasets, sort them on case and consequently timestamp, then reset the index
df_train = df_train.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_val = df_val.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)
df_test = df_test.sort_values(by = ['case:concept:name', 'time:timestamp']).reset_index(drop = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Normalization on Time Difference

In [None]:
# Normalize time difference so that the time difference's value is within 0 and 1
min_max_scaler = MinMaxScaler()
df_train['nor_future_time_diff'] = min_max_scaler.fit_transform(np.array(df_train['future_time_diff']).reshape(-1, 1))
# Use the range from training data on validation and test data
df_val['nor_future_time_diff'] = min_max_scaler.transform(np.array(df_val['future_time_diff']).reshape(-1, 1))
df_test['nor_future_time_diff'] = min_max_scaler.transform(np.array(df_test['future_time_diff']).reshape(-1, 1))

# 2. One-hot Encoding

In [None]:
def onehot_now(df):
    # Extract categorical and numerical variables
    df_cat = df[['concept:name', 'lifecycle:transition', 'EventOrigin', 'Action']]
    df_num = df[['nor_time_since_last_event', 'nor_time_since_case_starts', 'nor_time_since_midnight', 'nor_time_since_week_start', 'position']]
    # Convert categorical variable columns to one-hot encoding (A large matrix with dummy variables is made)
    enc = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
    enc.fit(df_train[['concept:name', 'lifecycle:transition', 'EventOrigin', 'Action']])
    transformed = enc.transform(df_cat)
    # Create a dataframe using the newly created matrix
    df_ohe = pd.DataFrame(transformed, columns = enc.get_feature_names())
    # Combine dummy dataframe with numerical dataframe
    df_ohe = pd.concat([df_ohe, df_num], axis = 1)
    return df_ohe

df_train_now = onehot_now(df_train)
df_val_now = onehot_now(df_val)
df_test_now = onehot_now(df_test)

enc = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
df_train_next = enc.fit_transform(df_train[['next:concept:name']])
df_val_next = enc.transform(df_val[['next:concept:name']])
df_test_next = enc.transform(df_test[['next:concept:name']])



In [None]:
# Source: https://towardsdatascience.com/how-to-reshape-data-and-do-regression-for-time-series-using-lstm-133dad96cd00

def lstm_data_transform(x_data, y_data_1, y_data_2, num_steps):
    # Reshape the feature array to (621131, 27, 1) so that it fulfills the format requirement of LSTM (Number Of Examples, Time Steps, Features Per Step)
    # Slide window approach to prevent throwing data away
    # Prepare the list for the transformed data
    X, y_1, y_2 = list(), list(), list()
    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # Compute a new (sliding window) index
        end = i + num_steps
        # If index is larger than the size of the dataset, we stop
        if end >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i: end]
        # Get only the last element of the sequency for y
        seq_y_1 = y_data_1[end]
        seq_y_2 = y_data_2[end]
        # Append the list with sequencies
        X.append(seq_X)
        y_1.append(seq_y_1)
        y_2.append(seq_y_2)
    # Make final arrays
    x_array = np.array(X)
    y_array_1 = np.array(y_1)
    y_array_2 = np.array(y_2)
    return x_array, y_array_1, y_array_2

def zero_row(df, df_now, df_next, time_step):
    # Convert all required data from dataframe to numpy arrays
    x = df_now.to_numpy()
    y_1 = df_next
    y_2 = df[['nor_future_time_diff']].to_numpy()
    # Create rows with just 0 at the beginning so that number of samples after sliding window matches the actual sample size, and no future data is used
    x_0 = np.zeros((time_step, x.shape[1]), dtype = float)
    y_0_1 = np.zeros((time_step, y_1.shape[1]), dtype = float)
    y_0_2 = np.zeros((time_step, y_2.shape[1]), dtype = float)
    x = np.concatenate((x_0, x))
    y_1 = np.concatenate((y_0_1, y_1))
    y_2 = np.concatenate((y_0_2, y_2))
    x, y_1, y_2 = lstm_data_transform(x, y_1, y_2, time_step)
    return x, y_1, y_2

In [None]:
time_step = 40 # Your chosen batch-size/timestep

x_train, y_train_event, y_train_time = zero_row(df_train, df_train_now, df_train_next, time_step)
x_val, y_val_event, y_val_time = zero_row(df_val, df_val_now, df_val_next, time_step)
x_test, y_test_event, y_test_time = zero_row(df_test, df_test_now, df_test_next, time_step)

# 4. LSTM Model

In [None]:
%%time
with tf.device(device_name):

    # create and fit the LSTM network
    model = Sequential()

    # First LSTM layer with 256 units, 0.2 dropout, activation as tanh, input shape (27, 1)
    # It returns the last input so that the next LSTM layer can run properly
    # return_sequences=True when many-to-many. return_sequence=False when many-to-one
    # tanh is used because it is the only activation that fulfills requirement of gpu
    # model.add(LSTM(256, activation='tanh', input_shape=(df_train_now.shape[1], 1), return_sequences=True))
    #model.add(LSTM(256, activation='tanh', input_shape=(time_step, df_train_now.shape[1]), return_sequences = True))
    #model.add(Dropout(0.2))

    # Second LSTM layer with 64 units, 0.2 dropout, activation as tanh
    # It does not return the last output because there are no more LSTM layers
    #model.add(LSTM(64, activation='tanh'))
    #model.add(Dropout(0.2))

    # Output Dense layer with 1 unit because the output is 1D
    #model.add(Dense(1))

    # build the model: 
    main_input = Input(shape = (time_step, df_train_now.shape[1]), name = 'main_input')
    # train a 2-layer LSTM with one shared layer
    l1 = LSTM(100, implementation = 2, kernel_initializer = 'glorot_uniform', return_sequences = True, dropout = 0.2)(main_input) # the shared layer
    b1 = BatchNormalization()(l1)
    l2_1 = LSTM(100, implementation = 2, kernel_initializer = 'glorot_uniform', return_sequences = False, dropout = 0.2)(b1) # the layer specialized in activity prediction
    b2_1 = BatchNormalization()(l2_1)
    l2_2 = LSTM(100, implementation = 2, kernel_initializer = 'glorot_uniform', return_sequences = False, dropout = 0.2)(b1) # the layer specialized in time prediction
    b2_2 = BatchNormalization()(l2_2)
    # Originally softmax is used
    act_output = Dense(len(df_train['next:concept:name'].unique().tolist()), activation = 'softmax', kernel_initializer = 'glorot_uniform', name = 'act_output')(b2_1)
    time_output = Dense(1, kernel_initializer = 'glorot_uniform', name = 'time_output')(b2_2)

    model = Model(inputs = [main_input], outputs = [act_output, time_output])

    opt = nadam_v2.Nadam(learning_rate = 0.002, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, schedule_decay = 0.004, clipvalue = 3)

    # The loss used in model training is mean_squared_error because it is time prediction
    # The optimizer is Adam
    model.compile(loss = {'act_output':'categorical_crossentropy', 'time_output': 'mae'}, optimizer = opt)

    # Save the best model
    checkpoint_filepath = '/content/drive/MyDrive/Process Mining RNN/model/weights.{epoch:02d}.h5'
    model_checkpoint_callback = ModelCheckpoint(filepath = checkpoint_filepath, monitor = 'val_loss', mode = 'min', save_best_only = True)
    lr_reducer = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 10, verbose = 0, mode = 'auto', min_delta = 0.0001, cooldown = 0, min_lr = 0)

    # Fit the model with 20 epoches and batch size 64
    # Validation data is used here for evaluation during the training process
    model.fit(x_train, {'act_output': y_train_event, 'time_output': y_train_time}, validation_data = (x_val, {'act_output': y_val_event, 'time_output': y_val_time}), epochs = 1, batch_size = time_step, callbacks = [model_checkpoint_callback, lr_reducer])

CPU times: user 5min 22s, sys: 34.7 s, total: 5min 57s
Wall time: 6min 13s


In [None]:
# Look for all file directories in the folder
directory = "/content/drive/MyDrive/Process Mining RNN/model/"
all_file_lst = []
for root, subdirectories, files in os.walk(directory):
    for file in files:
        all_file_lst.append(os.path.join(root, file))
# Load the best model trained
model = load_model(all_file_lst[-1])

In [None]:
%%time
# make predictions
train_predict_event, train_predict_time = model.predict(x_train)
val_predict_event, val_predict_time = model.predict(x_val)
test_predict_event, test_predict_time = model.predict(x_test)

CPU times: user 2min 18s, sys: 8.95 s, total: 2min 27s
Wall time: 2min 7s


# 5. Create New Columns with RNN Event and Time Prediction

In [None]:
# Obtain event predictions from the highest probability of the label found, then find its label string name
train_pred_event_lst = enc.get_feature_names()[np.argmax(train_predict_event, axis = 1)]
train_pred_event_lst = [i.replace('x0_', '') for i in train_pred_event_lst]
val_pred_event_lst = enc.get_feature_names()[np.argmax(val_predict_event, axis = 1)]
val_pred_event_lst = [i.replace('x0_', '') for i in val_pred_event_lst]
test_pred_event_lst = enc.get_feature_names()[np.argmax(test_predict_event, axis = 1)]
test_pred_event_lst = [i.replace('x0_', '') for i in test_pred_event_lst]

df_train['RNN_next_event'] = train_pred_event_lst
df_val['RNN_next_event'] = val_pred_event_lst
df_test['RNN_next_event'] = test_pred_event_lst



In [None]:
# Invert time predictions from min-max scaling to their actual value
train_predict_time = min_max_scaler.inverse_transform(train_predict_time)
val_predict_time = min_max_scaler.inverse_transform(val_predict_time)
test_predict_time = min_max_scaler.inverse_transform(test_predict_time)

train_pred_time_lst = train_predict_time[: , 0].tolist()
val_pred_time_lst = val_predict_time[: , 0].tolist()
test_pred_time_lst = test_predict_time[: , 0].tolist()
# # Due to the difference in number of terms for sliding window and actual sample size, the last value is being added until the prediction has equal size as input
# for i in range(df_train.shape[0] - len(train_pred_lst)):
#     train_pred_lst.append(train_pred_lst[-1])
# for i in range(df_val.shape[0] - len(val_pred_lst)):
#     val_pred_lst.append(val_pred_lst[-1])
# for i in range(df_test.shape[0] - len(test_pred_lst)):
#     test_pred_lst.append(test_pred_lst[-1])
df_train['RNN_time_diff'] = train_pred_time_lst
df_val['RNN_time_diff'] = val_pred_time_lst
df_test['RNN_time_diff'] = test_pred_time_lst

# 6. Accuracy and Confusion Matrix

In [None]:
train_true_event = df_train['next:concept:name'].tolist()
val_true_event = df_val['next:concept:name'].tolist()
test_true_event = df_test['next:concept:name'].tolist()

train_rnn_event = df_train['RNN_next_event'].tolist()
val_rnn_event = df_val['RNN_next_event'].tolist()
test_rnn_event = df_test['RNN_next_event'].tolist()

In [None]:
accuracy_score(train_true_event, train_rnn_event), accuracy_score(val_true_event, val_rnn_event), accuracy_score(test_true_event, test_rnn_event)

(0.8067047294549193, 0.8034728682170542, 0.7463596460670243)

In [None]:
confusion_matrix(train_true_event, train_rnn_event)

array([[ 2122,     0,     0,    10,     0,     0,     0,     0,     0,
            0,     0,     0,  2807,     0,     0,     0,     0,     0,
            0,     0,     0,  1937,     0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            1,     0,    12,     0,     1,     0,     0,    25,     0,
            0,   845,   139,    11,     0,     0,    65],
       [    0,     0,  6821,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0],
       [    0,     0,     0,  6032,     0,     0,     0,   661,     0,
          109,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    74,     0,     0,     0],
       [    0,     0,     1,     0,     0,     0,     7,     0,     0,
            0,     0,     1,     0,     1,     0,     2,     9,     0,
           23,    49,    58,    11,     0,     0,   868],


In [None]:
confusion_matrix(val_true_event, val_rnn_event)

array([[  527,     0,     0,     3,     0,     0,     0,     0,     0,
            0,     0,     0,   716,     0,     0,     0,     0,     0,
            0,     0,     0,   474,     0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     6,     0,     1,     0,     0,     9,     0,
            0,   240,    44,     4,     0,     0,    18],
       [    0,     0,  1705,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0],
       [    0,     0,     0,  1513,     0,     0,     0,   170,     0,
           23,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    14,     0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     1,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     1,     0,
            8,    15,     4,     2,     0,     0,   205],


In [None]:
confusion_matrix(test_true_event, test_rnn_event)

array([[ 231,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  355,    0,    0,    0,    0,    0,    0,    0,    0,  165,
           0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    2,    2,    0,    0,   35,   26,    0,
           0,  185],
       [   0,    0,  747,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   1,    0,    0,  663,    0,    0,    0,   70,    2,    8,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    5,
           0,    2],
       [   0,    0,    0,    0,    0,    0,    2,    0,    1,    0,    0,
           0,    0,    0,    0,    0,    1,    0,    2,    0,    6,    2,
           0,   79],
       [   0,    0,    0,    0,    0,  625,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       

# 7. R2 and RMSE

In [None]:
train_true = df_train['future_time_diff'].tolist()
val_true = df_val['future_time_diff'].tolist()
test_true = df_test['future_time_diff'].tolist()

train_rnn = df_train['RNN_time_diff'].tolist()
val_rnn = df_val['RNN_time_diff'].tolist()
test_rnn = df_test['RNN_time_diff'].tolist()

In [None]:
math.sqrt(mean_squared_error(train_true, train_rnn)), math.sqrt(mean_squared_error(val_true, val_rnn)), math.sqrt(mean_squared_error(test_true, test_rnn))

(123222.10403613243, 128092.95153607572, 239359.80592733712)

In [None]:
r2_score(train_true, train_rnn), r2_score(val_true, val_rnn), r2_score(test_true, test_rnn)

(0.2236992380153805, 0.21153693452992461, -0.026350591363622167)

In [None]:
df_train.to_csv('/content/drive/MyDrive/Process Mining RNN/output/bpi2017_train_output.csv', index = False)
df_val.to_csv('/content/drive/MyDrive/Process Mining RNN/output/bpi2017_val_output.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/Process Mining RNN/output/bpi2017_test_output.csv', index = False)