In [None]:
import os
from os.path import dirname
root_path = dirname(dirname(os.getcwd()))
print(root_path)
import sys
sys.path.append(root_path + '/RemainingCycleTimePrediction/2_Scripts/')
import pandas as pd
import numpy as np
import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error

from Event_log_processing_utils import Extract_trace_and_temporal_features, Extract_prefix
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

data_dir = root_path + '/RemainingCycleTimePrediction/1_Data/'
project_dir = root_path + '/RemainingCycleTimePrediction/'

print(tf.__version__)
print(tf.config.experimental.list_physical_devices('GPU'))

## Load data

In [None]:
data_name = 'BPIC20'
# data_name = 'Helpdesk'

In [None]:
tab_all = pd.read_csv(data_dir+data_name+"_processed_all.csv")
tab_train= pd.read_csv(data_dir+data_name+"_processed_train.csv")
tab_test = pd.read_csv(data_dir+data_name+"_processed_test.csv")

In [None]:
# Statistics of the dataset
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
print("num_cases: {}".format(len(tab_all['Case_ID'].unique())))
print("num_activities: {}".format(len(tab_all["Activity"].unique())))
print("num_events: {}".format(len(tab_all)))
avglen = round(np.mean([len(x) for x in lines]), 2)
print("avg_case_len: {}".format(avglen))
maxlen = max([len(x) for x in lines]) #find maximum line size
print("max_case_len: {}".format(maxlen))
print("avg_case_duration: {}".format(round(np.mean([sublist[-1] for sublist in lines_t2])/86400, 2)))
print("max_case_duration: {}".format(round(max([sublist[-1] for sublist in lines_t2])/86400, 2)))
print("min_case_duration: {}".format(round(min([sublist[-1] for sublist in lines_t2])/86400, 2)))
list_unique_line = []
for line in lines:
    if line not in list_unique_line:
        list_unique_line.append(line)
print("variants: {}".format(len(list_unique_line)))

## Prepare inputs and outputs for model training

In [None]:
def Prepare_X_Y_remaining_time(tab, list_activities, divisor, divisor2, divisor_rt, encoder, maxlen):
    lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab)
    prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
    num_samples = len(prefixes[0])
#     [sentences, sentences_t, sentences_t2, sentences_t3, sentences_t4], [next_ope, next_ope_t, end_ope_t]
    print('Vectorization...')
    num_features = len(list_activities)+5 #1 order feature + 4 temporal features
    print('num features: {}'.format(num_features))
    X = np.zeros((num_samples, maxlen, num_features), dtype=np.float32)
    Y = np.zeros(num_samples, dtype=np.float32)
    for i, sentence in enumerate(prefixes[0]):
        leftpad = maxlen-len(sentence)
        end_t = outputs[2][i]
        sentence_t = prefixes[1][i]
        sentence_t2 = prefixes[2][i]
        sentence_t3 = prefixes[3][i]
        sentence_t4 = prefixes[4][i]
        one_hot_act_matrix = encoder.transform(np.array(sentence).reshape((len(sentence), 1))).toarray()
        for t, char in enumerate(sentence):                
            X[i, t+leftpad, :len(list_activities)] = one_hot_act_matrix[t, :]
            X[i, t+leftpad, len(list_activities)] = t+1 # order of the activity in the sequence {1,...,maxlen}
            X[i, t+leftpad, len(list_activities)+1] = sentence_t[t]/divisor
            X[i, t+leftpad, len(list_activities)+2] = sentence_t2[t]/divisor2
            X[i, t+leftpad, len(list_activities)+3] = sentence_t3[t]/86400
            X[i, t+leftpad, len(list_activities)+4] = sentence_t4[t]/7
        Y[i] = end_t/divisor_rt
    return X, Y

In [None]:
list_activities = list(tab_all["Activity"].unique())
#creating instance of one-hot-encoder and fit on the whole dataset
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(np.array(list_activities).reshape((len(list_activities), 1)))

lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
maxlen = max([len(x) for x in lines]) #find maximum line size
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_train)
divisor = np.mean([item for sublist in lines_t for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in lines_t2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
divisor_rt = np.mean(outputs[2])
print('divisor_rt: {}'.format(divisor_rt))
#Train data
X_train, Y_train = Prepare_X_Y_remaining_time(tab_train, list_activities, divisor, divisor2, divisor_rt, encoder, maxlen)
#Test data
X_test, Y_test = Prepare_X_Y_remaining_time(tab_test, list_activities, divisor, divisor2, divisor_rt, encoder, maxlen)

## Train LSTM model

In [None]:
# build the model: 
print('Build model...')
main_input = Input(shape=(X_train.shape[1], X_train.shape[2]), name='main_input')
# train a 2-layer LSTM with one shared layer
l1 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
b1 = BatchNormalization()(l1)

l2_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b1) # the layer specialized in time prediction
b2_2 = BatchNormalization()(l2_2)
l3_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b2_2) 
b3_2 = BatchNormalization()(l3_2)
l4_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b3_2) 
b4_2 = BatchNormalization()(l4_2)
l5_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b4_2) 
b5_2 = BatchNormalization()(l5_2)
# time prediction
time_output = Dense(1, kernel_initializer='glorot_uniform')(b5_2)
model = Model(inputs=[main_input], outputs=[time_output])
model.summary()

In [None]:
#Train
num_epochs = 100
early_stop_patience = 10
min_delta = 0
lr_value = 1e-03
num_runs = 5
running_time = []
for run in range(num_runs):
    start=datetime.datetime.now()
    print("Run: {}".format(run+1))
    keras.backend.clear_session()
    main_input = Input(shape=(X_train.shape[1], X_train.shape[2]), name='main_input')
    # train a 2-layer LSTM with one shared layer
    l1 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
    b1 = BatchNormalization()(l1)

    l2_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b1) # the layer specialized in time prediction
    b2_2 = BatchNormalization()(l2_2)
    l3_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b2_2) 
    b3_2 = BatchNormalization()(l3_2)
    l4_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(b3_2) 
    b4_2 = BatchNormalization()(l4_2)
    l5_2 = LSTM(150, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b4_2) 
    b5_2 = BatchNormalization()(l5_2)
    # time prediction
    time_output = Dense(1, kernel_initializer='glorot_uniform')(b5_2)
    model = Model(inputs=[main_input], outputs=[time_output])


    opt = keras.optimizers.Adam(lr=lr_value)
    model.compile(optimizer=opt, loss='mae', metrics=['mae'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stop_patience, min_delta = min_delta)
    model_checkpoint = ModelCheckpoint('../4_Outputs/Output_files/'+data_name+'_Tax_LSTM/best_model_run_{}.h5'.format(run+1),
                                       monitor='val_loss', verbose=0, 
                                       save_best_only=True, save_weights_only=False, mode='min')
    model.fit(X_train, Y_train, validation_split=0.2, 
              callbacks=[early_stopping, model_checkpoint], batch_size=16, epochs=num_epochs)
    running_time.append((datetime.datetime.now()-start).total_seconds())

In [None]:
print("Running time in average of each run: {}".format(np.mean(running_time)/60))

## Evaluation

In [None]:
def MAPE(Y_actual,Y_Predicted):    
    mape_mean = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape_mean

In [None]:
# Extract test samples by prefix length
prefix_size = X_test[:,-1,len(list_activities)]
X_by_prefix_size = {}
Y_by_prefix_size = {}
for i in range(X_test.shape[0]):
    prefix_size = X_test[i,-1,len(list_activities)]
    if prefix_size in X_by_prefix_size.keys():
        X_by_prefix_size[prefix_size].append(X_test[i,:,:])
        Y_by_prefix_size[prefix_size].append(Y_test[i])
    else:
        X_by_prefix_size[prefix_size] = [X_test[i,:,:]]
        Y_by_prefix_size[prefix_size] = [Y_test[i]]

for key in X_by_prefix_size.keys():
    X_by_prefix_size[key] = np.array(X_by_prefix_size[key])
    Y_by_prefix_size[key] = np.array(Y_by_prefix_size[key])

In [None]:
# Compute MAE and MPAE error of test data for each run. Then the average and std are computed 
err_total_dict = {}
for run in range(num_runs):
    print("Run: {}".format(run+1))
    model = load_model('../4_Outputs/Output_files/'+data_name+'_Tax_LSTM/best_model_run_{}.h5'.format(run+1))
    for key in X_by_prefix_size.keys():
        y_predict = model.predict(X_by_prefix_size[key])
        mape_err = MAPE(Y_by_prefix_size[key], y_predict)
        mae_err = mean_absolute_error(Y_by_prefix_size[key], y_predict)*divisor_rt/86400
        if key in err_total_dict.keys():
            err_total_dict[key].append([mape_err, mae_err])
        else:
            err_total_dict[key] = [[mape_err, mae_err]]

In [None]:
# Prediction errors by prefix length are then stored in a tab
list_prefix_len = []
list_num_samples = []
list_mape_err = []
list_mape_std = []
list_mae_err = []
list_mae_std = []
for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(X_by_prefix_size[key].shape[0])
    list_mape_err.append(round(np.array(value).mean(axis=0)[0], 3))
    list_mape_std.append(round(np.array(value).std(axis=0)[0], 3))
    list_mae_err.append(round(np.array(value).mean(axis=0)[1], 3))
    list_mae_std.append(round(np.array(value).std(axis=0)[1], 3))
tab_result = pd.DataFrame({"Prefix length":list_prefix_len, "Num samples": list_num_samples, 
                           "MAPE(%)":list_mape_err, "MAPE std": list_mape_std,
                           "MAE(days)": list_mae_err, "MAE std": list_mae_std})

if not os.path.exists(project_dir + '4_Outputs/Evaluation'):
    os.mkdir(project_dir + '4_Outputs/Evaluation')
tab_result.to_csv(project_dir+"4_Outputs/Evaluation/"+data_name+"_Tax_LSTM_eval.csv", index = False)