# DBL Process Mining

## Imports and Class definitions

### Imports

In [1]:
import os
import time
from datetime import datetime
import time
import numpy as np
import numpy as np
import pandas as pd
from Utils.LogFile import LogFile
import tensorflow as tf
import multiprocessing as mp
import copy
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

print(tf.__version__)



2.8.0


## Loading in the data

### Two different methods: 
- One csv file, which still has to be split into training and test data
- Two csv files, which are already split into training and test data

In [2]:
# Define attribute columns here
case_attr = "case concept:name"
act_attr = "event concept:name"
time_attr = "event time:timestamp"
path = "Data/sub_dataset.csv"


#### One csv file

In [3]:
#path = "Data/sub_dataset.csv"
#baseline_log = LogFile(path, ",", 0, None, time_attr='event time:timestamp', trace_attr=case_attr,
#                    activity_attr=act_attr, convert=False, k=3)

#train_base_log, test_base_log = baseline_log.splitTrainTest(65, split_case=False, method="test-train")

In [4]:
#path = "Data/BPI_Challenge_2012-test.csv"

#baseline_log = LogFile(path, ",", 0, None, time_attr='event time:timestamp', trace_attr=case_attr,
#                    activity_attr=act_attr, convert=False, k=3)

#train_base_log, test_base_log = baseline_log.splitTrainTest(70, split_case=False, method="test-train")

#### Two csv files

In [5]:
path_train = "Data/sub_data_train.csv" 
path_test = "Data/sub_data_test.csv"

#path_train = 'Data/BPI_Challenge_2012-training.csv'
#path_test = 'Data\BPI_Challenge_2012-test.csv'

train_base_log = LogFile(path_train, ",", 0, None, time_attr=time_attr, trace_attr=case_attr,
                   activity_attr=act_attr, convert=False, k=3)
test_base_log = LogFile(path_test, ",", 0, None, time_attr=time_attr, trace_attr=case_attr,
                    activity_attr=act_attr, convert=False, k=3)

## Baseline prediction

### Preprocessing

In [6]:
def convert_time(dataset):
    """Adds a new column to a dataset with the converted timestamp to datetime"""

    date_list = []

    for time in dataset['event time:timestamp']:
        datex = time[:-4]
        date = datetime.strptime(datex, '%d-%m-%Y %H:%M:%S')

        date_list.append(date)

    dataset['time and date'] = date_list

### Add actual next event and time

In [7]:
def add_actual_next(df_case):
    """Adds the actual next activity and time to next event to the final dataframe"""


    # Create a list for all the actual next events for an case
    event_lst = [event for event in df_case['event concept:name']] # Gets a list of all events for a specific trace
    event_lst = event_lst[1:] # Erase the first activity from the list (thus the second activity becomes first in the list)
    event_lst.append('-') # Append a '-' to the end of the list (the last activity does not have a next activity)
    
    # Create a list for time of the next event
    nexttime_lst1 = [time for time in df_case['time and date']]
    nexttime_lst = nexttime_lst1[1:]
    nexttime_lst.append(nexttime_lst[-1])

    # Create the time difference list
    time_diff = []
    for i in range(len(nexttime_lst)):
        time_diff.append((nexttime_lst[i] - nexttime_lst1[i]).total_seconds())

    # Append columns to the case dataframe
    df_case['Next event'] = event_lst
    df_case['Time to next event'] = time_diff

    trace_len = len(df_case)

    return trace_len

### Predicted next event and time

In [8]:
def get_position_time(df_case, count_dict, time_dict):
    for index, row in df_case.iterrows():
        
        # Get the amount of times an action occured in a certain position {action : {position_1 : count_1, position_2: count_2}}
        if row['event concept:name'] in count_dict:
            if index in count_dict[row['event concept:name']]:
                count_dict[row['event concept:name']][index] += 1
            else:
                count_dict[row['event concept:name']].update({index: 1})
        else:
            count_dict[row['event concept:name']] = {index: 1}
        
        # Summation of the times to next action per position (index) {position: {"sum": summation_of_time, "count": amount_of_times_occured (to calculate mean)}}
        if index in time_dict:
            time_dict[index]['sum'] += row['Time to next event']
            time_dict[index]['count'] += 1
        else:
            time_dict[index] = {'sum': row['Time to next event'], 'count': 1}

def get_position_rank(max_trace_len, count_dict):
    pos_rank_dict = {}
    for i in range(max_trace_len):
        init = 0
        task = 0
        for key in count_dict.keys():
            try:
                new = count_dict[key][i]
            except:
                new = 0
            if new > init:
                init = new
                task = key

        pos_rank_dict.update({i: task})
    
    return pos_rank_dict

def get_mean_time(total_time_dict):
    mean_time_dict = {}
    for position in total_time_dict.keys():
        mean_time = total_time_dict[position]['sum'] / total_time_dict[position]['count']
        mean_time_dict[position] = mean_time
    
    return mean_time_dict

In [9]:
def create_event_pred(df_case, pos_rank_dict, mean_time_dict):
    
    # Prediction for the action
    pred_act_lst = [pos_rank_dict[i] for i in range(len(df_case))]
    pred_act_lst = pred_act_lst[1:]
    pred_act_lst.append('-')

    # Prediction for time
    pred_time_lst = [mean_time_dict[i] for i in range(len(df_case))]

    df_case['Event prediction'] = pred_act_lst 
    df_case['Time prediction'] = pred_time_lst

### Training and testing functions

In [10]:
def train_baseline(dataframe, maximum=None):
    """Returns the training dataset with predictions and 2 dictionaries which predict next action and nexttime based on position"""
    
    dataset = dataframe
    convert_time(dataset)

    df_actual = pd.DataFrame()


    # Creating a dataframe with the actual events

    cases = list(dataset['case concept:name'].unique())  
    max_trace_len = 0  
    pos_count_dict = {}
    time_dict = {}
    for case in cases[:maximum]:
        df_case = dataset[dataset['case concept:name'] == case].copy().reset_index(drop=True)
        trace_len = add_actual_next(df_case)
        get_position_time(df_case, pos_count_dict, time_dict)
        df_actual = pd.concat([df_actual, df_case])

        if trace_len > max_trace_len:
            max_trace_len = trace_len
    


    # Creating the predicitions
    df_predicted = pd.DataFrame()
    
    pos_rank_dict = get_position_rank(max_trace_len, pos_count_dict)
    mean_time_dict = get_mean_time(time_dict)

    for case in cases[:maximum]:
        df_case = df_actual[df_actual['case concept:name'] == case].copy().reset_index(drop=True)
        create_event_pred(df_case, pos_rank_dict, mean_time_dict)
        df_predicted = pd.concat([df_predicted,df_case])



    return df_predicted, pos_rank_dict, mean_time_dict

In [11]:
def test_baseline(dataframe, train_pos, train_time):
    """Creates the test dataset including the predictions based on the training dataset"""
    
    dataset = dataframe
    convert_time(dataset)

    df_predict = pd.DataFrame()
    cases = list(dataset['case concept:name'].unique())  
    for case in cases:
        df_case = dataset[dataset['case concept:name'] == case].copy().reset_index(drop=True)
        _ = add_actual_next(df_case)
        create_event_pred(df_case, train_pos, train_time)
        df_predict = pd.concat([df_predict, df_case])
    
    return df_predict

### Accuracy measurement

In [12]:
def get_accuracy(dataset):
    event_accuracy = np.mean(dataset['Next event'] ==  dataset['Event prediction'])
    time_accuracy = np.mean(abs(dataset['Time to next event'] - dataset['Time prediction'])) / 86400  # Mean Absolute Error in days
    
    return event_accuracy, time_accuracy

### Predictions

In [13]:
train_df = train_base_log.get_data()
test_df = test_base_log.get_data()

In [14]:
train_df, train_pos, train_time = train_baseline(train_df)
test_df = test_baseline(test_df, train_pos, train_time)

In [15]:
train_event_acc, train_time_acc = get_accuracy(train_df)
test_event_acc, test_time_acc = get_accuracy(test_df)

print(test_event_acc, test_time_acc)

0.47865303668069753 0.6874050509292106


# Decision Tree Classifier

In [16]:
def encode_target(df,
                  target_column, target_column2, target_column3,
                  target_result, target_result2, target_result3
                 ):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    
    targets2 = df_mod[target_column3].unique()
    map_to_int2 = {name: n for n, name in enumerate(targets2)}
    
    
    df_mod[f"{target_result}"] = df_mod[target_column].replace(map_to_int)
    df_mod[f"{target_result2}"] = df_mod[target_column2].replace(map_to_int)
    df_mod[f"{target_result3}"] = df_mod[target_column3].replace(map_to_int2)

    return (df_mod)

train_df = encode_target(train_df,
                                           "event concept:name", "Next event", "event lifecycle:transition",
                                           "current state", "next state", "lifecycle")
train_df['next state'].replace('-', None, inplace=True)
train_df = train_df.dropna()

test_df = encode_target(test_df,
                                           "event concept:name", "Next event", "event lifecycle:transition",
                                           "current state", "next state", "lifecycle")
test_df['next state'].replace('-', None, inplace=True)
test_df = test_df.dropna()

# Tree event prediction

In [18]:
train_sum = 0
test_sum = 0

for i in range(1):
    
    y = train_df['next state'].astype(int)
    X = train_df[['current state', 'lifecycle']].astype(int)
    clf = tree.DecisionTreeClassifier(splitter='best', criterion='entropy')
    clf = clf.fit(X, y)
    
    train_df['tree prediction'] = clf.predict(train_df[['current state', 'lifecycle']])
    test_df['tree prediction'] = clf.predict(test_df[['current state', 'lifecycle']])
    
    correct_event = 0 
    total = 0
    for index, row in test_df.iterrows():
        total += 1
        if row['next state'] == row['tree prediction']:
            correct_event += 1
        
    accuracy_event = correct_event/total 
    test_sum += accuracy_event

test_sum

0.24461815995189418

## Tree time prediction

In [27]:
train_sum2 = 0
test_sum2 = 0

for i in range(1):
    
    y2 = train_df['Time to next event']
    X2 = train_df[['current state', 'lifecycle']].astype(int)
    clf2 = tree.DecisionTreeClassifier(splitter='best', criterion='entropy')
    clf2 = clf2.fit(X2, y2)
    
    train_df['tree time prediction'] = clf2.predict(train_df[['current state', 'lifecycle']])
    test_df['tree time prediction'] = clf2.predict(test_df[['current state', 'lifecycle']])
    
    correct_event = 0 
    total = 0
    for index, row in test_df.iterrows():
        total += 1
        correct_event += abs(row['Time to next event'] - row['tree time prediction'])
        
    test_sum2 += correct_event/total/86400
    
test_sum2

0.4153229563372753

## LSTM Model

### Preprocessing

In [57]:
def transform_log(log):
    """
    
    """
    activities = np.unique(log.data[log.activity])
    X = np.zeros((len(log.contextdata), log.k, len(activities)+ 1 + 4), dtype=np.float32)
    y_a = np.zeros((len(log.contextdata), len(activities) + 2), dtype=np.float32)
    y_t = np.zeros((len(log.contextdata)), dtype=np.float32)
    j = 0
    df = log.contextdata
    time_diff = 0
    for row in log.contextdata.iterrows():
        
            act = getattr(row[1], log.activity)
            event_str = getattr(row[1], log.time)
            prev_str = getattr(row[1], "%s_Prev0" % (log.time))
            #prev_1_str = getattr(row[1], "%s_Prev1" % (log.time))
            event_time = time.strptime(event_str, "%d-%m-%Y %H:%M:%S.%f")
            if prev_str != 0:
                prev_time = time.strptime(prev_str, "%d-%m-%Y %H:%M:%S.%f")
                diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                          - datetime.fromtimestamp(time.mktime(prev_time))
                diff = diff_prev_event.total_seconds()
            else: 
                diff = 0
            y_a[j, act] = 1
            y_t[j] = diff            

            k = 0
            
            for i in range(log.k -1, -1, -1):
                
                if getattr(row[1], "%s_Prev%i" % (log.activity, i)) != 0: # 0 indicates no activity (first activity is encoded to 1)
                    X[j, log.k - i - 1, getattr(row[1], "%s_Prev%i" % (log.activity, i))] = 1
                X[j, log.k - i - 1, len(activities)+1] = k
                X[j, log.k - i - 1, len(activities) + 2] = time_diff # Diff in seconds

 
                str_time = getattr(row[1], "%s_Prev0" % (log.time))
                if str_time != 0:
                    event_time = time.strptime(str_time, "%d-%m-%Y %H:%M:%S.%f")
                    X[j, log.k - i - 1, len(activities) + 3] = event_time.tm_hour # Hour of day
                    X[j, log.k - i - 1, len(activities) + 4] = event_time.tm_wday  # Day of the week
    
                try:
                    prev_str = getattr(row[1], "%s_Prev1" % (log.time))
                    #print("First success!", prev_str)
                    if prev_str != 0:

                        prev_time = time.strptime(prev_str, "%d-%m-%Y %H:%M:%S.%f")
                        diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                                          - datetime.fromtimestamp(time.mktime(prev_time))
                        time_diff = diff_prev_event.total_seconds()
                        #print(time_diff)

                except:
                    pass
                

                #if str_time != 0: #No event
                #    try:
                #        event_time = time.strptime(str_time, "%d-%m-%Y %H:%M:%S.%f")
                #    except ValueError:
                #        event_time = time.strptime(str_time, "%d-%m-%Y %H:%M:%S.%f")


                #    if prev_str != 0:
                #        try:
                #           prev_time = time.strptime(prev_str, "%d-%m-%Y %H:%M:%S.%f")
                #        except ValueError:
    
                #            prev_time = time.strptime(prev_str, "%d-%m-%Y %H:%M:%S.%f")
                #        
                #        diff_prev_event = datetime.fromtimestamp(time.mktime(event_time)) \
                #                          - datetime.fromtimestamp(time.mktime(prev_time))
                #        diff = 86400 * diff_prev_event.days + diff_prev_event.seconds
                        #sum_duration += diff
                        #count_duration += 1
                        
                    X[j, log.k - i - 1, len(activities) + 3] = event_time.tm_hour # Hour of day
                    X[j, log.k - i - 1, len(activities) + 4] = event_time.tm_wday  # Day of the week

                k += 1

            j += 1

    return X, y_a, y_t

In [69]:
def train_LSTM(log, epochs=4, early_stop=42):
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
    from tensorflow.keras.layers import Input
    from tensorflow.keras.layers import Dense, BatchNormalization, LSTM
    from tensorflow.keras.models import Model
    from tensorflow.keras.optimizers import Nadam

    print("Transforming log...")
    X, y_a, y_t = transform_log(log)

    # build the model:
    print('Build model...')
    main_input = Input(shape=(log.k, len(np.unique(log.data[log.activity]))+5), name='main_input')
    # train a 2-layer LSTM with one shared layer
    l1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=0.2)(main_input) # the shared layer
    b1 = BatchNormalization()(l1)
    l2_1 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in activity prediction
    b2_1 = BatchNormalization()(l2_1)
    l2_2 = LSTM(100, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=0.2)(b1) # the layer specialized in time prediction
    b2_2 = BatchNormalization()(l2_2)

    act_output = Dense(len(np.unique(log.data[log.activity])) + 2, activation='softmax', kernel_initializer='glorot_uniform', name='act_output')(b2_1)
    time_output = Dense(1, kernel_initializer='glorot_uniform', name='time_output')(b2_2)


    model = Model(inputs=[main_input], outputs=[act_output, time_output])

    opt = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)

    model.compile(loss={'act_output':'categorical_crossentropy', 'time_output': 'mae'}, optimizer=opt)
    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stop)
    model_checkpoint = ModelCheckpoint(os.path.join("model", 'model_{epoch:03d}-{val_loss:.2f}.h5'), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
    lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    if len(y_a) > 10:
        split = 0.2
    else:
        split = 0

    model.fit(X, {'act_output': y_a, 'time_output': y_t}, validation_split=split, verbose=2, callbacks=[early_stopping, lr_reducer], batch_size=log.k, epochs=epochs)

    return model

In [62]:
def test(model, log):
    X, y_a, y_t = transform_log(log)
    pred_act, pred_time = model.predict(X)
    predict_vals = np.argmax(pred_act, axis=1)
    pred_time = pred_time.reshape(-1)
    #predict_probs = predictions[np.arange(predictions.shape[0]), predict_vals]
    expected_vals = np.argmax(y_a, axis=1)
    #expected_probs = predictions[np.arange(predictions.shape[0]), expected_vals]
    activity_acc = np.mean(expected_vals ==  predict_vals)
    mae_time = np.mean(abs(y_t - pred_time)) / 86400
    return predict_vals, pred_time, activity_acc, mae_time

In [65]:
LSTM_log_train = LogFile(path_train, ",", 0, None, 'event time:timestamp', case_attr,
                    activity_attr=act_attr, convert=False, k=2)
LSTM_log_test = LogFile(path_test, ",", 0, None, 'event time:timestamp', case_attr,
                    activity_attr=act_attr, convert=False, k=2)

LSTM_map_train = LSTM_log_train.int_convert()
LSTM_map_test = LSTM_log_test.int_convert()

LSTM_log_train.remove_attributes(['eventID', 'case REG_DATE', 'case AMOUNT_REQ', 'event lifecycle:transition'])
LSTM_log_test.remove_attributes(['eventID', 'case REG_DATE', 'case AMOUNT_REQ', 'event lifecycle:transition'])

LSTM_log_train.create_k_context()
LSTM_log_test.create_k_context()


print("Created k context")




Create k-context: 2
Create k-context: 2
Created k context


In [73]:
model = train_LSTM(LSTM_log_train, epochs=5, early_stop=2)

Transforming log...
Build model...
Epoch 1/5
5866/5866 - 275s - loss: 39024.9883 - act_output_loss: 2.5032 - time_output_loss: 39022.4297 - val_loss: 40667.0859 - val_act_output_loss: 2.3110 - val_time_output_loss: 40664.7656 - lr: 0.0020 - 275s/epoch - 47ms/step
Epoch 2/5
5866/5866 - 300s - loss: 39020.6367 - act_output_loss: 2.3324 - time_output_loss: 39018.2539 - val_loss: 40663.9492 - val_act_output_loss: 2.1426 - val_time_output_loss: 40661.8477 - lr: 0.0020 - 300s/epoch - 51ms/step
Epoch 3/5


In [71]:
pred_act, pred_time, acc_act, mae_time = test(model, LSTM_log_train)

In [72]:
print(acc_act, mae_time)


0.2744630071599045 0.45543533890335647
