In [1]:
# from pm4py.objects.log.importer.xes import importer as xes_importer
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn import preprocessing as pre
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter
import math
import tracemalloc
import psutil

In [2]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

def load_data(BPI = 'BPI.csv', BPI_attr = 'BPI_attr.csv',  data2012 = False, sample=False):
    df_BPI = pd.read_csv(BPI)
    df_BPI_attr = pd.read_csv(BPI_attr)
    
    if 'Unnamed: 0' in df_BPI.columns:
        df_BPI = df_BPI.rename(columns={'Unnamed: 0': 'case_id', 'Unnamed: 1': 'step_number'})
        df_BPI_attr = df_BPI_attr.rename(columns={'Unnamed: 0': 'case_id'})
    
    df_BPI['time:timestamp'] = df_BPI['time:timestamp'].apply(fix_time)

    df_BPI_attr['REG_DATE'] = df_BPI_attr['REG_DATE'].apply(fix_time)
    
    df_BPI['time:weekday'] = [x.weekday() for x in df_BPI['time:timestamp']]
    df_BPI['time:hour'] = [x.hour for x in df_BPI['time:timestamp']]
    df_BPI['time:day'] = [str(x.day) + '-' + str(x.month) + '-' + str(x.year) for x in df_BPI['time:timestamp']]
    df_BPI['time:event_count'] = df_BPI.groupby('time:day')['time:day'].transform('count')
    df_BPI['time:busy_day'] = df_BPI['time:event_count'] > 2525
    
    if(sample):
        df_BPI, df_BPI_attr = df_BPI[:50000], df_BPI_attr[:2359]
    
    return (df_BPI, df_BPI_attr)

def load_data_xes(data):
    BPI = xes_importer.apply(data)

In [3]:
df, df_attr = load_data(BPI = 'Datasets/BPI_2012.csv', BPI_attr = 'Datasets/BPI_attr_2012.csv', sample=False)

In [4]:
df_train, df_validation, df_test = pd.read_csv('Datasets/trainExample.csv'), pd.read_csv('Datasets/validationExample.csv'), pd.read_csv('Datasets/testExample.csv')

In [5]:
df['combined_names'] = df['lifecycle:transition'] + ' + ' + df['concept:name']
    
#Prepare the time between columns
if (type(df['time:timestamp'].iloc[0]) != datetime.datetime):
    df['time:timestamp'] = df['time:timestamp'].apply(fix_time)

df["time:time_for_next"] = df["time:timestamp"].diff()
df.loc[df['step_number'] == 0, 'time:time_for_next'] = pd.Timedelta(0) #Changed it a bit to always insert 0 into the first row
df["time:time_for_next"] = [int(x.total_seconds()) for x in df["time:time_for_next"]]
df["time:time_for_next"] = df["time:time_for_next"].shift(-1)

#Prepare the next event colum
df["next_event"] = df["combined_names"]
df.loc[df['step_number'] == 0, 'next_event'] = 'editor: close_case'
df["next_event"] = df["next_event"].shift(-1)
df.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

In [6]:
df

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,time:day,time:event_count,time:busy_day,combined_names,time:time_for_next,next_event
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_SUBMITTED,0.0,COMPLETE + A_PARTLYSUBMITTED
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_PARTLYSUBMITTED,53.0,COMPLETE + A_PREACCEPTED
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_PREACCEPTED,0.0,SCHEDULE + W_Completeren aanvraag
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0,1-10-2011,381,False,SCHEDULE + W_Completeren aanvraag,39427.0,START + W_Completeren aanvraag
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11,1-10-2011,381,False,START + W_Completeren aanvraag,356.0,COMPLETE + A_ACCEPTED
5,0,5,10862.0,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,5,11,1-10-2011,381,False,COMPLETE + A_ACCEPTED,145.0,COMPLETE + O_SELECTED
6,0,6,10862.0,COMPLETE,O_SELECTED,2011-10-01 11:45:09.243000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_SELECTED,0.0,COMPLETE + A_FINALIZED
7,0,7,10862.0,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,5,11,1-10-2011,381,False,COMPLETE + A_FINALIZED,1.0,COMPLETE + O_CREATED
8,0,8,10862.0,COMPLETE,O_CREATED,2011-10-01 11:45:11.197000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_CREATED,0.0,COMPLETE + O_SENT
9,0,9,10862.0,COMPLETE,O_SENT,2011-10-01 11:45:11.380000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_SENT,0.0,SCHEDULE + W_Nabellen offertes


In [10]:
def timeChain(df):
    
    df_chain = df.copy()
    shape = df_chain.shape[0]
    
    chain = {}
    concept_names = ['editor:first_event'] + list(df_chain['combined_names'])
    #Need to implement the time difference inside the algorithm instead of using the column
    times_list = [0] + list(df_chain['time:time_for_next'])
    id_list = ['editor:first_id'] + list(df_chain['case_id']) #Otherwise we check i+1-th position that does not exist

    for i in range(len(concept_names)):

        if shape > i + 2:
            key1 = concept_names[i]
            key2 = concept_names[i+1]
            event = concept_names[i+2]
            time1 = times_list[i]
            time2 = times_list[i+1]
            
            if (id_list[i] == id_list[i+2]): 

                if ((key1, key2) not in chain):
                    chain[(key1, key2)] = [time2] 
                else:
                    chain[(key1, key2)].append(time2)

            elif (event == 'COMPLETE + A_SUBMITTED'): 
                #Next event is editor:close_case --> so no time till next
                if ((key1, key2) not in chain):
                    chain[(key1, key2)] = [0]
                else:
                    chain[(key1, key2)].append(0)

            elif (key2 == 'COMPLETE + A_SUBMITTED'):
                
                if ((key2) not in chain):
                    chain[(key2)] = [time1]
                else:
                    chain[(key2)].append(time1)

    
    print('Chain size: {0} distinct event pairs.'.format(len(chain)))
    
    return chain

def getPredictions(df, chain):
    df_copy = df.copy()
    time_for_next = []

    for i in tqdm(range(-1, df_copy.shape[0] - 1)):
        
        event2 = df_copy['combined_names'][i+1]
        
        if (i == -1):
            time_for_next.append(random.choice(chain[(event2)]))
        else:
            
            event1 = df_copy['combined_names'][i]
        
            if ((event1, event2) in chain):
                time_for_next.append(random.choice(chain[(event1, event2)]))
            else:
                time_for_next.append(random.choice(chain[(event2)]))
        
        
        
    return time_for_next

In [19]:
chain = timeChain(df)
chain

Chain size: 189 distinct event pairs.


{'COMPLETE + A_SUBMITTED': [0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0

In [20]:
eventPredictions = getPredictions(df, chain)

df['predict:time_for_next'] = eventPredictions
df['predict:time_for_next'] = df['predict:time_for_next'].astype('float')

100%|███████████████████████████████████████████████████████████████████████| 262200/262200 [00:04<00:00, 54540.86it/s]


In [21]:
df

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,time:day,time:event_count,time:busy_day,combined_names,time:time_for_next,next_event,predict:time_for_next
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_SUBMITTED,0.0,COMPLETE + A_PARTLYSUBMITTED,0.0
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_PARTLYSUBMITTED,53.0,COMPLETE + A_PREACCEPTED,10.0
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0,1-10-2011,381,False,COMPLETE + A_PREACCEPTED,0.0,SCHEDULE + W_Completeren aanvraag,0.0
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0,1-10-2011,381,False,SCHEDULE + W_Completeren aanvraag,39427.0,START + W_Completeren aanvraag,0.0
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11,1-10-2011,381,False,START + W_Completeren aanvraag,356.0,COMPLETE + A_ACCEPTED,413.0
5,0,5,10862.0,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,5,11,1-10-2011,381,False,COMPLETE + A_ACCEPTED,145.0,COMPLETE + O_SELECTED,362.0
6,0,6,10862.0,COMPLETE,O_SELECTED,2011-10-01 11:45:09.243000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_SELECTED,0.0,COMPLETE + A_FINALIZED,0.0
7,0,7,10862.0,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,5,11,1-10-2011,381,False,COMPLETE + A_FINALIZED,1.0,COMPLETE + O_CREATED,47.0
8,0,8,10862.0,COMPLETE,O_CREATED,2011-10-01 11:45:11.197000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_CREATED,0.0,COMPLETE + O_SENT,0.0
9,0,9,10862.0,COMPLETE,O_SENT,2011-10-01 11:45:11.380000+02:00,5,11,1-10-2011,381,False,COMPLETE + O_SENT,0.0,SCHEDULE + W_Nabellen offertes,0.0


In [22]:
df_testing = df.copy()
df_testing['error:diff'] = abs(df_testing['time:time_for_next'] - df_testing['predict:time_for_next'])

In [23]:
df_testing['error:diff'].mean()

46258.57082597569

In [25]:
df_testing['error:diff'].median()

50.0