In [8]:
import pandas as pd
import numpy as np
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm, trange
import copy

In [9]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

def load_data(BPI = 'BPI.csv', BPI_attr = 'BPI_attr.csv',  data2012 = False, sample=False):
    df_BPI = pd.read_csv(BPI)
    df_BPI_attr = pd.read_csv(BPI_attr)
    
    if 'Unnamed: 0' in df_BPI.columns:
        df_BPI = df_BPI.rename(columns={'Unnamed: 0': 'case_id', 'Unnamed: 1': 'step_number'})
        df_BPI_attr = df_BPI_attr.rename(columns={'Unnamed: 0': 'case_id'})
    
    df_BPI['time:timestamp'] = df_BPI['time:timestamp'].apply(fix_time)

    df_BPI_attr['REG_DATE'] = df_BPI_attr['REG_DATE'].apply(fix_time)
    
    df_BPI['time:weekday'] = [x.weekday() for x in df_BPI['time:timestamp']]
    df_BPI['time:hour'] = [x.hour for x in df_BPI['time:timestamp']]
    
    if(sample):
        df_BPI, df_BPI_attr = df_BPI[:50000], df_BPI_attr[:2359]
    
    return (df_BPI, df_BPI_attr)
    return (df_BPI, df_BPI_attr)

def load_data_xes(data):
    BPI = xes_importer.apply(data)
    
df, df_attr = load_data(BPI = 'BPI_2012.csv', BPI_attr = 'BPI_attr_2012.csv', sample=False)

In [10]:
df_markov = copy.deepcopy(df)
df_markov

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11
...,...,...,...,...,...,...,...,...
262195,13086,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2,23
262196,13086,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+01:00,2,23
262197,13086,3,11169.0,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+01:00,3,9
262198,13086,4,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+01:00,3,9


In [16]:
chain = {}
shape = df_markov.shape[0]
conceptNames = ['editor: first event'] + list(df_markov['concept:name'])
ids = ['editor: first id'] + list(df_markov['case_id']) #Otherwise we check i+1-th position that does not exist


for i, key1 in enumerate(conceptNames):
    
    if shape > i + 2:
        key2 = conceptNames[i+1]
        event = conceptNames[i+2]

        
        if (ids[i] == ids[i+2]): 
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = [event] 
            else:
                chain[(key1, key2)].append(event)
                
        elif (event == 'A_SUBMITTED'): 
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = ['editor: close_case']
            else:
                chain[(key1, key2)].append('editor: close_case')
        
        elif (key2 == 'A_SUBMITTED'):
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = ['A_PARTLYSUBMITTED']
            else:
                chain[(key1, key2)].append('A_PARTLYSUBMITTED')
        

            
print('Chain size: {0} distinct event pairs.'.format(len(chain)))

Chain size: 137 distinct event pairs.


In [17]:
#creates a list of all concepts and a chain with only the percent chances
concepts = list(df_markov['concept:name'].unique())

chain2 = {}
for i in chain:
    lister = []
    for j in concepts:
        lister.append(chain[i].count(j)/len(chain[i]))
    
    chain2[i] = np.array(lister)

In [23]:
timer = time.perf_counter()

df_2markov = copy.deepcopy(df_markov)
df_2markov['concept:name S1'] = df_2markov['concept:name'].shift(1) 
df_2markov.loc[0, 'concept:name S1'] = 'editor: first event'

def apply_chain(var):
    chances = chain2[(var[0], var[1])]
    cumulative_chances = chances.cumsum()
    random_number = random.uniform(0, 1)
    result = np.expand_dims(cumulative_chances, 1) >= random_number
    result_number = result.argmax()   
    return np.asarray(concepts)[result_number]

df_2prediction = copy.deepcopy(df_markov)
df_2prediction['Prediction'] =  df_2markov[['concept:name S1','concept:name']].apply(apply_chain, axis=1)

print(timer-time.perf_counter())

-12.151010971000005


In [24]:
#Prepare the next event column
df_2prediction['next_event'] = df_2prediction['concept:name']
df_2prediction.loc[df_2prediction['step_number'] == 0, 'next_event'] = 'editor: close_case'
df_2prediction['next_event'] = df_2prediction['next_event'].shift(-1)
df_2prediction.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

accuracy = round((len(df_2prediction[df_2prediction['next_event'] == df_2prediction['Prediction']])/len(df_2prediction))*100, 1)
print('accuracy: ' + str(accuracy) + '%')

accuracy: 68.8%


In [15]:
#shows which predictions are given too much (>1) and which too little (<1)
predicted_correct_events = Counter(list(df_2prediction[df_2prediction['next_event'] == df_2prediction['Prediction']]['Prediction']))
predicted_events = Counter(list(df_2prediction['Prediction']))
true_events = Counter(list(df_2prediction['next_event']))

for i in predicted_events:
    if i != i:
        break
    print(i)
    per = round(predicted_events[i]/true_events[i]*100, 1)
    print(str(per) + '%')
    if per > 100:
        print('given ' + str(round(per-100, 1)) + '% too much')
    
    elif per < 100:
        print('given ' + str(round(100-per, 1)) + '% too little')
    
    else:
        print('given exactly the right amount')
   
    print('')

A_PARTLYSUBMITTED
100.0%
given exactly the right amount

A_PREACCEPTED
100.2%
given 0.2% too much

W_Completeren aanvraag
99.9%
given 0.1% too little

A_ACCEPTED
100.1%
given 0.1% too much

A_FINALIZED
100.9%
given 0.9% too much

O_CREATED
100.0%
given exactly the right amount

O_SENT
100.0%
given exactly the right amount

W_Nabellen offertes
100.3%
given 0.3% too much

O_SENT_BACK
98.8%
given 1.2% too little

A_CANCELLED
99.9%
given 0.1% too little

W_Valideren aanvraag
100.0%
given exactly the right amount

O_ACCEPTED
100.6%
given 0.6% too much

A_APPROVED
96.7%
given 3.3% too little

A_ACTIVATED
99.4%
given 0.6% too little

A_REGISTERED
98.2%
given 1.8% too little

A_SUBMITTED


ZeroDivisionError: division by zero