In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

def load_data(BPI = 'BPI.csv', BPI_attr = 'BPI_attr.csv',  data2012 = False):
    df_BPI = pd.read_csv(BPI)
    df_BPI_attr = pd.read_csv(BPI_attr)
    
    if 'Unnamed: 0' in df_BPI.columns:
        df_BPI = df_BPI.rename(columns={'Unnamed: 0': 'case_id', 'Unnamed: 1': 'step_number'})
        df_BPI_attr = df_BPI_attr.rename(columns={'Unnamed: 0': 'case_id'})
    
    df_BPI['time:timestamp'] = df_BPI['time:timestamp'].apply(fix_time)

    if data2012:
        df_BPI_attr['REG_DATE'] = df_BPI_attr['REG_DATE'].apply(fix_time)
    
    df_BPI['time:weekday'] = [x.weekday() for x in df_BPI['time:timestamp']]
    df_BPI['time:hour'] = [x.hour for x in df_BPI['time:timestamp']]
    return (df_BPI, df_BPI_attr)

def load_data_xes(data):
    BPI = xes_importer.apply(data)
    
df, df_attr = load_data(BPI = 'Datasets/BPI_2012.csv', BPI_attr = 'Datasets/BPI_attr_2012.csv', data2012 = True)

In [3]:
markov_df = df.copy()
markov_df

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11
...,...,...,...,...,...,...,...,...
262195,13086,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2,23
262196,13086,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+01:00,2,23
262197,13086,3,11169.0,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+01:00,3,9
262198,13086,4,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+01:00,3,9


In [4]:
markov_df['time:time_between'] = markov_df['time:timestamp'].diff()
markov_df.loc[markov_df['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)

In [None]:
i = 0
loopCount = markov_df.shape[0]-1
containsComplete = False
cases_to_drop = []

#Drop the cases which do not contain complete cycles
while i < loopCount:
    
    case_id = markov_df['case_id'].iloc[i] #Saving the case of i-th row
    next_row_case = markov_df['case_id'].iloc[i+1] #Saving the case of i+1-th row
    transition = markov_df['lifecycle:transition'].iloc[i] #Can have either 'complete' or sth else
    
    if (case_id == next_row_case): #If the cases are matching:
        if (transition == 'COMPLETE'): #If the i-th row had 'complete'
            containsComplete = True
            
    else: #Last instance of a case:
        if (transition == 'COMPLETE'): #If it is complete now
            containsComplete = True
            
        if (not containsComplete): #If was not completed at all:
            cases_to_drop.append(case_id) #Add the case to the list of cases to be dropped
            
    i = i + 1

#Explicitly checking the last position in the dataframe after the loop:
case_id = markov_df['case_id'].iloc[i]
transition = markov_df['lifecycle:transition'].iloc[i]

if(case_id == markov_df['case_id'].iloc[i-1] and containsComplete == False):
    if (transition != 'COMPLETE'):
        cases_to_drop.append(case_id)
        
#Actually dropping the cases:
for case in cases_to_drop:
    markov_df = markov_df[markov_df['case_id'] != case]
    
cases_to_drop

In [5]:
def creating_dict_for_next_step_stats (df : pd.DataFrame, concept_name : str) -> dict:
    '''For an input action checks for all the possible next actions and counts their occurence'''
    
    thisdf = df.reset_index().copy()
    dic_occurrence = {}
    dic_total_time = {}
    ids = list(thisdf['case_id']) + ['editor: last id'] #Otherwise we check i+1-th position that does not exist
    times = list(thisdf['time:time_between']) + [pd.Timedelta(0)] #Otherwise we check i+1-th position that does not exist
    names = thisdf['concept:name']
    df_concept = thisdf[names == concept_name]
    
    for i, row in df_concept.iterrows():
        
        if (ids[i] == ids[i+1]): #an instance of the same case
            
            if (names[i+1] not in dic_occurrence):
                dic_occurrence[names[i+1]] = 1
                dic_total_time[names[i+1]] = times[i+1]
            else:
                dic_occurrence[names[i+1]] += 1
                dic_total_time[names[i+1]] += times[i+1]
                
        else: #the last instance of the case
            
            if ('editor: close_case' not in dic_occurrence):
                dic_occurrence['editor: close_case'] = 1
                dic_total_time['editor: close_case'] = times[i+1]
            else:
                dic_occurrence['editor: close_case'] += 1
                dic_total_time['editor: close_case'] += times[i+1]
    
    #Compute average time
    dic_avg_time = {}
    
    for key in dic_total_time:
        dic_avg_time[key] = dic_total_time[key] / dic_occurrence[key]
        
    return(dic_occurrence, dic_avg_time)

In [None]:
testDic = creating_dict_for_next_step_stats(markov_df, 'A_PARTLYSUBMITTED')[0]

print(testDic)
for test in testDic:
    print(test, testDic[test])

In [42]:
names = markov_df['concept:name'].unique().tolist() + ['editor: close_case']
shape = len(names)

transitionMatrix = np.zeros((shape, shape))

for name in names:
    nextSteps = creating_dict_for_next_step_stats(markov_df, name)[0]
    sumOfSteps = sum(nextSteps.values())
    
    for step in nextSteps:
        x = names.index(name)
        y = names.index(step)
        
        probability = nextSteps[step] / sumOfSteps
        
        transitionMatrix[x, y] = probability

#transitionMatrix = np.delete(transitionMatrix, shape-1, 0) #Deleting the editor: close_case row
#transitionMatrix = np.delete(transitionMatrix, shape-1, 1) #Deleting the editor: close_case column

In [43]:
print('Names: ', names, '\n\nMatrix: \n', transitionMatrix)

Names:  ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_ACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_SENT_BACK', 'W_Valideren aanvraag', 'A_REGISTERED', 'A_APPROVED', 'O_ACCEPTED', 'A_ACTIVATED', 'O_CANCELLED', 'W_Wijzigen contractgegevens', 'A_DECLINED', 'A_CANCELLED', 'W_Afhandelen leads', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude', 'editor: close_case'] 

Matrix: 
 [[0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.37 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.26 0.   0.36 0.   0.   0.01 0.  ]
 [0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.69 0.09 0.   0.   0.   0.   0.09 0.   0.   0.   0.
  0.   0.   0.   0.   0.02 0.02 0.05 0.   0.   0.   0.04]
 [0.   0.  

In [44]:
columnSum = 0

for row in transitionMatrix:
    for cell in row:
        columnSum = cell + columnSum
    print(columnSum)
    columnSum = 0

1.0
1.0
1.0
1.0
0.99
1.0
1.0
1.0
1.0
0.99
1.0
1.0000000000000002
1.01
0.99
0.99
0.9999999999999999
1.0
1.0
1.01
1.0
0.98
0.9900000000000001
0.99
1.01
0.0


In [9]:
#Random Walk - Simulation

LONGEST_TRACE = max(markov_df['step_number'])

def walk(n = LONGEST_TRACE+1):
    
    walk = []
    current = names[0]

    walk.append(current)

    while (True):

        current = np.random.choice(a = names, p = transitionMatrix[names.index(current)])

        walk.append(current)
        
        if (current == 'editor: close_case' or len(walk) == n):
            break    
        
    return walk


#walk()

def predictNext(current):
    
    nextEvent = np.random.choice(a = names, p = transitionMatrix[names.index(current)])
    
    return nextEvent

predictNext('W_Completeren aanvraag')

'W_Completeren aanvraag'

In [10]:
markov_df.tail()

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,time:time_between
262195,13086,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2,23,00:00:00.624000
262196,13086,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+01:00,2,23,00:00:43.864000
262197,13086,3,11169.0,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+01:00,3,9,09:34:45.449000
262198,13086,4,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+01:00,3,9,00:00:50.382000
262199,13086,5,11169.0,COMPLETE,W_Afhandelen leads,2012-03-01 09:27:41.325000+01:00,3,9,00:00:04.207000


In [45]:
df_prediction = markov_df.copy()

nextEvents = []

for i in range(df_prediction.shape[0]):
    nextEvents.append(predictNext(df_prediction['concept:name'][i]))

df_prediction['Prediction'] = nextEvents

#Prepare the next event column
df_prediction['next_event'] = df_prediction['concept:name']
df_prediction.loc[df_prediction['step_number'] == 0, 'next_event'] = 'editor: close_case'
df_prediction['next_event'] = df_prediction['next_event'].shift(-1)
df_prediction.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

df_prediction

ValueError: probabilities do not sum to 1

In [12]:
accuracy = round((len(df_prediction[df_prediction['next_event'] == df_prediction['Prediction']])/len(df_prediction))*100, 1)
print('accuracy: ' + str(accuracy) + '%')

accuracy: 50.7%


In [41]:
#shows which predictions are given too much (>1) and which too little (<1)
predicted_correct_events = Counter(list(df_prediction[df_prediction['next_event'] == df_prediction['Prediction']]['Prediction']))
predicted_events = Counter(list(df_prediction['Prediction']))
true_events = Counter(list(df_prediction['next_event']))

for i in predicted_events:
    if i != i:
        break
    print(i)
    per = round(predicted_events[i]/true_events[i]*100, 1)
    print(str(per) + '%')
    if per > 100:
        print('given ' + str(round(per-100, 1)) + '% too much')
    
    elif per < 100:
        print('given ' + str(round(100-per, 1)) + '% too little')
    
    else:
        print('given exactly the right amount')
   
    print('')

A_PARTLYSUBMITTED
100.0%
given exactly the right amount

W_Afhandelen leads
100.6%
given 0.6% too much

W_Completeren aanvraag
99.8%
given 0.2% too little

W_Nabellen offertes
100.0%
given exactly the right amount

O_SELECTED
100.1%
given 0.1% too much

A_FINALIZED
99.7%
given 0.3% too little

O_SENT
100.0%
given exactly the right amount

W_Valideren aanvraag
99.6%
given 0.4% too little

A_ACTIVATED
99.6%
given 0.4% too little

A_REGISTERED
100.8%
given 0.8% too much

A_APPROVED
100.7%
given 0.7% too much

O_ACCEPTED
98.2%
given 1.8% too little

W_Nabellen incomplete dossiers
100.0%
given exactly the right amount

O_CREATED
99.3%
given 0.7% too little

O_SENT_BACK
100.0%
given exactly the right amount

editor: close_case
100.9%
given 0.9% too much

A_PREACCEPTED
99.9%
given 0.1% too little

O_CANCELLED
100.9%
given 0.9% too much

A_CANCELLED
100.7%
given 0.7% too much

W_Beoordelen fraude
98.0%
given 2.0% too little

A_ACCEPTED
99.9%
given 0.1% too little

A_DECLINED
100.7%
given 0.7% 

In [13]:
def monteCarlo(n):
    
    # Creates a list containing n lists (number of simulations), each of 25 (names) items, all set to 0
    distribution = [[0 for x in range(n)] for y in range(LONGEST_TRACE+1)] 
    
    for i in range(len(distribution)):
        distribution[i] = list(filter((0).__ne__, distribution[i])) #Deletes the 0s in the list
    
    for x in range(n):  
        chain = walk()
        
        for i in range(len(chain)):
            distribution[i].append(chain[i])
    
    return distribution

In [14]:
results = monteCarlo(10000)

In [15]:
Counter(results[3])

Counter({'W_Afhandelen leads': 2353,
         'W_Completeren aanvraag': 4540,
         'A_PREACCEPTED': 570,
         'editor: close_case': 1690,
         'W_Valideren aanvraag': 134,
         'W_Nabellen incomplete dossiers': 14,
         'A_DECLINED': 500,
         'W_Beoordelen fraude': 60,
         'O_DECLINED': 138,
         'W_Nabellen offertes': 1})

In [36]:
nextEvents2 = []
df_monteCarlo = markov_df.copy()

for i in range(df_monteCarlo.shape[0]-1):
    
    currentStep = df_monteCarlo['step_number'][i+1]
    
    counter = Counter(results[currentStep])
    counterKeys = list(counter.keys())
    counterValues = list(counter.values())
    
    probability = [float(x)/sum(counterValues) for x in counterValues]
    
    nextEvents2.append(np.random.choice(a = counterKeys, p = probability))

    
lastEventStep = df_monteCarlo['step_number'].iloc[-1]
counter = Counter(results[lastEventStep+1])
counterKeys, counterValues = list(counter.keys()), list(counter.values())
probs = [float(x)/sum(counterValues) for x in counterValues]
nextEvents2.append(np.random.choice(a = counterKeys, p = probs))

df_monteCarlo['Prediction'] = nextEvents2


#Prepare the next event column
df_monteCarlo['next_event'] = df_monteCarlo['concept:name']
df_monteCarlo.loc[df_monteCarlo['step_number'] == 0, 'next_event'] = 'editor: close_case'
df_monteCarlo['next_event'] = df_monteCarlo['next_event'].shift(-1)
df_monteCarlo.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

In [37]:
df_monteCarlo.tail(30)

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,time:time_between,Prediction,next_event
262170,13083,2,112.0,COMPLETE,A_DECLINED,2012-02-29 23:29:21.958000+01:00,2,23,0 days 00:00:40.760000,A_SUBMITTED,editor: close_case
262171,13084,0,112.0,COMPLETE,A_SUBMITTED,2012-02-29 23:28:55.349000+01:00,2,23,0 days 00:00:00,A_PARTLYSUBMITTED,A_PARTLYSUBMITTED
262172,13084,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:28:55.479000+01:00,2,23,0 days 00:00:00.130000,W_Afhandelen leads,W_Afhandelen leads
262173,13084,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:29:06.452000+01:00,2,23,0 days 00:00:10.973000,O_DECLINED,W_Afhandelen leads
262174,13084,3,11169.0,START,W_Afhandelen leads,2012-03-01 09:25:39.409000+01:00,3,9,0 days 09:56:32.957000,W_Completeren aanvraag,A_DECLINED
262175,13084,4,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:28:18.168000+01:00,3,9,0 days 00:02:38.759000,W_Nabellen offertes,W_Afhandelen leads
262176,13084,5,11169.0,COMPLETE,W_Afhandelen leads,2012-03-01 09:28:21.228000+01:00,3,9,0 days 00:00:03.060000,A_SUBMITTED,editor: close_case
262177,13085,0,112.0,COMPLETE,A_SUBMITTED,2012-02-29 23:43:09.766000+01:00,2,23,0 days 00:00:00,A_PARTLYSUBMITTED,A_PARTLYSUBMITTED
262178,13085,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:43:09.899000+01:00,2,23,0 days 00:00:00.133000,A_DECLINED,W_Afhandelen leads
262179,13085,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:43:54.276000+01:00,2,23,0 days 00:00:44.377000,W_Completeren aanvraag,W_Afhandelen leads


In [38]:
accuracy = round((len(df_monteCarlo[df_monteCarlo['next_event'] == df_monteCarlo['Prediction']])/len(df_monteCarlo))*100, 1)
print('accuracy: ' + str(accuracy) + '%')
#this turned out to be a failure

accuracy: 22.5%


In [None]:
#NOT USED ANYMORE

chain = []

for i in range(len(results)):
    counter = Counter(results[i])
    
    sumOfValues = sum(counter.values())
    
    maxKey = max(counter, key = counter.get)
    
    prob = round(counter[maxKey]/sumOfValues, 2)

    chain.append((i, maxKey, prob))
    
    
chain