In [149]:
from pm4py.objects.log.importer.xes import importer as xes_importer
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm, trange
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

def load_data(BPI = 'BPI.csv', BPI_attr = 'BPI_attr.csv',  data2012 = False):
    df_BPI = pd.read_csv(BPI)
    df_BPI_attr = pd.read_csv(BPI_attr)
    
    if 'Unnamed: 0' in df_BPI.columns:
        df_BPI = df_BPI.rename(columns={'Unnamed: 0': 'case_id', 'Unnamed: 1': 'step_number'})
        df_BPI_attr = df_BPI_attr.rename(columns={'Unnamed: 0': 'case_id'})
    
    df_BPI['time:timestamp'] = df_BPI['time:timestamp'].apply(fix_time)

    if data2012:
        df_BPI_attr['REG_DATE'] = df_BPI_attr['REG_DATE'].apply(fix_time)
    
    df_BPI['time:weekday'] = [x.weekday() for x in df_BPI['time:timestamp']]
    df_BPI['time:hour'] = [x.hour for x in df_BPI['time:timestamp']]
    return (df_BPI, df_BPI_attr)

def load_data_xes(data):
    BPI = xes_importer.apply(data)
    
df, df_attr = load_data(BPI = 'Datasets/BPI_2012.csv', BPI_attr = 'Datasets/BPI_attr_2012.csv', data2012 = True)

In [3]:
markov_df = df
markov_df

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11
...,...,...,...,...,...,...,...,...
262195,13086,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+01:00,2,23
262196,13086,2,112.0,SCHEDULE,W_Afhandelen leads,2012-02-29 23:52:01.287000+01:00,2,23
262197,13086,3,11169.0,START,W_Afhandelen leads,2012-03-01 09:26:46.736000+01:00,3,9
262198,13086,4,11169.0,COMPLETE,A_DECLINED,2012-03-01 09:27:37.118000+01:00,3,9


In [None]:
markov_df['time:time_between'] = markov_df['time:timestamp'].diff()
markov_df.loc[markov_df['step_number'] == 0, 'time:time_between'] = pd.Timedelta(0)

In [6]:
i = 0
loopCount = markov_df.shape[0]-1
containsComplete = False
cases_to_drop = []

#Drop the cases which do not contain complete cycles
while i < loopCount:
    
    case_id = markov_df['case_id'].iloc[i] #Saving the case of i-th row
    next_row_case = markov_df['case_id'].iloc[i+1] #Saving the case of i+1-th row
    transition = markov_df['lifecycle:transition'].iloc[i] #Can have either 'complete' or sth else
    
    if (case_id == next_row_case): #If the cases are matching:
        if (transition == 'COMPLETE'): #If the i-th row had 'complete'
            containsComplete = True
            
    else: #Last instance of a case:
        if (transition == 'COMPLETE'): #If it is complete now
            containsComplete = True
            
        if (not containsComplete): #If was not completed at all:
            cases_to_drop.append(case_id) #Add the case to the list of cases to be dropped
            
    i = i + 1

#Explicitly checking the last position in the dataframe after the loop:
case_id = markov_df['case_id'].iloc[i]
transition = markov_df['lifecycle:transition'].iloc[i]

if(case_id == markov_df['case_id'].iloc[i-1] and containsComplete == False):
    if (transition != 'COMPLETE'):
        cases_to_drop.append(case_id)
        
#Actually dropping the cases:
for case in cases_to_drop:
    markov_df = markov_df[markov_df['case_id'] != case]
    
cases_to_drop

[]

In [9]:
def creating_dict_for_next_step_stats (df : pd.DataFrame, concept_name : str) -> dict:
    '''For an input action checks for all the possible next actions and counts their occurence'''
    
    thisdf = df.reset_index()
    dic_occurrence = {}
    dic_total_time = {}
    ids = list(thisdf['case_id']) + ['editor: last id'] #Otherwise we check i+1-th position that does not exist
    times = list(thisdf['time:time_between']) + [pd.Timedelta(0)] #Otherwise we check i+1-th position that does not exist
    names = thisdf['concept:name']
    df_concept = thisdf[names == concept_name]
    
    for i, row in df_concept.iterrows():
        
        if (ids[i] == ids[i+1]): #an instance of the same case
            
            if (names[i+1] not in dic_occurrence):
                dic_occurrence[names[i+1]] = 1
                dic_total_time[names[i+1]] = times[i+1]
            else:
                dic_occurrence[names[i+1]] += 1
                dic_total_time[names[i+1]] += times[i+1]
                
        else: #the last instance of the case
            
            if ('editor: close_case' not in dic_occurrence):
                dic_occurrence['editor: close_case'] = 1
                dic_total_time['editor: close_case'] = times[i+1]
            else:
                dic_occurrence['editor: close_case'] += 1
                dic_total_time['editor: close_case'] += times[i+1]
    
    #Compute average time
    dic_avg_time = {}
    
    for key in dic_total_time:
        dic_avg_time[key] = dic_total_time[key] / dic_occurrence[key]
        
    return(dic_occurrence, dic_avg_time)

In [8]:
testDic = creating_dict_for_next_step_stats(markov_df, 'A_PARTLYSUBMITTED')[0]

print(testDic)
for test in testDic:
    print(test, testDic[test])

{'A_DECLINED': 3376, 'A_PREACCEPTED': 4063, 'W_Afhandelen leads': 4294, 'W_Beoordelen fraude': 62}
A_DECLINED 3376
A_PREACCEPTED 4063
W_Afhandelen leads 4294
W_Beoordelen fraude 62


In [64]:
names = markov_df['concept:name'].unique().tolist() + ['editor: close_case']
shape = len(names)

transitionMatrix = np.zeros((shape, shape))

for name in names:
    nextSteps = creating_dict_for_next_step_stats(markov_df, name)[0]
    sumOfSteps = sum(nextSteps.values())
    
    for step in nextSteps:
        x = names.index(name)
        y = names.index(step)
        
        probability = nextSteps[step] / sumOfSteps
        
        transitionMatrix[x, y] = probability

#transitionMatrix = np.delete(transitionMatrix, shape-1, 0) #Deleting the editor: close_case row
#transitionMatrix = np.delete(transitionMatrix, shape-1, 1) #Deleting the editor: close_case column

In [30]:
print('Names: ', names, '\n\nMatrix: \n', transitionMatrix)

Names:  ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_DECLINED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_CANCELLED', 'W_Afhandelen leads', 'A_ACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_SENT_BACK', 'W_Valideren aanvraag', 'W_Nabellen incomplete dossiers', 'O_DECLINED', 'O_CANCELLED', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'W_Beoordelen fraude', 'W_Wijzigen contractgegevens', 'editor: close_case'] 

Matrix: 
 [[0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.29 0.34 0.   0.   0.36 0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.  ]
 [0.   0.   0.   0.   0.14 0.   0.3  0.   0.   0.   0.   0.   0.   0.
  0.04 0.01 0.04 0.   0.   0.   0.   0.   0.01 0.   0.47]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.  

In [66]:
columnSum = 0

for row in transitionMatrix:
    for cell in row:
        columnSum = cell + columnSum
    print(columnSum)
    columnSum = 0

1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999998
1.0
1.0
0.9999999999999999
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0
1.0
0.0


In [171]:
#Random Walk - Simulation

LONGEST_TRACE = max(markov_df['step_number'])

def walk():
    
    walk = []
    current = names[0]

    walk.append(current)

    while (True):

        current = np.random.choice(a = names, p = transitionMatrix[names.index(current)])

        walk.append(current)
        
        if (current == 'editor: close_case' or len(walk) == LONGEST_TRACE):
            break    
        
    return walk


walk()

['A_SUBMITTED',
 'A_PARTLYSUBMITTED',
 'W_Afhandelen leads',
 'W_Afhandelen leads',
 'editor: close_case']

In [173]:
def monteCarlo(n):
    
    # Creates a list containing n lists (number of simulations), each of 25 (names) items, all set to 0
    distribution = [[0 for x in range(n)] for y in range(LONGEST_TRACE)] #
    
    for i in range(LONGEST_TRACE):
        distribution[i] = list(filter((0).__ne__, distribution[i])) #Deletes the 0s in the list
    
    for x in range(n):  
        chain = walk()
        
        for i in range(len(chain)):
            distribution[i].append(chain[i])
    
    return distribution
            
            
monteCarlo(10000)

[['A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED',
  'A_SUBMITTED

In [191]:
results = monteCarlo(10000)

In [199]:
chain = []

for i in range(len(results)):
    counter = Counter(results[i])
    
    sumOfValues = sum(counter.values())
    
    maxKey = max(counter, key = counter.get)
    
    prob = round(counter[maxKey]/sumOfValues, 2)

    chain.append((i, maxKey, prob))
    
    
chain

[(0, 'A_SUBMITTED', 1.0),
 (1, 'A_PARTLYSUBMITTED', 1.0),
 (2, 'W_Afhandelen leads', 0.36),
 (3, 'W_Completeren aanvraag', 0.44),
 (4, 'W_Completeren aanvraag', 0.48),
 (5, 'W_Completeren aanvraag', 0.47),
 (6, 'W_Completeren aanvraag', 0.41),
 (7, 'W_Completeren aanvraag', 0.37),
 (8, 'W_Completeren aanvraag', 0.32),
 (9, 'W_Completeren aanvraag', 0.28),
 (10, 'W_Nabellen offertes', 0.26),
 (11, 'W_Nabellen offertes', 0.28),
 (12, 'W_Nabellen offertes', 0.29),
 (13, 'W_Nabellen offertes', 0.3),
 (14, 'W_Nabellen offertes', 0.31),
 (15, 'W_Nabellen offertes', 0.3),
 (16, 'W_Nabellen offertes', 0.28),
 (17, 'W_Nabellen offertes', 0.28),
 (18, 'W_Nabellen offertes', 0.27),
 (19, 'W_Nabellen offertes', 0.27),
 (20, 'W_Nabellen offertes', 0.28),
 (21, 'W_Nabellen offertes', 0.26),
 (22, 'W_Nabellen offertes', 0.26),
 (23, 'W_Nabellen offertes', 0.27),
 (24, 'W_Nabellen offertes', 0.27),
 (25, 'W_Nabellen offertes', 0.27),
 (26, 'W_Nabellen offertes', 0.25),
 (27, 'W_Nabellen offertes', 0.2