In [35]:
import pandas as pd
import numpy as np
import datetime
import random
import time
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm, trange
import copy

In [36]:
def fix_time(time):
    return (datetime.datetime.fromisoformat(time))

def load_data(BPI = 'BPI.csv', BPI_attr = 'BPI_attr.csv',  data2012 = False, sample=False):
    df_BPI = pd.read_csv(BPI)
    df_BPI_attr = pd.read_csv(BPI_attr)
    
    if 'Unnamed: 0' in df_BPI.columns:
        df_BPI = df_BPI.rename(columns={'Unnamed: 0': 'case_id', 'Unnamed: 1': 'step_number'})
        df_BPI_attr = df_BPI_attr.rename(columns={'Unnamed: 0': 'case_id'})
    
    df_BPI['time:timestamp'] = df_BPI['time:timestamp'].apply(fix_time)

    df_BPI_attr['REG_DATE'] = df_BPI_attr['REG_DATE'].apply(fix_time)
    
    df_BPI['time:weekday'] = [x.weekday() for x in df_BPI['time:timestamp']]
    df_BPI['time:hour'] = [x.hour for x in df_BPI['time:timestamp']]
    
    if(sample):
        df_BPI, df_BPI_attr = df_BPI[:50000], df_BPI_attr[:2359]
    
    return (df_BPI, df_BPI_attr)
    return (df_BPI, df_BPI_attr)

def load_data_xes(data):
    BPI = xes_importer.apply(data)
    
df, df_attr = load_data(BPI = 'Datasets\BPI_2012.csv', BPI_attr = 'Datasets\BPI_attr_2012.csv', sample=False)

In [49]:
df_markov = copy.deepcopy(df)
df_markov['combined_names'] = df_markov['lifecycle:transition'] + ' + ' + df_markov['concept:name']
df_markov

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,combined_names
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0,COMPLETE + A_SUBMITTED
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0,COMPLETE + A_PARTLYSUBMITTED
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0,COMPLETE + A_PREACCEPTED
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0,SCHEDULE + W_Completeren aanvraag
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11,START + W_Completeren aanvraag
5,0,5,10862.0,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,5,11,COMPLETE + A_ACCEPTED
6,0,6,10862.0,COMPLETE,O_SELECTED,2011-10-01 11:45:09.243000+02:00,5,11,COMPLETE + O_SELECTED
7,0,7,10862.0,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,5,11,COMPLETE + A_FINALIZED
8,0,8,10862.0,COMPLETE,O_CREATED,2011-10-01 11:45:11.197000+02:00,5,11,COMPLETE + O_CREATED
9,0,9,10862.0,COMPLETE,O_SENT,2011-10-01 11:45:11.380000+02:00,5,11,COMPLETE + O_SENT


In [54]:
chain = {}
shape = df_markov.shape[0]
conceptNames = ['editor: first event'] + list(df_markov['combined_names'])
ids = ['editor: first id'] + list(df_markov['case_id']) #Otherwise we check i+1-th position that does not exist


for i, key1 in enumerate(conceptNames):
    
    if shape > i + 2:
        key2 = conceptNames[i+1]
        event = conceptNames[i+2]

        
        if (ids[i] == ids[i+2]): 
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = [event] 
            else:
                chain[(key1, key2)].append(event)
                
        elif (event == 'COMPLETE + A_SUBMITTED'): 
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = ['editor: close_case']
            else:
                chain[(key1, key2)].append('editor: close_case')
        
        elif (key2 == 'COMPLETE + A_SUBMITTED'):
            
            if ((key1, key2) not in chain):
                chain[(key1, key2)] = ['COMPLETE + A_PARTLYSUBMITTED']
            else:
                chain[(key1, key2)].append('COMPLETE + A_PARTLYSUBMITTED')
        

            
print('Chain size: {0} distinct event pairs.'.format(len(chain)))
chain

Chain size: 202 distinct event pairs.


{('editor: first event',
  'COMPLETE + A_SUBMITTED'): ['COMPLETE + A_PARTLYSUBMITTED'],
 ('COMPLETE + A_SUBMITTED',
  'COMPLETE + A_PARTLYSUBMITTED'): ['COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_DECLINED',
  'COMPLETE + A_DECLINED',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_DECLINED',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'SCHEDULE + W_Afhandelen leads',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE + A_DECLINED',
  'COMPLETE + A_PREACCEPTED',
  'SCHEDULE + W_Afhandelen leads',
  'COMPLETE + A_PREACCEPTED',
  'COMPLETE 

In [56]:
#creates a list of all concepts and a chain with only the percent chances
concepts = list(df_markov['combined_names'].unique()) + ['editor: close_case']

chain2 = {}
for i in chain:
    lister = []
    for j in concepts:
        lister.append(chain[i].count(j)/len(chain[i]))
    
    chain2[i] = np.array(lister)

In [57]:
timer = time.perf_counter()

df_2markov = copy.deepcopy(df_markov)
df_2markov['combined_names S1'] = df_2markov['combined_names'].shift(1) 
df_2markov.loc[0, 'combined_names S1'] = 'editor: first event'

def apply_chain(var):
    chances = chain2[(var[0], var[1])]
    cumulative_chances = chances.cumsum()
    random_number = random.uniform(0, 1)
    result = np.expand_dims(cumulative_chances, 1) >= random_number
    result_number = result.argmax()   
    return np.asarray(concepts)[result_number]

df_2prediction = copy.deepcopy(df_markov)
df_2prediction['Prediction'] =  df_2markov[['combined_names S1','combined_names']].apply(apply_chain, axis=1)

print(timer-time.perf_counter())

-11.122231700000157


In [58]:
#Prepare the next event column
df_2prediction['next_event'] = df_2prediction['combined_names']
df_2prediction.loc[df_2prediction['step_number'] == 0, 'next_event'] = 'editor: close_case'
df_2prediction['next_event'] = df_2prediction['next_event'].shift(-1)
df_2prediction.loc[len(df) - 1, 'next_event'] = 'editor: close_case'

accuracy = round((len(df_2prediction[df_2prediction['next_event'] == df_2prediction['Prediction']])/len(df_2prediction))*100, 1)
print('accuracy: ' + str(accuracy) + '%')

accuracy: 76.2%


In [59]:
df_2prediction

Unnamed: 0,case_id,step_number,org:resource,lifecycle:transition,concept:name,time:timestamp,time:weekday,time:hour,combined_names,Prediction,next_event
0,0,0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,5,0,COMPLETE + A_SUBMITTED,COMPLETE + A_PARTLYSUBMITTED,COMPLETE + A_PARTLYSUBMITTED
1,0,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,5,0,COMPLETE + A_PARTLYSUBMITTED,COMPLETE + A_PREACCEPTED,COMPLETE + A_PREACCEPTED
2,0,2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,5,0,COMPLETE + A_PREACCEPTED,SCHEDULE + W_Completeren aanvraag,SCHEDULE + W_Completeren aanvraag
3,0,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,5,0,SCHEDULE + W_Completeren aanvraag,START + W_Completeren aanvraag,START + W_Completeren aanvraag
4,0,4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,5,11,START + W_Completeren aanvraag,COMPLETE + W_Completeren aanvraag,COMPLETE + A_ACCEPTED
5,0,5,10862.0,COMPLETE,A_ACCEPTED,2011-10-01 11:42:43.308000+02:00,5,11,COMPLETE + A_ACCEPTED,COMPLETE + A_FINALIZED,COMPLETE + O_SELECTED
6,0,6,10862.0,COMPLETE,O_SELECTED,2011-10-01 11:45:09.243000+02:00,5,11,COMPLETE + O_SELECTED,COMPLETE + A_FINALIZED,COMPLETE + A_FINALIZED
7,0,7,10862.0,COMPLETE,A_FINALIZED,2011-10-01 11:45:09.243000+02:00,5,11,COMPLETE + A_FINALIZED,COMPLETE + O_CREATED,COMPLETE + O_CREATED
8,0,8,10862.0,COMPLETE,O_CREATED,2011-10-01 11:45:11.197000+02:00,5,11,COMPLETE + O_CREATED,COMPLETE + O_SENT,COMPLETE + O_SENT
9,0,9,10862.0,COMPLETE,O_SENT,2011-10-01 11:45:11.380000+02:00,5,11,COMPLETE + O_SENT,SCHEDULE + W_Nabellen offertes,SCHEDULE + W_Nabellen offertes


In [60]:
#shows which predictions are given too much (>1) and which too little (<1)
predicted_correct_events = Counter(list(df_2prediction[df_2prediction['next_event'] == df_2prediction['Prediction']]['Prediction']))
predicted_events = Counter(list(df_2prediction['Prediction']))
true_events = Counter(list(df_2prediction['next_event']))

for i in predicted_events:
    if i != i:
        break
    print(i)
    per = round(predicted_events[i]/true_events[i]*100, 1)
    print(str(per) + '%')
    if per > 100:
        print('given ' + str(round(per-100, 1)) + '% too much')
    
    elif per < 100:
        print('given ' + str(round(100-per, 1)) + '% too little')
    
    else:
        print('given exactly the right amount')
   
    print('')

COMPLETE + A_PARTLYSUBMITTED
100.0%
given exactly the right amount

COMPLETE + A_PREACCEPTED
99.1%
given 0.9% too little

SCHEDULE + W_Completeren aanvraag
100.0%
given exactly the right amount

START + W_Completeren aanvraag
100.0%
given exactly the right amount

COMPLETE + W_Completeren aanvraag
100.1%
given 0.1% too much

COMPLETE + A_FINALIZED
99.4%
given 0.6% too little

COMPLETE + O_CREATED
100.0%
given exactly the right amount

COMPLETE + O_SENT
100.0%
given exactly the right amount

SCHEDULE + W_Nabellen offertes
100.3%
given 0.3% too much

START + W_Nabellen offertes
99.9%
given 0.1% too little

COMPLETE + O_CANCELLED
99.9%
given 0.1% too little

COMPLETE + W_Nabellen offertes
100.1%
given 0.1% too much

SCHEDULE + W_Valideren aanvraag
100.6%
given 0.6% too much

START + W_Valideren aanvraag
100.1%
given 0.1% too much

COMPLETE + W_Valideren aanvraag
100.2%
given 0.2% too much

COMPLETE + A_APPROVED
98.6%
given 1.4% too little

COMPLETE + A_ACTIVATED
98.5%
given 1.5% too littl