In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import utils
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score
import json

In [2]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [24]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [29]:
def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    if 'resource' not in list(df.columns.values):
        noresource = True
    else:
        noresource = False
        
    for case,group in groups: 
        activitylist = list(group['activity'])
        
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}

        if noresource == False:
            resourcelist = list(group['resource'])
            resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
            case_outcome.update(resource_index)
        
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [35]:
dataset_label = 'bpic17'
with open('./dataset_parameters.json','r') as json_file:
    parameters = json.load(json_file)[dataset_label]
    key_pair = parameters['key_pair']
    maximum_prefix = parameters['maximum_prefix']

dataset_loc = './data/' +dataset_label +'.csv'
df = pd.read_csv(dataset_loc)
df = df.rename(columns=key_pair)
if 'resource' in df.columns.values:
    df = df.loc[:,['caseid','activity','ts','resource','outcome']]
else:
    df = df.loc[:,['caseid','activity','ts','outcome']]

try:
    os.makedirs('./result/%s'%(dataset_label))
except:
    pass

In [36]:
groups = df.groupby('caseid')
concating = []
for _, group in groups:
    outcomelist = list(group['outcome'])
    outcome = outcomelist[-1]
    group = group.reset_index(drop=True)
    if True in outcomelist:
        group = group.loc[:outcomelist.index(True),:]
    group['outcome'] = outcome
    concating.append(group)

dfn = pd.concat(concating)

In [37]:
idslist = []
prefix_length=maximum_prefix
for length in range(2,prefix_length):
    print('Progressing length: %s'%(length))
    idslist.append(indexbase_encoding(dfn,length))

Progressing length: 2
Progressing length: 3
Progressing length: 4
Progressing length: 5
Progressing length: 6
Progressing length: 7
Progressing length: 8
Progressing length: 9
Progressing length: 10
Progressing length: 11
Progressing length: 12
Progressing length: 13
Progressing length: 14


In [38]:
prefixlist= list(range(2,prefix_length))
acc_dict= {}
print('Decision tree')

for pos,prefix in enumerate(idslist):
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    acc_list = []
    for i in range(10):
        x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

        # Deicision tree result
        dt = DecisionTreeClassifier(criterion='entropy').fit(x_train,y_train)
        y_pred = dt.predict(x_test)
        acc_list.append(accuracy_score(y_test,y_pred))
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  np.mean(acc_list)

print(acc_dict)
import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
with open('./result/%s/off_dt_acc.pkl'%(dataset_label),'wb') as f:
    pkl.dump([x,y],f)

Decision tree
{'prefix_2': 0.6890070921985816, 'prefix_3': 0.6132978723404255, 'prefix_4': 0.6387211367673178, 'prefix_5': 0.646797153024911, 'prefix_6': 0.632620320855615, 'prefix_7': 0.6417112299465241, 'prefix_8': 0.6501792114695341, 'prefix_9': 0.6363471971066909, 'prefix_10': 0.8155109489051096, 'prefix_11': 0.9706443914081146, 'prefix_12': 0.9537117903930131, 'prefix_13': 0.9578431372549019, 'prefix_14': 1.0}


In [39]:
prefixlist= list(range(2,prefix_length))
acc_dict= {}
print('Random forest')
for pos,prefix in enumerate(idslist):
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    acc_list = []

    for i in range(10):
        x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

        # Random forest result
        rf = RandomForestClassifier(criterion='entropy').fit(x_train,y_train)
        y_pred = rf.predict(x_test)
        acc_list.append(accuracy_score(y_test,y_pred))
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  np.mean(acc_list)
print(acc_dict)

import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
with open('./result/%s/off_rf_acc.pkl'%(dataset_label),'wb') as f:
    pkl.dump([x,y],f)

Random forest
{'prefix_2': 0.6914893617021276, 'prefix_3': 0.6567375886524823, 'prefix_4': 0.6937833037300177, 'prefix_5': 0.7083629893238435, 'prefix_6': 0.7115864527629233, 'prefix_7': 0.7135472370766489, 'prefix_8': 0.7229390681003585, 'prefix_9': 0.7103074141048824, 'prefix_10': 0.8496350364963504, 'prefix_11': 0.9744630071599045, 'prefix_12': 0.9694323144104805, 'prefix_13': 0.969607843137255, 'prefix_14': 1.0}
