In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import utils
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt

In [2]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [3]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [4]:
def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        activitylist = list(group['activity'])
        resourcelist = list(group['resource'])
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}
        resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        case_outcome.update(resource_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [5]:
df = pd.read_csv('./data/bac_offline_small.csv')
df['START_DATE'] = pd.to_datetime(df['START_DATE'])
df = df.rename(columns={'REQUEST_ID':'caseid','ACTIVITY':'activity','START_DATE':'ts','CE_UO':'resource'})
df = df.loc[:,['caseid','activity','ts','resource','outcome']]
groups = df.groupby('caseid')
reconcatenate =[]
dfn = df

In [6]:
adf5 = aggregation_encoding(dfn,5)
idf5 = indexbase_encoding(dfn,5)

In [9]:
adf6 = aggregation_encoding(dfn,6)


In [10]:
adf7 = aggregation_encoding(dfn,7)


In [11]:
adf8 = aggregation_encoding(dfn,8)


In [12]:
adf9 = aggregation_encoding(dfn,9)


In [13]:
adf10 = aggregation_encoding(dfn,10)


In [21]:
adf2 = aggregation_encoding(dfn,2)
adf3 = aggregation_encoding(dfn,3)
adf4 = aggregation_encoding(dfn,4)
adf11 = aggregation_encoding(dfn,11)

In [23]:
adflist = [adf2,adf3,adf4,adf5,adf6,adf7,adf8,adf9,adf10,adf11]
prefixlist= list(range(2,12))
acc_dict= {}
for pos,prefix in enumerate(adflist):
    print('BAC aggregation encoding with prefix length %s'%(prefixlist[pos]))
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

    # Deicision tree result
    dt = DecisionTreeClassifier(criterion='entropy').fit(x_train,y_train)
    y_pred = dt.predict(x_test)
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  accuracy_score(y_test,y_pred)

import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
with open('./result/noneg_off_dt_acc.pkl','wb') as f:
    pkl.dump([x,y],f)

BAC aggregation encoding with prefix length 2
BAC aggregation encoding with prefix length 3
BAC aggregation encoding with prefix length 4
BAC aggregation encoding with prefix length 5
BAC aggregation encoding with prefix length 6
BAC aggregation encoding with prefix length 7
BAC aggregation encoding with prefix length 8
BAC aggregation encoding with prefix length 9
BAC aggregation encoding with prefix length 10
BAC aggregation encoding with prefix length 11


In [25]:
adflist = [adf2,adf3,adf4,adf5,adf6,adf7,adf8,adf9,adf10,adf11]
prefixlist= list(range(2,12))
acc_dict= {}
for pos,prefix in enumerate(adflist):
    print('BAC aggregation encoding with prefix length %s'%(prefixlist[pos]))
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

    # Deicision tree result
    dt = RandomForestClassifier(criterion='entropy').fit(x_train,y_train)
    y_pred = dt.predict(x_test)
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  accuracy_score(y_test,y_pred)

import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
print(acc_dict)
with open('./result/noneg_off_rf_acc.pkl','wb') as f:
    pkl.dump([x,y],f)

BAC aggregation encoding with prefix length 2
BAC aggregation encoding with prefix length 3
BAC aggregation encoding with prefix length 4
BAC aggregation encoding with prefix length 5
BAC aggregation encoding with prefix length 6
BAC aggregation encoding with prefix length 7
BAC aggregation encoding with prefix length 8
BAC aggregation encoding with prefix length 9
BAC aggregation encoding with prefix length 10
BAC aggregation encoding with prefix length 11
{'prefix_2': 0.9077123050259965, 'prefix_3': 0.9246187363834423, 'prefix_4': 0.9232446576537288, 'prefix_5': 0.9227251943301326, 'prefix_6': 0.9144905273937532, 'prefix_7': 0.86, 'prefix_8': 0.68, 'prefix_9': 0.9230769230769231, 'prefix_10': 0.6666666666666666, 'prefix_11': 1.0}
