In [42]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import utils
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt

In [43]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [44]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [45]:
def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        activitylist = list(group['activity'])
        resourcelist = list(group['resource'])
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}
        resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        case_outcome.update(resource_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [46]:
df = pd.read_csv('./data/bac_online_small.csv')
df['START_DATE'] = pd.to_datetime(df['START_DATE'])
df = df.rename(columns={'REQUEST_ID':'caseid','ACTIVITY':'activity','START_DATE':'ts','CE_UO':'resource'})
df = df.loc[:,['caseid','activity','ts','resource','outcome']]

In [47]:
groups = df.groupby('caseid')
concating = []
for _, group in groups:
    outcomelist = list(group['outcome'])
    outcome = outcomelist[-1]
    group = group.reset_index(drop=True)
    if True in outcomelist:
        group = group.loc[:outcomelist.index(True),:]
    group['outcome'] = outcome
    concating.append(group)

dfn = pd.concat(concating)

In [48]:
adf10 = aggregation_encoding(dfn,10)
idf10 = indexbase_encoding(dfn,10)

In [49]:
adf5 = aggregation_encoding(dfn,5)
idf5 = indexbase_encoding(dfn,5)

In [50]:
adf6 = aggregation_encoding(dfn,6)
idf6 = indexbase_encoding(dfn,6)

In [51]:
adf7 = aggregation_encoding(dfn,7)
idf7 = indexbase_encoding(dfn,7)

In [52]:
adf8 = aggregation_encoding(dfn,8)
idf8 = indexbase_encoding(dfn,8)

In [53]:
adf9 = aggregation_encoding(dfn,9)
idf9 = indexbase_encoding(dfn,9)

In [54]:
print('BAC aggregation encoding with prefix length 5')
y = adf5['outcome']
x =adf5.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 5')
y = idf5['outcome']
x =idf5.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 5
Decision Tree
              precision    recall  f1-score   support

       False       0.93      0.98      0.95      1078
        True       0.46      0.18      0.26       102

    accuracy                           0.91      1180
   macro avg       0.69      0.58      0.60      1180
weighted avg       0.89      0.91      0.89      1180

BAC index encoding with prefix length 5
Decision Tree
              precision    recall  f1-score   support

       False       0.94      0.98      0.96      1094
        True       0.50      0.21      0.30        86

    accuracy                           0.93      1180
   macro avg       0.72      0.60      0.63      1180
weighted avg       0.91      0.93      0.91      1180



In [55]:
print('BAC aggregation encoding with prefix length 6')
y = adf6['outcome']
x =adf6.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 6')
y = idf6['outcome']
x =idf6.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 6
Decision Tree
              precision    recall  f1-score   support

       False       0.93      0.96      0.95       958
        True       0.46      0.32      0.37        95

    accuracy                           0.91      1053
   macro avg       0.70      0.64      0.66      1053
weighted avg       0.89      0.91      0.90      1053

BAC index encoding with prefix length 6
Decision Tree
              precision    recall  f1-score   support

       False       0.92      0.98      0.95       950
        True       0.58      0.25      0.35       103

    accuracy                           0.91      1053
   macro avg       0.75      0.62      0.65      1053
weighted avg       0.89      0.91      0.89      1053



In [56]:
print('BAC aggregation encoding with prefix length 7')
y = adf7['outcome']
x =adf7.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 7')
y = idf7['outcome']
x =idf7.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 7
Decision Tree
              precision    recall  f1-score   support

       False       0.94      0.72      0.82        87
        True       0.57      0.89      0.70        36

    accuracy                           0.77       123
   macro avg       0.76      0.81      0.76       123
weighted avg       0.83      0.77      0.78       123

BAC index encoding with prefix length 7
Decision Tree
              precision    recall  f1-score   support

       False       0.94      0.78      0.85        94
        True       0.53      0.83      0.65        29

    accuracy                           0.79       123
   macro avg       0.73      0.80      0.75       123
weighted avg       0.84      0.79      0.80       123



In [57]:
print('BAC aggregation encoding with prefix length 8')
y = adf8['outcome']
x =adf8.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 8')
y = idf8['outcome']
x =idf8.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 8
Decision Tree
              precision    recall  f1-score   support

       False       0.74      0.46      0.57        37
        True       0.63      0.85      0.72        40

    accuracy                           0.66        77
   macro avg       0.68      0.65      0.65        77
weighted avg       0.68      0.66      0.65        77

BAC index encoding with prefix length 8
Decision Tree
              precision    recall  f1-score   support

       False       0.73      0.57      0.64        47
        True       0.50      0.67      0.57        30

    accuracy                           0.61        77
   macro avg       0.61      0.62      0.61        77
weighted avg       0.64      0.61      0.62        77



In [58]:
print('BAC aggregation encoding with prefix length 9')
y = adf9['outcome']
x =adf9.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 9')
y = idf9['outcome']
x =idf9.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 9
Decision Tree
              precision    recall  f1-score   support

       False       0.78      0.58      0.67        12
        True       0.38      0.60      0.46         5

    accuracy                           0.59        17
   macro avg       0.58      0.59      0.56        17
weighted avg       0.66      0.59      0.61        17

BAC index encoding with prefix length 9
Decision Tree
              precision    recall  f1-score   support

       False       0.50      0.88      0.64         8
        True       0.67      0.22      0.33         9

    accuracy                           0.53        17
   macro avg       0.58      0.55      0.48        17
weighted avg       0.59      0.53      0.48        17



In [59]:
print('BAC aggregation encoding with prefix length 10')
y = adf10['outcome']
x =adf10.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 10')
y = idf10['outcome']
x =idf10.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 10
Decision Tree
              precision    recall  f1-score   support

       False       0.77      1.00      0.87        10
        True       1.00      0.40      0.57         5

    accuracy                           0.80        15
   macro avg       0.88      0.70      0.72        15
weighted avg       0.85      0.80      0.77        15

BAC index encoding with prefix length 10
Decision Tree
              precision    recall  f1-score   support

       False       0.58      0.78      0.67         9
        True       0.33      0.17      0.22         6

    accuracy                           0.53        15
   macro avg       0.46      0.47      0.44        15
weighted avg       0.48      0.53      0.49        15

