In [26]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import utils
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt

In [27]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [28]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [29]:
def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        activitylist = list(group['activity'])
        resourcelist = list(group['resource'])
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}
        resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        case_outcome.update(resource_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [30]:
df = pd.read_csv('./data/bac_offline_small.csv')
df['START_DATE'] = pd.to_datetime(df['START_DATE'])
df = df.rename(columns={'REQUEST_ID':'caseid','ACTIVITY':'activity','START_DATE':'ts','CE_UO':'resource'})
df = df.loc[:,['caseid','activity','ts','resource','outcome']]
groups = df.groupby('caseid')
reconcatenate =[]
dfn = df

In [31]:
adf5 = aggregation_encoding(dfn,5)
idf5 = indexbase_encoding(dfn,5)

In [32]:
adf6 = aggregation_encoding(dfn,6)
idf6 = indexbase_encoding(dfn,6)

In [33]:
adf7 = aggregation_encoding(dfn,7)
idf7 = indexbase_encoding(dfn,7)

In [34]:
adf8 = aggregation_encoding(dfn,8)
idf8 = indexbase_encoding(dfn,8)

In [35]:
adf9 = aggregation_encoding(dfn,9)
idf9 = indexbase_encoding(dfn,9)

In [36]:
adf10 = aggregation_encoding(dfn,10)
idf10 = indexbase_encoding(dfn,10)

In [37]:
print('BAC aggregation encoding with prefix length 5')
y = adf5['outcome']
x =adf5.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 5')
y = idf5['outcome']
x =idf5.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 5
Decision Tree
              precision    recall  f1-score   support

       False       0.94      0.98      0.96      1081
        True       0.58      0.32      0.42        99

    accuracy                           0.92      1180
   macro avg       0.76      0.65      0.69      1180
weighted avg       0.91      0.92      0.91      1180

BAC index encoding with prefix length 5
Decision Tree
              precision    recall  f1-score   support

       False       0.94      0.98      0.96      1082
        True       0.62      0.34      0.44        98

    accuracy                           0.93      1180
   macro avg       0.78      0.66      0.70      1180
weighted avg       0.92      0.93      0.92      1180



In [38]:
print('BAC aggregation encoding with prefix length 6')
y = adf6['outcome']
x =adf6.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 6')
y = idf6['outcome']
x =idf6.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 6
Decision Tree
              precision    recall  f1-score   support

       False       0.92      0.99      0.95       961
        True       0.40      0.09      0.14        93

    accuracy                           0.91      1054
   macro avg       0.66      0.54      0.55      1054
weighted avg       0.87      0.91      0.88      1054

BAC index encoding with prefix length 6
Decision Tree
              precision    recall  f1-score   support

       False       0.92      0.98      0.95       947
        True       0.63      0.27      0.38       107

    accuracy                           0.91      1054
   macro avg       0.78      0.63      0.67      1054
weighted avg       0.89      0.91      0.89      1054



In [39]:
print('BAC aggregation encoding with prefix length 7')
y = adf7['outcome']
x =adf7.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 7')
y = idf7['outcome']
x =idf7.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 7
Decision Tree
              precision    recall  f1-score   support

       False       0.97      0.73      0.83        93
        True       0.78      0.98      0.87        92

    accuracy                           0.85       185
   macro avg       0.88      0.85      0.85       185
weighted avg       0.88      0.85      0.85       185

BAC index encoding with prefix length 7
Decision Tree
              precision    recall  f1-score   support

       False       0.97      0.79      0.87        87
        True       0.84      0.98      0.91        98

    accuracy                           0.89       185
   macro avg       0.91      0.89      0.89       185
weighted avg       0.90      0.89      0.89       185



In [40]:
print('BAC aggregation encoding with prefix length 8')
y = adf8['outcome']
x =adf8.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 8')
y = idf8['outcome']
x =idf8.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 8
Decision Tree
              precision    recall  f1-score   support

       False       0.81      0.27      0.41        48
        True       0.44      0.90      0.59        30

    accuracy                           0.51        78
   macro avg       0.62      0.59      0.50        78
weighted avg       0.67      0.51      0.48        78

BAC index encoding with prefix length 8
Decision Tree
              precision    recall  f1-score   support

       False       0.88      0.52      0.66        42
        True       0.62      0.92      0.74        36

    accuracy                           0.71        78
   macro avg       0.75      0.72      0.70        78
weighted avg       0.76      0.71      0.70        78



In [41]:
print('BAC aggregation encoding with prefix length 9')
y = adf9['outcome']
x =adf9.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 9')
y = idf9['outcome']
x =idf9.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 9
Decision Tree
              precision    recall  f1-score   support

       False       0.80      0.67      0.73        12
        True       0.89      0.94      0.91        33

    accuracy                           0.87        45
   macro avg       0.84      0.80      0.82        45
weighted avg       0.86      0.87      0.86        45

BAC index encoding with prefix length 9
Decision Tree
              precision    recall  f1-score   support

       False       0.75      0.75      0.75         8
        True       0.95      0.95      0.95        37

    accuracy                           0.91        45
   macro avg       0.85      0.85      0.85        45
weighted avg       0.91      0.91      0.91        45



In [42]:
print('BAC aggregation encoding with prefix length 10')
y = adf10['outcome']
x =adf10.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

print('BAC index encoding with prefix length 10')
y = idf10['outcome']
x =idf10.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BAC aggregation encoding with prefix length 10
Decision Tree
              precision    recall  f1-score   support

       False       0.67      0.80      0.73        10
        True       0.33      0.20      0.25         5

    accuracy                           0.60        15
   macro avg       0.50      0.50      0.49        15
weighted avg       0.56      0.60      0.57        15

BAC index encoding with prefix length 10
Decision Tree
              precision    recall  f1-score   support

       False       0.71      0.56      0.63         9
        True       0.50      0.67      0.57         6

    accuracy                           0.60        15
   macro avg       0.61      0.61      0.60        15
weighted avg       0.63      0.60      0.60        15

