In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import practice
import numpy as np

In [2]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>=prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [3]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [4]:
df = pd.read_csv('./data/bpic2017.csv')
df['Complete Timestamp'] = pd.to_datetime(df['Complete Timestamp'])
df = df.rename(columns={'Case ID':'caseid','Activity':'activity','Complete Timestamp':'ts','Resource':'resource'})
df = df.loc[:,['caseid','activity','ts','resource']]
groups = df.groupby('caseid')
reconcatenate =[]
for _,group in groups:
    group = group.reset_index(drop=True)
    case_label = practice.set_label(group.iloc[-1,:].to_dict())['True label']
    group.loc[:,'outcome'] = case_label
    reconcatenate.append(group)
dfn = pd.concat(reconcatenate)
df2 = aggregation_encoding(dfn,2)
df3 = aggregation_encoding(dfn,3)

In [5]:
print(df3)

              caseid         ts    outcome  O_Create Offer  \
0   Offer_1000681710   7.460667   Accepted               1   
0   Offer_1001553250   5.949333   Accepted               1   
0   Offer_1002136393   7.380667   Accepted               1   
0   Offer_1002236598   4.628667    Refused               1   
0   Offer_1002530118   7.483333  Cancelled               1   
..               ...        ...        ...             ...   
0    Offer_993689039   6.695667  Cancelled               1   
0    Offer_993800442   7.780333  Cancelled               1   
0     Offer_99473283  89.098333   Accepted               1   
0    Offer_995784215   7.004333   Accepted               1   
0    Offer_997411923  12.050667   Accepted               1   

    O_Sent (mail and online)  O_Created  User_20  User_2  User_85  User_49  \
0                        1.0          1      3.0     0.0      0.0      0.0   
0                        1.0          1      0.0     3.0      0.0      0.0   
0                    

In [6]:
print('BPIC2017 aggregation encoding with prefix length 3')
y = df3['outcome']
x =df3.drop(columns=['outcome','caseid'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

# Deicision tree result
print('Decision Tree')
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5).fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

BPIC2017 aggregation encoding with prefix length 3
Decision Tree


NameError: name 'decision_tree_model' is not defined