In [25]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import utils
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score


In [26]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

In [27]:
def aggregation_encoding(df, prefix):
    '''
    Aggregation encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in aggregation method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        case_time_outcome = {'caseid':case, 'ts':np.mean(cumdurationlist),'outcome':outcome}
        activity_count = {x: list(group['activity']).count(x) for x in set(group['activity'])}
        resource_count = {x: list(group['resource']).count(x) for x in set(group['resource'])}

        case_time_outcome.update(activity_count)
        case_time_outcome.update(resource_count)
        dfk = pd.DataFrame.from_dict([case_time_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [28]:
def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        activitylist = list(group['activity'])
        resourcelist = list(group['resource'])
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}
        resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        case_outcome.update(resource_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [29]:
# df = pd.read_csv('./data/bac_online_back_small.csv')
# df['START_DATE'] = pd.to_datetime(df['START_DATE'])
# df = df.rename(columns={'REQUEST_ID':'caseid','ACTIVITY':'activity','START_DATE':'ts','CE_UO':'resource'})
# df = df.loc[:,['caseid','activity','ts','resource','outcome']]

# df = pd.read_csv('./data/IRO5k_labeled_sampled_newts.csv')
# key_pair = {
#         'Case ID':'caseid',
#         'Activity':'activity',
#         'Complete Timestamp':'ts',
# }
# df['Complete Timestamp'] = pd.to_datetime(df['Complete Timestamp'])
# df = df.rename(columns=key_pair)
# df = df.loc[:,['caseid','activity','ts','outcome']]

# df = pd.read_csv('./data/bpic15_streaming.csv')
# key_pair = {
# }
# df = df.rename(columns=key_pair)
# df = df.loc[:,['caseid','activity','ts','resource','outcome']]


# df = pd.read_csv('./data/BPI Challenge 2017_modified3.csv')
# key_pair = {
#     'Case ID':'caseid',
#     'Activity':'activity',
#     'Resource':'resource',
#     'Start Timestamp':'ts',
#     'Outcome':'outcome'
# }
# df = df.rename(columns=key_pair)
# df = df.loc[:,['caseid','activity','ts','resource','outcome']]

df = pd.read_csv('./data/road_traffic_fine_process.csv')
key_pair = {
    'Case ID':'caseid',
    'Activity':'activity',
    'Resource':'resource',
    'Complete Timestamp':'ts',
}
df = df.rename(columns=key_pair)
df = df.loc[:,['caseid','activity','ts','resource','outcome']]


save_dir = 'road_traffic_fine_process'

try:
    os.makedirs('./result/%s'%(save_dir))
except:
    pass

In [30]:
groups = df.groupby('caseid')
concating = []
for _, group in groups:
    outcomelist = list(group['outcome'])
    outcome = outcomelist[-1]
    group = group.reset_index(drop=True)
    if True in outcomelist:
        group = group.loc[:outcomelist.index(True),:]
    group['outcome'] = outcome
    concating.append(group)

dfn = pd.concat(concating)

In [34]:
idslist = []
prefix_length=6
for length in range(2,prefix_length):
    print('Progressing length: %s'%(length))
    idslist.append(indexbase_encoding(dfn,length))

Progressing length: 2
Progressing length: 3
Progressing length: 4
Progressing length: 5


In [35]:
prefixlist= list(range(2,prefix_length))
acc_dict= {}
for pos,prefix in enumerate(idslist):
    print('BAC aggregation encoding with prefix length %s'%(prefixlist[pos]))
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    acc_list = []
    for i in range(10):
        x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

        # Deicision tree result
        dt = DecisionTreeClassifier(criterion='entropy').fit(x_train,y_train)
        y_pred = dt.predict(x_test)
        acc_list.append(accuracy_score(y_test,y_pred))
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  np.mean(acc_list)

print(acc_dict)
import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
with open('./result/%s/off_dt_acc.pkl'%(save_dir),'wb') as f:
    pkl.dump([x,y],f)

BAC aggregation encoding with prefix length 2
BAC aggregation encoding with prefix length 3
BAC aggregation encoding with prefix length 4
BAC aggregation encoding with prefix length 5
{'prefix_2': 0.5994557823129251, 'prefix_3': 0.7225600000000001, 'prefix_4': 0.8190016103059582, 'prefix_5': 0.7300813008130081}


In [36]:
prefixlist= list(range(2,prefix_length))
acc_dict= {}
for pos,prefix in enumerate(idslist):
    print('BAC index-base encoding with prefix length %s'%(prefixlist[pos]))
    y = prefix['outcome']
    x =prefix.drop(columns=['outcome','caseid'],axis=1)
    acc_list = []

    for i in range(10):
        x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3)

        # Deicision tree result
        rf = RandomForestClassifier(criterion='entropy').fit(x_train,y_train)
        y_pred = rf.predict(x_test)
        acc_list.append(accuracy_score(y_test,y_pred))
    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  np.mean(acc_list)
print(acc_dict)

import pickle as pkl

x = list(acc_dict.keys())
y = [acc_dict[x] for x in acc_dict.keys()]
with open('./result/%s/off_rf_acc.pkl'%(save_dir),'wb') as f:
    pkl.dump([x,y],f)

BAC index-base encoding with prefix length 2
BAC index-base encoding with prefix length 3
BAC index-base encoding with prefix length 4
BAC index-base encoding with prefix length 5
{'prefix_2': 0.6300680272108843, 'prefix_3': 0.78784, 'prefix_4': 0.8679549114331724, 'prefix_5': 0.8203252032520325}
