In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score, f1_score
import numpy as np

import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score
import json
import pickle as pkl

In [49]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    if 'resource' not in list(df.columns.values):
        noresource = True
    else:
        noresource = False
        
    for case,group in groups: 
        activitylist = list(group['activity'])
        
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}

        if noresource == False:
            resourcelist = list(group['resource'])
            resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
            case_outcome.update(resource_index)
        
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [80]:
df = pd.read_csv('./preprocessed_loan_baseline.pnml_noise_0.09999999999999999_iteration_1_seed_14329_sample.csv')
used_models = 'XGB'


key_pair = {'Case ID':'caseid', 'Activity':'activity', 'Complete Timestamp':'ts'}
df = df.rename(columns=key_pair)

if 'resource' in df.columns.values:
    df = df.loc[:,['caseid','activity','ts','resource','noise']]

else:
    df = df.loc[:,['caseid','activity','ts','noise']]

In [81]:
groups = df.groupby('caseid')
concating = []
max_case_len = max([len(group) for _, group in groups])
caseids = list(set(df['caseid']))

outcome = []
for _, group in groups:
    group = group.reset_index(drop=True)
    actlist = list(group['activity'])
    outcomelist = actlist[1:] + [np.nan]
    group['outcome'] = outcomelist
    concating.append(group)

dfn = pd.concat(concating)

max_case_len =15
idslist = []
for prefix in range(1, max_case_len):
    idslist.append(indexbase_encoding(dfn,prefix))

prefixlist= list(range(1, max_case_len))

acc_dict= {}

In [82]:
for pos, x in enumerate(idslist):
    print(pos+1, len(set(x['caseid'])))

1 500
2 500
3 500
4 500
5 500
6 500
7 500
8 446
9 402
10 373
11 318
12 241
13 201
14 171


In [83]:
print(used_models)
models = []
testdf_list = []


for pos,prefix in enumerate(idslist):  
    caseids = list(set(prefix['caseid']))
    trainids = np.random.choice(caseids, int(len(caseids)*0.7), replace=False)
    traindf = prefix[prefix['caseid'].isin(trainids)].reset_index(drop=True)
    testdf = prefix[~prefix['caseid'].isin(trainids)].reset_index(drop=True)
    testdf_list.append(testdf)

    y_train = traindf['outcome']
    x_train = traindf.drop(columns=['outcome','caseid'],axis=1)

    y_test = testdf['outcome']
    x_test = testdf.drop(columns=['outcome','caseid'],axis=1)

    # Random forest result    
    
    if used_models == 'RF':
        m = RandomForestClassifier(n_estimators=10, criterion='entropy').fit(x_train,y_train)
        y_pred = m.predict(x_test)

    elif used_models =='XGB':
        m = xgb.XGBClassifier(n_estimators = 20, learning_rate=0.01).fit(x_train, y_train)
        y_pred = m.predict(x_test)
        
    models.append(m)

    filename = './models/%s prefix %s.pkl'%(used_models, pos+1)
    with open(filename,'wb') as f:
        pkl.dump(m, f)

    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  accuracy_score(y_test,y_pred)
    
    testids = list(set(testdf['caseid']))
    test_file_name = './data/Prefix %s testdata.pkl'%(str(pos+1))
    with open(test_file_name,'wb') as f:
        pkl.dump(testids,f)

XGB






























In [84]:
for_confusion_matrix = {}

counting_normal = 0
for threshold in [0.01,0.05,0.1,0.15,0.2,0.25]:
    global_true =[]
    global_pred = []
    ad_predictions=[]
    ad_true = []

    for pos, prefix in enumerate(idslist):
        testing_case_ids = set(testdf_list[-1]['caseid'])

        prediction_list = []
        testing_case_ids = set(testdf_list[pos]['caseid'])
        for caseid in list(testing_case_ids):
            prediction_label = 'Normal'
            x_test = testdf_list[pos][testdf_list[pos]['caseid'] ==caseid]
            true_outcome = x_test['outcome'].values[0]
            
            x_test_features = list(x_test.columns.values)
            x_test_features.remove('caseid')
            x_test_features.remove('outcome')
            
            x_test = x_test.loc[:, x_test_features]
            x_test = np.array(x_test.values).reshape(1,-1)

            model_classes = models[pos].classes_
            predictions_proba = models[pos].predict_proba(x_test)[0]
            predicted_one = model_classes[np.argmax(predictions_proba)]
        
            if predicted_one  == 'Not Available':
                prediction_label = 'Not Available'
            else:
                if true_outcome in model_classes:
                    labelidx = list(model_classes).index(true_outcome)

                    if predictions_proba[labelidx] <threshold:
                        prediction_label = 'Anomalous'
                else:
                    prediction_label = 'Anomalous'
           
            noisedf = df[df['caseid'] == caseid].reset_index(drop=True)
            noiselabel = list(noisedf['noise'])[pos]
            if np.isnan(noiselabel):
                noiselabel= 'Normal'
            else:
                noiselabel= 'Anomalous'
            ad_predictions.append(prediction_label)
            ad_true.append(noiselabel)
        
    for_confusion_matrix[threshold]=[ad_predictions, ad_true]

    # saving_data = {'y_true':global_true, 'y_pred':global_pred}

In [85]:
for t in for_confusion_matrix.keys():
    print(t)
    predictions = for_confusion_matrix[t][0]
    trues = for_confusion_matrix[t][1]
    print(classification_report(y_pred = predictions, y_true = trues))
    print('Accuarcy: ',accuracy_score(y_pred = predictions, y_true = trues))
    print('F1 score: ',f1_score(y_pred = predictions, y_true = trues, average='binary', pos_label='Normal'))
    print(set(predictions), set(trues))

0.01
              precision    recall  f1-score   support

   Anomalous       0.31      0.03      0.05       176
      Normal       0.90      0.99      0.94      1523

    accuracy                           0.89      1699
   macro avg       0.61      0.51      0.50      1699
weighted avg       0.84      0.89      0.85      1699

Accuarcy:  0.8928781636256622
F1 score:  0.943231441048035
{'Normal', 'Anomalous'} {'Normal', 'Anomalous'}
0.05
              precision    recall  f1-score   support

   Anomalous       0.26      0.03      0.05       176
      Normal       0.90      0.99      0.94      1523

    accuracy                           0.89      1699
   macro avg       0.58      0.51      0.50      1699
weighted avg       0.83      0.89      0.85      1699

Accuarcy:  0.8911124190700412
F1 score:  0.9422416484545738
{'Normal', 'Anomalous'} {'Normal', 'Anomalous'}
0.1
              precision    recall  f1-score   support

   Anomalous       0.16      0.41      0.24       176
      No

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
