In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import numpy as np
from sklearn import tree
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score
import json
import pickle as pkl

In [3]:
def filter_by_prefix(df,prefix):
    '''
    Filter case by prefix length
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to slice by prefix length
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe with sliced cases
    '''
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    for case,group in groups: 
        group = group.reset_index(drop=True)
        if len(group)>prefix:
            group = group.loc[:prefix-1,:]
            encoded_df.append(group)
    return pd.concat(encoded_df)

def indexbase_encoding(df, prefix):
    '''
    Indexbase encoding
    
    Parameters
    ----------
    df : pandas dataframe
        Assigned dataframe to encode for outcome prediction
    
    prefix : int
        Prefix length to slice to cases in fixed length
    
    Returns
    ----------
    Return dataframe encoded in indexbase method
    '''
    df = filter_by_prefix(df,prefix)
    df['ts'] = pd.to_datetime(df['ts'])
    groups = df.groupby('caseid')
    encoded_df=[]
    if 'resource' not in list(df.columns.values):
        noresource = True
    else:
        noresource = False
        
    for case,group in groups: 
        activitylist = list(group['activity'])
        
        group = group.reset_index(drop=True)
        outcome = set(group['outcome']).pop()
        cumdurationlist = [(x - list(group['ts'])[0]).total_seconds() for x in list(group['ts'])]
        cumduration_index ={'Cumduration_'+str(x+1): cumdurationlist[x] for x in range(len(cumdurationlist))}
        
        case_outcome = {'caseid':case, 'outcome':outcome}
        activity_index = {'activity_'+str(x+1)+'_'+activitylist[x]: 1 for x in range(len(activitylist))}

        if noresource == False:
            resourcelist = list(group['resource'])
            resource_index = {'resource_'+str(x+1)+'_'+str(resourcelist[x]): 1 for x in range(len(resourcelist))}
            case_outcome.update(resource_index)
        
        case_outcome.update(cumduration_index)
        case_outcome.update(activity_index)
        dfk = pd.DataFrame.from_dict([case_outcome])
        encoded_df.append(dfk)
    concated_df = pd.concat(encoded_df)
    concated_df = concated_df.fillna(0)
    return concated_df

In [62]:
df = pd.read_csv('./preprocessed_loan_baseline.pnml_noise_0.049999999999999996_iteration_1_seed_42477_sample.csv')


key_pair = {'Case ID':'caseid', 'Activity':'activity', 'Complete Timestamp':'ts'}
df = df.rename(columns=key_pair)

if 'resource' in df.columns.values:
    df = df.loc[:,['caseid','activity','ts','resource','noise']]

else:
    df = df.loc[:,['caseid','activity','ts','noise']]

In [63]:
groups = df.groupby('caseid')
concating = []
max_case_len = max([len(group) for _, group in groups])
caseids = list(set(df['caseid']))

outcome = []
for _, group in groups:
    group = group.reset_index(drop=True)
    actlist = list(group['activity'])
    outcomelist = actlist[1:] + [np.nan]
    group['outcome'] = outcomelist
    concating.append(group)

dfn = pd.concat(concating)

max_case_len =10
idslist = []
for prefix in range(1, max_case_len):
    idslist.append(indexbase_encoding(dfn,prefix))

prefixlist= list(range(1, max_case_len))
acc_dict= {}

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
6625    NaN
6626    NaN
6627    NaN
6628    NaN
6629    NaN
Name: noise, Length: 6630, dtype: object


In [67]:
print('Random forest')
models = []
used_models = 'RF'
testdf_list = []

for pos,prefix in enumerate(idslist):
    np.random.seed(2022)
    trainids = np.random.choice(caseids, int(len(caseids)*0.7), replace=False)

    traindf = prefix[prefix['caseid'].isin(trainids)].reset_index(drop=True)
    testdf = prefix[~prefix['caseid'].isin(trainids)].reset_index(drop=True)
    testdf_list.append(testdf)

    y_train = traindf['outcome']
    x_train = traindf.drop(columns=['outcome','caseid'],axis=1)

    y_test = testdf['outcome']
    x_test = testdf.drop(columns=['outcome','caseid'],axis=1)

    # Random forest result    
    
    rf = RandomForestClassifier(criterion='entropy').fit(x_train,y_train)
    y_pred = rf.predict(x_test)

    filename = './models/%s prefix %s.pkl'%(used_models, pos+1)
    models.append(rf)
    with open(filename,'wb') as f:
        pkl.dump(rf, f)

    acc_dict['prefix_%s'%(str(prefixlist[pos]))] =  accuracy_score(y_test,y_pred)


Random forest


In [89]:
testing_case_ids = set(testdf_list[-1]['caseid'])
testdf = df[df['caseid'].isin(testing_case_ids)]

In [98]:
model = models[-1]
for_confusion_matrix = {}

counting_normal = 0
ad_predictions=[]
ad_true = []
for threshold in [0.01,0.05,0.1,0.15,0.2,0.25]:
    global_true =[]
    global_pred = []

    for pos, prefix in enumerate(idslist):

        for_confusion_matrix[int(caseid)] =[]
        prediction_list = []
        df = testdf
        
        for caseid in list(testing_case_ids):
            prediction_label = 'Normal'
            x_test = testdf_list[pos][testdf_list[pos]['caseid'] ==caseid]
            true_outcome = x_test['outcome'].values[0]
            
            x_test_features = list(x_test.columns.values)
            x_test_features.remove('caseid')
            x_test_features.remove('outcome')
            
            x_test = x_test.loc[:, x_test_features]
            x_test = np.array(x_test.values).reshape(1,-1)

            model_classes = models[pos].classes_
            predictions_proba = models[pos].predict_proba(x_test)[0]
            predicted_one = model_classes[np.argmax(predictions_proba)]
        
            if predicted_one  == 'Not Available':
                prediction_label = 'Not Available'
            else:
                if true_outcome in model_classes:
                    labelidx = list(model_classes).index(true_outcome)

                    if predictions_proba[labelidx] <threshold:
                        prediction_label = 'Anomalous'
                else:
                    prediction_label = 'Anomalous'
            
            noisedf = df[df['caseid'] == caseid].reset_index(drop=True)
            noiselabel = list(noisedf['noise'])[pos]
            if np.isnan(noiselabel):
                noiselabel= 'Normal'
            else:
                noiselabel= 'Anomalous'
            ad_predictions.append(prediction_label)
            ad_true.append(noiselabel)
        
    #     for pos, p in enumerate(prediction_list):
    #         global_pred.append(p)
    #         global_true.append(true_label_list[pos])


    # saving_data = {'y_true':global_true, 'y_pred':global_pred}

In [101]:
print(classification_report(ad_predictions, ad_true))
print(accuracy_score(ad_predictions, ad_true))

              precision    recall  f1-score   support

   Anomalous       0.22      0.13      0.16       521
      Normal       0.93      0.96      0.94      6175

    accuracy                           0.90      6696
   macro avg       0.57      0.55      0.55      6696
weighted avg       0.87      0.90      0.88      6696

0.8959080047789725
