In [1]:
import pickle
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
import matplotlib.dates as mdates
import pandas as pd
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support  as score

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [19]:
df = pd.read_csv('./data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv')

In [20]:
with open('result_rf.pkl', 'rb') as fp:
    data = pickle.load(fp)

In [21]:
def anomalous_or_not(result, caseid):
    '''
    Determine following activity is anomalous or not
    If following activity is in the prediction candidates, it is normal. Otherwise, potential anomalous
    ----------
    Parameters
    result: dict
        Next activity prediction result
    caseid: str
    ----------
    Return
    anomalous_list: list
        List with each event is anomaloy or not
    '''
    anomalous_list=[]
    for x in result[caseid]:
        event = result[caseid][x]
        true_label = event[1]
        predictions = event[0]
        event_anomalous = 'Not Available'
        for y in list(predictions.values()):
            candidate_list = y[0]
            detection_result = 'Not Available'

            if candidate_list != 'Not Available':
                candidates = y[0][1]
                detection_result = 'Not Available'
                if true_label in candidates:
                    detection_result = 'Normal'
                else:
                    detection_result = 'Potential anomalous'
        event_anomalous = detection_result
        anomalous_list.append(event_anomalous)
    
    return anomalous_list

In [22]:
def anomalous_or_not_detail(result, caseid):
    '''
    Determine following activity is anomalous or not
    If following activity is in the prediction candidates, it is normal. Otherwise, potential anomalous
    ----------
    Parameters
    result: dict
        Next activity prediction result
    caseid: str
    ----------
    Return
    anomalous_list: list
        List with each event is anomaloy or not
    '''
    anomalous_list=[]
    for x in result[caseid]:
        anomalous_detail = []
        event = result[caseid][x]
        true_label = event[1]
        predictions = event[0]
        event_anomalous = 'Not Available'
        for y in list(predictions.values()):
            candidate_list = y[0]
            detection_result = 'Not Available'

            if candidate_list != 'Not Available':
                candidates = y[0][1]
                detection_result = 'Not Available'
                if true_label in candidates:
                    detection_result = 'Normal'
                else:
                    detection_result = 'Potential anomalous'
            anomalous_detail.append((detection_result, y[1]))
        anomalous_list.append(anomalous_detail)
    
    return anomalous_list

In [23]:
caseidlist = list(data.keys())
anomalous_result ={}
for caseid in caseidlist:
    anomalous_result[caseid]= anomalous_or_not_detail(data, caseid)

In [24]:
print(df.head)
print(df['noise'])

<bound method NDFrame.head of       Case ID                                Activity       Complete Timestamp      Variant  Variant index lifecycle:transition  noise
0           0                                   Start  2017-09-15 14:46:25.000   Variant 17             17                Start  Start
1           0  start_event_Loan  application received  2017-09-15 14:46:25.000   Variant 17             17             complete    NaN
2           0   Check  application  form completeness  2017-09-15 15:14:23.349   Variant 17             17             complete    NaN
3           0                    Send acceptance pack  2017-09-15 15:14:23.349   Variant 17             17                  NaN   true
4           0                       Appraise property  2017-09-15 17:15:52.828   Variant 17             17             complete    NaN
...       ...                                     ...                      ...          ...            ...                  ...    ...
8609      525            

In [32]:
true_labels = []

for pos, x in enumerate(list(df['noise'])):
    if list(df['Activity'])[pos] != 'End':
        if x == 'Start':
            x = np.nan

        true_labels.append(x)
print(len(true_labels))

8088


In [33]:
case_event_result_dict = {}
predicted_labels = []
for x in anomalous_result:
    case_event_result_dict[x] = []
    for pos, t in enumerate(anomalous_result[x]):
        case_event_result_dict[x].append([pos+1, t[0][0]])
        predicted_labels.append(t[0][0])
print(len(predicted_labels))

8088


In [52]:
total_predictions  =0
correct_prediction =0
true_label =0

true_label2=[]
predict_label2=[]
for pos,t in enumerate(predicted_labels):
    if predicted_labels[pos] != 'Not Available':
        if true_labels[pos] == 'true':
            true_label = 'Potential anomalous'
        elif np.isnan(true_labels[pos]):
            true_label = 'Normal'
        
        true_label2.append(true_label)
        predict_label2.append(predicted_labels[pos])
        if true_label == predicted_labels[pos]:
            correct_prediction +=1
        total_predictions +=1
print(correct_prediction, total_predictions)
print(correct_prediction/total_predictions)

5448 7841
0.6948093355439358


In [54]:
print(classification_report(true_label2, predict_label2))

                     precision    recall  f1-score   support

             Normal       0.86      0.77      0.81      6810
Potential anomalous       0.12      0.21      0.15      1031

           accuracy                           0.69      7841
          macro avg       0.49      0.49      0.48      7841
       weighted avg       0.77      0.69      0.73      7841



In [13]:
import pickle

threshold = [0.01,0.05,0.1,0.15,0.2,0.25]
window_size = 100

dataset = ['loan_baseline.pnml_noise_0.15_iteration_1_seed_614_sample.pkl',
'loan_baseline.pnml_noise_0.125_iteration_1_seed_27126_sample.pkl',
'loan_baseline.pnml_noise_0.09999999999999999_iteration_1_seed_14329_sample.pkl',
'loan_baseline.pnml_noise_0.075_iteration_1_seed_73753_sample.pkl',
'loan_baseline.pnml_noise_0.049999999999999996_iteration_1_seed_42477_sample.pkl',
'loan_baseline.pnml_noise_0.024999999999999998_iteration_1_seed_68964_sample.pkl']


noiselist = [0.15,0.125,0.09,0.075,0.049,0.0249]

precisiondf = pd.DataFrame(columns=['Noise'])
precisiondf['Noise'] = noiselist
precisiondf = precisiondf.set_index(precisiondf['Noise'])

recalldf = pd.DataFrame(columns=['Noise'])
recalldf['Noise'] = noiselist
recalldf = recalldf.set_index(recalldf['Noise'])

fscoredf = pd.DataFrame(columns=['Noise'])
fscoredf['Noise'] = noiselist
fscoredf = fscoredf.set_index(fscoredf['Noise'])





for pos1, data in enumerate(dataset):

    for thr in threshold:
#         rf_name = './result/rf_thr%s_%s'%(thr, data)
#         iso_name = './result/iso_cont%s_%s'%(thr, data)
        sp_name = './result/lstm/lstm_thr%s_window%s_%s'%(thr, window_size, data)
        unsp_name = './result/iso/iso_cont%s_window%s_%s'%(thr, window_size, data)


        with open(sp_name, 'rb') as f:
            sp_data = pickle.load(f)

        with open(unsp_name, 'rb') as f:
            unsp_data = pickle.load(f)

        sp_y_pred_avail= []
        sp_y_true_avail = []
        unsp_y_pred_avail = []
        unsp_y_true_avail = []

        for pos, t in enumerate(sp_data['y_pred']):
            if t != 'Not Available' and unsp_data['y_pred'][pos] != 'Not Available':
                sp_y_pred_avail.append(t)
                sp_y_true_avail.append(sp_data['y_true'][pos])

                unsp_y_pred_avail.append(unsp_data['y_pred'][pos])
                unsp_y_true_avail.append(unsp_data['y_true'][pos])

        precision, recall, fscore, support = score(y_true = sp_y_true_avail, y_pred = sp_y_pred_avail)
#         precision, recall, fscore, support = score(y_true = unsp_y_true_avail, y_pred = unsp_y_pred_avail)
        precisiondf.loc[noiselist[pos1],thr] = precision[0]
        recalldf.loc[noiselist[pos1],thr] = recall[0]
        fscoredf.loc[noiselist[pos1],thr] = fscore[0]
        
precisiondf = precisiondf.drop(columns=['Noise'])
recalldf = recalldf.drop(columns=['Noise'])
fscoredf = fscoredf.drop(columns=['Noise'])
print(precisiondf)
print('\n')
print(recalldf)
print('\n')
print(fscoredf)
    #     print('----------')

            0.01      0.05      0.10      0.15      0.20      0.25
Noise                                                             
0.1500  0.321522  0.237933  0.201756  0.184512  0.176345  0.170940
0.1250  0.288722  0.208271  0.166499  0.152009  0.141827  0.136755
0.0900  0.278670  0.182059  0.156039  0.139021  0.133178  0.128666
0.0750  0.225340  0.138510  0.109966  0.098466  0.090364  0.086259
0.0490  0.180238  0.109865  0.089022  0.078067  0.072583  0.069555
0.0249  0.097561  0.055986  0.042288  0.036068  0.032366  0.029256


            0.01      0.05      0.10      0.15      0.20      0.25
Noise                                                             
0.1500  0.479452  0.752446  0.899217  0.944227  0.971624  0.978474
0.1250  0.525308  0.764706  0.904241  0.957592  0.968536  0.976744
0.0900  0.509434  0.719078  0.872117  0.922432  0.955975  0.974843
0.0750  0.561441  0.752119  0.881356  0.938559  0.957627  0.968220
0.0490  0.625397  0.777778  0.898413  0.933333  0.955556  0.