In [55]:
import pickle
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
import matplotlib.dates as mdates
import pandas as pd
from sklearn.metrics import classification_report


In [56]:
df = pd.read_csv('./data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv')

In [57]:
with open('result_rf.pkl', 'rb') as fp:
    data = pickle.load(fp)

In [58]:
def anomalous_or_not(result, caseid):
    '''
    Determine following activity is anomalous or not
    If following activity is in the prediction candidates, it is normal. Otherwise, potential anomalous
    ----------
    Parameters
    result: dict
        Next activity prediction result
    caseid: str
    ----------
    Return
    anomalous_list: list
        List with each event is anomaloy or not
    '''
    anomalous_list=[]
    for x in result[caseid]:
        event = result[caseid][x]
        true_label = event[1]
        predictions = event[0]
        event_anomalous = 'Not Available'
        for y in list(predictions.values()):
            candidate_list = y[0]
            detection_result = 'Not Available'

            if candidate_list != 'Not Available':
                candidates = y[0][1]
                detection_result = 'Not Available'
                if true_label in candidates:
                    detection_result = 'Normal'
                else:
                    detection_result = 'Potential anomalous'
        event_anomalous = detection_result
        anomalous_list.append(event_anomalous)
    
    return anomalous_list

In [59]:
def anomalous_or_not_detail(result, caseid):
    '''
    Determine following activity is anomalous or not
    If following activity is in the prediction candidates, it is normal. Otherwise, potential anomalous
    ----------
    Parameters
    result: dict
        Next activity prediction result
    caseid: str
    ----------
    Return
    anomalous_list: list
        List with each event is anomaloy or not
    '''
    anomalous_list=[]
    for x in result[caseid]:
        anomalous_detail = []
        event = result[caseid][x]
        true_label = event[1]
        predictions = event[0]
        event_anomalous = 'Not Available'
        for y in list(predictions.values()):
            candidate_list = y[0]
            detection_result = 'Not Available'

            if candidate_list != 'Not Available':
                candidates = y[0][1]
                detection_result = 'Not Available'
                if true_label in candidates:
                    detection_result = 'Normal'
                else:
                    detection_result = 'Potential anomalous'
            anomalous_detail.append((detection_result, y[1]))
        anomalous_list.append(anomalous_detail)
    
    return anomalous_list

In [62]:
print(anomalous_result)

{'0': [[('Not Available', datetime.datetime(2017, 9, 15, 14, 46, 25))], [('Not Available', datetime.datetime(2017, 9, 15, 14, 46, 25))], [('Not Available', datetime.datetime(2017, 9, 15, 15, 14, 23))], [('Not Available', datetime.datetime(2017, 9, 15, 15, 14, 23))], [('Not Available', datetime.datetime(2017, 9, 15, 17, 15, 52))], [('Not Available', datetime.datetime(2017, 9, 15, 20, 34, 38))], [('Not Available', datetime.datetime(2017, 9, 15, 23, 14, 12))], [('Not Available', datetime.datetime(2017, 9, 15, 23, 25, 33))], [('Not Available', datetime.datetime(2017, 9, 16, 0, 40, 31))], [('Not Available', datetime.datetime(2017, 9, 16, 0, 43, 53))]], '1': [[('Not Available', datetime.datetime(2017, 9, 16, 0, 43, 53))], [('Not Available', datetime.datetime(2017, 9, 16, 0, 43, 53))], [('Not Available', datetime.datetime(2017, 9, 16, 16, 55, 30))], [('Not Available', datetime.datetime(2017, 9, 16, 18, 6, 10))], [('Not Available', datetime.datetime(2017, 9, 16, 18, 38, 29))], [('Not Available

In [60]:
caseidlist = list(data.keys())
anomalous_result ={}
for caseid in caseidlist:
    anomalous_result[caseid]= anomalous_or_not_detail(data, caseid)

In [61]:
print(df.head)
print(df['noise'])

<bound method NDFrame.head of       Case ID                                Activity  \
0           0                                   Start   
1           0  start_event_Loan  application received   
2           0   Check  application  form completeness   
3           0                    Send acceptance pack   
4           0                       Appraise property   
...       ...                                     ...   
8609      525                        Assess loan risk   
8610      525                      Assess eligibility   
8611      525                      Reject application   
8612      525     end_event_Loan application rejected   
8613      525                                     End   

           Complete Timestamp      Variant  Variant index  \
0     2017-09-15 14:46:25.000   Variant 17             17   
1     2017-09-15 14:46:25.000   Variant 17             17   
2     2017-09-15 15:14:23.349   Variant 17             17   
3     2017-09-15 15:14:23.349   Variant 1

In [32]:
true_labels = []

for pos, x in enumerate(list(df['noise'])):
    if list(df['Activity'])[pos] != 'End':
        if x == 'Start':
            x = np.nan

        true_labels.append(x)
print(len(true_labels))

8088


In [33]:
case_event_result_dict = {}
predicted_labels = []
for x in anomalous_result:
    case_event_result_dict[x] = []
    for pos, t in enumerate(anomalous_result[x]):
        case_event_result_dict[x].append([pos+1, t[0][0]])
        predicted_labels.append(t[0][0])
print(len(predicted_labels))

8088


In [52]:
total_predictions  =0
correct_prediction =0
true_label =0

true_label2=[]
predict_label2=[]
for pos,t in enumerate(predicted_labels):
    if predicted_labels[pos] != 'Not Available':
        if true_labels[pos] == 'true':
            true_label = 'Potential anomalous'
        elif np.isnan(true_labels[pos]):
            true_label = 'Normal'
        
        true_label2.append(true_label)
        predict_label2.append(predicted_labels[pos])
        if true_label == predicted_labels[pos]:
            correct_prediction +=1
        total_predictions +=1
print(correct_prediction, total_predictions)
print(correct_prediction/total_predictions)

5448 7841
0.6948093355439358


In [54]:
print(classification_report(true_label2, predict_label2))

                     precision    recall  f1-score   support

             Normal       0.86      0.77      0.81      6810
Potential anomalous       0.12      0.21      0.15      1031

           accuracy                           0.69      7841
          macro avg       0.49      0.49      0.48      7841
       weighted avg       0.77      0.69      0.73      7841

