In [62]:
import pandas as pd
from river import stream,tree,metrics
import utils
from encoding import prefix_bin
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import os
from tqdm import tqdm
import sliding_window
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier
import datetime, time
import importlib
importlib.reload(sliding_window)

<module 'sliding_window' from 'C:\\Users\\suhwan\\Desktop\\Project\\coding\\streaming_anomaly_detect\\sliding_window.py'>

In [87]:
dataset = stream.iter_csv(
            './data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv'
            )

totallength = len(list(dataset))

In [88]:
dataset = stream.iter_csv(
            './data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv',
            drop=['noise', 'lifecycle:transition', 'Variant', 'Variant index'],
            )
enctype = 'Index-base'

In [89]:
key_pair = {
'Case ID':'caseid',
'Activity':'activity',
# 'Resource':'resource',
'Complete Timestamp':'ts',
}
catatars= ['activity']#,'resource']

case_dict ={}
training_models ={}

casecount = 0
rowcounter = 0
resultdict ={}
acc_dict ={}
prefix_wise_window = {}
prediction_result = {}
graceperiod_finish=0
finishedcases = set()

In [90]:
# Sliding window for training setting
window_size = 50
retraining_size = 10
training_window = sliding_window.training_window(window_size,retraining_size)


In [91]:
def display_progress(row_counting, total_length, interval=500):
    if rowcounter%interval == 0:
        print(round(rowcounter*100/totallength,2) ,'%', 'Case finished: %s'%(casecount), 'Running cases: %s'%(len(case_dict)))

In [92]:
def training_stage(window, training_models):
    '''
    Manage training stage of streaming anomaly detection
    ----------
    Parameters
    window: class training_window
        Sliding window with training data
    training_models: dict
        Trained detector by prefix stored in. Default is randomforest
    ----------
    Return
    training_models
    '''
    pw_window = window.prefix_wise_window()
    for x in pw_window:
        clf  = RandomForestClassifier()
        training_x = pw_window[x][0]
        training_y = pw_window[x][1]
        
        clf.fit(pw_window[x][0],pw_window[x][1])
        if 'detector_%s'%(x) not in training_models:
            training_models['detector_%s'%(x)] =[0,0]
        training_models['detector_%s'%(x)][0] += 1
        training_models['detector_%s'%(x)][1] = clf
    return training_models

In [93]:
def predict_activity_proba(last_event):
    '''
    Predict next activity prediction 
    
    Parameters
    ----------
    last_event: case_bin
    
    Return
    ----------
    modelid, prediction
    
    '''
    feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
    current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
    current_event = pd.Series(current_event).to_frame().T
    prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
    modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]

    return modelid, prediction

In [94]:
def first_event(case_bin):
    '''
    Generate start event before first event
    '''
    print(case_bin.event['ts'])
    empty_data ={'activity':'Start signal', 'ts':datetime.datetime.strftime(case_bin.event['ts'], '%Y-%m-%d %H:%M:%S')}
    start_event = prefix_bin(case_bin.caseid, empty_data)
    start_event.set_prefix_length(0)
    start_event.update_encoded(catattrs=catatars,enctype=enctype)
    start_event.update_truelabel(case_bin.event['activity'])
    return start_event

In [95]:
start_time = time.time()

for x,y in dataset:
    display_progress(rowcounter, totallength)
    rowcounter +=1
    
    utils.dictkey_chg(x, key_pair)
    # Event stream change dictionary keys
    x['ts'] = x['ts'][:-4]
    
    # Check label possible
    
    # Initialize case by prefix length
    caseid = x['caseid']
    x.pop('caseid')
    
    case_bin = prefix_bin(caseid, x)
    
    if caseid not in list(case_dict.keys()):
        case_dict[caseid] = []
        case_bin.set_prefix_length(1)
        
    elif caseid in finishedcases:
        continue
    
    else:
        case_bin.set_prefix_length(len(case_dict[caseid])+1)
        case_bin.set_prev_enc(case_dict[caseid][-1])
    
    # Encode event and cases and add to DB
    ts = case_bin.event['ts']
    case_bin.update_encoded(catattrs=catatars,enctype=enctype)
    
    # Set current activity as outcome of previous event
    if case_bin.prefix_length != 1:
        case_bin.prev_enc.update_truelabel(x['activity'])

    # First prediction for current event
    
    last_event = case_bin
    modelid = 'None'
    prediction = 'Not Available'

    if len(training_window.getAllitems()) !=0:
        if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
            modelid, prediction = predict_activity_proba(last_event)
#             feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
#             current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
#             current_event = pd.Series(current_event).to_frame().T
#             prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
#             modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
    case_bin.update_prediction((modelid, (prediction,ts)))        
            
    # Update training window and finish the case
    if x['activity'] == 'End':
        training_window.update_window({caseid: case_dict[caseid]})        
        if training_window.retraining == training_window.retraining_count:            
            training_models = training_stage(training_window, training_models)
            prefix_wise_window = training_window.prefix_wise_window()
            
        resultdict[caseid] = case_dict[caseid]
        case_dict.pop(caseid)

        casecount +=1
        for x in case_dict:
            last_event = case_dict[x][-1]
            modelid = 'None'
            prediction = 'Not Available'

            if len(training_window.getAllitems()) !=0:
                prefix_wise_window = training_window.prefix_wise_window()
                if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
                    modelid, prediction = predict_activity_proba(last_event)

#                     feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
#                     current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
#                     current_event = pd.Series(current_event).to_frame().T
#                     prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
#                     modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
            case_dict[x][-1].update_prediction((modelid, (prediction,ts)))        
        training_window.reset_retraining_count()
    else:
        case_dict[caseid].append(case_bin)


end_time = time.time()

0.0 % Case finished: 0 Running cases: 0
5.8 % Case finished: 29 Running cases: 1
11.61 % Case finished: 60 Running cases: 1
17.41 % Case finished: 92 Running cases: 1
23.22 % Case finished: 121 Running cases: 1
29.02 % Case finished: 148 Running cases: 1
34.83 % Case finished: 180 Running cases: 0
40.63 % Case finished: 211 Running cases: 1
46.44 % Case finished: 242 Running cases: 1
52.24 % Case finished: 273 Running cases: 0
58.05 % Case finished: 301 Running cases: 1
63.85 % Case finished: 334 Running cases: 0
69.65 % Case finished: 365 Running cases: 1
75.46 % Case finished: 395 Running cases: 1
81.26 % Case finished: 427 Running cases: 1
87.07 % Case finished: 457 Running cases: 1
92.87 % Case finished: 490 Running cases: 1
98.68 % Case finished: 518 Running cases: 1


In [96]:
print((end_time-start_time)/60)

3.6343265891075136


In [97]:
original_df = pd.read_csv('./data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv')

In [98]:
for_confusion_matrix = {}

global_true =[]
global_pred = []
counting_normal = 0
for caseid in list(resultdict.keys()):

    for_confusion_matrix[int(caseid)] =[]
    
    prediction_list = []
    
    df = original_df[original_df['Case ID'] == int(caseid)].reset_index(drop=True)
    for pos, t in enumerate(resultdict['%s'%(caseid)]):
        prediction_label = 'Normal'

        predictions = list(t.predicted.values())[0][0]
        predictions_proba = predictions[0][0]
        predictions_value = list(predictions[1])

        if predictions  == 'Not Available':
            prediction_label = 'Not Available'
        else:
            if t.true_label in predictions_value:
                labelidx = predictions_value.index(t.true_label)

                if predictions_proba[labelidx] <0.15:
                    prediction_label = 'Anomalous'
            else:
                prediction_label = 'Anomalous'

        if t.true_label != 'End':
            prediction_list.append(prediction_label)

                    
    true_label_list = []

    labellist = list(df['noise'])
    actlist = list(df['Activity'])
    for pos, t in enumerate(labellist):
        if t == 'Start' or t == 'End':
            continue
        elif t == 'true':
            true_label = 'Anomalous'
        else:
            true_label = 'Normal'
        true_label_list.append(true_label)

    
    for pos, p in enumerate(prediction_list):
        if p =='Not Available':
            counting_normal +=1
            continue
        else:
            global_pred.append(p)
            global_true.append(true_label_list[pos])

    print(caseid, len(true_label_list), len(prediction_list))
#     for t in true_label_list:
#         global_true.append(t)
#     print(prediction_list)
#     print(true_label_list)

0 9 9
1 12 12
2 10 10
3 14 14
4 17 17
5 15 15
6 11 11
7 14 14
8 19 19
9 14 14
10 9 9
11 14 14
12 12 12
13 8 8
14 18 18
15 15 15
16 18 18
17 19 19
18 27 27
19 32 32
20 12 12
21 10 10
22 9 9
23 19 19
24 13 13
25 17 17
26 14 14
27 9 9
28 22 22
29 35 35
30 26 26
31 8 8
32 10 10
33 9 9
34 8 8
35 12 12
36 14 14
37 13 13
38 11 11
39 17 17
40 10 10
41 15 15
42 12 12
43 13 13
44 10 10
45 13 13
46 19 19
47 9 9
48 15 15
49 15 15
50 22 22
51 23 23
52 11 11
53 12 12
54 11 11
55 20 20
56 16 16
57 14 14
58 9 9
59 9 9
60 17 17
61 12 12
62 20 20
63 25 25
64 9 9
65 12 12
66 11 11
67 15 15
68 8 8
69 24 24
70 11 11
71 12 12
72 8 8
73 13 13
74 27 27
75 16 16
76 11 11
77 11 11
78 8 8
79 9 9
80 14 14
81 12 12
82 15 15
83 14 14
84 11 11
85 12 12
86 19 19
87 12 12
88 14 14
89 8 8
90 15 15
91 13 13
92 23 23
93 16 16
94 15 15
95 8 8
96 8 8
97 28 28
98 12 12
99 11 11
100 8 8
101 20 20
102 16 16
103 8 8
104 18 18
105 15 15
106 19 19
107 20 20
108 21 21
109 9 9
110 13 13
111 13 13
112 25 25
113 17 17
114 12 12
115 

In [99]:
# for pos, t in enumerate(prediction_list):
#     if t == 'Not Available':
#         true_label_list.pop(pos)

matrix = classification_report(y_true = global_true, y_pred = global_pred)
print(len(global_true), len(global_pred))

print(counting_normal)
print(matrix)

7376 7376
186
              precision    recall  f1-score   support

   Anomalous       0.27      0.94      0.42      1036
      Normal       0.98      0.59      0.74      6340

    accuracy                           0.64      7376
   macro avg       0.63      0.77      0.58      7376
weighted avg       0.88      0.64      0.70      7376



In [56]:
prediction_list = []
for pos, t in enumerate(resultdict['484']):
    prediction_correct= 'Normal'

    predictions = list(t.predicted.values())[0][0]
    predictions_proba = predictions[0][0]
    predictions_label = list(predictions[1])

    if t.true_label in predictions_label:
        labelidx = predictions_label.index(t.true_label)
        
        if predictions_proba[labelidx] <0.01:
            prediction_correct = 'Anomalous'
    else:
        prediction_correct = 'Anomalous'

    prediction_list.append(prediction_correct)
print(prediction_list)
print(len(prediction_list))

['Normal', 'Normal', 'Anomalous', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Anomalous', 'Anomalous', 'Normal']
13


In [57]:
correct = 0
incorrect = 0
total =0
for pos, t in enumerate(prediction_list):
    
    if list(df['noise'])[pos+1] == 'true':
        true_label = 'Anomalous'
        
    elif list(df['noise'])[pos+1] == 'End' :
        true_label = 'Normal'
       
    else:
        true_label = 'Normal'
        
    if t == true_label:
        correct +=1
    else:
        print(list(df['Activity'])[pos+1], t, true_label, pos)
        incorrect +=1
    total +=1
print(correct, incorrect, total)

Check credit history Anomalous Normal 2
Check  application  form completeness Normal Anomalous 5
Cancel application Anomalous Normal 10
end_event_Loan  application canceled Anomalous Normal 11
9 4 13


In [49]:
resultdict2 ={}

In [50]:
for t in resultdict.keys():
    resultdict2[t] ={}
    for x in resultdict[t]:
        resultdict2[t]['Event_%s'%(x.prefix_length)] =[x.predicted, x.true_label]


In [53]:
import pickle

with open('result_rf.pkl', 'wb') as fp:
    pickle.dump(resultdict2, fp)