In [43]:
import pandas as pd
from river import stream,tree,metrics
import utils
from encoding import prefix_bin
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import os
from tqdm import tqdm
import sliding_window
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import datetime, time
import importlib
importlib.reload(sliding_window)

<module 'sliding_window' from 'C:\\Users\\suhwan\\Desktop\\Project\\coding\\streaming_anomaly_detect\\sliding_window.py'>

In [112]:
dataset = stream.iter_csv(
            './data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv',
            )

totallength = len(list(dataset))

In [113]:
dataset = stream.iter_csv(
            './data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv',
            drop=['noise', 'lifecycle:transition', 'Variant', 'Variant index'],
            )
enctype = 'Index-base'

In [114]:
key_pair = {
'Case ID':'caseid',
'Activity':'activity',
# 'Resource':'resource',
'Complete Timestamp':'ts',
}
catatars= ['activity']#,'resource']

case_dict ={}
training_models ={}

casecount = 0
rowcounter = 0
resultdict ={}
acc_dict ={}
prefix_wise_window = {}
prediction_result = {}
graceperiod_finish=0
finishedcases = set()

In [115]:
# Sliding window for training setting
window_size = 50
retraining_size = 10
training_window = sliding_window.training_window(window_size,retraining_size)


In [116]:
def display_progress(row_counting, total_length, interval=500):
    if rowcounter%interval == 0:
        print(round(rowcounter*100/totallength,2) ,'%', 'Case finished: %s'%(casecount), 'Running cases: %s'%(len(case_dict)))

In [117]:
def training_stage(window, training_models):
    '''
    Manage training stage of streaming anomaly detection
    ----------
    Parameters
    window: class training_window
        Sliding window with training data
    training_models: dict
        Trained detector by prefix stored in. Default is randomforest
    ----------
    Return
    training_models
    '''
    pw_window = window.prefix_wise_window()
    for x in pw_window:
        clf  = RandomForestClassifier(max_depth=10)
        training_x = pw_window[x][0]
        training_y = pw_window[x][1]
        
        clf.fit(pw_window[x][0],pw_window[x][1])
        if 'detector_%s'%(x) not in training_models:
            training_models['detector_%s'%(x)] =[0,0]
        training_models['detector_%s'%(x)][0] += 1
        training_models['detector_%s'%(x)][1] = clf
    return training_models

In [118]:
def predict_activity_proba(last_event):
    '''
    Predict next activity prediction 
    
    Parameters
    ----------
    last_event: case_bin
    
    Return
    ----------
    modelid, prediction
    
    '''
    feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
    current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
    current_event = pd.Series(current_event).to_frame().T
    prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
    modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]

    return modelid, prediction

In [119]:
def first_event(case_bin):
    '''
    Generate start event before first event
    '''
    print(case_bin.event['ts'])
    empty_data ={'activity':'Start signal', 'ts':datetime.datetime.strftime(case_bin.event['ts'], '%Y-%m-%d %H:%M:%S')}
    start_event = prefix_bin(case_bin.caseid, empty_data)
    start_event.set_prefix_length(0)
    start_event.update_encoded(catattrs=catatars,enctype=enctype)
    start_event.update_truelabel(case_bin.event['activity'])
    return start_event

In [120]:
print(case_bin)

<encoding.prefix_bin object at 0x000001F8CF2FD700>


In [121]:
start_time = time.time()

for x,y in dataset:
    display_progress(rowcounter, totallength)
    rowcounter +=1
    
    utils.dictkey_chg(x, key_pair)
    # Event stream change dictionary keys
    x['ts'] = x['ts'][:-4]
    
    # Check label possible
    
    # Initialize case by prefix length
    caseid = x['caseid']
    x.pop('caseid')
    
    case_bin = prefix_bin(caseid, x)
    
    if caseid not in list(case_dict.keys()):
        case_dict[caseid] = []
        case_bin.set_prefix_length(1)
        
    elif caseid in finishedcases:
        continue
    
    else:
        case_bin.set_prefix_length(len(case_dict[caseid])+1)
        case_bin.set_prev_enc(case_dict[caseid][-1])
    
    # Encode event and cases and add to DB
    ts = case_bin.event['ts']
    case_bin.update_encoded(catattrs=catatars,enctype=enctype)
    
    # Set current activity as outcome of previous event
    if case_bin.prefix_length != 1:
        case_bin.prev_enc.update_truelabel(x['activity'])

    # First prediction for current event
    
    last_event = case_bin
    modelid = 'None'
    prediction = 'Not Available'

    if len(training_window.getAllitems()) !=0:
        if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
            modelid, prediction = predict_activity_proba(last_event)
#             feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
#             current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
#             current_event = pd.Series(current_event).to_frame().T
#             prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
#             modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
    case_bin.update_prediction((modelid, (prediction,ts)))        
            
    # Update training window and finish the case
    if x['activity'] == 'End':
        training_window.update_window({caseid: case_dict[caseid]})        
        if training_window.retraining == training_window.retraining_count:            
            training_models = training_stage(training_window, training_models)
            prefix_wise_window = training_window.prefix_wise_window()
            
        resultdict[caseid] = case_dict[caseid]
        case_dict.pop(caseid)

        casecount +=1
        for x in case_dict:
            last_event = case_dict[x][-1]
            modelid = 'None'
            prediction = 'Not Available'

            if len(training_window.getAllitems()) !=0:
                prefix_wise_window = training_window.prefix_wise_window()
                if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
                    modelid, prediction = predict_activity_proba(last_event)

#                     feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
#                     current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
#                     current_event = pd.Series(current_event).to_frame().T
#                     prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
#                     modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
            case_dict[x][-1].update_prediction((modelid, (prediction,ts)))        
        training_window.reset_retraining_count()
    else:
        case_dict[caseid].append(case_bin)


end_time = time.time()

0.0 % Case finished: 0 Running cases: 0
5.8 % Case finished: 29 Running cases: 1
11.61 % Case finished: 60 Running cases: 1
17.41 % Case finished: 92 Running cases: 1
23.22 % Case finished: 121 Running cases: 1
29.02 % Case finished: 148 Running cases: 1
34.83 % Case finished: 180 Running cases: 0
40.63 % Case finished: 211 Running cases: 1
46.44 % Case finished: 242 Running cases: 1
52.24 % Case finished: 273 Running cases: 0
58.05 % Case finished: 301 Running cases: 1
63.85 % Case finished: 334 Running cases: 0
69.65 % Case finished: 365 Running cases: 1
75.46 % Case finished: 395 Running cases: 1
81.26 % Case finished: 427 Running cases: 1
87.07 % Case finished: 457 Running cases: 1
92.87 % Case finished: 490 Running cases: 1
98.68 % Case finished: 518 Running cases: 1


In [122]:
print((end_time-start_time)/60)

3.8872692465782164


In [123]:
df = pd.read_csv('./data/loan_baseline.pnml_noise_0.15_iteration_1_seed_614_simple.csv')
df = df[df['Case ID'] == 484]
print(df.head)

<bound method NDFrame.head of       Case ID                                    Activity  \
7900      484                                       Start   
7901      484      start_event_Loan  application received   
7902      484       Check  application  form completeness   
7903      484                        Check credit history   
7904      484                            Assess loan risk   
7905      484                           Appraise property   
7906      484       Check  application  form completeness   
7907      484                          Assess eligibility   
7908      484  Check if home insurance quote is requested   
7909      484                        Send acceptance pack   
7910      484                  Verify repayment agreement   
7911      484                          Cancel application   
7912      484        end_event_Loan  application canceled   
7913      484                                         End   

           Complete Timestamp      Variant  Variant in

In [150]:
# print(len(resultdict['484']))
prediction_list = []
for pos, t in enumerate(resultdict['484']):
    prediction_correct= 'Normal'

    predictions = list(t.predicted.values())[0][0]
    predictions_proba = predictions[0][0]
    predictions_label = list(predictions[1])
    print(t.event['activity'], list(df['Activity'])[pos+1], predictions_label)

    if t.true_label in predictions_label:
        labelidx = predictions_label.index(t.true_label)
        
        if predictions_proba[labelidx] <0.01:
            prediction_correct = 'Anomalous'
    else:
        prediction_correct = 'Anomalous'

    prediction_list.append(prediction_correct)

print(prediction_list)
list(df['Activity'])[pos+1]

Start start_event_Loan  application received ['Approve application', 'Check  application  form completeness', 'Return application back to applicant', 'Send acceptance pack', 'Send home insurance quote', 'end_event_Loan  application canceled', 'start_event_Loan  application received']
start_event_Loan  application received Check  application  form completeness ['Assess loan risk', 'Check  application  form completeness', 'end_event_Loan  application approved', 'start_event_Loan  application received']
Check  application  form completeness Check credit history ['Appraise property', 'Approve application', 'Check  application  form completeness', 'Check credit history', 'Reject application', 'Return application back to applicant', 'Send home insurance quote', 'end_event_Loan  application canceled', 'start_event_Loan  application received']
Check credit history Assess loan risk ['Appraise property', 'Approve application', 'Assess eligibility', 'Assess loan risk', 'Check  application  form c

'End'

In [149]:
correct = 0
incorrect = 0
total =0
for pos, t in enumerate(prediction_list):
    
    if list(df['noise'])[pos+1] == 'true':
        true_label = 'Anomalous'
        
    elif list(df['noise'])[pos+1] == 'End':
        true_label = 'Normal'
       
    else:
        true_label = 'Normal'
        
    if t == true_label:
        correct +=1
    else:
        print(list(df['Activity'])[pos+1], t, true_label)
        incorrect +=1
    total +=1
print(correct, incorrect, total)

Check credit history Anomalous Normal
Check  application  form completeness Normal Anomalous
end_event_Loan  application canceled Anomalous Normal
10 3 13


In [49]:
resultdict2 ={}

In [50]:
for t in resultdict.keys():
    resultdict2[t] ={}
    for x in resultdict[t]:
        resultdict2[t]['Event_%s'%(x.prefix_length)] =[x.predicted, x.true_label]


In [85]:
importlib.reload(sliding_window)
pw_window = training_window.prefix_wise_window()

27


In [53]:
import pickle

with open('result_rf.pkl', 'wb') as fp:
    pickle.dump(resultdict2, fp)

In [103]:
print(pw_window['window_2'][0].columns.values)

['duration_1' 'cumduration_1' 'activity_1 A_Create Application'
 'resource_1 User_5' 'duration_2' 'cumduration_2'
 'activity_2 W_Complete application' 'resource_2 User_5'
 'resource_1 User_1' 'activity_2 A_Submitted' 'resource_2 User_1'
 'resource_1 User_47' 'resource_2 User_47' 'resource_1 User_3'
 'activity_2 A_Concept' 'resource_2 User_3' 'resource_1 User_28'
 'resource_2 User_28' 'resource_1 User_23' 'resource_2 User_23'
 'resource_1 User_16' 'resource_2 User_16' 'resource_1 User_76'
 'resource_2 User_76' 'resource_1 User_45' 'resource_2 User_45'
 'resource_1 User_37' 'resource_2 User_37' 'resource_1 User_52'
 'resource_2 User_52' 'resource_1 User_4' 'resource_2 User_4'
 'resource_1 User_10' 'resource_2 User_10' 'resource_1 User_51'
 'resource_2 User_51']


In [99]:
training_models = training_stage(training_window, training_models)

In [68]:
print(clf.classes_)
print(clf.predict_proba(x_train.loc[19:]))
print(clf.predict(x_train.loc[19:]))

['A_Accepted' 'O_Create Offer' 'O_Created' 'W_Complete application']
[[0.06 0.   0.91 0.03]]
['O_Created']


In [62]:
print(clf.score(x_train,y_train))

1.0
