In [15]:
import pandas as pd
from river import stream,tree,metrics
import utils
from encoding import prefix_bin
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import os
from tqdm import tqdm
import sliding_window
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import time
# import importlib
# importlib.reload(sliding_window)

In [16]:
dataset = stream.iter_csv(
            './bpic17_sampling3.csv',
            )

totallength = len(list(dataset))

In [17]:
dataset = stream.iter_csv(
            './bpic17_sampling3.csv',
            drop=['resource'],
            )
enctype = 'Index-base'

In [18]:
key_pair = {
# 'Case ID':'caseid',
# 'Activity':'activity',
# 'Resource':'resource',
# 'Complete Timestamp':'ts',
}
catatars= ['activity']#,'resource']

case_dict ={}
training_models ={}

casecount = 0
rowcounter = 0
resultdict ={}
acc_dict ={}

prediction_result = {}
graceperiod_finish=0
finishedcases = set()

In [19]:
# Sliding window for training setting
window_size = 30
training_window = sliding_window.training_window(window_size)


In [20]:
def display_progress(row_counting, total_length, interval=500):
    if rowcounter%interval == 0:
        print(round(rowcounter*100/totallength,2) ,'%', 'Case finished: %s'%(casecount))

In [21]:
def training_stage(window, training_models):
    '''
    Manage training stage of streaming anomaly detection
    ----------
    Parameters
    window: class training_window
        Sliding window with training data
    training_models: dict
        Trained detector by prefix stored in. Default is randomforest
    ----------
    Return
    training_models
    '''
    pw_window = window.prefix_wise_window()
    for x in pw_window:
        clf  = RandomForestClassifier(max_depth=10)
        clf.fit(pw_window[x][0],pw_window[x][1])
        if 'detector_%s'%(x) not in training_models:
            training_models['detector_%s'%(x)] =[0,0]
        training_models['detector_%s'%(x)][0] += 1
        training_models['detector_%s'%(x)][1] = clf
    return training_models

In [22]:
def predict_activity_proba():
    pass

In [23]:
start_time = time.time()

for x,y in dataset:
    display_progress(rowcounter, totallength)
    rowcounter +=1
    
    utils.dictkey_chg(x, key_pair)
    # Event stream change dictionary keys
    x['ts'] = x['ts'][:-4]
    
    # Check label possible
    
    # Initialize case by prefix length
    caseid = x['caseid']
    x.pop('caseid')
    
    case_bin = prefix_bin(caseid, x)
    
    if caseid not in list(case_dict.keys()):
        case_dict[caseid] = []
        case_bin.set_prefix_length(1)
        
    elif caseid in finishedcases:
        continue
    
    else:
        case_bin.set_prefix_length(len(case_dict[caseid])+1)
        case_bin.set_prev_enc(case_dict[caseid][-1])
    
    # Encode event and cases and add to DB
    ts = case_bin.event['ts']
    case_bin.update_encoded(catattrs=catatars,enctype=enctype)
    
    # Set current activity as outcome of previous event
    if case_bin.prefix_length != 1:
        case_bin.prev_enc.update_truelabel(x['activity'])

    # First prediction for current event
    
    last_event = case_bin
    modelid = 'None'
    prediction = 'Not Available'

    if len(training_window.getAllitems()) !=0:
        prefix_wise_window = training_window.prefix_wise_window()
        if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
            feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
            current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
            current_event = pd.Series(current_event).to_frame().T
            prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
            modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
    case_bin.update_prediction((modelid, (prediction,ts)))        
            
    # Update training window and finish the case
    if x['activity'] == 'End':
        training_window.update_window({caseid: case_dict[caseid]})
        training_models = training_stage(training_window, training_models)
        resultdict[caseid] = case_dict[caseid]
        case_dict.pop(caseid)

        casecount +=1
        print(len(case_dict))
        for x in case_dict:
            last_event = case_dict[x][-1]
            modelid = 'None'
            prediction = 'Not Available'

            if len(training_window.getAllitems()) !=0:
                prefix_wise_window = training_window.prefix_wise_window()
                if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_window_%s'%(last_event.prefix_length) in training_models.keys():
                    feature_matrix = prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values
                    current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
                    current_event = pd.Series(current_event).to_frame().T
                    prediction = [training_models['detector_window_%s'%(last_event.prefix_length)][1].predict_proba(current_event), training_models['detector_window_%s'%(last_event.prefix_length)][1].classes_]
                    modelid = training_models['detector_window_%s'%(last_event.prefix_length)][0]
            case_dict[x][-1].update_prediction((modelid, (prediction,ts)))        
    else:
        case_dict[caseid].append(case_bin)
    
    # Predict probability of next activities
    # If 
    
#     for x in tqdm(case_dict):
#         last_event = case_dict[x][-1]
#         modelid = 'None'
#         prediction = 'Not Available'

#         if len(training_window.getAllitems()) !=0:
#             prefix_wise_window = training_window.prefix_wise_window()
#             if 'window_%s'%(last_event.prefix_length) in list(prefix_wise_window.keys()) and 'detector_%s'%(last_event.prefix_length) in training_models.keys():
#                 feature_matrix = {'window_%s'%(last_event.prefix_length): prefix_wise_window['window_%s'%(last_event.prefix_length)][0].columns.values}
#                 current_event = utils.readjustment_training(last_event.encoded, feature_matrix)
#                 prediction = training_models['window_%s'%(last_event.prefix_length)][1].predict(current_event)
#                 modelid = training_models['detector_%s'%(last_event.prefix_length)][0]
#         case_dict[x][-1].update_prediction((modelid, (prediction,ts)))


end_time = time.time()

0.0 % Case finished: 0
15.61 % Case finished: 0
31.21 % Case finished: 0
73
73
74
79
78
77
78
46.82 % Case finished: 7
79
81
80
80
80
79
78
78
77
62.42 % Case finished: 16
76
75
74
73
72
71
70
70
69
68
67
66
65
64
63
62
61
60
59
58
57
56
55
54
53
52
78.03 % Case finished: 42
51
50
49
48
47
46
45
44
43
42
41
40
39
38
38
37
36
35
34
33
32
31
30
29
28
27
26
25
24
23
22
21
93.63 % Case finished: 74
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
0


In [29]:
print((end_time-start_time)/60)

33.27091280221939


In [300]:
print(resultdict)

{'Application_369105792': [<encoding.prefix_bin object at 0x000002DDCC5CB550>, <encoding.prefix_bin object at 0x000002DDCC5CB250>, <encoding.prefix_bin object at 0x000002DDCC5CBD30>, <encoding.prefix_bin object at 0x000002DDCC2A1310>, <encoding.prefix_bin object at 0x000002DDCC2A1190>, <encoding.prefix_bin object at 0x000002DDCC2A1340>, <encoding.prefix_bin object at 0x000002DDCC273EB0>, <encoding.prefix_bin object at 0x000002DDCC402970>, <encoding.prefix_bin object at 0x000002DDCC402DF0>, <encoding.prefix_bin object at 0x000002DDCC4029D0>, <encoding.prefix_bin object at 0x000002DDCC402520>, <encoding.prefix_bin object at 0x000002DDCC4023A0>, <encoding.prefix_bin object at 0x000002DDCC402790>, <encoding.prefix_bin object at 0x000002DDCC4029A0>, <encoding.prefix_bin object at 0x000002DDCC402220>, <encoding.prefix_bin object at 0x000002DDCE70CE50>, <encoding.prefix_bin object at 0x000002DDCEBD2100>, <encoding.prefix_bin object at 0x000002DDCEBD2070>], 'Application_1754943309': [<encoding

In [25]:
resultdict2 ={}

In [26]:
for t in resultdict.keys():
    resultdict2[t] ={}
    for x in resultdict[t]:
        resultdict2[t]['Event_%s'%(x.prefix_length)] =[x.predicted, x.true_label]


In [27]:
    
print(resultdict2.keys())


dict_keys(['Application_369105792', 'Application_1754943309', 'Application_2002564265', 'Application_1674281629', 'Application_1691306052', 'Application_1363980603', 'Application_74089970', 'Application_1931792282', 'Application_1111870538', 'Application_1786874274', 'Application_1892186837', 'Application_1096831814', 'Application_1059184025', 'Application_1251541081', 'Application_1299976284', 'Application_1266995739', 'Application_662386389', 'Application_1878239836', 'Application_546206358', 'Application_1120819670', 'Application_1962860870', 'Application_842511489', 'Application_1363165385', 'Application_1210637698', 'Application_1806387393', 'Application_2118101879', 'Application_167944225', 'Application_1696031128', 'Application_1413308979', 'Application_829714042', 'Application_1428307326', 'Application_821425679', 'Application_323012882', 'Application_1310063638', 'Application_275338795', 'Application_2055987002', 'Application_822724921', 'Application_1436618564', 'Application_

In [85]:
importlib.reload(sliding_window)
pw_window = training_window.prefix_wise_window()

27


In [28]:
import pickle

with open('result_rf.pkl', 'wb') as fp:
    pickle.dump(resultdict2, fp)

In [103]:
print(pw_window['window_2'][0].columns.values)

['duration_1' 'cumduration_1' 'activity_1 A_Create Application'
 'resource_1 User_5' 'duration_2' 'cumduration_2'
 'activity_2 W_Complete application' 'resource_2 User_5'
 'resource_1 User_1' 'activity_2 A_Submitted' 'resource_2 User_1'
 'resource_1 User_47' 'resource_2 User_47' 'resource_1 User_3'
 'activity_2 A_Concept' 'resource_2 User_3' 'resource_1 User_28'
 'resource_2 User_28' 'resource_1 User_23' 'resource_2 User_23'
 'resource_1 User_16' 'resource_2 User_16' 'resource_1 User_76'
 'resource_2 User_76' 'resource_1 User_45' 'resource_2 User_45'
 'resource_1 User_37' 'resource_2 User_37' 'resource_1 User_52'
 'resource_2 User_52' 'resource_1 User_4' 'resource_2 User_4'
 'resource_1 User_10' 'resource_2 User_10' 'resource_1 User_51'
 'resource_2 User_51']


In [99]:
training_models = training_stage(training_window, training_models)

In [68]:
print(clf.classes_)
print(clf.predict_proba(x_train.loc[19:]))
print(clf.predict(x_train.loc[19:]))

['A_Accepted' 'O_Create Offer' 'O_Created' 'W_Complete application']
[[0.06 0.   0.91 0.03]]
['O_Created']


In [62]:
print(clf.score(x_train,y_train))

1.0
