In [1]:
import sys
sys.path.insert(1, '../')

from river import stream,tree,metrics
from encoding import prefix_bin
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter

from tqdm import tqdm
import pickle as pkl
import matplotlib.pyplot as plt
import os,json
import datetime
from collections import deque, Counter
from sklearn.model_selection import train_test_split
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

import datetime, time
import utils
import sliding_window
import psutil

import copy
import gzip
from tqdm import tqdm

import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn

import torch.nn.functional as F
import torch.optim as optim

In [2]:
#torch cuda setting
with torch.no_grad():
    torch.cuda.empty_cache()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [3]:
def display_progress(row_counting, total_length, interval=10000):
    if rowcounter%interval == 0:
        print(round(rowcounter*100/totallength,2) ,'%', 'Case finished: %s'%(casecount), 'Running cases: %s'%(len(streaming_db)))

In [4]:
class Customdataset():
    def __init__(self, cdataset):
        '''
        Convert dataset to tensor
        
        Params
        dataset_type: Type of dataset, trainset, validset, and testset
        '''
        self.cdataset = cdataset


    def preprocessing(self):
        self.x_data=self.cdataset[0]
        self.y_data=self.cdataset[1]

        x = self.x_data.to_numpy()
        x = np.reshape(x, (x.shape[0],1, x.shape[1]))
        y_set = sorted(set(self.y_data))
        train_y =[]
        for y in self.y_data:
            train_y.append(y_set.index(y))

        x_tensor = torch.tensor(x, dtype=torch.float)
        y_tensor = torch.tensor(train_y, dtype=torch.long)

        return x_tensor, y_tensor

    
    def test_preprocessing(self):
        self.x_data=self.cdataset

        x = self.x_data.to_numpy()
        x = np.reshape(x, (x.shape[0],1, x.shape[1]))

        x_tensor = torch.tensor(x, dtype=torch.float)

        return x_tensor
      

In [5]:
class LSTM_model(nn.Module): # nn.Module inherit

    def __init__(self, input_x, raw_y):
        super(LSTM_model, self).__init__()
        
        self.input_size = input_x.shape[2]
        self.hidden_size =2* input_x.shape[2]
        self.num_case = 1
        self.num_layers =2

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, num_layers=self.num_layers, dropout=0.25, batch_first =False, bidirectional = False)

        self.h0 = torch.randn(self.num_layers, 1, self.hidden_size, device=device)
        self.c0 = torch.randn(self.num_layers, 1, self.hidden_size, device=device)

        latent_vector_size =30 * 1
        self.linear1 = nn.Linear(1 *self.num_case *self.hidden_size, latent_vector_size)
        self.linear_h = nn.Linear(1 *self.num_layers *self.hidden_size, latent_vector_size)
        self.linear_o = nn.Linear(3 * latent_vector_size, 1 *self.num_case * len(set(raw_y)))

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_x):
        output, (hn,cn) = self.lstm(input_x, (self.h0,self.c0))
        output = output.reshape((output.size()[0] *output.size()[1] *output.size()[2]))
        output = self.relu(self.linear1(output))

        uH = F.leaky_relu(self.linear_h(hn.reshape((hn.size()[0] *hn.size()[1] *hn.size()[2]))))
        uC = F.leaky_relu(self.linear_h(cn.reshape((cn.size()[0] *cn.size()[1] *cn.size()[2]))))
        output = torch.cat((uH ,uC ,output))
        output = self.sigmoid(self.linear_o(output))
        output =output.reshape(self.num_case,-1)

        return output

In [6]:
def training_stage(datafortraining):
    '''
    Manage training stage of streaming anomaly detection
    ----------
    Parameters
    window: class training_window
        Sliding window with training data
    training_models: dict
        Trained detector by prefix stored in. Default is randomforest
    ----------
    Return
    training_models
    '''
    start_time = time.time()
    cross_entropy_loss = torch.nn.CrossEntropyLoss()

    train_x,train_y = Customdataset(datafortraining).preprocessing()
    loss_list =[]

    x_tensor = torch.unsqueeze(train_x[0], dim=0)
    y_tensor = torch.unsqueeze(train_y[0], dim=0)
    model = LSTM_model(x_tensor, y_tensor).cuda()
    optimizer = optim.Adam(model.parameters(), lr=0.0003)
    optimizer.zero_grad()
    loss = nn.BCELoss ()
    losses = []

    previous_model =0
    for i in range(5):
        running_loss =0
        for pos, x2 in enumerate(train_x):
            x_tensor = torch.unsqueeze(x2, dim=0)
            y_tensor = torch.tensor([[float(train_y[pos])]])

            x_tensor = x_tensor.cuda()
            y_tensor = y_tensor.cuda()
            output = model(x_tensor)
#             running_loss += loss.item()
            l = loss(output, y_tensor)
            running_loss +=l.item()

            l.backward()
            optimizer.step()
        loss_list.append(running_loss)
        previous_model = model
        epoch_loss = running_loss / len(train_x)
        losses.append(epoch_loss)

        if len(loss_list) ==0:
            pass

        else:
            if epoch_loss > np.mean(losses):
                break
    end_time = time.time()
    training_time.append(end_time -start_time)
    
    return model

In [7]:
def first_event(case_bin):
    '''
    Generate start event before first event
    '''
    print(case_bin.event['ts'])
    empty_data ={'activity':'Start signal', 'ts':datetime.datetime.strftime(case_bin.event['ts'], '%Y-%m-%d %H:%M:%S')}
    start_event = prefix_bin(case_bin.caseid, empty_data)
    start_event.set_prefix_length(0)
    start_event.update_encoded(catattrs=catatars,enctype=enctype)
    start_event.update_truelabel(case_bin.event['activity'])
    return start_event

In [8]:
def update_test_cases(new_case, test_cases):
    test_cases.append(new_case)
    if len(test_cases) > test_size:
        test_cases.popleft()

In [9]:
def predict_proba(model, test_x):
    test_x = test_x.cuda()
    with torch.no_grad():
        test_output = model(test_x)
        test_output = test_output.cpu().detach().reshape(-1).numpy()[0]
        y_pred = [[1-test_output, test_output]]
    return y_pred

In [10]:
def variant_list(window_data):
    variant_window = {}
    for t in window_data:
        values = [i for i in t.encoded if t.encoded[i]==1]
        activities = str(sorted([i for i in values if 'activity' in i]))

        if activities not in variant_window.keys():
            variant_window[activities] =1
        else:
            variant_window[activities] +=1
    return variant_window

In [11]:
def variant_coverage(train_variant, test_variant):   
    coverage_ratio = 0
    for t in test_variant.keys():
        if t in train_variant.keys():
            coverage_ratio += test_variant[t]

    return coverage_ratio

In [12]:
def label_distribution(test_data):
    testing_label = {'True':0, 'False':0, '':0}
    for t in test_data:
        label = t[maximum_prefix-1].true_label
        testing_label[label]+=1

    label_dist = int(testing_label['True'])/30
    return label_dist


In [13]:
def trigger_cd(dataset_label, cd_type):
    file_path = '../triggered_cd/%s/%s_CD_list.pkl'%(cd_type, dataset_label)
    with open(file_path, 'rb') as f:
        data = pkl.load(f)
    return data

In [15]:
# for dataset_label in ['bpic2015_1', 'bpic2015_4', 'bpic2015_5', 'traffic_fines_1']:
for dataset_label in ['bpic2011_1', 'bpic2011_3', 'bpic2011_4', 'bpic2015_1', 'bpic2015_2', 'bpic2015_3', 'bpic2015_4', 'bpic2015_5']:
# for dataset_label in ['bpic2012_1', 'bpic2012_2', 'bpic2012_3', 'bpic2017_1', 'bpic2017_2', 'bpic2017_3']:
    # Experiment settings
    label_condition = False
    cd_type = 'prefixtreeCDD'
    # cd_type = 'prefixtreeCDD'
    print(dataset_label)
    
    # Invoke parameters for dataset
    window_size = 200
    test_size = 30
    gp = 200
    training_rebalance = True
    retraining_condition = cd_type
    classifier = 'lstm'
    retraining_trigger = False
    
    with open('../dataset_parameters.json','r') as json_file:
        parameters = json.load(json_file)[dataset_label]
        print(parameters)
        key_pair = parameters['key_pair']
        catatars = parameters['categorical_attrs']
        maximum_prefixs = parameters['maximum_prefix']
        prefix_range = parameters['prefix']
        
    for maximum_prefix in prefix_range:
        training_time = []
        print(maximum_prefix)
        train_window_dict = {}
        test_window_dict = {}
        cd_list = deque(trigger_cd(dataset_label, cd_type))

        dataset_loc = '../DATA/logs/'+ dataset_label +'.csv'
        try:
            os.makedirs('../result/%s'%(dataset_label))
        except:
            pass
        dataset = stream.iter_csv(
                dataset_loc
                )
        totallength = len(list(dataset))

        dataset = stream.iter_csv(
                dataset_loc,
                # drop=['Complete Timestamp'],
                target='outcome'
                )
        enctype = 'Index-base'

        streaming_db ={}
        training_windows = sliding_window.training_window(window_size,test_size)
        training_models ={}
        test_cases = deque()
        feature_matrix ={}

        save_model = {}
        casecount = 0
        rowcounter = 0
        finished_db ={}
        running_case = 0
        window_acc_dict = {}
        prediction_result = dict()
        for i in range(1, maximum_prefix+1): prediction_result[i] = {}
        finished_caseid = set()
        usedingrace = set()

        for x,y in dataset:
    #         display_progress(rowcounter, totallength)
            rowcounter +=1
            # Event stream change dictionary keys
            x = utils.dictkey_chg(x, key_pair)

            # if dataset_label !='bpic15':
            #      x['ts'] = x['ts'][:-4]

            x['outcome'] =y 
            # Initialize case by prefix length        
            caseid = x['caseid']
            outcome = x['outcome']
        #     progress = x['progress']

            x.pop('caseid')
            x.pop('outcome')

        #     x.pop('progress')

            case_bin = prefix_bin(caseid, x)
            case_bin.set_enctype(enctype)

            if caseid not in list(streaming_db.keys()):
                case_bin.set_prefix_length(1)    
                streaming_db[caseid] = []
            elif caseid in finished_caseid:
                pass
            else:
                case_bin.set_prefix_length(len(streaming_db[caseid])+1)
                case_bin.set_prev_enc(streaming_db[caseid][-1])

            # Encode event and cases and add to DB
            case_bin.update_truelabel(outcome)   
            case_bin.update_encoded(catattrs=catatars,enctype=enctype)
            ts = case_bin.event['ts']
            streaming_db[caseid].append(case_bin)

            # Set current activity as outcome of previous event
            if case_bin.prefix_length != 1 and case_bin.caseid not in finished_caseid:
                case_bin.prev_enc.update_truelabel(x['activity'])

            # First prediction for current event

            last_event = case_bin
            modelid = 'None'
            prediction = 'Not Available'

            # Detect label appeared case 
            if outcome != '' and caseid not in finished_caseid:
                usedingrace.add(caseid)
                for i in streaming_db[caseid]:
                    i.update_truelabel(outcome)
                finished_caseid.add(caseid)
                # Adding newly finished case to training set.    
                casecount +=1
                # Grace period to collect feature matrix
                if casecount < gp:
                    case_length = len(streaming_db[caseid])
                    if case_length >= maximum_prefix:
                        if 'prefix_%s'%(maximum_prefix) not in list(feature_matrix.keys()):
                            feature_matrix['prefix_%s'%(maximum_prefix)]=set()
                            training_models['prefix_%s'%(maximum_prefix)] = [0,
                                                                       0]
                        feature_list = list(streaming_db[caseid][maximum_prefix-1].encoded.keys())
                        for x in feature_list: feature_matrix['prefix_%s'%(maximum_prefix)].add(x) 
                else:
                    break

        if casecount not in train_window_dict.keys(): train_window_dict[casecount] = []
        if casecount not in test_window_dict.keys(): test_window_dict[casecount] = []

        for caseid in list(usedingrace):
            case_length = len(streaming_db[caseid])
            if case_length >= maximum_prefix:
                x = streaming_db[caseid][maximum_prefix-1]
                if x.prefix_length != 0:            
                    training_windows.update_window(x)        

                update_test_cases(streaming_db[caseid], test_cases)
        train_window_dict[casecount].append(copy.deepcopy(training_windows.container))

        training_x = []
        training_y = []
        for pos, i in enumerate(training_windows.container):
            x_prefix_length = i.prefix_length 
            i.encoded = utils.readjustment_training(i.encoded, feature_matrix['prefix_%s'%(maximum_prefix)])
            training_x.append(i.encoded)
            training_y.append(i.true_label)

        le = LabelEncoder()
        training_y = le.fit_transform(training_y)
        training_x = pd.DataFrame.from_dict(training_x)

        ###
        #Oversampling
        ###
        n_labels = Counter(training_y)['True']
        if n_labels <=2:
            pass
        elif n_labels>2 and n_labels <=5:
            smote = SMOTE(k_neighbors=n_labels-1)
            training_x, training_y = smote.fit_resample(training_x, training_y)
        else:
            smote = SMOTE()
            training_x, training_y = smote.fit_resample(training_x, training_y)

        training_models['prefix_%s'%(x_prefix_length)][0] = training_stage((training_x, training_y))
        training_models['prefix_%s'%(x_prefix_length)][1] = casecount

        prediction_result[maximum_prefix][casecount] = {}
        y_truelist = []
        y_predlist = []
        for case in test_cases:
            if len(case) >= maximum_prefix:
                x = case[maximum_prefix-1]
                if x.prefix_length != 0:            
                    model = training_models['prefix_%s'%(x_prefix_length)][0]
                    current_event = utils.readjustment_training(x.encoded, feature_matrix['prefix_%s'%(maximum_prefix)])
                    current_event = pd.Series(current_event).to_frame().T
                    current_event = Customdataset(current_event).test_preprocessing()
                    y_pred = predict_proba(model, current_event)            
                    y_truelist.append(x.true_label)
                    y_predlist.append(y_pred)
                    test_window_dict[casecount].append(x)

        prediction_result[maximum_prefix][casecount]['y_true'] = y_truelist
        prediction_result[maximum_prefix][casecount]['y_pred'] = y_predlist
        train_variant = variant_list(train_window_dict[200][0])

        '''
        Streaming event label prediction start.
        - Test and training steps are executed when case finished/ event arrived with label
        '''
        for i in streaming_db.keys(): usedingrace.add(i)
        streaming_db ={}
        cdhappend ={}
        for i in range(1, maximum_prefix+1): cdhappend[i] = 0

        ## Start streaming again
        for x,y in dataset:
            display_progress(rowcounter, totallength)

            rowcounter +=1
            # Event stream change dictionary keys
            x = utils.dictkey_chg(x, key_pair)

            # if dataset_label !='bpic15':
            #     x['ts'] = x['ts'][:-4]

            # Check label possible
            # x = utils.set_label(x)
            x['outcome'] =y 
            # Initialize case by prefix length
            caseid = x['caseid']
            outcome = x['outcome']
            x.pop('caseid')
            x.pop('outcome')

            if caseid not in usedingrace:
                case_bin = prefix_bin(caseid, x)
                case_bin.set_enctype(enctype)

                if caseid not in list(streaming_db.keys()):
                    case_bin.set_prefix_length(1)    
                    streaming_db[caseid] = []
                    running_case +=1
                elif caseid in finished_caseid:
                    pass
                else:
                    case_bin.set_prefix_length(len(streaming_db[caseid])+1)
                    case_bin.set_prev_enc(streaming_db[caseid][-1])

                # Encode event and cases and add to DB
                case_bin.update_truelabel(outcome)   
                case_bin.update_encoded(catattrs=catatars,enctype=enctype)
                ts = case_bin.event['ts']

                if case_bin.prefix_length == maximum_prefix:
                    case_bin.encoded = utils.readjustment_training(case_bin.encoded, feature_matrix['prefix_%s'%(case_bin.prefix_length)])
                streaming_db[caseid].append(case_bin)

                # Detect label appeared case 
                if outcome != '' and caseid not in finished_caseid:
                    finished_caseid.add(caseid)

                    # Adding newly finished case to training set.
                    casecount +=1    

                    # Modify encoded attributes of cases with feature matrix
                    case_length = len(streaming_db[caseid])
                    if case_length >= maximum_prefix:

                        streaming_db[caseid][maximum_prefix-1].update_truelabel(outcome)
                        update_test_cases(streaming_db[caseid], test_cases)
                        x = streaming_db[caseid][maximum_prefix-1].encoded
                        prefix_length =streaming_db[caseid][maximum_prefix-1].prefix_length                    

                        test_variant = variant_list([i[maximum_prefix-1] for i in test_cases])
                        variant_cover = variant_coverage(train_variant, test_variant)/test_size
                        label_dist = label_distribution(test_cases)
                        training_windows.update_window(streaming_db[caseid][maximum_prefix-1])
                        
                        if retraining_condition == 'prodrift' or retraining_condition == 'prefixtreeCDD':
                            if len(cd_list) ==0:
                                retraining_trigger = False
                                pass
                            else:
                                if test_cases[0][0].caseid == cd_list[0]:
                                    retraining_trigger = True
                                    cd_list.popleft()
                                    print('Triggered1')
                                else:
                                    retraining_trigger = False
                                    
                        if retraining_condition == 'label':
                            if label_dist <= 0.1 or label_dist >=0.9:
                                label_condition = True
                                print(label_dist, 'Triggered1')
                            else:
                                label_condition = False

                        elif retraining_condition == 'variant':
                            if variant_cover <=0.5:
                                label_condition = True
                            else:
                                label_condition = False

                        # if label_condition == True and retraining_check == True:
                        if retraining_trigger == True:
                            print('Triggered2')
                            if casecount not in train_window_dict.keys(): train_window_dict[casecount] = []
                            train_window_dict[casecount].append(copy.deepcopy(training_windows.container))                       

                            x_training = pd.DataFrame.from_dict([i.encoded for i in training_windows.container])
                            for i in x_training.columns.values: x_training[i] = x_training[i].fillna(0)
                            feature_matrix['prefix_%s'%(maximum_prefix)] = x_training.columns.values

                            training_x = []
                            training_y = []
                            for pos, i in enumerate(training_windows.container):
                                x_prefix_length = i.prefix_length 
                                i.encoded = utils.readjustment_training(i.encoded, feature_matrix['prefix_%s'%(maximum_prefix)])
                                training_x.append(i.encoded)
                                training_y.append(i.true_label)

                            training_y = le.fit_transform(training_y)
                            training_x = pd.DataFrame.from_dict(training_x)
                            ###
                            #Oversampling
                            ###
                            n_labels = Counter(training_y)['True']

                            if n_labels <=2:
                                pass
                            elif n_labels>2 and n_labels <=5:
                                smote = SMOTE(k_neighbors=n_labels-1)
                                training_x, training_y = smote.fit_resample(training_x, training_y)
                            else:
                                smote = SMOTE()
                                training_x, training_y = smote.fit_resample(training_x, training_y)

                            ###
                            #Model retraining
                            ###
                            del model

                            training_models['prefix_%s'%(x_prefix_length)][0] = training_stage((training_x, training_y))
                            training_models['prefix_%s'%(x_prefix_length)][1] = casecount
                            train_variant = variant_list(train_window_dict[casecount][0])
                            
                            if retraining_condition == 'label':
                                train_window_dict = dict()
                                
    #                         save_model[training_models['prefix_%s'%(x_prefix_length)][1]] = training_models['prefix_%s'%(x_prefix_length)][0]
                            save_model[training_models['prefix_%s'%(x_prefix_length)][1]] = 0

                    y_truelist = []
                    y_predlist = []
                    if casecount ==200:
                        break
                    if casecount not in test_window_dict.keys():
                        test_window_dict[casecount] = []

                    for case in test_cases:
                        if len(case) >= maximum_prefix:
                            x = case[maximum_prefix-1]
                            if x.prefix_length != 0:
                                length = x.prefix_length
                                current_event = utils.readjustment_training(x.encoded, feature_matrix['prefix_%s'%(maximum_prefix)])
                                current_event = pd.Series(current_event).to_frame().T
                                current_event = Customdataset(current_event).test_preprocessing()
                                model = training_models['prefix_%s'%(x_prefix_length)][0]
                                with torch.no_grad():
                #                     label_classes = training_models['detector_window_%s'%(last_event.prefix_length)][2]
                                    current_event = current_event.cuda()
                                    test_output = model(current_event)
                                    test_output = test_output.cpu().detach().reshape(-1).numpy()[0]
                                    y_pred = [[1-test_output, test_output]]

                                y_truelist.append(x.true_label)
                                y_predlist.append(y_pred)
                                test_window_dict[casecount].append(x)
                                if casecount != gp:
                                    c_id = [x.caseid for x in test_window_dict[casecount-1]]

                                if x.caseid not in c_id:
                                    model = training_models['prefix_%s'%(x_prefix_length)][0]
                                    y_pred = predict_proba(model, current_event)
                                else:
                                    y_pred = prediction_result[maximum_prefix][casecount-1]['y_pred'][c_id.index(x.caseid)]
                                y_truelist.append(x.true_label)
                                y_predlist.append(y_pred)

                                if casecount not in test_window_dict.keys():
                                    test_window_dict[casecount] = []
                                test_window_dict[casecount].append(x)
                    prediction_result[maximum_prefix][casecount] = {}
                    prediction_result[maximum_prefix][casecount]['y_true'] = y_truelist
                    prediction_result[maximum_prefix][casecount]['y_pred'] = y_predlist
    #                 if 'b1' not in caseid and cdhappend[maximum_prefix] == 0:
    #                     cdhappend[maximum_prefix] = model_update_count

        try:
            os.makedirs('../result/%s/%s/Finished cases/Trigger %s'%(dataset_label, classifier, retraining_condition))
        except:
            pass    

#         with gzip.open('../result/%s/%s/Finished cases/Trigger %s/prefix_%s training window retrained.pkl'%(dataset_label, classifier, retraining_condition, maximum_prefix), 'wb') as f:
#             pkl.dump(train_window_dict, f)
        with gzip.open('../result/%s/%s/Finished cases/Trigger %s/prefix_%s test window retrained.pkl'%(dataset_label, classifier, retraining_condition, maximum_prefix), 'wb') as f:
            pkl.dump(test_window_dict, f)
        with gzip.open('../result/%s/%s/Finished cases/Trigger %s/prefix_%s update retrained.pkl'%(dataset_label, classifier, retraining_condition, maximum_prefix), 'wb') as f:
            pkl.dump(prediction_result, f)
        with gzip.open('../result/%s/%s/Finished cases/Trigger %s/prefix_%s model.pkl'%(dataset_label, classifier, retraining_condition, maximum_prefix), 'wb') as f:
            pkl.dump(save_model, f)
        with gzip.open('../result/time/%s_%s_%s_%s_trainingtime.pkl'%(dataset_label, classifier, maximum_prefix, retraining_condition), 'wb') as f:
            pkl.dump(training_time, f)
            
        # training_time.append(end_time - start_time)

bpic2011_1
{'key_pair': {'Case ID': 'caseid', 'Activity': 'activity', 'Resource': 'resource', 'Complete Timestamp': 'ts'}, 'categorical_attrs': ['activity', 'resource'], 'maximum_prefix': 40, 'prefix': [10, 19, 27]}
10
26.62 % Case finished: 227 Running cases: 59
33.27 % Case finished: 257 Running cases: 151
39.92 % Case finished: 301 Running cases: 223
46.58 % Case finished: 358 Running cases: 286
53.23 % Case finished: 394 Running cases: 325
59.88 % Case finished: 438 Running cases: 370
66.54 % Case finished: 483 Running cases: 410
73.19 % Case finished: 545 Running cases: 455
79.85 % Case finished: 581 Running cases: 501
86.5 % Case finished: 612 Running cases: 535
93.15 % Case finished: 683 Running cases: 580
Triggered1
Triggered2
99.81 % Case finished: 817 Running cases: 629
19
26.62 % Case finished: 227 Running cases: 59
33.27 % Case finished: 257 Running cases: 151
39.92 % Case finished: 301 Running cases: 223
46.58 % Case finished: 358 Running cases: 286
53.23 % Case finished: 