# **Resolving Partially Ordered Traces Using Deep Learning** (Seq2Seq)


|                        | BPI 2012| BPI 2014 | Traffic | <br>
|-----------------------:|--------:|---------:|--------:|
| \|A\|                  | 24      |  9       | 11      |
| #Traces                | 13087   | 41353    | 150370  |
| #Events                | 262200  | 369485   | 561470  |
| #Event Sets            | 248205  | 243186   | 549452  |
| #uncertain Seq's       | 14      | 24       | 25      |
| Trace Uncertainty      | 38%     | 93%      |  6%     |
| Event Uncertainty      |  5%     | 40%      |  2%     |
| max(len(unc.seq))      |  4      |  4       |  3      |
| avg(len(unc.seq))      |  2.4    |  2.6     |  2.0    |

### imports and PIP installs

In [None]:
from pm4py.objects.log.importer.xes import importer as xes_importer

In [None]:
import utils

In [None]:
from tqdm import tqdm
from itertools import combinations_with_replacement, product
from random import shuffle

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn import model_selection
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM

### Loading the logs

In [169]:
#b12_log = xes_importer.apply("./logs/BPI_Challenge_2012.xes")
#b14_log = xes_importer.apply("./logs/BPI_Challenge_2014.xes")
#traffic_log = xes_importer.apply("./logs/traffic_fines.xes")

HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=150370.0, style=P…




In [193]:
# artificial logs

#a_log0   = xes_importer.apply("./logs/generated_logs/1561989897361-4_0.xes")
#a_log25  = xes_importer.apply("./logs/generated_logs/1561989897361-4_25.xes")
#a_log50  = xes_importer.apply("./logs/generated_logs/1561989897361-4_50.xes")
a_log75  = xes_importer.apply("./logs/generated_logs/1561989897313-3_75.xes")
#a_log100 = xes_importer.apply("./logs/generated_logs/1561989906794-495_100.xes")

HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=1000.0, style=Pro…




# **Seq2Seq**
---

In [194]:
log = utils.remove_timezones(a_log75)
utils.abstract_time(log, utils.abstract_seconds) # AF
c_log, u_log = utils.split_log(log)

sparse_log = utils.get_sparse_log(log)
sparse_c_log, sparse_u_log = utils.get_sparse_log(c_log), utils.get_sparse_log(u_log)

A = list(set([event["concept:name"] for trace in log for event in trace ]))
A_set = [[activity] for activity in A] 

#log_set   = utils.get_sparse_log_set(log)
#c_log_set = utils.get_sparse_log_set(c_log)
#u_log_set = utils.get_sparse_log_set(u_log)

log_set   = utils.get_sparse_log_set_artificial(log)    # AF
c_log_set = utils.get_sparse_log_set_artificial(c_log)
u_log_set = utils.get_sparse_log_set_artificial(u_log)

In [195]:
max_trace_len = utils.longest_trace(u_log)
max_unc_trace_len = utils.longest_trace(u_log_set)
max_seq_len = utils.longest_unc_seq(u_log_set)
k = max_seq_len # longest uncertain sequences
unc_seq = utils.possible_uncertain_seq(A, k) 
pos_res = utils.possible_resolutions(A, k)

In [196]:
pos_res_for_unc_seq = utils.pos_res_for_unc_seq(unc_seq)

In [197]:
# add start and end sequence symbol to each target trace
BOS = '<'
EOS = '>'

In [198]:
# shrink the set if it is larger 10000 to 10000 
# since the smallest set has about 10000 (traffic log) 
# also for huge encoding 2, 3
if len(u_log_set) > 10000:
    shuffle(u_log_set)
    u_log_set = u_log_set[:10000]

### Encoding 1

In [None]:
rev_X = [trace[::-1] for trace in u_log_set] 
y = [[BOS] + seq + [EOS] for seq in sparse_u_log]

In [None]:
input_events = A
target_events = sorted(A + [BOS, EOS])
n_enc_tokens = len(input_events)
n_dec_tokens = len(target_events)
max_enc_seq_len = max([len(trace) for trace in rev_X])
max_dec_seq_len = max([len(trace) for trace in y])

print('Number of samples:', len(rev_X))
print('Number of unique input tokens:', n_enc_tokens)
print('Number of unique output tokens:', n_dec_tokens)
print('Max sequence length for inputs:', max_enc_seq_len)
print('Max sequence length for outputs:', max_dec_seq_len)

In [None]:
# lookup tables
INactivity_to_idx = dict( (tuple(e_set),i) for i,e_set in enumerate(input_events)) 
INidx_to_activity = dict( (i,tuple(e_set)) for i,e_set in enumerate(input_events))
INactivity_to_idx2 = dict( (e_set,i) for i,e_set in enumerate(input_events)) # for decoding
INidx_to_activity2 = dict( (i,e_set) for i,e_set in enumerate(input_events)) # for decoding

OUTactivity_to_idx = dict( (tuple(e_set),i) for i,e_set in enumerate(target_events))
OUTidx_to_activity = dict( (i,tuple(e_set)) for i,e_set in enumerate(target_events))
OUTactivity_to_idx2 = dict( (e_set,i) for i,e_set in enumerate(target_events)) # for decoding
OUTidx_to_activity2 = dict( (i,e_set) for i,e_set in enumerate(target_events)) # for decoding

In [None]:
encoder_input_data, decoder_input_data, decoder_target_data = utils.seq2seq_encode(rev_X, y, max_enc_seq_len, n_enc_tokens,
                                                                            max_dec_seq_len, n_dec_tokens, INactivity_to_idx,
                                                                            OUTactivity_to_idx, 1)

### Encoding 2 & 3

In [199]:
rev_X = [trace_set[::-1] for trace_set in u_log_set]
y = [[[BOS]] + seq + [[EOS]] for seq in u_log_set]

In [200]:
input_events = unc_seq
target_events = sorted(pos_res + [[BOS], [EOS]])
n_enc_tokens = len(input_events)
n_dec_tokens = len(target_events)
max_enc_seq_len = max([len(trace) for trace in rev_X]) # fix here
max_dec_seq_len = max([len(trace) for trace in y])

print('Number of samples:', len(rev_X))
print('Number of unique input tokens:', n_enc_tokens)
print('Number of unique output tokens:', n_dec_tokens)
print('Max sequence length for inputs:', max_enc_seq_len)
print('Max sequence length for outputs:', max_dec_seq_len)

Number of samples: 494
Number of unique input tokens: 5984
Number of unique output tokens: 88742
Max sequence length for inputs: 11
Max sequence length for outputs: 13


In [201]:
# lookup tables
unc_seq_to_idx = dict( (tuple(e_set),i) for i,e_set in enumerate(input_events)) 
idx_to_unc_seq = dict( (i,tuple(e_set)) for i,e_set in enumerate(input_events))

pos_res_to_idx = dict( (tuple(e_set),i) for i,e_set in enumerate(target_events))
idx_to_pos_res = dict( (i,tuple(e_set)) for i,e_set in enumerate(target_events))

Encoding 2

In [129]:
encoder_input_data, decoder_input_data, decoder_target_data = utils.seq2seq_encode(rev_X, y, max_enc_seq_len, n_enc_tokens,
                                                                            max_dec_seq_len, n_dec_tokens, unc_seq_to_idx,
                                                                            pos_res_to_idx, 2)

Encoding 3

In [202]:
encoder_input_data, decoder_input_data, decoder_target_data = utils.seq2seq_encode(rev_X, y, max_enc_seq_len, n_enc_tokens,
                                                                            max_dec_seq_len, n_dec_tokens, unc_seq_to_idx,
                                                                            pos_res_to_idx, 3)

split off the test set (20%)

In [203]:
n_samples = len(rev_X)
cut = int(n_samples*0.8)
#training
train_encoder_input_data  = encoder_input_data[:cut]
train_decoder_input_data  = decoder_input_data[:cut]
train_decoder_target_data = decoder_target_data[:cut]

# test
test_encoder_input_data = encoder_input_data[cut:]
test_decoder_input_data = decoder_input_data[cut:]
test_decoder_target_data = decoder_target_data[cut:]

In [204]:
train_encoder_input_data.shape, test_encoder_input_data.shape

((395, 11, 5984), (99, 11, 5984))

## Model (Preprocessing, Training, Predictions, Evaluation)

In [205]:
batch_size = 64  # batch size for training
epochs = 30 # 50  # number of epochs to train for, 100
latent_dim = 256  # latent dimensionality of the encoding space
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [206]:
encoder_inputs = Input(shape=(None, n_enc_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, n_dec_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(n_dec_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model(inputs=[encoder_inputs, decoder_inputs], 
              outputs=decoder_outputs)

In [207]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
#model.summary()

In [208]:
history = model.fit([train_encoder_input_data, train_decoder_input_data],
                    train_decoder_target_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2,
                    callbacks=[callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.save('./outputs/seq2seqsets_bpi14log_logorder_01022021.h5') 

### Make Predictions

In [209]:
#use the trained model to make predictions via inferencing
#for that we take the uncertain log as input and assume the order in the log as the correct order

#inference mode brakedown
# 1 encode input sequence and return corresponding internal states
# 2 start decoder with BOS symbol and the encoders internal states as input
# 3 append predicted activity (after looking up in lookup table) to the predicted sequence
# 4 repeat process with the previously predicted activity and the updated internal states as input
# 5 end when EOS was predicted

encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

In [210]:
decoded_X_test = utils.decode_X(test_encoder_input_data, idx_to_unc_seq, mode="enc3")

In [211]:
decoded_y_test = utils.decode_y(test_decoder_input_data, idx_to_pos_res, mode='enc2+3')

In [None]:
for idx in range(0,20):
    enc_input_seq = test_encoder_input_data[idx:idx+1]
    decoded_trace, n_repredicts = decode_seq(enc_input_seq,decoded_X_test[idx][::-1],
                                             decoded_y_test[idx][1:-1],
                                             pos_res_to_idx,idx_to_pos_res)
    print('-'*20)
    print('Input trace  :', decoded_X_test[idx][::-1]) # log_set[idx]
    print('Target trace :', decoded_y_test[idx][1:-1])
    print('Decoded trace:', decoded_trace[:])
    print(decoded_y_test[idx][1:-1]==decoded_trace)
    print()

In [213]:
total = len(decoded_X_test)
n_event_sets = sum([1 for trace in decoded_X_test for event_set in trace])
count = 0
count_highest_prob_is_non_pos_res = 0
prediction_probabilities = {}
actual_resolution_probabilities = {}

for idx in tqdm(range(total)):
    enc_input_seq = test_encoder_input_data[idx:idx+1]
    decoded_trace, n_repredicts = decode_seq(enc_input_seq,decoded_X_test[idx][::-1],
                                             decoded_y_test[idx][1:-1],
                                             pos_res_to_idx,idx_to_pos_res)
    
    count_highest_prob_is_non_pos_res += n_repredicts
    if decoded_y_test[idx][1:-1] == decoded_trace:
        count += 1
    #print(decoded_y_test[idx][1:-1]==decoded_trace)
print(count / total, count_highest_prob_is_non_pos_res / n_event_sets)

100%|██████████| 99/99 [02:45<00:00,  1.67s/it]

0.7878787878787878 0.6995827538247567





In [214]:
import statistics
for act in A:
    print(round(statistics.mean(prediction_probabilities[tuple([act])]), 2))

0.23
0.17
0.15
0.16
0.13
0.12
0.14
0.15
0.2
0.15
0.17
0.18
0.42
0.19
0.43
0.2
0.11


In [215]:
for act in A:
    print(round(statistics.mean(actual_resolution_probabilities[tuple([act])]), 2))

0.23
0.17
0.15
0.16
0.13
0.12
0.14
0.15
0.2
0.15
0.17
0.18
0.42
0.19
0.43
0.2
0.11


In [216]:
import pickle

a_file = open("Seq2Seq_Enc3_ART1_pred_prob.pkl", "wb")
pickle.dump(prediction_probabilities, a_file)
a_file.close()

a_file = open("Seq2Seq_Enc3_ART1_target_prob.pkl", "wb")
pickle.dump(actual_resolution_probabilities, a_file)
a_file.close()

In [212]:
# build function to decode predictions 
def decode_seq(enc_input_seq, dec_input_seq, dec_y_test, OUTact_to_idx, OUTidx_to_act):
    # encode the input sequence to get the internal state vectors
    states_value = encoder_model.predict(enc_input_seq)

    # generate empty target sequence of len 1 with only the start character
    target_seq = np.zeros((1,1, n_dec_tokens))
    target_seq[0, 0, OUTact_to_idx[tuple([BOS])]] = 1.0

    #output seq loop
    stop_cond = False
    decoded_trace = []
    num_dec_events = 0
    n_repredicts = 0
    while not stop_cond:
        n_its = 0
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        truth = dec_y_test[num_dec_events]
        truth_index = OUTact_to_idx[tuple(truth)]
        truth_prob = output_tokens[0, -1, :][truth_index]
        actual_resolution_probabilities[tuple(truth)] = actual_resolution_probabilities.get(tuple(truth), []) + [truth_prob]

        #sample token and add corresponding activity to the decoded trace
        prob = np.amax(output_tokens[0, -1, :])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_activity = list(OUTidx_to_act[sampled_token_index])
    
        
        # check if the sampled activity is actually a possible resolution fo that case
        while not sampled_activity in pos_res_for_unc_seq[tuple(dec_input_seq[num_dec_events])]:
            if n_its == 0:
                n_repredicts += 1
                n_its += 1
            
            # if not take the prediction with the 2nd highest prob... etc.
            output_tokens[0, -1, sampled_token_index] = 0.0 # set the old idx with max prob to zero
            
            prob = np.amax(output_tokens[0, -1, :])
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_activity = list(OUTidx_to_act[sampled_token_index])
            
        prediction_probabilities[tuple(sampled_activity)] = prediction_probabilities.get(tuple(sampled_activity), []) + [prob]
        decoded_trace.append(sampled_activity)
        num_dec_events += 1

        #check for stop condition: either hitting max length or prediciting EOS
        if (sampled_activity == tuple([EOS]) or len(decoded_trace) > max_dec_seq_len or
            num_dec_events >= len(dec_input_seq) ):
            stop_cond = True

        #update the target sequence (len 1) to resemble the last predicted event
        target_seq = np.zeros((1, 1, n_dec_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        #update states
        states_value = [h, c]

    return decoded_trace, n_repredicts