In [5]:
import keras
import pickle
import random
import numpy as np
import pandas as pd
import re
import multiprocessing

from tqdm import *
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input, Reshape, concatenate, Flatten, Activation, LSTM
from keras.utils  import multi_gpu_model

In [2]:
keras.backend.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:1']

In [6]:
##### CONFIGURATION SETUP ####

data_path = "../logs/bpic2011.xes"
traces_finalpath = data_path.replace(".xes", "_traces_encoded.pickled")
traces_dictionarypath = data_path.replace(".xes", "_dictionaries.pickled")
n_sp2_features = 624
n_pfs_features = 25

traces = pickle.load(open(traces_finalpath, "rb"))
feature_dict = pickle.load(open(traces_dictionarypath, "rb"))

### CONFIGURATION SETUP END ###

In [7]:
# shuffle complete traces and create test and training set
random.shuffle(traces)
sep_idx = int(0.8*len(traces))

# extract the feature indices
# data is organized like this: ordinal features | categorical features | SP2 features | PFS features | TARGET features
# needed as every of these features will get its own layer
feature_names  = traces[0].columns
trace_columns = list(map(lambda e: bool(re.match('^TARGET$', e)), feature_names))
target_col_start_index = trace_columns.index(True)

categorical_feature_names = feature_dict.keys()
pfs_col_start_index = target_col_start_index - n_pfs_features
sp2_col_start_index = pfs_col_start_index - n_sp2_features
cat_col_start_index = sp2_col_start_index - len(categorical_feature_names)

ordinal_feature_names     = feature_names[0:cat_col_start_index]

In [9]:
def generate_input_name(var_name):
    return "input_{0}".format(''.join(c for c in var_name if c.isalnum()))

In [6]:
# TODO: normalize
# X = X / float(n_vocab)
def wrapped__create_learning_dicts_from_trace(p):
    return create_learning_dicts_from_trace(*p)
# reshape X to be [samples, time steps, features]
# How to understand keras feature shape requirements: https://github.com/keras-team/keras/issues/2045
def create_learning_dicts_from_trace(t, sp2_col_start_index, n_sp2_features, pfs_col_start_index, n_pfs_features, target_col_start_index, feature_names):
    t_dict = {'x':[], 'y':[]}
    # generate one input sequence for every type of variable
    # map every single-step batch in a dictionary that will correspond to input layer names!
    for i in range(0, len(t)):
        batch_dict = {}

        # automatically run through all ordinal and categorical features
        for col_idx, col in enumerate(feature_names[:sp2_col_start_index]):
            input_name = generate_input_name(col)
            batch_dict[input_name] = np.array(t.iloc[i, col_idx], dtype=np.float32).reshape([-1,1])

        # create batches for sp2 and pfs2 seperately because of their variable encodings
        batch_dict[generate_input_name("sp2")] = np.asarray(t.iloc[i, sp2_col_start_index:pfs_col_start_index], dtype=np.float32).reshape([-1,n_sp2_features])
        batch_dict[generate_input_name("pfs")] = np.asarray(t.iloc[i, pfs_col_start_index:target_col_start_index], dtype=np.float32).reshape([-1,n_pfs_features])

        t_dict['x'].append(batch_dict)
    t_dict['y'] = keras.utils.np_utils.to_categorical(t.iloc[:, target_col_start_index:].values.reshape([-1,1,1]))
    return t_dict

ncores = multiprocessing.cpu_count()
ppool = multiprocessing.Pool(ncores)
train_traces = []
test_traces  = []
traces_for_input_dicts = [ (t, sp2_col_start_index, n_sp2_features, pfs_col_start_index, n_pfs_features, target_col_start_index, feature_names) for t in traces ]

with tqdm(total=len(traces[:sep_idx]), desc="Converting traces to Keras learning data", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__create_learning_dicts_from_trace, traces_for_input_dicts[:sep_idx]))):
        pbar.update()
        train_traces.append(_)
        
with tqdm(total=len(traces[sep_idx:]), desc="Converting traces to Keras validation data", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__create_learning_dicts_from_trace, traces_for_input_dicts[sep_idx:]))):
        pbar.update()
        test_traces.append(_)

Converting traces to Keras learning data:   0%|          | 0/914 [00:00<?, ?traces/s]
Converting traces to Keras learning data:   0%|          | 1/914 [00:00<02:22,  6.43traces/s]
Converting traces to Keras learning data:   0%|          | 2/914 [00:00<04:03,  3.74traces/s]
Converting traces to Keras learning data:   0%|          | 4/914 [00:01<04:46,  3.17traces/s]
Converting traces to Keras learning data:   1%|          | 6/914 [00:01<03:44,  4.04traces/s]
Converting traces to Keras learning data:   8%|▊         | 72/914 [00:01<02:26,  5.74traces/s]
Converting traces to Keras learning data:   9%|▉         | 86/914 [00:02<01:43,  7.97traces/s]
Converting traces to Keras learning data:  12%|█▏        | 112/914 [00:04<01:29,  9.00traces/s]
Converting traces to Keras learning data:  18%|█▊        | 163/914 [00:04<01:01, 12.22traces/s]
Converting traces to Keras learning data:  24%|██▍       | 219/914 [00:04<00:40, 17.24traces/s]
Converting traces to Keras learning data:  28%|██▊       | 2

## Model creation

In [11]:
models = []
model_inputs = []

# forward all ordinal features
for ord_var in feature_names[:cat_col_start_index]:
    il = Input(batch_shape=(1,1), name=generate_input_name(ord_var))
    model = Reshape(target_shape=(1,1,))(il)
    model_inputs.append(il)
    models.append(model)

# create embedding layers for every categorical feature
for cat_var in categorical_feature_names :
    model = Sequential()
    no_of_unique_cat  = len(feature_dict[cat_var]['to_int'])
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
    vocab  = no_of_unique_cat+1
    
    il = Input(batch_shape=(1,1), name=generate_input_name(cat_var))    
    model = Embedding(vocab, embedding_size)(il)
    model = Reshape(target_shape=(1,embedding_size,))(model)
    
    model_inputs.append(il)
    models.append(model)

# create input and embedding for sp2/pfs2 features
learn_sp2 = True
sequence_embedding = None

# Can't embed SP2 due to dimensionality with embedding layer, be stringent and do the same for PFS features
# instead, mimic the embedding internal architecture and use a Dense/Linear layer
if learn_sp2:
    il = Input(batch_shape=(1,n_sp2_features), name=generate_input_name("sp2"))
    model_inputs.append(il)
    sequence_embedding = il
    # TODO mimic embedding architecture
    sequence_embedding = Reshape(target_shape=(n_sp2_features,))(sequence_embedding)
else:
    # TODO
    pass
    
# merge the outputs of the embeddings, and everything that belongs to the most recent activity executions
main_output = concatenate(models, axis=2)
main_output = LSTM(25*32, batch_input_shape=(1,), return_sequences=True, stateful=True)(main_output) # should be multiple of 32 since it trains faster due to np.float32
main_output = LSTM(25*32, stateful=True)(main_output) # should be multiple of 32 since it trains faster due to np.float32

# after LSTM has learned on the sequence, bring in the SP2/PFS features, like in Shibatas paper
main_output = concatenate([main_output, sequence_embedding], axis=1)
main_output = Dense(20*32, activation='relu', name='dense_join')(main_output)
main_output = Dense(len(feature_dict["concept:name"]["to_int"]), activation='sigmoid', name='dense_final')(main_output)

full_model = Model(inputs=model_inputs, outputs=[main_output])
full_model.compile(loss='categorical_crossentropy', optimizer='adam')

for l in full_model.layers:
    print(l.name, "input_shape={}".format(l.input_shape), "output_shape={}".format(l.output_shape))

input_conceptname input_shape=(1, 1) output_shape=(1, 1)
input_Specialismcode input_shape=(1, 1) output_shape=(1, 1)
input_orggroup input_shape=(1, 1) output_shape=(1, 1)
input_Numberofexecutions input_shape=(1, 1) output_shape=(1, 1)
input_timetimestamp input_shape=(1, 1) output_shape=(1, 1)
embedding_4 input_shape=(1, 1) output_shape=(1, 1, 50)
embedding_5 input_shape=(1, 1) output_shape=(1, 1, 13)
embedding_6 input_shape=(1, 1) output_shape=(1, 1, 22)
reshape_7 input_shape=(1, 1) output_shape=(1, 1, 1)
reshape_8 input_shape=(1, 1) output_shape=(1, 1, 1)
reshape_9 input_shape=(1, 1, 50) output_shape=(1, 1, 50)
reshape_10 input_shape=(1, 1, 13) output_shape=(1, 1, 13)
reshape_11 input_shape=(1, 1, 22) output_shape=(1, 1, 22)
concatenate_3 input_shape=[(1, 1, 1), (1, 1, 1), (1, 1, 50), (1, 1, 13), (1, 1, 22)] output_shape=(1, 1, 87)
lstm_3 input_shape=(1, 1, 87) output_shape=(1, 1, 800)
input_sp2 input_shape=(1, 624) output_shape=(1, 624)
lstm_4 input_shape=(1, 1, 800) output_shape=(1,

In [29]:
print("{0}".format(type(full_model.optimizer).__name__))

Adam


In [None]:
# # define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# for t in train_traces:
#     full_model.fit(t['x'], t['y'], epochs=10, batch_size=50, callbacks=callbacks_list)
#     model.reset_states()

for epoch in range(1):
    mean_tr_acc = []
    mean_tr_loss = []
    for t_idx, t in enumerate(train_traces[0:5]):
        print("Training {0}/{1}".format(t_idx,len(train_traces)))
        for x,y in zip(t['x'],t['y']):
            tr_loss = full_model.train_on_batch(x, y)
            mean_tr_loss.append(tr_loss)
        full_model.reset_states()

    print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
    print('loss training = {}'.format(np.mean(mean_tr_loss)))
    print('___________________________________')

In [9]:
full_model = keras.models.load_model("my_first_model.h5")

In [None]:
for t_idx, t in enumerate(test_traces[0:1]):
    for x,y in zip(t['x'],t['y']):
        pred_y = full_model.predict(x)
        print("Predicted: {0} | Actual: {1}".format(pred_y,y))
    full_model.reset_states()

In [None]:
prediction_index = [ np.argmax(p) for p in prediction ]

In [None]:
sum([prediction_index[i] == dataY[i] for i in prediction_index]) / len(prediction_index)