In [1]:
from __future__ import print_function

from hyperopt import Trials, STATUS_OK, tpe
from keras.datasets import mnist
from keras.layers import Dense, Embedding, Input, Reshape, concatenate, Flatten, Activation, LSTM
from keras.models import Model
from keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

import pickle
import random
import numpy as np
import pandas as pd
import re
import multiprocessing

from tqdm import *

Using TensorFlow backend.


In [2]:
##### CONFIGURATION SETUP ####

data_path = "../logs/bpic2011.xes"
traces_finalpath = data_path.replace(".xes", "_traces_encoded.pickled")
traces_dictionarypath = data_path.replace(".xes", "_dictionaries.pickled")
n_sp2_features = 624
n_pfs_features = 25

traces = pickle.load(open(traces_finalpath, "rb"))
feature_dict = pickle.load(open(traces_dictionarypath, "rb"))

### CONFIGURATION SETUP END ###

In [3]:
def generate_input_name(var_name):
    return "input_{0}".format(''.join(c for c in var_name if c.isalnum()))

In [4]:
# shuffle complete traces and create test and training set
random.shuffle(traces)
sep_idx = int(0.8*len(traces))

# extract the feature indices
# data is organized like this: ordinal features | categorical features | SP2 features | PFS features | TARGET features
# needed as every of these features will get its own layer
feature_names  = traces[0].columns
trace_columns = list(map(lambda e: bool(re.match('^TARGET$', e)), feature_names))
target_col_start_index = trace_columns.index(True)

categorical_feature_names = feature_dict.keys()
pfs_col_start_index = target_col_start_index - n_pfs_features
sp2_col_start_index = pfs_col_start_index - n_sp2_features
cat_col_start_index = sp2_col_start_index - len(categorical_feature_names)

ordinal_feature_names = feature_names[0:cat_col_start_index]

In [None]:
# TODO: normalize
def wrapped__create_learning_dicts_from_trace(p):
    return create_learning_dicts_from_trace(*p)
# reshape X to be [samples, time steps, features]
# How to understand keras feature shape requirements: https://github.com/keras-team/keras/issues/2045
def create_learning_dicts_from_trace(t, sp2_col_start_index, n_sp2_features, pfs_col_start_index, n_pfs_features, target_col_start_index, feature_names):
    t_dict = {'x':[], 'y':[]}
    # generate one input sequence for every type of variable
    # map every single-step batch in a dictionary that will correspond to input layer names!
    for i in range(0, len(t)):
        batch_dict = {}

        # automatically run through all ordinal and categorical features
        for col_idx, col in enumerate(feature_names[:sp2_col_start_index]):
            input_name = generate_input_name(col)
            batch_dict[input_name] = np.array(t.iloc[i, col_idx], dtype=np.float32).reshape([-1,1])

        # create batches for sp2 and pfs2 seperately because of their variable encodings
        batch_dict[generate_input_name("sp2")] = np.asarray(t.iloc[i, sp2_col_start_index:pfs_col_start_index], dtype=np.float32).reshape([-1,n_sp2_features])
        batch_dict[generate_input_name("pfs")] = np.asarray(t.iloc[i, pfs_col_start_index:target_col_start_index], dtype=np.float32).reshape([-1,n_pfs_features])

        t_dict['x'].append(batch_dict)
    t_dict['y'] = np_utils.to_categorical(t.iloc[:, target_col_start_index:].values.reshape([-1,1,1]))
    return t_dict

ncores = multiprocessing.cpu_count()
ppool = multiprocessing.Pool(ncores)
train_traces = []
test_traces  = []
traces_for_input_dicts = [ (t, sp2_col_start_index, n_sp2_features, pfs_col_start_index, n_pfs_features, target_col_start_index, feature_names) for t in traces ]

with tqdm(total=len(traces[:sep_idx]), desc="Converting traces to Keras learning data", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__create_learning_dicts_from_trace, traces_for_input_dicts[:sep_idx]))):
        pbar.update()
        train_traces.append(_)
        
with tqdm(total=len(traces[sep_idx:]), desc="Converting traces to Keras validation data", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__create_learning_dicts_from_trace, traces_for_input_dicts[sep_idx:]))):
        pbar.update()
        test_traces.append(_)

## Model creation

In [10]:
def create_model(train_traces, test_traces):
    '''
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    '''
    from keras.models import Sequential, Model
    from keras.layers import Dense, Embedding, Input, Reshape, concatenate, Flatten, Activation, LSTM
    
    model_inputs = []
    models = []

    # forward all ordinal features
    for ord_var in feature_names[:cat_col_start_index]:
        il = Input(batch_shape=(1,1), name=generate_input_name(ord_var))
        model = Reshape(target_shape=(1,1,))(il)
        model_inputs.append(il)
        models.append(model)

    # create embedding layers for every categorical feature
    for cat_var in categorical_feature_names :
        model = Sequential()
        no_of_unique_cat  = len(feature_dict[cat_var]['to_int'])
        embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
        vocab  = no_of_unique_cat+1

        il = Input(batch_shape=(1,1), name=generate_input_name(cat_var))    
        model = Embedding(vocab, embedding_size)(il)
        model = Reshape(target_shape=(1,embedding_size,))(model)

        model_inputs.append(il)
        models.append(model)

    # create input and embedding for sp2/pfs2 features
    learn_sp2 = True
    sequence_embedding = None

    # Can't embed SP2 due to dimensionality with embedding layer, be stringent and do the same for PFS features
    # instead, mimic the embedding internal architecture and use a Dense/Linear layer
    if learn_sp2:
        il = Input(batch_shape=(1,n_sp2_features), name=generate_input_name("sp2"))
        model_inputs.append(il)
        sequence_embedding = il
        # TODO mimic embedding architecture
        sequence_embedding = Reshape(target_shape=(n_sp2_features,))(sequence_embedding)
    else:
        # TODO
        pass

    # merge the outputs of the embeddings, and everything that belongs to the most recent activity executions
    main_output = concatenate(models, axis=2)
    main_output = LSTM(25*32, batch_input_shape=(1,), return_sequences=True, stateful=True)(main_output) # should be multiple of 32 since it trains faster due to np.float32
    main_output = LSTM(25*32, stateful=True)(main_output) # should be multiple of 32 since it trains faster due to np.float32

    # after LSTM has learned on the sequence, bring in the SP2/PFS features, like in Shibatas paper
    main_output = concatenate([main_output, sequence_embedding], axis=1)
    main_output = Dense(20*32, activation='relu', name='dense_join')(main_output)
    main_output = Dense(len(feature_dict["concept:name"]["to_int"]), activation='sigmoid', name='dense_final')(main_output)

    full_model = Model(inputs=model_inputs, outputs=[main_output])
    full_model.compile(loss='categorical_crossentropy', optimizer={{choice(['adadelta', 'adam', 'sgd'])}}, metrics=['categorical_accuracy', 'mae'])
    
    n_epochs = 40
    for epoch in range(n_epochs):
        mean_tr_acc  = []
        mean_tr_loss = []

        for t in tqdm(train_traces, desc="Epoch {0}/{1}".format(epoch,n_epochs)):
            for x,y in zip(t['x'],t['y']):
                tr_acc, tr_loss = full_model.train_on_batch(x, y)
                mean_tr_acc.append(tr_acc)
                mean_tr_loss.append(tr_loss)
            full_model.reset_states()

        print('Epoch {0} -- categorical_acc = {1} -- mae loss = {2}'.format(epoch, np.mean(mean_tr_acc), np.mean(mean_tr_loss)))
        
        if epoch % 5 == 0:
            full_model.save("complete_model_{0}_{1}.h5".format(full_model.optimizer, type(full_model.optimizer).__name__))
            
    npreds = 0
    correct_preds = 0
    for t in test_traces[0:1]:
        for x,y in zip(t['x'],t['y']):
            npreds += 1
            pred_y = full_model.predict(x)
            correct_preds += pred_y == y
        full_model.reset_states()
        
    return {'loss': -1 * correct_preds/npreds, 'status': STATUS_OK, 'model': model}

In [9]:
def data():
    return train_traces, test_traces

In [11]:
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())

FileNotFoundError: [Errno 2] No such file or directory: '/home/felix.wolff2/master-thesis-code/notebooks/<ipython-input-11-8ce78beebb39>'