In [140]:
import keras
import pickle
import random
import numpy as np
import pandas as pd
import re

from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input, Reshape, Concatenate, concatenate, Flatten, Activation

In [85]:
##### CONFIGURATION SETUP ####

data_path = "../logs/bpic2011.xes"
traces_finalpath = data_path.replace(".xes", "_traces_encoded.pickled")
traces_dictionarypath = data_path.replace(".xes", "_dictionaries.pickled")
n_sp2_features = 624
n_pfs_features = 25

traces = pickle.load(open(traces_finalpath, "rb"))
feature_dict = pickle.load(open(traces_dictionarypath, "rb"))

### CONFIGURATION SETUP END ###

In [89]:
# shuffle complete traces and create test and training set
random.shuffle(traces)
sep_idx = int(0.8*len(traces))

# train_traces = traces[:sep_idx]
# test_traces  = traces[sep_idx:]
# https://github.com/keras-team/keras/issues/3107
# Yes, your input should be in this format (sequences, timesteps, dimensions).
# So based on your example, your input should be in (None, 8, 2). Your input now is (8,2).

In [254]:
# extract the feature indices
# data is organized like this: normal features | SP2 features | PFS features | TARGET features
trace_columns = traces[0].columns.tolist()
trace_columns = list(map(lambda e: bool(re.match('^TARGET$', e)), trace_columns))
target_col_start_index = trace_columns.index(True)
pfs_col_start_index = target_col_start_index - n_pfs_features
sp2_col_start_index  = pfs_col_start_index - n_sp2_features

# reshape X to be [samples, time steps, features]
# X = np.reshape(dataX, (n_patterns, seq_length, 1))
# TODO: normalize
# X = X / float(n_vocab)

# TODO: cluster categorical features and numerical features...
# How to understand keras feature shape requirements: https://github.com/keras-team/keras/issues/2045
categorical_vars= traces[0].columns[1:sp2_col_start_index-1]
train_traces = []
for t in traces[:sep_idx]:
    t_dict = {}
    t_dict['x'] = [ t.iloc[:, i].values.reshape([-1,1,1]) for i in range(1,sp2_col_start_index-1)]
    t_dict['y'] = keras.utils.np_utils.to_categorical(t.iloc[:, target_col_start_index:].values)
    train_traces.append(t_dict)

# test_x = []
# test_y = []
# for t in test_traces:
#     test_x.append(t.iloc[:, :sp2_col_start_index].values.tolist())
#     test_y.append(keras.utils.np_utils.to_categorical(t.iloc[:, target_col_start_index:].values))

In [247]:
train_traces[3]['x'][0].shape

(142, 1, 1)

In [242]:
train_traces[0]['x'][0].reshape([-1,1,1])

array([[[2]],

       [[2]]])

## Model creation

In [256]:
models = []
model_inputs = []

# create embedding layers for every input feature
for cat_var in categorical_vars :
    model = Sequential()
    no_of_unique_cat  = len(feature_dict[cat_var]['to_int'])
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
    vocab  = no_of_unique_cat+1
    
    il = Input(shape=(None, 1), name="Input_{0}".format(''.join(c for c in cat_var if c.isalnum())))
    model_inputs.append(il)
    
    model = Embedding(vocab, embedding_size)(il)
    model = Reshape(target_shape=(embedding_size,))(model)
    models.append(model)
    
# merge the outputs of the embeddings in a dense layer
main_output = concatenate(models)
main_output = Dense(len(feature_dict["concept:name"]["to_int"]), activation='sigmoid', name='dense_final')(main_output)

full_model = Model(inputs=model_inputs, outputs=[main_output])
full_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

for t in train_traces:
    full_model.fit(t['x'], t['y'], epochs=2, batch_size=1)

In [None]:
model_filename = "weights-improvement-02-3.3127.hdf5"
model.load_weights(model_filename)

In [None]:
prediction = model.predict(X)

In [None]:
prediction_index = [ np.argmax(p) for p in prediction ]

In [None]:
sum([prediction_index[i] == dataY[i] for i in prediction_index]) / len(prediction_index)

In [None]:
# Test prefixspan here
from prefixspan import PrefixSpan

event_sequences = [[ ev.get_attributes()["concept:name"].get_value() for ev in trace ] for trace in bpic2011_log ]
translated_sequences = [ [event_to_int[ev] for ev in trace] for trace in event_sequences]

ps = PrefixSpan(translated_sequences)

print(ps.frequent(3))