In [1]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing, threading
import math
import scipy.stats as ss
import time
import pickle
import re

from opyenxes.model.XLog import XLog
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier

from prefixspan import PrefixSpan

from tqdm import *

In [2]:
### configuration
data_path = "../logs/bpic2011.xes"
traces_picklepath = data_path.replace(".xes", "_raw_traces.pickled")
traces_tmppath = data_path.replace(".xes", "_traces_tmp.pickled")
traces_finalpath = data_path.replace(".xes", "_traces_encoded.pickled")
traces_dictionarypath = data_path.replace(".xes", "_dictionaries.pickled")
target_column = "concept:name"
categorical_feature_names = ["concept:name", "Specialism code", "org:group"]
### configuration end

In [3]:
with open(data_path) as bpic_file:
    eventlog = XUniversalParser().parse(bpic_file)[0]

Unknown extension: http://www.xes-standard.org/meta_time.xesext
Unknown extension: http://www.xes-standard.org/meta_life.xesext
Unknown extension: http://www.xes-standard.org/meta_org.xesext
Unknown extension: http://www.xes-standard.org/meta_concept.xesext
Unknown extension: http://www.xes-standard.org/meta_3TU.xesext
Unknown extension: http://www.xes-standard.org/meta_general.xesext


In [4]:
ncores = multiprocessing.cpu_count()
ntraces = len(eventlog)

## Extract data trace-wise from XES format and enrich with BOS/EOS markers

In [5]:
# collect all attributes
column_names = []

for event in eventlog[0]:
    for attribute in event.get_attributes():
        column_names.append(attribute)
        
column_names = set(column_names) # remove duplicates
column_names = list(column_names)

def create_dataframe_from_trace(t):
    df = pd.DataFrame(columns=column_names, index=range(0,len(t)))
    for event_idx, event in enumerate(t):
        event_attributes = event.get_attributes()
        df.iloc[event_idx]["__case_id"] = 0
        
        for attribute in event_attributes:
            df[attribute].values[event_idx] = event_attributes[attribute].get_value()
    
    return df

ppool = multiprocessing.Pool(ncores)
traces = []
with tqdm(total=len(eventlog), desc="Converting XES traces to Pandas dataframes", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(create_dataframe_from_trace, eventlog))):
        pbar.update()
        traces.append(_)
        
del eventlog

Converting XES traces to Pandas dataframes:   0%|          | 0/1143 [00:00<?, ?traces/s]
Converting XES traces to Pandas dataframes:   0%|          | 1/1143 [00:00<02:15,  8.42traces/s]
Converting XES traces to Pandas dataframes:   0%|          | 2/1143 [00:00<02:48,  6.75traces/s]
Converting XES traces to Pandas dataframes:   1%|          | 8/1143 [00:00<02:07,  8.91traces/s]
Converting XES traces to Pandas dataframes:   1%|          | 13/1143 [00:01<02:19,  8.11traces/s]
Converting XES traces to Pandas dataframes:   1%|▏         | 15/1143 [00:01<01:59,  9.44traces/s]
Converting XES traces to Pandas dataframes:   7%|▋         | 75/1143 [00:02<01:22, 12.91traces/s]
Converting XES traces to Pandas dataframes:  12%|█▏        | 132/1143 [00:02<00:56, 17.74traces/s]
Converting XES traces to Pandas dataframes:  12%|█▏        | 142/1143 [00:02<00:42, 23.35traces/s]
Converting XES traces to Pandas dataframes:  26%|██▌       | 297/1143 [00:03<00:20, 41.14traces/s]
297it [00:03, 29.81it/s][A
C

In [6]:
pickle.dump(traces, open(traces_picklepath, "wb"))

In [5]:
traces = pickle.load(open(traces_picklepath, "rb"))

## Eliminate correlated or unimportant features

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
        https://stackoverflow.com/questions/46498455/categorical-features-correlation"""
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for col_a,col_b in itertools.product(eventlog_df.columns, repeat=2): 
    candidate = pd.crosstab(eventlog_df[col_a], eventlog_df[col_b]).as_matrix()
    print("{: >30} {: >30} {: >20}".format(col_a, col_b, cramers_v(candidate)))

In [6]:
# lifecyle:transition is always "complete"
# Producer code correlates perfectly with org:group
# Activity code correlates perfectly with concept:name
for t in traces:
    t.drop(columns=["lifecycle:transition", "Producer code", "Activity code", "Section"], inplace=True)

## Create standard feature set

In [7]:
# Set data types for columns that were not autodetected
for i in range(0,len(traces)):
    traces[i]["Specialism code"] = pd.to_numeric(traces[i]["Specialism code"], errors="ignore")
    traces[i]["Number of executions"] = pd.to_numeric(traces[i]["Number of executions"], errors="ignore")

eventlog_df = pd.concat(traces, ignore_index=True)

## Create SP2 feature set

In [8]:
# https://stackoverflow.com/questions/42636765/how-to-set-all-the-values-of-an-existing-pandas-dataframe-to-zero
# This one-hot encodes all entries in concept:name column for later incrementation once it has been seen
# sp2_features = pd.get_dummies(eventlog_df["concept:name"], prefix="SP2") # can't use windowed representation here as it might skew distribution of values
# eventlog_sp2_df = process_results.copy(deep=True)
# sp2_features    = sp2_features.drop(sp2_features.index[sp2_features.index[len(eventlog_sp2_df):]])
# assert(len(sp2_features) == len(eventlog_sp2_df))

In [9]:
# loop through every trace and encode the presence of an activity
sp2_prefix = "SP2_"
activity_labels = [ "{0}{1}".format(sp2_prefix,a) for a in eventlog_df["concept:name"].unique() ]

def enrich_trace_with_sp2(t):
    sp2_df = pd.DataFrame(columns=activity_labels, index=range(0,len(t)), dtype=np.bool)
    for col in sp2_df.columns: sp2_df[col].values[:] = 0
    sp2_df["{0}{1}".format(sp2_prefix, t["concept:name"][0])].values[0]  = 1
    
    for i in range(1,len(t)):
        first_activity_name = t["concept:name"].iloc[i]
        col = "{0}{1}".format(sp2_prefix,first_activity_name)
        
        sp2_df.values[i] = sp2_df.values[i-1]
        sp2_df[col].values[i] = 1
        
    return sp2_df

ppool = multiprocessing.Pool(ncores)
sp2_traces = []
with tqdm(total=len(traces), desc="Enriching traces with SP2 features", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(enrich_trace_with_sp2, traces))):
        pbar.update()
        sp2_traces.append(_)

Enriching traces with SP2 features:   0%|          | 0/1143 [00:00<?, ?traces/s]
Enriching traces with SP2 features:   0%|          | 1/1143 [00:00<02:00,  9.47traces/s]
Enriching traces with SP2 features:   3%|▎         | 31/1143 [00:00<01:13, 15.18traces/s]
Enriching traces with SP2 features:   8%|▊         | 91/1143 [00:00<00:49, 21.45traces/s]
Enriching traces with SP2 features:  10%|█         | 120/1143 [00:00<00:34, 29.69traces/s]
Enriching traces with SP2 features:  14%|█▎        | 155/1143 [00:00<00:24, 40.84traces/s]
Enriching traces with SP2 features:  16%|█▌        | 183/1143 [00:00<00:17, 54.27traces/s]
Enriching traces with SP2 features:  20%|██        | 229/1143 [00:00<00:12, 73.33traces/s]
Enriching traces with SP2 features:  23%|██▎       | 259/1143 [00:01<00:09, 94.64traces/s]
Enriching traces with SP2 features:  26%|██▌       | 297/1143 [00:01<00:07, 119.92traces/s]
Enriching traces with SP2 features:  29%|██▉       | 333/1143 [00:01<00:05, 141.91traces/s]
Enriching t

## Enrich with PrefixSpan features

In [10]:
def print_patterns(pt):
    for p in pt:
        print("Support: {0}%".format(100*p[0]/len(traces)))
        for n in p[1]:
            print("    > ", int_to_event[n])
        print()

# since most patterns begin and end with the <eos> and <bos> markers, the features only become valuable towards the end...
events       = eventlog_df["concept:name"].unique()
event_to_int = dict((c, i) for i,c in enumerate(events) if c not in ["<bos>","<eos>"])
int_to_event = dict((i, c) for i,c in enumerate(events) if c not in ["<bos>","<eos>"])

In [11]:
# Prefixspan requires an array of arrays with one subarray for every trace
encoded_traces = [ t["concept:name"].map(event_to_int).tolist() for t in traces ]
prefixspan_traces = PrefixSpan(encoded_traces)
closed_sequences = prefixspan_traces.topk(25, closed=True) # support is how often the subsequence appears in total
# http://sequenceanalysis.github.io/slides/analyzing_sequential_user_behavior_part2.pdf, slide 5
# print_patterns(ps_topkc)

# only take subsequence which are at a certain level of support? like if ss[0]/len(traces) < .90
#ps_topkc = list(filter(lambda x: x[0]/len(traces) > .90, ps_topkc))
closed_sequences = [ p[1] for p in closed_sequences ]
pftrace_args = [ (t, closed_sequences[:], event_to_int) for t in traces ] # enrich traces with copy of mined subsequences

In [12]:
def wrapped__enrich_trace_with_subseq(args):
    return enrich_trace_with_subseq(*args)

def enrich_trace_with_subseq(t, ps, event_to_int):
    col_prefix = "PFS_"
    subseq_labels = [ "{0}{1}".format(col_prefix,ss_idx) for ss_idx, ss in enumerate(ps) ]
    subseq_df = pd.DataFrame(columns=subseq_labels, index=range(0,len(t)), dtype=np.bool)
    
    for col in subseq_df.columns: subseq_df[col].values[:] = 0
    for i in range(0,len(t)): # loop through sequence, prune items from mined sequences, and once a subsequence array is empty, this subsequence has occured :)
        activity_code = event_to_int.get(t["concept:name"].iloc[i], None)
        
        for subseq_idx in range(0,len(ps)):
            if ps[subseq_idx] == []:
                continue
            if ps[subseq_idx][0] == activity_code:
                ps[subseq_idx].pop(0)
                if ps[subseq_idx] == []:
                    subseq_df.values[i:,subseq_idx] = 1
        
    return subseq_df

ppool = multiprocessing.Pool(ncores)
pf_traces = []

with tqdm(total=len(pftrace_args), desc="Enriching traces with mined subsequence features", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__enrich_trace_with_subseq, pftrace_args))):
        pbar.update()
        pf_traces.append(_)

Enriching traces with mined subsequence features:   0%|          | 0/1143 [00:00<?, ?traces/s]
Enriching traces with mined subsequence features:   0%|          | 3/1143 [00:00<00:46, 24.30traces/s]
Enriching traces with mined subsequence features:   6%|▋         | 73/1143 [00:00<00:31, 34.06traces/s]
Enriching traces with mined subsequence features:  12%|█▏        | 132/1143 [00:00<00:21, 47.15traces/s]
Enriching traces with mined subsequence features:  19%|█▉        | 219/1143 [00:00<00:14, 65.69traces/s]
Enriching traces with mined subsequence features:  25%|██▍       | 284/1143 [00:00<00:09, 89.51traces/s]
Enriching traces with mined subsequence features:  29%|██▉       | 333/1143 [00:00<00:06, 118.42traces/s]
Enriching traces with mined subsequence features:  35%|███▍      | 400/1143 [00:00<00:04, 156.99traces/s]
Enriching traces with mined subsequence features:  41%|████      | 469/1143 [00:00<00:03, 201.31traces/s]
Enriching traces with mined subsequence features:  46%|████▌     

## Dictionary encoding for categorical features

In [13]:
# Convert timestamp to total running time
bos_idx = 0
for i in range(0, len(traces)):
    tlen = len(traces[i])-1
    dfs = traces[i]["time:timestamp"] - traces[i]["time:timestamp"][bos_idx]
    traces[i]["time:timestamp"] = dfs.map(lambda d: int(d.total_seconds()/(60*60))) # convert to hours
    
# Create dictionaries here
feature_dict = {}
for cf in categorical_feature_names:
    cf_dict = { 'to_int': {}, 'to_cat': {} }
    events = eventlog_df[cf].unique().tolist()
    if cf == target_column: events.append("<EOS>")
    cf_dict['to_int'] = dict((c, i) for i, c in enumerate(events))
    cf_dict['to_cat'] = dict((i, c) for i, c in enumerate(events))
    feature_dict[cf] = cf_dict

## Dataframe creation and saving

In [19]:
ordinal_feature_names = traces[0].columns.difference(categorical_feature_names)

# Concatenate all features into one feature dataframe per trace
for i in range(0,len(traces)):
    targets = traces[i][target_column].shift(-1).to_frame("TARGET")
    targets.values[len(traces[i])-1] = "<EOS>"
    targets["TARGET"] = targets["TARGET"].map(feature_dict[target_column]['to_int'])
    
    # normalize values while encoding
    for cf in categorical_feature_names:
        traces[i][cf] = traces[i][cf].map(feature_dict[cf]['to_int'])
        traces[i][c] /= max(feature_dict[c]['to_int'].values())
    
    ordinal_trace = traces[i][ordinal_feature_names]
    categorical_trace = traces[i][categorical_feature_names]
    traces[i] = pd.concat([ordinal_trace, categorical_trace, sp2_traces[i], pf_traces[i], targets], ignore_index=False, axis=1)

ValueError: could not convert string to float: '<EOS>'

In [18]:
traces[0]

Unnamed: 0,Number of executions,time:timestamp,concept:name,Specialism code,org:group,SP2_1e consult poliklinisch,SP2_administratief tarief - eerste pol,SP2_verlosk.-gynaec. korte kaart kosten-out,SP2_echografie - genitalia interna,SP2_simulator - gebruik voor aanvang megavol,...,PFS_16,PFS_17,PFS_18,PFS_19,PFS_20,PFS_21,PFS_22,PFS_23,PFS_24,TARGET
0,1,0,0,0,0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,1,0,1,0,0,True,True,False,False,False,...,True,False,False,False,False,False,False,False,False,2
2,1,48,2,1,1,True,True,True,False,False,...,True,False,False,False,False,False,False,False,False,3
3,1,48,3,1,2,True,True,True,True,False,...,True,False,False,False,False,False,False,False,False,0
4,1,48,0,1,1,True,True,True,True,False,...,True,False,False,False,False,False,False,False,False,1
5,1,48,1,1,1,True,True,True,True,False,...,True,False,False,False,False,False,False,False,False,4
6,1,504,4,0,0,True,True,True,True,True,...,True,False,False,False,False,False,False,False,False,5
7,1,672,5,0,0,True,True,True,True,True,...,True,False,False,False,False,False,False,False,False,6
8,1,672,6,0,0,True,True,True,True,True,...,True,False,False,False,False,False,False,False,False,7
9,1,1032,7,2,3,True,True,True,True,True,...,True,False,False,False,False,False,False,False,False,7


In [17]:
assert(sum([len(t) for t in traces]) == len(eventlog_df))
pickle.dump(traces, open(traces_finalpath, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(feature_dict, open(traces_dictionarypath, "wb"), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
used_bytes = sum([t.memory_usage(index=True, deep=True).sum() for t in traces])
print("Memory usage of encoded data: {0} KB".format(used_bytes / 1024))
print("Memory usage of encoded data: {0} MB".format(used_bytes / 1024**2))
print("Memory usage of encoded data: {0} GB".format(used_bytes / 1024**3))

In [None]:
trace_columns = traces[0].columns.tolist()
trace_columns = list(map(lambda e: bool(re.match('^TARGET$', e)), trace_columns))
trace_columns.index(True) # when do the target columns start?!