In [None]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing, threading
import math
import scipy.stats as ss
import time
import pickle

from opyenxes.model.XLog import XLog
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier

from prefixspan import PrefixSpan

from tqdm import *

data_path = "../logs/bpic2011.xes"

In [None]:
with open(data_path) as bpic_file:
    eventlog = XUniversalParser().parse(bpic_file)[0]

In [None]:
ncores = multiprocessing.cpu_count()
ntraces = len(eventlog)

## Extract data trace-wise from XES format and enrich with BOS/EOS markers

In [None]:
# collect all attributes
column_names = []

for event in eventlog[0]:
    for attribute in event.get_attributes():
        column_names.append(attribute)
        
column_names = set(column_names) # remove duplicates
column_names = list(column_names)

def create_dataframe_from_trace(t):
    df = pd.DataFrame(columns=column_names, index=range(0,len(t)))
    for event_idx, event in enumerate(t):
        event_attributes = event.get_attributes()
        df.iloc[event_idx]["__case_id"] = 0

        for attribute in event_attributes:
            df[attribute].values[event_idx] = event_attributes[attribute].get_value()
    
    return df

ppool = multiprocessing.Pool(ncores)
traces = []
with tqdm(total=len(eventlog), desc="Converting XES traces to Pandas dataframes", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(create_dataframe_from_trace, eventlog))):
        pbar.update()
        traces.append(_)

In [None]:
del eventlog
traces_picklepath = data_path.replace(".xes", "_traces.pickled")
pickle.dump(traces, open(traces_picklepath, "wb"))

In [None]:
traces = pickle.load(open(traces_picklepath, "rb"))

## Eliminate correlated or unimportant features

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
        https://stackoverflow.com/questions/46498455/categorical-features-correlation"""
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for col_a,col_b in itertools.product(eventlog_df.columns, repeat=2): 
    candidate = pd.crosstab(eventlog_df[col_a], eventlog_df[col_b]).as_matrix()
    print("{: >30} {: >30} {: >20}".format(col_a, col_b, cramers_v(candidate)))

In [None]:
# lifecyle:transition is always "complete"
# Producer code correlates perfectly with org:group
# Activity code correlates perfectly with concept:name
for t in traces:
    t.drop(columns=["lifecycle:transition", "Producer code", "Activity code", "Section"], inplace=True)

## Create standard featureset

In [None]:
eventlog_df = [None] * len(traces)
nattr = len(traces[0].columns)
bos_df = pd.DataFrame([("<bos>",)*nattr], columns = traces[0].columns)
eos_df = pd.DataFrame([("<eos>",)*nattr], columns = traces[0].columns)

for i in range(0,len(traces)):
    traces[i] = pd.concat([bos_df, traces[i], eos_df], ignore_index=True)

eventlog_df = pd.concat(traces, ignore_index=True)
# TODO: one-hot encoding still remains

## Create SP2 feature set

In [None]:
# https://stackoverflow.com/questions/42636765/how-to-set-all-the-values-of-an-existing-pandas-dataframe-to-zero
# This one-hot encodes all entries in concept:name column for later incrementation once it has been seen
# sp2_features = pd.get_dummies(eventlog_df["concept:name"], prefix="SP2") # can't use windowed representation here as it might skew distribution of values
# eventlog_sp2_df = process_results.copy(deep=True)
# sp2_features    = sp2_features.drop(sp2_features.index[sp2_features.index[len(eventlog_sp2_df):]])
# assert(len(sp2_features) == len(eventlog_sp2_df))

In [None]:
# loop through every trace and encode the presence of an activity
activity_labels = [ "SP2_{0}".format(a) for a in eventlog_df["concept:name"].unique() ]
x = None

def enrich_trace_with_sp2(t):
    sp2_df = pd.DataFrame(columns=activity_labels, index=range(0,len(t)))
    
    for col in sp2_df.columns: sp2_df[col].values[:] = 0
    
    sp2_df["SP2_<bos>"].values[0] = 1
    for i in range(1,len(t)):
        first_activity_name = t["concept:name"].iloc[i]
        sp2_df.values[i] = sp2_df.values[i-1]
        sp2_df["SP2_{0}".format(first_activity_name)].values[i] = 1
    return pd.concat([t, sp2_df], axis=1)

ppool = multiprocessing.Pool(ncores)
ttraces = []
with tqdm(total=len(traces), desc="Enriching traces with SP2 features", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(enrich_trace_with_sp2, traces))):
        pbar.update()
        ttraces.append(_)
        
traces = ttraces
del ttraces

## Enrich with PrefixSpan features

In [86]:
def print_patterns(pt):
    for p in pt:
        print("Support: {0}%".format(p[0]))
        for n in p[1]:
            print("    > ", int_to_event[n])
        print()
        
events       = eventlog_df["concept:name"].unique()
event_to_int = dict((c, i) for i,c in enumerate(events))
int_to_event = dict((i, c) for i,c in enumerate(events))

In [82]:
# Prefixspan requires an array of arrays with one subarray for every trace
encoded_traces = [ t["concept:name"].map(event_to_int).tolist() for t in traces ]
prefixspan_traces = PrefixSpan(encoded_traces)

In [87]:
ps_topkc = prefixspan_traces.topk(15, closed=True)
print_patterns(ps_topkc)
# TODO understand what support means

Support: 1143%
    >  <bos>
    >  <eos>

Support: 1110%
    >  <bos>
    >  administratief tarief       - eerste pol
    >  <eos>

Support: 958%
    >  <bos>
    >  vervolgconsult poliklinisch
    >  <eos>

Support: 839%
    >  <bos>
    >  administratief tarief       - eerste pol
    >  vervolgconsult poliklinisch
    >  <eos>

Support: 804%
    >  <bos>
    >  aanname laboratoriumonderzoek
    >  ordertarief
    >  <eos>

Support: 765%
    >  <bos>
    >  aanname laboratoriumonderzoek
    >  aanname laboratoriumonderzoek
    >  ordertarief
    >  <eos>

Support: 760%
    >  <bos>
    >  vervolgconsult poliklinisch
    >  vervolgconsult poliklinisch
    >  <eos>

Support: 729%
    >  <bos>
    >  aanname laboratoriumonderzoek
    >  hemoglobine foto-elektrisch
    >  ordertarief
    >  <eos>

Support: 723%
    >  <bos>
    >  aanname laboratoriumonderzoek
    >  aanname laboratoriumonderzoek
    >  hemoglobine foto-elektrisch
    >  ordertarief
    >  <eos>

Support: 723%
    >  <bos