In [1]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing, threading
import math
import scipy.stats as ss
import time

from opyenxes.model.XLog import XLog
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier
from prefixspan import PrefixSpan

data_path = "../logs/bpic2011.xes"

In [2]:
with open(data_path) as bpic_file:
    eventlog = XUniversalParser().parse(bpic_file)[0]

Unknown extension: http://www.xes-standard.org/meta_time.xesext
Unknown extension: http://www.xes-standard.org/meta_life.xesext
Unknown extension: http://www.xes-standard.org/meta_org.xesext
Unknown extension: http://www.xes-standard.org/meta_concept.xesext
Unknown extension: http://www.xes-standard.org/meta_3TU.xesext
Unknown extension: http://www.xes-standard.org/meta_general.xesext


## Extract data from XES format and enrich with BOS/EOS markers

In [None]:
# collect all attributes
attribute_list = []

# extract column names from any trace, here the first is used
for event in eventlog[0]:
    event_attributes = event.get_attributes()
    
    for attribute in event_attributes:
        attribute_list.append(attribute)
        
attribute_list = set(attribute_list) # remove duplicates
column_names   = ["__case_id"] + list(attribute_list)

eventcount    = sum([2+len(t) for t in eventlog])
event_indices = range(0, eventcount) # total number of entries in log
eventlog_df   = pd.DataFrame(columns=column_names, index=event_indices)

def set_row_value(df, row, colnames, val):
    for column in colnames:
        df.iloc[row][column] = val
        
def process_log_chunk(trace_offset, row_offset, chunk):
    row_idx = row_offset
    for trace_idx, raw_trace in enumerate(chunk):
        # insert start-of-sequence marker
        set_row_value(eventlog_df, row_idx, column_names, "<bos>")
        row_idx += 1

        for event_idx, event in enumerate(raw_trace):
            event_attributes = event.get_attributes()
            eventlog_df.iloc[row_idx]["__case_id"] = trace_idx + trace_offset

            for attribute in event_attributes:
                eventlog_df.iloc[row_idx][attribute] = event_attributes[attribute].get_value()

            row_idx += 1
        # finalize trace by inserting end-of-sequence marker    
        set_row_value(eventlog_df, row_idx, column_names, "<eos>")
        row_idx += 1
        
    print("Finished processing rows ", row_offset, " through", row_idx)

threads  = []
chunk_sz = int(math.ceil(len(eventlog) / multiprocessing.cpu_count()))
row_offset = 0
trace_offset = 0
for core in range(0, multiprocessing.cpu_count()):
    trace_offset = core*chunk_sz
    chunk = eventlog[trace_offset : (core+1)*chunk_sz]
    t = threading.Thread(target=process_log_chunk, args=(trace_offset, row_offset, chunk))
    row_offset += sum([2+len(t) for t in chunk])
    
    threads.append(t)
    
[t.start() for t in threads]
[t.join()  for t in threads]
print("Finished parsing the event log!")

In [None]:
del eventlog
eventlog_df.to_csv(data_path.replace(".xes", "_parsed.csv"))

In [2]:
eventlog_df = pd.read_csv(data_path.replace(".xes", "_parsed.csv"), index_col=[0])
eventcount  = len(eventlog_df)
# eventlog_df = pd.DataFrame.from_csv(data_path.replace(".xes", "_parsed.csv"))

## Eliminate correlated or unimportant features

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
        https://stackoverflow.com/questions/46498455/categorical-features-correlation"""
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for col_a,col_b in itertools.product(eventlog_df.columns, repeat=2): 
    candidate = pd.crosstab(eventlog_df[col_a], eventlog_df[col_b]).as_matrix()
    print("{: >30} {: >30} {: >20}".format(col_a, col_b, cramers_v(candidate)))

In [3]:
# lifecyle:transition is always "complete"
# Producer code correlates perfectly with org:group
# Activity code correlates perfectly with concept:name
eventlog_df = eventlog_df.drop(columns=["lifecycle:transition", "Producer code", "Activity code", "Section"])

## Create windowed featureset

In [5]:
def generate_windows(chunk, window_size, core):
    i = 0
    windows = []
    t_start = time.time()
        
    while i < len(chunk)-window_size+1:
        window = []
        for j in range(0, window_size):
            window_part = chunk.iloc[[i+j]].drop(columns=["__case_id"]).add_prefix("w{0}!!".format(j))
            window_part.reset_index(drop=True, inplace=True)
            window.append(window_part)
            
        window = pd.concat(window, axis=1)
        window_part_width = int(window.shape[1] / window_size)
        
        if((window.iloc[0,window_part_width:] == "<bos>").any()):
            i += window_size-1
            continue
            
        i += 1
        windows.append(window)
    
    print("Window generation: {0} windows on core {1} in {2}s".format(len(windows), core, time.time() - t_start))
    return pd.concat(windows).reset_index(drop=True)

# generating the windows takes incredibly long because of NUMA effects, put everything into processes to avoid contention
corecount = multiprocessing.cpu_count()
process_pool = multiprocessing.Pool(corecount)
chunk_sz = int(math.ceil(eventcount / corecount))
window_size = 5
process_args = []

for core in range(0, corecount):
    process_args.append((eventlog_df[core*chunk_sz:(core+1)*chunk_sz].reset_index(drop=True), window_size, core))

process_results = process_pool.starmap(generate_windows, process_args)
process_results = pd.concat(process_results, ignore_index = True)

Window generation: 3580 windows on core 39 in 35.15171241760254s
Window generation: 3699 windows on core 38 in 36.69233059883118s
Window generation: 3711 windows on core 35 in 36.87334680557251s
Window generation: 3731 windows on core 36 in 36.939133405685425s
Window generation: 3695 windows on core 33 in 37.3782684803009s
Window generation: 3699 windows on core 37 in 37.55412936210632s
Window generation: 3755 windows on core 34 in 37.71281957626343s
Window generation: 3755 windows on core 26 in 38.60425925254822s
Window generation: 3723 windows on core 31 in 38.47282314300537s
Window generation: 3739 windows on core 24 in 38.813809633255005s
Window generation: 3715 windows on core 29 in 39.097424268722534s
Window generation: 3611 windows on core 25 in 39.23093771934509s
Window generation: 3683 windows on core 20 in 39.34364461898804s
Window generation: 3723 windows on core 30 in 39.239952087402344s
Window generation: 3691 windows on core 32 in 39.28441596031189s
Window generation: 366

In [7]:
process_results.to_csv(data_path.replace(".xes", "_windowed.csv"))

## Enrich with SP2 features

In [82]:
# https://stackoverflow.com/questions/42636765/how-to-set-all-the-values-of-an-existing-pandas-dataframe-to-zero
# This one-hot encodes all entries in concept:name column for later incrementation once it has been seen
sp2_features = pd.get_dummies(eventlog_df["concept:name"], prefix="SP2") # can't use windowed representation here as it might skew distribution of values
for col in sp2_features.columns: sp2_features[col].values[:] = 0

eventlog_sp2_df = process_results.copy(deep=True)
sp2_features    = sp2_features.drop(sp2_features.index[sp2_features.index[len(eventlog_sp2_df):]])
assert(len(sp2_features) == len(eventlog_sp2_df))

In [50]:
t_start = time.time()
for i in range(0, len(sp2_features)):
    first_activity_name = eventlog_sp2_df["w0!!concept:name"].iloc[i]
    if (first_activity_name != "<bos>"):
        sp2_features.iloc[i] = sp2_features.iloc[i-1]
    else: # TODO implement stepping through entire window here to extract all appeared items
        pass
    
    sp2_features["SP2_{0}".format(activity_name)].iloc[i] = 1
    
print("Enriched {0} rows with SP2 features in {1}s".format(len(sp2_features), time.time()-t_start))
# TODO parallelize

Enriched 147849 rows with SP2 features in 613.7024304866791s


In [57]:
sp2_features["SP2_<eos>"].

## Enrich with PrefixSpan features

In [None]:
def print_patterns(pt):
    for p in pt:
        print("Support: {0}%".format(100*p[0]/len(event_traces)))
        for n in p[1]:
            print("    > ", int_to_event[n])
        print()
        
events       = list(set(eventlog_df["concept:name"]))
event_to_int = dict((c, i) for i,c in enumerate(events))
int_to_event = dict((i, c) for i,c in enumerate(events))

In [None]:
# Prefixspan requires an array of arrays with one subarray for every trace
indices = np.where(eventlog_df["concept:name"] == "<bos>")[0].tolist()
arr     = eventlog_df["concept:name"].map(event_to_int).tolist()
event_traces   = np.array_split(arr, indices)[1:] # remove randomly inserted array at the start of this list
encoded_traces = [ [e for e in t] for t in event_traces ]
prefixspan_traces = PrefixSpan(encoded_traces)

In [None]:
## TODO: create indices for sequence items here for dictionary encoding
ps_topkc = prefixspan_traces.topk(15, closed=True)