In [None]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing, threading
import math
import scipy.stats as ss

from opyenxes.model.XLog import XLog
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier
from prefixspan import PrefixSpan

data_path = "../logs/bpic2011.xes"

In [None]:
with open(data_path) as bpic_file:
    eventlog = XUniversalParser().parse(bpic_file)[0]

## Extract data from XES format and enrich with BOS/EOS markers

In [None]:
# collect all attributes
attribute_list = []

# extract column names from any trace, here the first is used
for event in eventlog[0]:
    event_attributes = event.get_attributes()
    
    for attribute in event_attributes:
        attribute_list.append(attribute)
        
attribute_list = set(attribute_list) # remove duplicates
column_names   = ["__case_id"] + list(attribute_list)

eventcount    = sum([2+len(t) for t in eventlog])
event_indices = range(0, eventcount) # total number of entries in log
eventlog_df   = pd.DataFrame(columns=column_names, index=event_indices)

def set_row_value(df, row, colnames, val):
    for column in colnames:
        df.iloc[row][column] = val
        
def process_log_chunk(trace_offset, row_offset, chunk):
    row_idx = row_offset
    for trace_idx, raw_trace in enumerate(chunk):
        # insert start-of-sequence marker
        set_row_value(eventlog_df, row_idx, column_names, "<bos>")
        row_idx += 1

        for event_idx, event in enumerate(raw_trace):
            event_attributes = event.get_attributes()
            eventlog_df.iloc[row_idx]["__case_id"] = trace_idx + trace_offset

            for attribute in event_attributes:
                eventlog_df.iloc[row_idx][attribute] = event_attributes[attribute].get_value()

            row_idx += 1
        # finalize trace by inserting end-of-sequence marker    
        set_row_value(eventlog_df, row_idx, column_names, "<eos>")
        row_idx += 1
        
    print(row_offset, row_idx)

threads  = []
chunk_sz = int(math.ceil(len(eventlog) / multiprocessing.cpu_count()))
row_offset = 0
trace_offset = 0
for core in range(0, multiprocessing.cpu_count()):
    trace_offset = core*chunk_sz
    chunk = eventlog[trace_offset : (core+1)*chunk_sz]
    t = threading.Thread(target=process_log_chunk, args=(trace_offset, row_offset, chunk))
    row_offset += sum([2+len(t) for t in chunk])
    
    threads.append(t)
    
[t.start() for t in threads]
[t.join()  for t in threads]

In [None]:
#eventlog_df.to_csv(data_path.replace(".xes", "_parse.csv"))
# eventlog_df = pd.read_csv(data_path.replace(".xes", "_prepared.csv"), index_col=[0])
eventlog_df = pd.DataFrame.from_csv(data_path.replace(".xes", "_parsed.csv"))

## Eliminate correlated or unimportant features

In [None]:
eventlog_df.describe().transpose()

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
        https://stackoverflow.com/questions/46498455/categorical-features-correlation"""
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for col_a,col_b in itertools.product(eventlog_df.columns, repeat=2): 
    candidate = pd.crosstab(eventlog_df[col_a], eventlog_df[col_b]).as_matrix()
    print("{: >30} {: >30} {: >20}".format(col_a, col_b, cramers_v(candidate)))

In [None]:
# lifecyle:transition is always "complete"
# Producer code correlates perfectly with org:group
# Activity code correlates perfectly with concept:name
eventlog_df = eventlog_df.drop(columns=["lifecycle:transition", "Producer code", "Activity code", "Section"])

## Create windowed featureset

In [None]:
def generate_windows(soffset, eoffset, core):
    i = soffset
    windows = []
    
    while i < eoffset:
        window = []
        for j in range(0, window_size):
            window_part = eventlog_df.iloc[[i+j]].drop(columns=["__case_id"]).add_prefix("w{0}!!".format(j))
            window_part.reset_index(drop=True, inplace=True)
            window.append(window_part)
            
        window = pd.concat(window, axis=1)
        window_part_width = int(window.shape[1] / window_size)
        
        if((window.iloc[0,window_part_width:] == "<bos>").any()):
            i += window_size-2
            continue
            
        i += 1
        windows.append(window)
    
    thread_results[core] = windows

# generating the windows takes incredibly long, speed up via parallel processing here
corecount = multiprocessing.cpu_count()
threads  = [None] * corecount
thread_results = [None] * corecount
chunk_sz = int(math.ceil(eventcount / corecount))
row_offset = 0
window_size = 5

for core in range(0, corecount):
    t = threading.Thread(target=generate_windows, args=(core*chunk_sz, (core+1)*chunk_sz, core))
    threads[core] = t
    
[t.start() for t in threads]
[t.join()  for t in threads]
thread_results = list(itertools.chain.from_iterable(thread_results))
thread_results = pd.concat(thread_results, ignore_index = True)

In [None]:
w = eventlog_df.iloc[[1]].drop(columns=["__case_id"]).add_prefix("w1_")
y = eventlog_df.iloc[[2]].drop(columns=["__case_id"]).add_prefix("w2_")

w.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

pd.concat([w,y], axis=1)

## Enrich with SP2 features

In [None]:
# https://stackoverflow.com/questions/42636765/how-to-set-all-the-values-of-an-existing-pandas-dataframe-to-zero
# This one-hot encodes all entries in concept:name column for later incrementation once it has been seen
sp2_features = pd.get_dummies(eventlog_df["concept:name"], prefix="SP2")
for col in sp2_features.columns: sp2_features[col].values[:] = 0

## Enrich with PrefixSpan features

In [None]:
def print_patterns(pt):
    for p in pt:
        print("Support: {0}%".format(100*p[0]/len(event_traces)))
        for n in p[1]:
            print("    > ", int_to_event[n])
        print()
        
events       = list(set(eventlog_df["concept:name"]))
event_to_int = dict((c, i) for i,c in enumerate(events))
int_to_event = dict((i, c) for i,c in enumerate(events))

In [None]:
# Prefixspan requires an array of arrays with one subarray for every trace
indices = np.where(eventlog_df["concept:name"] == "<bos>")[0].tolist()
arr     = eventlog_df["concept:name"].map(event_to_int).tolist()
event_traces   = np.array_split(arr, indices)[1:] # remove randomly inserted array at the start of this list
encoded_traces = [ [e for e in t] for t in event_traces ]
prefixspan_traces = PrefixSpan(encoded_traces)

In [None]:
## TODO: create indices for sequence items here for dictionary encoding
ps_topkc = prefixspan_traces.topk(15, closed=True)