In [None]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing, threading
import math
import scipy.stats as ss
import time
import pickle

from opyenxes.model.XLog import XLog
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier

from prefixspan import PrefixSpan

from tqdm import *

data_path = "../logs/bpic2011.xes"
traces_picklepath = data_path.replace(".xes", "_raw_traces.pickled")
traces_tmppath = data_path.replace(".xes", "_traces_tmp.pickled")
traces_finalpath = data_path.replace(".xes", "_traces_encoded.pickled")

In [None]:
with open(data_path) as bpic_file:
    eventlog = XUniversalParser().parse(bpic_file)[0]

In [None]:
ncores = multiprocessing.cpu_count()
ntraces = len(eventlog)

## Extract data trace-wise from XES format and enrich with BOS/EOS markers

In [None]:
# collect all attributes
column_names = []

for event in eventlog[0]:
    for attribute in event.get_attributes():
        column_names.append(attribute)
        
column_names = set(column_names) # remove duplicates
column_names = list(column_names)

def create_dataframe_from_trace(t):
    df = pd.DataFrame(columns=column_names, index=range(0,len(t)))
    for event_idx, event in enumerate(t):
        event_attributes = event.get_attributes()
        df.iloc[event_idx]["__case_id"] = 0

        for attribute in event_attributes:
            df[attribute].values[event_idx] = event_attributes[attribute].get_value()
    
    return df

ppool = multiprocessing.Pool(ncores)
traces = []
with tqdm(total=len(eventlog), desc="Converting XES traces to Pandas dataframes", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(create_dataframe_from_trace, eventlog))):
        pbar.update()
        traces.append(_)
        
del eventlog

In [None]:
pickle.dump(traces, open(traces_picklepath, "wb"))

In [43]:
traces = pickle.load(open(traces_picklepath, "rb"))

## Eliminate correlated or unimportant features

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
        https://stackoverflow.com/questions/46498455/categorical-features-correlation"""
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

for col_a,col_b in itertools.product(eventlog_df.columns, repeat=2): 
    candidate = pd.crosstab(eventlog_df[col_a], eventlog_df[col_b]).as_matrix()
    print("{: >30} {: >30} {: >20}".format(col_a, col_b, cramers_v(candidate)))

In [44]:
# lifecyle:transition is always "complete"
# Producer code correlates perfectly with org:group
# Activity code correlates perfectly with concept:name
for t in traces:
    t.drop(columns=["lifecycle:transition", "Producer code", "Activity code", "Section"], inplace=True)

## Create standard feature set

In [45]:
eventlog_df = [None] * len(traces)
nattr = len(traces[0].columns)
bos_df = pd.DataFrame([("<bos>",)*nattr], columns = traces[0].columns)
eos_df = pd.DataFrame([("<eos>",)*nattr], columns = traces[0].columns)

# Do not enrich with BOS,EOS features for now
# for i in range(0,len(traces)):
#     traces[i] = pd.concat([bos_df, traces[i], eos_df], ignore_index=True)

eventlog_df = pd.concat(traces, ignore_index=True)

## Create SP2 feature set

In [None]:
# https://stackoverflow.com/questions/42636765/how-to-set-all-the-values-of-an-existing-pandas-dataframe-to-zero
# This one-hot encodes all entries in concept:name column for later incrementation once it has been seen
# sp2_features = pd.get_dummies(eventlog_df["concept:name"], prefix="SP2") # can't use windowed representation here as it might skew distribution of values
# eventlog_sp2_df = process_results.copy(deep=True)
# sp2_features    = sp2_features.drop(sp2_features.index[sp2_features.index[len(eventlog_sp2_df):]])
# assert(len(sp2_features) == len(eventlog_sp2_df))

In [47]:
# loop through every trace and encode the presence of an activity
sp2_prefix = "SP2_"
activity_labels = [ "{0}{1}".format(sp2_prefix,a) for a in eventlog_df["concept:name"].unique() ]

def enrich_trace_with_sp2(t):
    sp2_df = pd.DataFrame(columns=activity_labels, index=range(0,len(t)), dtype=np.bool)
    for col in sp2_df.columns: sp2_df[col].values[:] = 0
    sp2_df["{0}{1}".format(sp2_prefix, t["concept:name"][0])].values[0]  = 1
    
    for i in range(1,len(t)):
        first_activity_name = t["concept:name"].iloc[i]
        col = "{0}{1}".format(sp2_prefix,first_activity_name)
        
        sp2_df.values[i] = sp2_df.values[i-1]
        sp2_df[col].values[i] = 1
        
    return pd.concat([t, sp2_df], axis=1)

ppool = multiprocessing.Pool(ncores)
ttraces = []
with tqdm(total=len(traces), desc="Enriching traces with SP2 features", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(enrich_trace_with_sp2, traces))):
        pbar.update()
        ttraces.append(_)
        
traces = ttraces
del ttraces

Enriching traces with SP2 features:   0%|          | 0/1143 [00:00<?, ?traces/s]
Enriching traces with SP2 features:   0%|          | 1/1143 [00:00<04:48,  3.96traces/s]
Enriching traces with SP2 features:   1%|          | 13/1143 [00:00<03:24,  5.54traces/s]
Enriching traces with SP2 features:   6%|▋         | 73/1143 [00:00<02:16,  7.84traces/s]
Enriching traces with SP2 features:  10%|█         | 117/1143 [00:00<01:32, 11.11traces/s]
Enriching traces with SP2 features:  14%|█▍        | 161/1143 [00:00<01:02, 15.60traces/s]
Enriching traces with SP2 features:  18%|█▊        | 210/1143 [00:01<00:42, 21.85traces/s]
Enriching traces with SP2 features:  21%|██        | 242/1143 [00:01<00:29, 30.09traces/s]
Enriching traces with SP2 features:  25%|██▌       | 286/1143 [00:01<00:20, 41.49traces/s]
Enriching traces with SP2 features:  29%|██▊       | 328/1143 [00:01<00:14, 56.76traces/s]
328it [00:01, 59.27it/s][A
Enriching traces with SP2 features:  34%|███▍      | 389/1143 [00:01<00:08, 

## Enrich with PrefixSpan features

In [49]:
def print_patterns(pt):
    for p in pt:
        print("Support: {0}%".format(100*p[0]/len(traces)))
        for n in p[1]:
            print("    > ", int_to_event[n])
        print()

# since most patterns begin and end with the <eos> and <bos> markers, the features only become valuable towards the end...
events       = eventlog_df["concept:name"].unique()
event_to_int = dict((c, i) for i,c in enumerate(events) if c not in ["<bos>","<eos>"])
int_to_event = dict((i, c) for i,c in enumerate(events) if c not in ["<bos>","<eos>"])

In [None]:
traces = save_traces[:]

In [50]:
save_traces = traces[:]

In [51]:
# Prefixspan requires an array of arrays with one subarray for every trace
encoded_traces = [ t["concept:name"].map(event_to_int).tolist() for t in traces ]
prefixspan_traces = PrefixSpan(encoded_traces)
closed_sequences = prefixspan_traces.topk(25, closed=True) # support is how often the subsequence appears in total
# http://sequenceanalysis.github.io/slides/analyzing_sequential_user_behavior_part2.pdf, slide 5
# print_patterns(ps_topkc)

# only take subsequence which are at a certain level of support? like if ss[0]/len(traces) < .90
#ps_topkc = list(filter(lambda x: x[0]/len(traces) > .90, ps_topkc))
closed_sequences = [ p[1] for p in closed_sequences ]
ptraces = [ (t, closed_sequences[:], event_to_int) for t in traces ] # enrich traces with copy of mined subsequences

In [52]:
def wrapped__enrich_trace_with_subseq(args):
    return enrich_trace_with_subseq(*args)

def enrich_trace_with_subseq(t, ps, event_to_int):
    col_prefix = "PFS_"
    subseq_labels = [ "{0}{1}".format(col_prefix,ss_idx) for ss_idx, ss in enumerate(ps) ]
    subseq_df = pd.DataFrame(columns=subseq_labels, index=range(0,len(t)), dtype=np.bool)
    
    for col in subseq_df.columns: subseq_df[col].values[:] = 0
    for i in range(0,len(t)): # loop through sequence, prune items from mined sequences, and once a subsequence array is empty, this subsequence has occured :)
        activity_code = event_to_int.get(t["concept:name"].iloc[i], None)
        
        for subseq_idx in range(0,len(ps)):
            if ps[subseq_idx] == []:
                continue
            if ps[subseq_idx][0] == activity_code:
                ps[subseq_idx].pop(0)
                if ps[subseq_idx] == []:
                    subseq_df.values[i:,subseq_idx] = 1
        
    return pd.concat([t, subseq_df], axis=1)

ppool = multiprocessing.Pool(ncores)
ttraces = []

with tqdm(total=len(traces), desc="Enriching traces with mined subsequence features", unit="traces") as pbar:
    for i, _ in tqdm(enumerate(ppool.imap(wrapped__enrich_trace_with_subseq, ptraces))):
        pbar.update()
        ttraces.append(_)
        
traces = ttraces
del ttraces

Enriching traces with mined subsequence features:   0%|          | 0/1143 [00:00<?, ?traces/s]
Enriching traces with mined subsequence features:   1%|          | 8/1143 [00:00<00:15, 74.14traces/s]
Enriching traces with mined subsequence features:   2%|▏         | 25/1143 [00:00<00:12, 88.85traces/s]
Enriching traces with mined subsequence features:   5%|▍         | 55/1143 [00:00<00:09, 111.22traces/s]
Enriching traces with mined subsequence features:   6%|▋         | 73/1143 [00:00<00:08, 121.65traces/s]
Enriching traces with mined subsequence features:   8%|▊         | 87/1143 [00:00<00:08, 126.59traces/s]
Enriching traces with mined subsequence features:  10%|█         | 118/1143 [00:00<00:06, 151.45traces/s]
Enriching traces with mined subsequence features:  14%|█▍        | 164/1143 [00:00<00:05, 186.28traces/s]
Enriching traces with mined subsequence features:  17%|█▋        | 192/1143 [00:00<00:04, 204.53traces/s]
Enriching traces with mined subsequence features:  20%|██        

In [53]:
pickle.dump(traces, open(traces_tmppath, "wb"))

In [65]:
traces = pickle.load(open(traces_tmppath, "rb"))

## Do basic encoding of remaining features

In [66]:
# Convert timestamp to total running time
bos_idx = 0
for i in range(0, len(traces)):
    tlen = len(traces[i])-1
    dfs = traces[i]["time:timestamp"] - traces[i]["time:timestamp"][bos_idx]
    traces[i]["time:timestamp"] = dfs.map(lambda d: d.total_seconds())

In [67]:
# Do one-hot encoding for concept:name and Specialism code
dummy_columns = ["concept:name", "org:group"]
eventlog_df_dummies = pd.get_dummies(eventlog_df[dummy_columns], dtype=np.bool)
trace_offset = 0

for i in range(0,len(traces)):
    traces[i].drop(columns=dummy_columns, inplace = True)
    traces[i] = pd.concat([eventlog_df_dummies[trace_offset:len(traces[i])], traces[i]], ignore_index=False, axis=1)
    trace_offset += len(traces[i])

In [98]:
for i in range(0,len(traces)):
    traces[i]["Specialism code"] = pd.to_numeric(traces[i]["Specialism code"], errors="ignore")
    traces[i]["Number of executions"] = pd.to_numeric(traces[i]["Number of executions"], errors="ignore")

In [102]:
pickle.dump(traces, open(traces_finalpath, "wb"))

In [109]:
used_bytes = sum(traces[0].memory_usage(index=True, deep=True)) * sum([len(t) for t in traces])
print("Memory usage of encoded data: {0} KB".format(used_bytes / 1024))
print("Memory usage of encoded data: {0} MB".format(used_bytes / 1024**2))
print("Memory usage of encoded data: {0} GB".format(used_bytes / 1024**3))

Memory usage of encoded data: 14750973.588867188 KB
Memory usage of encoded data: 14405.247645378113 MB
Memory usage of encoded data: 14.067624653689563 GB
