In [1]:
import numpy as np
import random
import itertools
import pandas as pd
import multiprocessing
import math
import scipy.stats as ss
import seaborn as sns
import pickle
import re
import copy
from keras.utils import np_utils

from prefixspan import PrefixSpan

from tqdm import tqdm_notebook

Using TensorFlow backend.


In [2]:
### configuration
data_path = "../logs/helpdesk.csv"
traces_picklepath  = data_path.replace(".csv", "_raw_traces.pickled")
traces_dictionarypath = data_path.replace(".csv", "_dictionaries.pickled")
target_column = "concept:name"
categorical_feature_names = [target_column]
date_feature_names = ["time:timestamp"]
eosmarker = "<EOS>"
ncores = multiprocessing.cpu_count()
### configuration end

## Extract data trace-wise from CSV format

In [3]:
eventlog = pd.read_csv(data_path).sort_values(by=['CaseID', 'CompleteTimestamp'])

traces = []
for _, g in eventlog.groupby(eventlog['CaseID']):
    g = g[['ActivityID', 'CompleteTimestamp']].reset_index(drop=True)
    g['ActivityID'] = g['ActivityID'].astype(str)
    g = g.rename(columns={'ActivityID': 'concept:name', 'CompleteTimestamp': 'time:timestamp'})
    traces.append(g)

pickle.dump(traces, open(traces_picklepath, "wb"))

In [22]:
traces = pickle.load(open(traces_picklepath, "rb"))

## Convert timestamps to relative scale in hours

In [6]:
# Convert timestamp to total running time in hours
bos_idx = 0
for i in range(0, len(traces)):    
    for c in ["time:timestamp"]:
        traces[i][c] = pd.to_datetime(traces[i][c], utc=True)
        dfs = traces[i][c] - traces[i][c][bos_idx]
        traces[i][c] = dfs.map(lambda d: int(d.total_seconds()/(60*60))).astype(np.float32)

## Dictionary encoding for categorical features

In [7]:
eventlog_df = pd.concat(traces, ignore_index=True)
feature_dict = {}
for cf in categorical_feature_names:
    cf_dict = { 'to_int': {}, 'to_cat': {} }
    events = eventlog_df[cf].unique().tolist()
    if cf == target_column: events.append(eosmarker)
    cf_dict['to_int'] = dict((c, i) for i, c in enumerate(events))
    cf_dict['to_cat'] = dict((i, c) for i, c in enumerate(events))
    feature_dict[cf] = cf_dict

## Create SP2 feature set

In [8]:
# loop through every trace and encode the presence of an activity
sp2_prefix = "SP2_"
activity_labels = [ "{0}{1}".format(sp2_prefix,a) for a in eventlog_df[target_column].unique() ]

def enrich_trace_with_sp2(t):
    sp2_df = pd.DataFrame(columns=activity_labels, index=range(0,len(t)), dtype=np.bool)
    for col in sp2_df.columns: sp2_df[col].values[:] = 0
    sp2_df["{0}{1}".format(sp2_prefix, t[target_column][0])].values[0]  = 1
    
    for i in range(1,len(t)):
        first_activity_name = t[target_column].iloc[i]
        col = "{0}{1}".format(sp2_prefix,first_activity_name)
        
        sp2_df.values[i] = sp2_df.values[i-1]
        sp2_df[col].values[i] = 1
        
    return sp2_df

ppool = multiprocessing.Pool(ncores)
sp2_traces = []

for _ in tqdm_notebook(ppool.imap(enrich_trace_with_sp2, traces),
                       total=len(traces),
                       unit="traces"):
        sp2_traces.append(_)
        
ppool.close()

HBox(children=(IntProgress(value=0, max=3804), HTML(value='')))




## Create PrefixSpan feature set

In [9]:
# Prefixspan requires an array of arrays with one subarray for every trace
encoded_traces = [ t[target_column].map(feature_dict[target_column]['to_int']).tolist() for t in traces ]
prefixspan_traces = PrefixSpan(encoded_traces)
closed_sequences = prefixspan_traces.topk(25, closed=True) # support is how often the subsequence appears in total
# http://sequenceanalysis.github.io/slides/analyzing_sequential_user_behavior_part2.pdf, slide 5

# only take subsequence which are at a certain level of support? like if ss[0]/len(traces) < .90
#ps_topkc = list(filter(lambda x: x[0]/len(traces) > .90, ps_topkc))
closed_sequences = [ p[1] for p in closed_sequences ]
pftrace_args = [ (t, closed_sequences[:], feature_dict[target_column]['to_int']) for t in traces ] # enrich traces with copy of mined subsequences

In [44]:
def wrapped__enrich_trace_with_subseq(args):
    return enrich_trace_with_subseq(*args)

def enrich_trace_with_subseq(t, ps, event_to_int):
    col_prefix = "PFS_"
    subseq_labels = [ "{0}{1}".format(col_prefix,ss_idx) for ss_idx, ss in enumerate(ps) ]
    subseq_df = pd.DataFrame(columns=subseq_labels, index=range(0,len(t)), dtype=np.bool)
    
    subseq_df[:].values[:] = False
    activity_codes = t["concept:name"].map(event_to_int)
    tlen = len(t)
    
    for i in range(0, tlen):
        # loop through all subsequences
        for subseq_idx, subseq in enumerate(ps):
            if tlen <= i+len(subseq):
                continue
                
            # check if the subsequence takes place in the following fields
            subsequence_found = True
            j = 0
            while subsequence_found and j < len(subseq):
                if subseq[j] != activity_codes[j+i]:
                    subsequence_found = False
                j += 1
                    
            if subsequence_found:
                subseq_df.values[i+j-1:,subseq_idx] = True
        
    return subseq_df

ppool = multiprocessing.Pool(ncores)
pf_traces = []
        
for _ in tqdm_notebook(ppool.imap(wrapped__enrich_trace_with_subseq, pftrace_args),
                       total=len(pftrace_args),
                       unit="traces"):
        pf_traces.append(_)
        
ppool.close()

HBox(children=(IntProgress(value=0, max=3804), HTML(value='')))




## Create and normalize ordinal and categorical feature sets

In [11]:
ordinal_feature_names = traces[0].columns.difference(categorical_feature_names)
n_target_classes = max(feature_dict[target_column]['to_int'].values()) + 1
final_traces = copy.deepcopy(traces)

ordinal_traces = [None] * len(traces)
categorical_traces = [None] * len(traces)
target_traces = [None] * len(traces)

# Concatenate all features into one feature dataframe per trace
for i in range(0, len(traces)):
    
    # Create TARGET feature column by shifting target column
    targets = final_traces[i][target_column].shift(-1).map(feature_dict[target_column]['to_int']).to_frame("TARGET")
    targets.values[len(targets)-1] = feature_dict[target_column]['to_int'][eosmarker]
    target_traces[i] = pd.DataFrame(np_utils.to_categorical(targets, num_classes=n_target_classes, dtype='bool')).add_prefix("TARGET_")
    
    # Create separate dfs for ordinal and categorical traces
    ordinal_traces[i] = final_traces[i][ordinal_feature_names].astype(np.float32)
    categorical_traces[i] = final_traces[i][categorical_feature_names].astype(np.str)
    
    # min-max-normalization of ordinal features PER TRACE
    assert len(ordinal_traces[i]) == len(traces[i]), i
    x = ordinal_traces[i]
    denominator = x.max(axis=0) - x.min(axis=0)
    
    for j in range(0, len(denominator)):
        if(denominator[j] == 0):
            denominator[j] += 1
            
    ordinal_traces[i] = (x-x.min(axis=0)) / denominator
    assert len(ordinal_traces[i]) == len(traces[i]), i
    
del final_traces

## Sava data sets per variable type

In [12]:
# Create indices for stratification
from sklearn.model_selection import train_test_split
X = list(range(len(ordinal_traces)))
y = [len(t) for t in ordinal_traces]
train_indices, test_indices, _, _ = train_test_split(X,y, test_size=0.25, random_state=42)

save_path = "/home/felix.wolff2/master-thesis-code/logs/helpdesk/"
def save_trace_dataset(dataset, settype, purpose):
    suffix = "{0}_{1}.pickled".format(settype, purpose)
    p = save_path + suffix
    pickle.dump(dataset, open(p, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    
def filter_by_indices(a, a_idx):
    return [ a[i] for i in range(len(a)) if i in a_idx ]

save_trace_dataset(feature_dict, 'mapping', 'dict')
save_trace_dataset(filter_by_indices(ordinal_traces, train_indices), 'ordinal', 'train')
save_trace_dataset(filter_by_indices(categorical_traces, train_indices), 'categorical', 'train')
save_trace_dataset(filter_by_indices(sp2_traces, train_indices), 'sp2', 'train')
save_trace_dataset(filter_by_indices(pf_traces, train_indices), 'pfs', 'train')
save_trace_dataset(filter_by_indices(target_traces, train_indices),'target', 'train')
save_trace_dataset(filter_by_indices(ordinal_traces, test_indices), 'ordinal', 'test')
save_trace_dataset(filter_by_indices(categorical_traces, test_indices), 'categorical', 'test')
save_trace_dataset(filter_by_indices(sp2_traces, test_indices), 'sp2', 'test')
save_trace_dataset(filter_by_indices(pf_traces, test_indices), 'pfs', 'test')
save_trace_dataset(filter_by_indices(target_traces, test_indices),'target', 'test')

# Data statistics

In [13]:
lens = [len(t) for t in traces]
print("Min trace length", min(lens))
print("Max trace length", max(lens))
print("Avg trace length", np.mean(lens))
print("Std trace length", np.std(lens))
print("# traces", len(traces))
print("Number of events", sum(lens))
print("Number of activities", len(eventlog_df[target_column].unique().tolist()))

Min trace length 1
Max trace length 14
Avg trace length 3.6041009463722395
Std trace length 1.1874492656422815
# traces 3804
Number of events 13710
Number of activities 9


# Thesis feature examples

In [20]:
traces[3]["concept:name"]

0    1
1    8
2    6
3    8
4    6
Name: concept:name, dtype: object

In [17]:
sp2_traces[3]

Unnamed: 0,SP2_1,SP2_8,SP2_6,SP2_3,SP2_9,SP2_2,SP2_4,SP2_5,SP2_7
0,True,False,False,False,False,False,False,False,False
1,True,True,False,False,False,False,False,False,False
2,True,True,True,False,False,False,False,False,False
3,True,True,True,False,False,False,False,False,False
4,True,True,True,False,False,False,False,False,False


In [45]:
pf_traces[3]

Unnamed: 0,PFS_0,PFS_1,PFS_2,PFS_3,PFS_4,PFS_5,PFS_6,PFS_7,PFS_8,PFS_9,...,PFS_15,PFS_16,PFS_17,PFS_18,PFS_19,PFS_20,PFS_21,PFS_22,PFS_23,PFS_24
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
list(map(lambda x: list(map(lambda xx: feature_dict["concept:name"]["to_cat"][xx], x)), closed_sequences[:5]))

[['6'], ['1', '6'], ['8', '6'], ['1', '8', '6'], ['9', '6']]