<a href="https://colab.research.google.com/github/ejakupi13/Uni-Projects/blob/main/Trace_clustering_and_Context_Attribute_ranking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pm4py

In [None]:
import pm4py
print(pm4py.__version__)

2.2.12


In [None]:
#Mount the data from google drive, need to specify the path
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/My Drive/"
%cd /content/drive/My Drive/Colab Notebooks/



# Datasets

In [None]:
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter

#read the event logs from csv files 
log = pd.read_csv('file.csv', sep=',')

"""  read the event logs from xes files and convert to dataframe since the rest of the code works with dataframes
from pm4py.objects.log.importer.xes import importer as xes_importer
xes = xes_importer.apply('file.xes')
log = log_converter.apply(xes, variant=log_converter.Variants.TO_DATA_FRAME) """

#convert timestamp to the desired format
log = dataframe_utils.convert_timestamp_columns_in_df(log)
log['time:timestamp'] = pd.to_datetime(log['time:timestamp'], format = "%d-%m-%Y %H:%M:%S.%f" )

#sort dataframe by timestamp
log = log.sort_values('time:timestamp')

Preprocess event logs

In [None]:
#remove duplicate entries
def duplicate_removal(log):
    return log.drop_duplicates(keep='first')

#filter out traces with less than 3 events
def filter_short_traces(log):
    return log.groupby('case concept:name').filter(lambda x: x['concept:name'].count() > 2 )

#remove traces that do not have case, event or time identifier (mandatory attributes)
def filter_mandatory(log):
    return log.dropna(axis=0, subset=['case:concept:name','concept:name', 'time:timestamp'])

# Feature Generation

Parts of this section's code is adapted from: https://github.com/p-decker/Event_Log_Assessment

Count number of events per each trace

In [None]:
def event_count(log):
    return log.groupby('case:concept:name')['concept:name'].count().values.tolist()

Count number of distinct events per trace

In [None]:
def event_count_unique(log):
    return log.groupby('case:concept:name')['concept:name'].nunique().values.tolist()

Return a variant number for each trace

In [None]:
#variant code
from sklearn.preprocessing import OrdinalEncoder

def variant(log):
    variants = log.groupby('case:concept:name')['concept:name'].apply(tuple).reset_index(name ='variants')
    variants["variant"] = OrdinalEncoder().fit_transform(variants[["variants"]])
    return variants["variant"]

Calculate size of self loops per trace

In [None]:
def self_loop_per_trace_overview(log):
    self_loop_overview_list = []
    trace_list = log.groupby('case:concept:name')['concept:name'].apply(list).values.tolist()
    # create trace representation
    for trace in trace_list:        
        # create list containing size of loops in trace
        self_loop_size_list = []
        i = 0
        while i < (len(trace) - 1):
            self_loop_size = 0
            if trace[i] == trace[i + 1]:
                self_loop_size += 1
                for k in range(i + 1, len(trace)-1, 1):
                    #check consecutive events
                    if trace[k] == trace[k+1]:
                        self_loop_size += 1
            if self_loop_size > 0:
                self_loop_size_list.append(self_loop_size)
                i += self_loop_size
            else:
                i += 1
        self_loop_overview_list.append(sum(self_loop_size_list))
    return self_loop_overview_list

Calculate number of repetitions per trace

In [None]:
def repetition_per_trace_overview(log):
    repetition_overview_list = []
    trace_list = log.groupby('case:concept:name')['concept:name'].apply(list).values.tolist()
    # create trace representation
    for trace in trace_list:
        # create window to detect repetitions
        window = []
        repetition_size_list = []
        repetition_size = 0
        # append events to window
        for i in trace:
            if i not in window:
                window.append(i)
            else:
                # check if repetition is not a self-loop
                position = len(window) - 1 - window[::-1].index(i)
                if position == (len(window) - 1):
                    window.append(i)
                else:
                    # calculate repetition size and delete repetition from window
                    repetition_size += len(window[position: (len(window) + 1)])
                    repetition_size_list.append(repetition_size)
                    del window[position: (len(window) + 1)]
                    window.append(i)
        repetition_overview_list.append(len(repetition_size_list))
    return repetition_overview_list

Convert trace representation to scalar

In [None]:
import numpy as np
import math
def cat2num(log):
    log_copy = log.copy()
    log_copy["counter"] = 1
    log_copy = log_copy.groupby(['case:concept:name', 'concept:name'])['counter'].count().reset_index()
    return log_copy.groupby('case:concept:name')['counter'].apply(lambda x: sum(x**x)).values.tolist()

Methods to create a directed graph and calculate network connectivity for each trace

In [None]:
# Returns the names of events in the log (trace)
def event_names(log):
    from pm4py.algo.filtering.pandas.attributes import attributes_filter
    events = attributes_filter.get_attribute_values(log, "concept:name")
    event_list = [*events]
    return event_list

# Returns a dictionary that assigns a distinct number to each event in log (trace)
def ranking_dict(log):
    events = sorted(event_names(log))
    rank_dict = {b: a for a, b in enumerate(events)}
    return rank_dict

# Creates an adjacency matrix for building a directed graph
def adjacency_matrix_directed(log):
    from pm4py.objects.dfg.retrieval.pandas import  get_dfg_graph
    import numpy as np

    event_ranking = ranking_dict(log)
    connections = [list(i) for i in [*(get_dfg_graph(log,  measure="frequency", sort_caseid_required=False, sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1))]]

    # Bring connections in rank format
    for connection in range(len(connections)):
        for elem in range(len(connections[connection])):
            connections[connection][elem] = event_ranking[connections[connection][elem]]

    # Create initial matrix
    matrix_shape = len(event_names(log))
    adjac_matrix = np.zeros(shape=(matrix_shape, matrix_shape))

    # Fill matrix based on connections
    for (i, j) in connections:
        adjac_matrix[i][j] += 1
    return np.asarray(adjac_matrix)

#graph creation
def create_directed_graph(log):
    import networkx as nx

    DG = nx.DiGraph()
    matrix = adjacency_matrix_directed(log)
    number_for_event = ranking_dict(log)
    event_for_number = dict((y, x) for x, y in number_for_event.items())
    for row in range(len(matrix)):
        for event in range(len(matrix[row])):
                if matrix[row][event] > 0:
                    DG.add_edge(event_for_number[row], event_for_number[event])
    return DG

def number_of_arcs(log):
   # from graph_creation import create_directed_graph
    import networkx as nx
    graph = create_directed_graph(log)
    return len(nx.edges(graph))

def number_of_nodes(log):
   # from graph_creation import create_directed_graph
    import networkx as nx
    graph = create_directed_graph(log)
    return len(nx.nodes(graph))
    
def network_connectivity(log):
    from pandas import DataFrame
    network_connectivity= []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        coeff = number_of_arcs(trace) / number_of_nodes(trace) 
        network_connectivity.append(coeff)
    return network_connectivity

Create in degree matrix and convert to scalar

In [None]:
def in_degree(log):
    from pandas import DataFrame
    import math
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    in_degree = []
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        graph = create_directed_graph(trace)
        node_indegree_tuple_list=list(graph.in_degree()) 
        in_degree_list=[x[1] for x in node_indegree_tuple_list]
        trace_indegree = math.sqrt(sum(map(lambda x:x*x,in_degree_list)))
        in_degree.append(trace_indegree)
    return in_degree


Create out degree matrix and convert to **scalar**

In [None]:
def out_degree(log):
    from pandas import DataFrame
    import math
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    out_degree = []
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        graph = create_directed_graph(trace)
        node_outdegree_tuple_list=list(graph.out_degree()) 
        out_degree_list=[x[1] for x in node_outdegree_tuple_list]
        trace_outdegree = math.sqrt(sum(map(lambda x:x*x,out_degree_list)))
        out_degree.append(trace_outdegree)
    return out_degree

Calculate cyclicity as number of cyclic node/number of nodes for each trace

In [None]:
def cyclicity(log):
    import networkx as nx
    from pandas import DataFrame
    cyclicity_list = []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
    # retrieve cycles and set of nodes contained in the cycles
          trace = DataFrame(trace_list[i][1])
          graph = create_directed_graph(trace)
          cycles = list(nx.simple_cycles(graph))
          cycle_nodes = set()
          for i in cycles:
              if len(i) > 1:
                  cycle_nodes = cycle_nodes.union(set(i))
          cyclicity_list.append(len(list(cycle_nodes)) / number_of_nodes(trace))
    return cyclicity_list

Calculate graph density for each trace

In [None]:
def density(log):
    import networkx as nx
    from pandas import DataFrame
    density_list = []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        graph = create_directed_graph(trace)
        density_list.append(nx.density(graph))
    return density_list

Calculate syntactic node similarity

In [None]:
def syntactic_node_similarity(log):
    from editdistance import distance
    from pandas import DataFrame
    syntactic_node_sim_list = []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        events = event_names(trace)
        counter = 0
        for i in events:
            for j in events:
                sim = 1 - (distance(i, j) / max(len(i), len(j)))
                if sim >= 0.6:
                    counter += 1
        if len(events)==1:
            print ('zero')
        syntactic_node_sim_list.append((counter - len(events)) / (len(events) * (len(events-1)) - len(events)))
    return syntactic_node_sim_list

Methods for creating an undirected graph and count cut vertices for each trace

In [None]:
# Creates an adjacency matrix to then create an undirected graph
def adjacency_matrix_undirected(log):
    from pm4py.objects.dfg.retrieval.pandas import  get_dfg_graph
    import numpy as np

    event_ranking = ranking_dict(log)
    connections = [list(i) for i in [*(get_dfg_graph(log,  measure="frequency", sort_caseid_required=False, sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1))]]

    # Bring connections in rank format
    for connection in range(len(connections)):
        for elem in range(len(connections[connection])):
            connections[connection][elem] = event_ranking[connections[connection][elem]]

    # Create initial matrix
    matrix_shape = len(event_names(log))
    adjac_matrix = np.zeros(shape=(matrix_shape, matrix_shape))

    # Fill matrix based on connections
    for (i, j) in connections:
        adjac_matrix[i][j] += 1
        adjac_matrix[j][i] += 1
    return np.asarray(adjac_matrix)

# Create undirected graph
def create_undirected_graph(log):
    
    import networkx as nx

    DG = nx.Graph()
    matrix = adjacency_matrix_undirected(log)
    number_for_event = ranking_dict(log)
    event_for_number = dict((y, x) for x, y in number_for_event.items())
    for row in range(len(matrix)):
        for event in range(len(matrix[row])):
            if matrix[row][event] > 0:
                DG.add_edge(event_for_number[row], event_for_number[event])
                DG.add_edge(event_for_number[event], event_for_number[row])
    return DG

# Number of articulation points
def number_of_articulation_points(log):
    import networkx as nx
    from pandas import DataFrame
    nr_articulation_points_list = []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
        trace = DataFrame(trace_list[i][1])
        graph = create_undirected_graph(trace)
        nr_articulation_points_list.append(len(list(nx.articulation_points(graph))))
    return nr_articulation_points_list

Triple abstraction evaluation

In [None]:
def triple_abstraction_evaluation(log):
    from pm4py.objects.dfg.retrieval.pandas import get_freq_triples
    triple_abs_list = []
    trace_list = list(log.groupby("case:concept:name", as_index=False))
    for i in range(len(trace_list)):
        trace = pd.DataFrame(trace_list[i][1])
        triples = list(get_freq_triples(trace).keys())
        target_triples = set()
        for i in triples:
            for j in triples:
                if i is not j:
                    if i[0] == j[0] and i[-1] == j[-1]:
                        target_triples.add(i)
                        target_triples.add(j)
        triple_abs_list.append(len(target_triples) / len(triples))
    return triple_abs_list

Create the dataset with all the features

In [None]:
def create_feature_representation(log):
    #take into account only control-flow
    log = log[['case:concept:name', 'concept:name', 'time:timestamp']]
    #Initialize the dataframe and append all the features
    df = log['case:concept:name'].unique()
    df = pd.DataFrame(data=df, columns=["case:concept:name"])
    df['event_count']= event_count(log)
    df['event_count_unique'] = event_count_unique(log)
    df['variant_code']= variant(log)
    df['self_loop_per_trace_overview'] = self_loop_per_trace_overview(log)
    df['repetition_per_trace_overview'] = repetition_per_trace_overview(log)
    df['cat2num'] = cat2num(log)
    df['network_connectivity'] = network_connectivity(log)
    df['density'] = density(log)
    df['syntactic_node_similarity'] = syntactic_node_similarity(log)
    df['number_of_cut_vertices'] = number_of_cut_vertices(log)
    df['triple_abstraction_evaluation'] = triple_abstraction_evaluation(log)
    df['in_degree'] =in_degree(log)
    df['out_degree'] = out_degree(log)
    df['cyclicity'] = cyclicity(log)
    return df




# Feature Transformation (optional)

In [None]:
#transform feature via PCA, n is number of components (transformed features)
def transform_via_pca(log,n):
    from sklearn.decomposition import PCA
    log = PCA(n_components=n).fit(log)
    return log

In [None]:
#transform feature via LSA (truncated-SVD), n is number of components (transformed features)
def transform_via_pca(log,n):
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=n)
    #uncomment to print the explained variance by the components
    #print(np.cumsum(svd.explained_variance_ratio_))
    return svd.fit(log)

In [None]:
# import preprocessing
from sklearn import preprocessing
import numpy as np

# Create correlation matrix
def corr_matrix(df):
    corr_matrix = df.corr().abs()
    return corr_matrix

# Select upper triangle of correlation matrix
def upper(df):
    upper = corr_matrix(df).where(np.triu(np.ones(corr_matrix(df).shape), k=1).astype(np.bool))
    return upper

# Find features with correlation greater than 0.9
def remove_correlation(df):
    to_drop = [column for column in upper(df).columns if any(upper[column] > 0.90)]
    df = df.drop(to_drop, axis=1)
    return df


# normalize features to a (0,1) range
def normalization(df):
    from sklearn import preprocessing
    scaler = preprocessing.MinMaxScaler()
    # preprocess the features ItemsBought and ItemsReturned
    df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation","in_degree","out_degree","cyclicity"]] = scaler.fit_transform(df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation", "in_degree","out_degree", "cyclicity"]])
    return df

# Stratified Sampling

In [None]:
import numpy as np
# sample the log using stratified random sampling; size is the percentage of the log to be represented in the sample, e.g., 0.1 (10%)
# output: a dataframe with columns: case:concept:name, concept:name, time:timestamp
def sampling(log,size):
    log_csv_variants = log.groupby('case:concept:name')['concept:name'].apply(tuple).reset_index(name = 'variants')

    #define total sample size desired
    N = round(size * log['case:concept:name'].nunique())

    #perform stratified random sampling
    log_csv_variants = log_csv_variants.groupby('variants', group_keys=False).apply(lambda x: x.sample(int(np.rint(N*len(x)/len(log_csv_variants))))).sample(frac=1).reset_index(drop=True)
    sample = log_csv_variants.merge(log, on='case:concept:name').drop(columns={'variants'})
    return sample

# Clustering

Evaluation of several clustering techniques in terms of precision, replay-fitness and F1-score.
Process models are extracted from each sublog using Inductive Miner.

In [None]:
# Get average of a list
def Average(list):
    return sum(list) / len(list)

## K-means

In [None]:
def K_means_evaluation(sample,df):
    from sklearn.cluster import KMeans
    from pm4py.algo.discovery.inductive import algorithm as inductive_miner
    from pm4py.objects.conversion.log import converter as log_converter
    from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
    from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
    from pm4py.algo.discovery.inductive.algorithm import Variants

    #contain f1-score, precision and fitness for each parameter setting, eg. k=1,2,3 ... 10
    #k=1 is the baseline, 1 cluster i.e., whole log/sample
    f1_final_list=[]
    precision_final_list = []
    fitness_final_list = []

    for i in range(1,11):     
        estimator = KMeans(n_clusters = i)
        estimator.fit(df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation", "in_degree", "out_degree", "cyclicity"]])
        df_with_cluster = df.assign(cluster=estimator.labels_)

        #contain f1-score,precision and fitness for each sublog
        score = []
        precisionlist = []
        fitnesslist = []
        for j in range (0,i):
            df1 = df_with_cluster[df_with_cluster['cluster'] == j]
            df1 = pd.merge(sample,df1,how="inner",on="case:concept:name",left_index=False,right_index=False,indicator=False,copy=True,)
            event_log = log_converter.apply(df1)
            #convert log to Petri net
            net, im, fm = inductive_miner.apply(event_log,variant=Variants.IMf) # variant=Variants.IMd
            #calculate precision for each sublog
            prec = precision_evaluator.apply(event_log, net, im, fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE) #.ETCONFORMANCE_TOKEN
            #calculate replay-fitness for each sublog
            fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED) #.TOKEN_BASED

            #use this in case inductive miner variant is IMD
            #f_score = (2*prec*fitness.get('log_fitness'))/(prec+fitness.get('log_fitness'))         
            f_score = (2*prec*fitness.get('averageFitness'))/(prec+fitness.get('averageFitness'))

            score.append(f_score)
            precisionlist.append(prec)
            fitnesslist.append(fitness.get('averageFitness'))
 
        f1_final_list.append(Average(score))
        precision_final_list.append(Average(precisionlist))
        fitness_final_list.append(Average(fitnesslist))

    return f1_final_list, precision_final_list, fitness_final_list

## Hierarchical Agglomerative Clustering




In [None]:
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive.algorithm import Variants
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from scipy.cluster.hierarchy import dendrogram, linkage


def HAC_evaluation(sample, df):

    f1_final_list=[]
    precision_final_list = []
    fitness_final_list = []

    # create the clustering
    Z = linkage(df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation", "in_degree", "out_degree", "cyclicity"]], 'median')

    # plot the dendrogram
    #dendrogram(Z, labels=df['case:concept:name'].values)

    # import fcluster to add clusterIDs
    from scipy.cluster.hierarchy import fcluster
    dendrogram(Z, truncate_mode='lastp', p=20)
    for t in range(2,11):

        score = []
        precisionlist = []
        fitnesslist = []

        clusters = fcluster(Z, t=t, criterion='maxclust')
        df_with_cluster = df.copy()
        df_with_cluster['cluster'] = clusters
        score=[]
        for j in range (0,t):
            df1 = df_with_cluster[df_with_cluster['cluster'] == j]
            df1 = pd.merge(sample,df1,how="inner",on="case:concept:name",left_index=False,right_index=False,indicator=False,copy=True,)  
            event_log = log_converter.apply(df1)
            net, im, fm = inductive_miner.apply(event_log, variant=Variants.IMf)

            prec = precision_evaluator.apply(event_log, net, im, fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)
           
            fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED)
            
            f_score = (2*prec*fitness.get('averageFitness'))/(prec+fitness.get('averageFitness'))

            score.append(f_score)
            precisionlist.append(prec)
            fitnesslist.append(fitness.get('averageFitness'))          

        f1_final_list.append(Average(score))
        precision_final_list.append(Average(precisionlist))
        fitness_final_list.append(Average(fitnesslist))

    return f1_final_list, precision_final_list, fitness_final_list

## Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive.algorithm import Variants
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
import pandas as pd

def SC_evaluation(sample, df):

    f1_final_list=[]
    precision_final_list = []
    fitness_final_list = []

    for i in range (2,11):
        estimator = SpectralClustering(n_clusters=i, affinity='rbf').fit(df[['event_count','event_count_unique', 'variant_code','self_loop_per_trace_overview', 'repetition_per_trace_overview','cat2num', 'network_connectivity','density','syntactic_node_similarity','number_of_cut_vertices', 'triple_abstraction_evaluation','in_degree', 'out_degree', 'cyclicity']])
        df_with_cluster = df.assign(cluster=estimator.labels_)
        score = []
        for j in range (0,len(set(list(estimator.labels_)))):
            df1 = df_with_cluster[df_with_cluster['cluster'] == j]
            df1 = pd.merge(log,df1,how="inner",on="case:concept:name",left_index=False,right_index=False,indicator=False,copy=True,)
            event_log = log_converter.apply(df1)
            net, im, fm = inductive_miner.apply(event_log, variant=Variants.IMf)
            
            prec = precision_evaluator.apply(event_log, net, im, fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)

            fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED)
            
            f_score = (2*prec*fitness.get('averageFitness'))/(prec+fitness.get('averageFitness'))
            
            score.append(f_score)
            precisionlist.append(prec)
            fitnesslist.append(fitness.get('averageFitness'))

        f1_final_list.append(Average(score))
        precision_final_list.append(Average(precisionlist))
        fitness_final_list.append(Average(fitnesslist))

    return f1_final_list, precision_final_list, fitness_final_list

## DBSCAN

In [None]:
#select value for eps
import numpy as np
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
import pandas as pd

#plot distances to k-nearest neighbor, df-feature representation, k-parameter
def plot_distances(df,k):
    X = df.copy()
    X.drop(columns='case:concept:name', inplace=True)
    neigh = NearestNeighbors(n_neighbors=k)
    nbrs = neigh.fit(X)
    distances, indices = nbrs.kneighbors(X)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    plt.plot(distances)

In [None]:
from sklearn.cluster import DBSCAN
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive.algorithm import Variants
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator

def DBSCAN_evaluation(sample,df):

    f1_final_list=[]
    precision_final_list = []
    fitness_final_list = []
    n_clusters = []

    for eps in range(10,14,1):
        for min_samples in range(22,88,22):
            estimator = DBSCAN(eps=eps/10.0, min_samples=min_samples).fit(df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation","in_degree", "out_degree", "cyclicity"]])
            df_with_cluster = df.assign(cluster=estimator.labels_)
            score = []
            for j in range (0,len(set(list(estimator.labels_)))):
                df1 = df_with_cluster[df_with_cluster['cluster'] == j]
                df1 = pd.merge(sample,df1,how="inner",on="case:concept:name",left_index=False,right_index=False,indicator=False,copy=True,)
                event_log = log_converter.apply(df1)
                net, im, fm = inductive_miner.apply(event_log, variant=Variants.IMf)
                prec = precision_evaluator.apply(event_log, net, im, fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)

                fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED)

                f_score = (2*prec*fitness.get('averageFitness'))/(prec+fitness.get('averageFitness'))

                score.append(f_score)
                precisionlist.append(prec)
                fitnesslist.append(fitness.get('averageFitness'))

            f1_final_list.append(Average(score))
            precision_final_list.append(Average(precisionlist))
            fitness_final_list.append(Average(fitnesslist))
            n_clusters = n_clusters.append(len(set(list(estimator.labels_))))
    return n_clusters, f1_final_list, precision_final_list, fitness_final_list               


## OPTICS

In [None]:
from sklearn.cluster import OPTICS
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.inductive.algorithm import Variants
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator

def OPTICS_evaluation(sample,df):

    f1_final_list=[]
    precision_final_list = []
    fitness_final_list = []
    n_clusters = []

    for eps in range(10,14,1):
        for min_samples in range(22,88,22):
            estimator = OPTICS(eps=eps/10.0, min_samples=min_samples).fit(df[["event_count","event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation","in_degree", "out_degree", "cyclicity"]])
            df_with_cluster = df.assign(cluster=estimator.labels_)
            score = []
            for j in range (0,len(set(list(estimator.labels_)))):
                df1 = df_with_cluster[df_with_cluster['cluster'] == j]
                df1 = pd.merge(log,df1,how="inner",on="case:concept:name",left_index=False,right_index=False,indicator=False,copy=True,)
                event_log = log_converter.apply(df1)
                net, im, fm = inductive_miner.apply(event_log, variant=Variants.IMf)
                prec = precision_evaluator.apply(event_log, net, im, fm, variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)

                fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED)

                f_score = (2*prec*fitness.get('averageFitness'))/(prec+fitness.get('averageFitness'))

                score.append(f_score)
                precisionlist.append(prec)
                fitnesslist.append(fitness.get('averageFitness'))

            f1_final_list.append(Average(score))
            precision_final_list.append(Average(precisionlist))
            fitness_final_list.append(Average(fitnesslist))
            n_clusters = n_clusters.append(len(set(list(estimator.labels_))))
    return n_clusters, f1_final_list, precision_final_list, fitness_final_list 

# Ranking Context Attributes

In [None]:
#replace the missing values of a specific numeric attribute with average of the column,,, can also use .median()
def replace_with_mean(log, attribute):
    mean = log[attribute].mean()
    return log.fillna({attribute:mean})

In [None]:
#sum up event-level attributes to convert to trace-level attribute, e.g. sum up cost for each event, to get the total cost for the trace
def sum_event_level_attr(log, attribute):
    return log.groupby('case:concept:name')[attribute].sum().reset_index()
    

In [None]:
#Apply best performing clustering setting (e.g. K-means with k=3) to the whole log and append the context attributes

def context_representation(log, df):
    from sklearn.cluster import KMeans
    estimator = KMeans(n_clusters = 3)
    estimator.fit(df[["event_count_unique","variant_code","self_loop_per_trace_overview","repetition_per_trace_overview","cat2num","network_connectivity","density","syntactic_node_similarity","number_of_cut_vertices","triple_abstraction_evaluation","in_degree", "out_degree", "cyclicity"]])
    df_with_cluster = df.assign(cluster=estimator.labels_)
    log = log.groupby('case:concept:name').first().reset_index()
    context = df_with_cluster[['case:concept:name','cluster']].merge(log.drop(columns=['concept:name']), on="case:concept:name", how="left")
    return context

In [None]:
# convert attributes that are numerical to categorical (discretization)
# k - number of bins, user-defined
def discretize(attribute, k):
    import pandas as pd
    return pd.cut(column, k, labels=["low", "medium", "high"])

# filter out attributes with very low variance (one value)
def low_variance_attr(context):
    for column in context.columns:
        if context[column].nunique() == 1:
           contex.drop(columns=column, inplace=True, axis=1)
    return context


# filter out attributes with very high variance (IDs) 
# n is the user-defined threshold (equal to number of features)
def high_variance_attr(context, n):
    for column in context.columns:
        if context[column].nunique() > n:
           context.drop(columns=column, inplace=True, axis =1)
    return context


# filter out attributes which have missing values that cannot be inferred from the log
# first the missing values which can be replaced are filled, then the rest is filtered out
def filter_missing(cotext):
    return context.dropna(axis = 1)



In [None]:
# Get average of a list
def Average(list):
    return sum(list) / len(list)
# Rank the context attributes
# Output: sorted distionary of key:attribute name and value:score.
def ranking(context):
    import numpy as np
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.metrics.pairwise import cosine_similarity
    rank = {}
    tfidf = TfidfTransformer()
    context['helper']=1
    context.head()
    columns = list(context.drop(columns=['cluster', 'helper']).columns)
    for column in columns:
        df_pivot = context.pivot_table(index='cluster', columns=str(column), values='helper', fill_value=0, aggfunc=np.sum)
        df_pivot.reset_index(inplace=True)
        df_pivot.drop(columns = 'cluster', inplace = True)
        tfidf.fit(df_pivot, y=None)
        tfidf_matrix = tfidf.transform(df_pivot, copy=True)
        disimm = (1-cosine_similarity(tfidf_matrix))
        rank.update({column: Average(disimm[np.triu_indices(context['cluster'].nunique(), k = 1)])})
    return (sorted(rank.items(), key=lambda item: item[1]))
  




# Visualize Decision Tree

Create a DT using the ground truth context attribute as the label and visualize it.

In [None]:
#decision tree
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn import tree
import graphviz 


# attribute - the target attribute(label)
# n - maximum depth of the tree to display
def visualize_DT(context, attribute, n)
    data = context.drop(columns={'cluster', attribute})
    target = context[attribute]
    target = target.astype(str)
    target1=target.copy()
    data = data.astype(str) 
    data1=data.copy()
    enc = OrdinalEncoder()
    data = enc.fit_transform(data)
    data = pd.DataFrame(data=data)

    estimator = DecisionTreeClassifier(max_depth=n)
    #apply the DT classifier
    estimator.fit(data, target)

    dot_data = tree.export_graphviz(estimator, out_file=None, feature_names=data1.columns, filled=True, rounded=True, special_characters=True) 
    graph = graphviz.Source(dot_data) 
    display(graph)

# BPMN models

In [None]:
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.conversion.process_tree import converter
from pm4py.visualization.bpmn import visualizer

def bpmn_visualizer(log):
    log = log_converter.apply(log)
    tree = pm4py.discover_process_tree_inductive(log)
    bpmn_graph = converter.apply(tree, variant=converter.Variants.TO_BPMN)
    bpmn_model = visualizer.apply(bpmn_graph)
    visualizer.view(bpmn_model)