In [None]:
# Process Mining
## Trace Clustering 

In [None]:
import pm4py
import pandas as pd

In [None]:
event_log = pm4py.read_xes('logs/sintetico_completo.xes', sep=',')
#lawsuit_id,date,movement,order,status,area,subject_matter,control,digital,class,distribution_date,court_department,claim_amount,judge
event_log = pm4py.format_dataframe(event_log, case_id='case:concept:name', activity_key='Activity', timestamp_key='time:timestamp')
event_log['time:timestamp']=pd.to_datetime(event_log['time:timestamp'], utc=True) # format the timestamp column

In [None]:
event_log.head()

In [None]:
_sint1 = event_log[event_log['case:concept:name'] =='2778'] 
_sint1.to_csv('_sint1.csv')
_sint1

In [None]:
_sint2 = event_log[event_log['case:concept:name'] =='1956'] 
_sint2.to_csv('_sint2.csv')
_sint2

In [None]:
_6686 = event_log[event_log['case:concept:name'] =='6686'] 
_6686.to_csv('_6686.csv')
_6686

In [None]:
### Exploiting Pandas

In [None]:
#### Number of Cases

In [None]:
event_log['elementId'].nunique()

In [None]:
#### Activity Ocurrences

In [None]:
event_log["concept:name"].value_counts()

In [None]:
#### Activities in Cases

In [None]:
event_log.groupby(["elementId", "concept:name"]).first().reset_index()["concept:name"].value_counts()

In [None]:
#### Start Activities

In [None]:
event_log.groupby("elementId").first()["concept:name"].value_counts()

In [None]:
#### End Activities

In [None]:
event_log.groupby("elementId").last()["concept:name"].value_counts()

In [None]:
## Data Exploration

In [None]:
pm4py.view_events_distribution_graph(event_log, distr_type="days_week") # Matplotlib inside!

In [None]:
#### Events per Hour

In [None]:
pm4py.view_events_distribution_graph(event_log, distr_type="hours") # Matplotlib inside!

In [None]:
#### Event Distribution Plot

In [None]:
pm4py.view_events_per_time_graph(event_log) # Matplotlib inside!

In [None]:
#### Case Duration Distribution

In [None]:
pm4py.view_case_duration_graph(event_log) # Matplotlib inside!

In [None]:
## Fun with Process Maps (and Graphy Theory)

In [None]:
dfg = pm4py.discover_dfg(event_log)
print(dfg[0]) # graph structure

In [None]:
#### Start Activities

In [None]:
print(dfg[1]) # start activities

In [None]:
#### End Activities

In [None]:
print(dfg[2]) # end activities

In [None]:
#### Visualizing the Process Map

In [None]:
pm4py.view_dfg(dfg[0], dfg[1], dfg[2])

In [None]:
### Graph Theory for Process Maps (using NetworkX)


In [None]:
#### Converting the Process Map

In [None]:
%matplotlib inline
import networkx as nx
import itertools as it
G = nx.DiGraph()
alphabet = set(list(it.chain.from_iterable([[a,b] for (a,b) in dfg[0]]))).union(dfg[1].keys()).union(dfg[2].keys())
G.add_nodes_from(alphabet) # adding nodes
nx.draw(G, with_labels = True)

In [None]:
#### Adding Arcs

In [None]:
G.add_edges_from(dfg[0].keys())
nx.draw(G, with_labels = True)

In [None]:
### Degree of Vertices

In [None]:
sorted([(n, i) for (n,i) in G.in_degree()], key=lambda t: t[1], reverse=True)

In [None]:
### Dominator of a Graph

In [None]:
nx.immediate_dominators(G, 'Decisão')

In [None]:
### Centrality of a Node

In [None]:
sorted(nx.degree_centrality(G).items(), key=lambda t: t[1], reverse=True) 

In [None]:
## Discovering Process Models

In [None]:
### Discovering a BPMN Model (Raw Data)

In [None]:
### Discovering a BPMN Model (Internal Filtering)

In [None]:
bpmn_2 = pm4py.discover_bpmn_inductive(event_log, noise_threshold=0.1)
pm4py.view_bpmn(bpmn_2)

In [None]:
from pm4py.objects.bpmn.exporter import exporter
exporter.apply(bpmn_2, "juris.bpmn")

In [None]:
## Checking Model Quality

In [None]:
### Converting the Model

In [None]:
pn, im, fm = pm4py.convert_to_petri_net(bpmn_2)
pm4py.view_petri_net(pn, im, fm)

In [None]:
### Checking Behavioral Model Properties

In [None]:
pm4py.check_soundness(pn, im, fm)

In [None]:
### Computing Quality Statistics

In [None]:
#### Fitness

In [None]:
pm4py.fitness_token_based_replay(event_log, pn, im, fm) # fast results; less accurate/explainable

In [None]:
pm4py.fitness_alignments(event_log, pn, im, fm)

In [None]:
#### Precision

In [None]:
pm4py.precision_alignments(event_log, pn, im, fm)

In [None]:
#### Diagnostics

In [None]:
alignments = pm4py.conformance_diagnostics_alignments(event_log, pn, im, fm)
alignments[0]

In [None]:
non_fitting = list(filter(lambda a: a['fitness']<1.0, alignments))
non_fitting[0]

In [None]:
#### Reconstructing the Model Behavior

In [None]:
trace = list(map(lambda m: m[1], filter(lambda m: m[1] !='>>' and m[1] is not None, alignment)))
trace

In [None]:
### Comparing Footprints

In [None]:
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
process_tree = pm4py.discover_process_tree_inductive(event_log, noise_threshold=0.1)
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
fp_model = footprints_discovery.apply(process_tree, variant=footprints_discovery.Variants.PROCESS_TREE)
from pm4py.visualization.footprints import visualizer as fp_visualizer
gviz = fp_visualizer.apply(fp_log, fp_model)
fp_visualizer.view(gviz)

In [None]:
### Machine Learning at the Case Level

In [None]:
#### Attribute Inspection

In [None]:
pm4py.get_event_attributes(event_log)

In [None]:
pm4py.get_trace_attributes(event_log)

In [None]:
#### Creating a Feature Table

In [None]:
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
ft, feature_names = log_to_features.apply(event_log, parameters={"str_ev_attr": ["lawsuit_id"],
                                                "str_tr_attr": ["concept:name"],
                                                "num_ev_attr": [], "num_tr_attr": [],
                                                "str_evsucc_attr": ["lawsuit_id"]})
pd.DataFrame(ft, columns=feature_names)

In [None]:
#### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca_df = pd.DataFrame(pca.fit_transform(ft))
pca_df.rename({c: str(c) for c in pca_df.columns}, axis='columns') # renaming the columns to string
pca_df

In [None]:
#### Outlier Detection

In [None]:
from sklearn.ensemble import IsolationForest
model=IsolationForest()
model.fit(pca_df)
pca_df["scores"] = model.decision_function(pca_df)
pca_df.sort_values("scores")

In [None]:
#### Concept Drift Detection

In [None]:
from pm4py.algo.transformation.log_to_features.util import locally_linear_embedding
from pm4py.visualization.graphs import visualizer

x, y = locally_linear_embedding.apply(event_log)
gviz = visualizer.apply(x, y, variant=visualizer.Variants.DATES, parameters={"title": "Locally Linear Embedding", "y_axis": "Intensity"})
visualizer.view(gviz)

In [None]:
#### Clustering

In [None]:
from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=4)
clustering.fit_predict(ft)

In [None]:
labels = clustering.labels_
centroids = clustering.cluster_centers_

l = clustering.fit_predict(ft)
pca = PCA(n_components=2).fit(ft)
datapoint = pca.transform(ft)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure
label1 = ["#FFFF00", "#008000", "#0000FF", "#800080", "#800d00"]
color = [label1[i] for i in labels]
plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
plt.show()

In [None]:
#### Decision Point Mining

In [None]:
from pm4py.algo.decision_mining import algorithm as dem
ft = dem.apply(event_log, pn, im, fm, decision_point='exi_id6046d342-19d8-46a9-a7f1-af0f14e43653')
ft[0]

In [None]:
ft[1]

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(ft[0], ft[1])
tree.plot_tree(clf, 
                   feature_names=list(ft[0].columns.values),  
                   class_names=['T13 Adjust document X request unlicensed', 'T11 Create document X request unlicensed'],
                   filled=True)

In [None]:
#### Case Duration Prediction

In [None]:
data, feature_names = log_to_features.apply(event_log, parameters={"str_ev_attr": ["lawsuit_id"],
                                                "str_tr_attr": ["concept:name"],
                                                "num_ev_attr": [], "num_tr_attr": [],
                                                "str_evsucc_attr": ["lawsuit_id"]})
#throughput_time = [y[-1]["date"].timestamp() - y[0]["date"].timestamp() for y in event_log]
#throughput_time[0:5]
data

In [None]:
import random
training_cases = set(random.sample([i for i in range(len(event_log))], 500))
data_training = [data[i] for i in range(len(event_log)) if i in training_cases]
throughput_time_training = [throughput_time[i] for i in range(len(event_log)) if i in training_cases]

In [None]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=3)
regressor.fit(data_training, throughput_time_training)

In [None]:
data_validation = [data[i] for i in range(len(event_log)) if i not in training_cases]
throughput_time_validation = [throughput_time[i] for i in range(len(event_log)) if i not in training_cases]

In [None]:
predicted_throughput_time = regressor.predict(data_validation)

In [None]:
for i in range(5):
    print("case actual throughput=", throughput_time_validation[i], " predicted throughput = ", predicted_throughput_time[i])