# Dataset Exploration - BPIC 17 

## 0. Prerequesites - Installing of packages and Loading Files

In [None]:
import pandas as pd
import toml
import os
import matplotlib.pyplot as plt

import pm4py
import os
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization
from pm4py.objects.conversion.process_tree import converter
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

from pm4py.visualization.petri_net import visualizer as pn_visualizer

from pm4py.objects.conversion.log import converter as log_converter

import pm4py

import pylab as pl

os.chdir("..")
config = toml.load("config.toml")

In [None]:
df_a = pd.read_csv(config["data"]["bpi17_pa"], delimiter=";")

df_bpi17 = df_a

## 2. First Investigation

In [None]:
df_bpi17.head(10)

## 3. Average Case Duration (Number Activities and Time Spent) and Histogram of Case Lengths

In [None]:
# Average Case Duration (Time)

df_bpi17_grouped=df_bpi17.groupby(by='Case ID')['time:timestamp'].agg(['first','last'])
df_bpi17_grouped[["first", "last"]] = df_bpi17_grouped[["first", "last"]].apply(pd.to_datetime)
df_bpi17_grouped[["first", "last"]] = df_bpi17_grouped[["first", "last"]].apply(pd.to_datetime)
df_bpi17_grouped['Days_gap']=df_bpi17_grouped['last']-df_bpi17_grouped['first']
df_bpi17_grouped.groupby(by='Case ID')["Days_gap"].last().mean()

In [None]:
# Median Number of Activities per Case
df_bpi17.groupby(['Case ID']).size().median()

In [None]:
df_bpi17.columns

In [None]:
plt.rcParams["figure.figsize"] = (7,6)
plt.rcParams["figure.dpi"] = 300
histo = df_bpi17.groupby(['Case ID']).size().hist(bins=50, grid=False, figsize=(12,8) , color='#2077B4', zorder=2, rwidth=0.9)
pl.title("Histogram of Case Lengths", fontsize=22)
pl.xlabel("Case Length", fontsize=18)
pl.ylabel("Cumulative count", fontsize=18)
pl.xticks(fontsize=14)
pl.yticks(fontsize=14)
pl.axvline(x=35, ymin=0, ymax=1, linewidth=3, color="#000000")
pl.savefig('charts/hist_case_lengths.png', dpi=150)

## 4. Process Outcome Statistics

In [None]:
df_bpi17.groupby(by='Case ID').last()["label"].value_counts()

In [None]:
print(df_bpi17["Activity"].nunique())
print(df_bpi17.shape)

## 5. First DFG and BPMN Graphs

In [None]:
event_log = pm4py.read_xes(config["data"]["bpi17_xes"])

dfg = dfg_discovery.apply(event_log, variant=dfg_discovery.Variants.PERFORMANCE)
parameters = {dfg_visualization.Variants.PERFORMANCE.value.Parameters.FORMAT: "svg"}
gviz = dfg_visualization.apply(dfg, log=event_log, variant=dfg_visualization.Variants.PERFORMANCE, parameters=parameters)
dfg_visualization.save(gviz, "charts/dfg.svg")

In [None]:
log = pm4py.read_xes(config["data"]["bpi17_xes"])
tree = pm4py.discover_process_tree_inductive(log)

bpmn_graph = converter.apply(tree, variant=converter.Variants.TO_BPMN)
pm4py.view_bpmn(bpmn_graph)

In [None]:
event_log = pm4py.read_xes(config["data"]["bpi17_xes"])
start_activities = pm4py.get_start_activities(event_log)
end_activities = pm4py.get_end_activities(event_log)
print("Start activities: {}\nEnd activities: {}".format(start_activities, end_activities))

In [None]:
heu_net = heuristics_miner.apply_heu(event_log, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.99})

In [None]:
net, im, fm = heuristics_miner.apply(event_log, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.99})
gviz = pn_visualizer.apply(net, im, fm)
pn_visualizer.view(gviz)