In [1]:
import pandas as pd
from src.eventlog_utils import convert_log
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance

ModuleNotFoundError: No module named 'log_distance_measures'

In [607]:
case_studies = {
    1: 'Consulta',
    2: 'Production',
    3: 'Purchasing',
    4: 'bpi12',
    5: 'bpi17'
}

In [608]:
# chose case study
case_study = case_studies[5]

# Choose if our approach or Sota
our_approach = True
Sota = False
approach = 'SIMOD'
print(f'Case study: {case_study}')

Case study: bpi17


In [609]:
log_real = xes_importer.apply(f'data/{case_study}/logTest.xes')
log_real = pm4py.convert_to_dataframe(log_real) 

if our_approach:
    log_sim = pd.read_csv(f'simulations/{case_study}/sim.csv')
    print('imported log from our approach')

elif Sota:
    log_sim = pd.read_csv(f'Sota/{case_study}/{approach}/sim.csv')
    
    #Rename the columns named 'end_timestamp' and 'start_timestamp' with 'time:timestamp' and 'start:timestamp'
    log_sim = log_sim.rename(columns={'end_timestamp': 'time:timestamp', 
                                      'start_timestamp': 'start:timestamp', 
                                      'task':'concept:name',
                                      'caseid': 'case:concept:name'})

    print('imported log from Sota, approach:', approach, ',columns renamed')

parsing log, completed traces :: 100%|██████████| 6056/6056 [00:03<00:00, 1908.11it/s]


imported log from our approach


In [610]:
print(min(log_real['time:timestamp']))
print(max(log_real['time:timestamp']))
print(min(log_sim['time:timestamp']))
print(max(log_sim['time:timestamp']))

2016-01-02 10:45:22+00:00
2017-02-01 14:01:24+00:00
2016-06-14 13:04:37
3000-08-04 14:25:43.594431


In [611]:
# convert event log format lifecycles
if 'lifecycle:transition' in log_real.columns:
    log_real = convert_log(log_real)
if 'lifecycle:transition' in log_sim.columns:
    log_sim = convert_log(log_sim)

  0%|          | 0/6056 [00:00<?, ?it/s]

100%|██████████| 6056/6056 [02:02<00:00, 49.29it/s]
100%|██████████| 6056/6056 [00:50<00:00, 120.50it/s]


In [612]:
print(min(log_real['time:timestamp']))
print(max(log_real['time:timestamp']))
print(min(log_sim['time:timestamp']))
print(max(log_sim['time:timestamp']))

2016-01-02 10:49:28+00:00
2017-02-01 14:01:24+00:00
2016-06-14 13:04:37
3000-08-04 14:25:43.594431


In [None]:
# Set event log column ID mapping
event_log_ids = EventLogIDs(
    case="case:concept:name",
    activity="concept:name",
    start_time="start:timestamp",
    end_time="time:timestamp"
)

In [None]:
log_real[event_log_ids.start_time] = pd.to_datetime(log_real[event_log_ids.start_time], utc=True)
log_real[event_log_ids.end_time] = pd.to_datetime(log_real[event_log_ids.end_time], utc=True)

log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

In [None]:
#Initialize distances dictionary
distances = {}

### Control-flow Log Distance

<b>WARNING: It may take a long time</b>

In [None]:
# Call passing the event logs, and its column ID mappings
distance = control_flow_log_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
)

In [None]:
print('CF Log distance: ', distance)

# Fill the distances dictionary
distances['control_flow_log_distance'] = distance


CF Log distance:  0.053923808562558316


### N-Gram Distribution Distance

In [None]:
n_gram = 5

In [None]:
# Call passing the event logs, and its column ID mappings
distance = n_gram_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    n=n_gram
)

In [None]:
print('N-Gram distr. distance: ', distance)
# Fill the distances dictionary
distances['n_gram_distribution_distance'] = distance

N-Gram distr. distance:  0.028848404731413778


### Absolute Event Distribution Distance

In [None]:
# EMD of the (END) timestamps distribution where each bin represents a minute
distance = absolute_event_distribution_distance(
    log_real, event_log_ids,
    log_sim, event_log_ids,
    discretize_type=AbsoluteTimestampType.END,
    discretize_event=discretize_to_hour
)

TypeError: '<' not supported between instances of 'str' and 'Timestamp'

In [None]:
print('Absolute Event Distribution Distance: ', distance)
# Fill the distances dictionary
distances['absolute_event_distribution_distance'] = distance

Absolute Event Distribution Distance:  1551.0053650963555


### Case Arrival Distribution Distance

In [None]:
distance = case_arrival_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
)

In [None]:
print('Case Arrival distr distance: ', distance)
# Fill the distances dictionary
distances['case_arrival_distribution_distance'] = distance

Case Arrival distr distance:  1798.3923210744326


### Circadian Event Distribution Distance

In [None]:
distance = circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [None]:
print('Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['circadian_event_distribution_distance'] = distance

Circadian Event distr distance:  7.3283751971662


### Relative Event Distribution Distance

In [None]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = relative_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
    discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
)

In [None]:
print('Relative Event distr distance: ', distance)
# Fill the distances dictionary
distances['relative_event_distribution_distance'] = distance

Relative Event distr distance:  238.92399980070414


### Work in Progress Distance

<b>WARNING: It may take a long time</b>

In [None]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
# distance = work_in_progress_distance(
#     log_real, event_log_ids,  # First event log and its column id mappings
#     log_sim, event_log_ids,  # Second event log and its column id mappings
#     window_size=pd.Timedelta(hours=1)  # Bins of 1 hour
# )

In [None]:
print('Work in Progress distance: ', distance)
# Fill the distances dictionary
distances['work_in_progress_distance'] = 'NaN'

Work in Progress distance:  238.92399980070414


### Cycle Time Distribution Distance

In [None]:
distance = cycle_time_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    bin_size=pd.Timedelta(hours=1)  # Bins of 1 minute
)

In [None]:
print('Cycle Time distr distance: ', distance)
# Fill the distances dictionary
distances['cycle_time_distribution_distance'] = distance

Cycle Time distr distance:  207.8780629505428


In [None]:
print(f'case study is {case_study} for approach {approach}, with our approach {our_approach} and Sota {Sota}')
distances

case study is bpi12 for approach SIMOD, with our approach False and Sota True


{'control_flow_log_distance': 0.6185034539309048,
 'n_gram_distribution_distance': 0.9149807405351008,
 'absolute_event_distribution_distance': 1551.0053650963555,
 'case_arrival_distribution_distance': 1798.3923210744326,
 'circadian_event_distribution_distance': 7.3283751971662,
 'relative_event_distribution_distance': 238.92399980070414,
 'work_in_progress_distance': 'NaN',
 'cycle_time_distribution_distance': 207.8780629505428}

| Distance Measure                       | ProbBased | DSIM    | LSTM    | LSTM(GAN) | SIMOD   |
|----------------------------------------|-----------|---------|---------|-----------|---------|
| Control-flow Log Distance              | 0.096     | 0.359   | 0.185   | 0.643     | 0.619   |
| N-Gram Distribution Distance           | 0.074     | 0.708   | 0.533   | 0.898     | 0.915   |
| Absolute Event Distribution Distance   | 594841.618| 1630.453| 1491.270| 80229.565 | 1551.005|
| Case Arrival Distribution Distance     | 587953.934| 1827.333| 1600.404| 80475.746 | 1798.392|
| Circadian Event Distribution Distance  | 6.665     | 5.695   | 7.733   | 17.956    | 7.328   |
| Relative Event Distribution Distance   | 20.980    | 191.228 | 107.818 | 237.195   | 238.924 |
| Work in Progress Distance              | NaN       | NaN     | NaN     | NaN       | NaN     |
| Cycle Time Distribution Distance       | 17.746    | 172.563 | 106.482 | 206.750   | 207.878 |

Here is a markdown table summarizing the obtained distance values for the case study 'Production':

| Distance Measure                       | ProbBased | DSIM    | LSTM    | LSTM(GAN) | SIMOD   |
|----------------------------------------|-----------|---------|---------|-----------|---------|
| Control-flow Log Distance              | 0.305     | 0.161   | 0.486   | 0.857     | 0.523   |
| N-Gram Distribution Distance           | 0.243     | 0.363   | 0.828   | 0.991     | 0.655   |
| Absolute Event Distribution Distance   | 967.259   | 1134.681| 1300.409| 83375.183 | 787.306 |
| Circadian Event Distribution Distance  | 0.938     | 0.997   | 2.526   | 3.561     | 4.708   |
| Relative Event Distribution Distance   | 523.096   | 722.356 | 826.602 | 782.089   | 787.647 |
| Work in Progress Distance              | 0.248     | 0.340   | 0.386   | 0.979     | 0.211   |
| Cycle Time Distribution Distance       | 403.787   | 596.947 | 698.343 | 638.106   | 588.961 |

Here is a markdown table summarizing the obtained distance values for the case study 'Production':

| Distance Measure                       | ProbBased| DSIM    |LSTM     | LSTM(GAN) | SIMOD   |
|----------------------------------------|----------|---------|---------|-----------|---------|
| Control-flow Log Distance              | 0.470    | 0.788   | 0.869   | 0.908     | 0.658   |
| N-Gram Distribution Distance           | 0.541    | 0.998   | 1.0     | 1.0       | 0.893   |
| Absolute Event Distribution Distance   | 3614.320 | 378.030 | 389.020 | 78504.732 | 1962.000|
| Case Arrival Distribution Distance     | 3645.666 | 443.177 | 441.555 | 78575.956 | 1884.0  |
| Circadian Event Distribution Distance  | 2.498    | 2.516   | 18.023  | 6.723     | 3.721   |
| Relative Event Distribution Distance   | 51.523   | 227.642 | 275.048 | 232.318   | 263.868 |
| Work in Progress Distance              | 0.367    | 0.224   | 0.225   | 0.988     | 0.480   |
| Cycle Time Distribution Distance       | 148.222  | 301.755 | 360.555 | 313.156   | 348.311 |


Here is a markdown table summarizing the obtained distance values for the case study 'Consulta':

| Distance Measure                       | ProbBased| DSIM    |LSTM   | LSTM(GAN) | SIMOD |
|----------------------------------------|----------|---------|-------|-----------|-------|
| Control-flow Log Distance              | 0.161    | 0.4258  |0.442  |0.856      |0.224  |
| N-Gram Distribution Distance           | 0.244    | 1.0     |0.998  |1.0        |0.538  |
| Absolute Event Distribution Distance   | 135.874  | 251.667 |543.827|41162.510  |624.803|
| Case Arrival Distribution Distance     | 173.586  | 231.196 |503.821|41186.346  |645.419|
| Circadian Event Distribution Distance  | 3.457    | 3.07532 |21.095 |14.355     |2.677  |
| Relative Event Distribution Distance   | 155.895  | 28.7120 |40.013 |37.952     |28.185 |
| Work in Progress Distance              | 0.107    | 0.1728  |0.451  |0.984      |0.312  |
| Cycle Time Distribution Distance       | 298.005  | 69.587  |106.155| 85.229    |85.162 |
