In [1]:
import pandas as pd
from src.eventlog_utils import convert_log
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance
from src.res_based_ced import resource_based_circadian_event_distribution_distance

In [2]:
case_studies = {
    1: 'Consulta',
    2: 'Production',
    3: 'Purchasing',
    4: 'bpi12',
    5: 'bpi17'
}

In [3]:
# chose case study
case_study = case_studies[5]

# Choose if our approach or Sota
our_approach = True
Sota = not(our_approach)
approach = 'SIMOD' if Sota else None
print(f'Case study: {case_study}')

Case study: bpi17


In [4]:
log_real = xes_importer.apply(f'data/{case_study}/logTest.xes')
log_real = pm4py.convert_to_dataframe(log_real)

if our_approach:
    log_sim = pd.read_csv(f'simulations/{case_study}/sim.csv')
    print('imported log from our approach')

elif Sota:
    log_sim = pd.read_csv(f'Sota/{case_study}/{approach}/sim.csv')
    
    #Rename the columns named 'end_timestamp' and 'start_timestamp' with 'time:timestamp' and 'start:timestamp'
    log_sim = log_sim.rename(columns={'end_timestamp': 'time:timestamp', 
                                      'start_timestamp': 'start:timestamp', 
                                      'task':'concept:name',
                                      'caseid': 'case:concept:name'}, errors='ignore')
    
    # Errors='ignore' has been set because the log from francesca meneghello et al has some columns already correctly

    print('imported log from Sota, approach:', approach, ',columns renamed')

parsing log, completed traces :: 100%|██████████| 6056/6056 [00:02<00:00, 2020.46it/s]


imported log from our approach


In [5]:
# convert event log format lifecycles
if 'lifecycle:transition' in log_real.columns:
    log_real = convert_log(log_real)
    log_real.rename(columns={'START': 'start:timestamp', 'END': 'time:timestamp'}, errors='ignore', inplace=True)
if 'lifecycle:transition' in log_sim.columns:
    log_sim = convert_log(log_sim)

 49%|████▉     | 2970/6056 [00:56<00:57, 53.40it/s]

In [None]:
# Set event log column ID mapping
event_log_ids = EventLogIDs(
    case="case:concept:name",
    activity="concept:name",
    start_time="start:timestamp",
    end_time="time:timestamp"
)

In [None]:
log_real[event_log_ids.start_time] = pd.to_datetime(log_real[event_log_ids.start_time], utc=True)
log_real[event_log_ids.end_time] = pd.to_datetime(log_real[event_log_ids.end_time], utc=True)

log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

In [None]:
log_sim.columns

Index(['case:concept:name', 'concept:name', 'org:resource', 'start:timestamp',
       'time:timestamp'],
      dtype='object')

In [None]:
#Initialize distances dictionary
distances = {}

### Control-flow Log Distance

<b>WARNING: It may take a long time</b>

In [None]:
# Call passing the event logs, and its column ID mappings
distance = control_flow_log_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
)

In [None]:
print('CF Log distance: ', distance)

# Fill the distances dictionary
distances['control_flow_log_distance'] = distance


CF Log distance:  0.19880362511252564


### N-Gram Distribution Distance

In [None]:
n_gram = 5

In [None]:
# Call passing the event logs, and its column ID mappings
distance = n_gram_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    n=n_gram
)

In [None]:
print('N-Gram distr. distance: ', distance)
# Fill the distances dictionary
distances['n_gram_distribution_distance'] = distance

N-Gram distr. distance:  0.29995114802149486


### Absolute Event Distribution Distance

In [None]:
# EMD of the (END) timestamps distribution where each bin represents a minute
distance = absolute_event_distribution_distance(
    log_real, event_log_ids,
    log_sim, event_log_ids,
    discretize_type=AbsoluteTimestampType.END,
    discretize_event=discretize_to_hour
)

In [None]:
print('Absolute Event Distribution Distance: ', distance)
# Fill the distances dictionary
distances['absolute_event_distribution_distance'] = distance

Absolute Event Distribution Distance:  105.20370926026486


### Case Arrival Distribution Distance

In [None]:
distance = case_arrival_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
)

In [None]:
print('Case Arrival distr distance: ', distance)
# Fill the distances dictionary
distances['case_arrival_distribution_distance'] = distance

Case Arrival distr distance:  228.50785340314135


### Circadian Event Distribution Distance

In [None]:
distance = circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [None]:
print('Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['circadian_event_distribution_distance'] = distance

Circadian Event distr distance:  2.5029006098187008


### Resource-Based Circadian Event Distribution Distance


In [None]:
distance = resource_based_circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [None]:
print('Resource-Based Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['resource_based_circadian_event_distribution_distance'] = distance

Resource-Based Circadian Event distr distance:  6.319850961714782


### Relative Event Distribution Distance

In [None]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = relative_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
    discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
)

In [None]:
print('Relative Event distr distance: ', distance)
# Fill the distances dictionary
distances['relative_event_distribution_distance'] = distance

Relative Event distr distance:  210.90742250898165


### Work in Progress Distance

<b>WARNING: It may take a long time</b>

In [None]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
# distance = work_in_progress_distance(
#     log_real, event_log_ids,  # First event log and its column id mappings
#     log_sim, event_log_ids,  # Second event log and its column id mappings
#     window_size=pd.Timedelta(hours=1)  # Bins of 1 hour
# )

In [None]:
print('Work in Progress distance: ', distance)
# Fill the distances dictionary
# distances['work_in_progress_distance'] = distance

Work in Progress distance:  210.90742250898165


### Cycle Time Distribution Distance

In [None]:
distance = cycle_time_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    bin_size=pd.Timedelta(hours=1)  # Bins of 1 minute
)

In [None]:
print('Cycle Time distr distance: ', distance)
# Fill the distances dictionary
distances['cycle_time_distribution_distance'] = distance

Cycle Time distr distance:  398.91099476439786


In [None]:
print(f'case study is {case_study} for approach {approach}, with our approach {our_approach} and Sota {Sota}')
distances

case study is Consulta for approach None, with our approach True and Sota False


{'control_flow_log_distance': 0.19880362511252564,
 'n_gram_distribution_distance': 0.29995114802149486,
 'absolute_event_distribution_distance': 105.20370926026486,
 'case_arrival_distribution_distance': 228.50785340314135,
 'circadian_event_distribution_distance': 2.5029006098187008,
 'resource_based_circadian_event_distribution_distance': 6.319850961714782,
 'relative_event_distribution_distance': 210.90742250898165,
 'cycle_time_distribution_distance': 398.91099476439786}

# Consulta
| Model         | Control Flow Log Distance | N-Gram Distribution Distance | Absolute Event Distribution Distance | Case Arrival Distribution Distance | Circadian Event Distribution Distance | Resource-Based Circadian Event Distribution Distance | Relative Event Distribution Distance | Work in Progress Distance | Cycle Time Distribution Distance |
|---------------|---------------------------|-------------------------------|------------------------------------|-----------------------------------|----------------------------------------|----------------------------------------------------|-----------------------------------|--------------------------|----------------------------------|
| Our Approach  | 0.233 | 0.331 | 114.57 | 135.71 | 2.722 | 6.285  | 241.98                           | NaN                      | 444.29                         |
| DSIM          | 0.4258                    | 1.0                           | 251.6673                           | 231.1963                          | 3.0753                                     | NaN                                                | 28.712                            | NaN                      | 69.5873                          |
| LSTM          | 0.4426                    | 0.9989                        | 543.8277                           | 503.822                           | 21.0959                                    | NaN                                                | 40.0139                           | NaN                      | 106.1552                         |
| LSTM (GAN)    | 0.8560                    | 1.0                           | 41162.5108                         | 41186.3461                        | 14.3556                                    | NaN                                                | 37.9526                           | NaN                      | 85.2295                          |
| SIMOD         | 0.2250                    | 0.5382                        | 624.8035                           | 645.4194                          | 2.6774                                     | NaN                                                | 28.1857                           | NaN                      | 85.1621                          |
| RIMS          | 0.4624                    | 1.0                           | 266.0798                           | 243.6736                          | 3.1837                                     | NaN                                                | 27.8764                           | NaN                      | 73.1648                          |


## Purchasing

| Model         | Control Flow Log Distance | N-Gram Distribution Distance | Absolute Event Distribution Distance | Case Arrival Distribution Distance | Circadian Event Distribution Distance | Resource-Based Circadian Event Distribution Distance | Relative Event Distribution Distance | Cycle Time Distribution Distance |
|---------------|---------------------------|-------------------------------|------------------------------------|-----------------------------------|----------------------------------------|----------------------------------------------------|-----------------------------------|----------------------------------|
| Our Approach  | 0.338                     | 0.260                        | 710.081                            | 555.319                           | 0.961                                      | 5.144                                               | 480.151                            | 377.442                           |
| LSTM          | 0.4864                    | 0.8282                        | 1300.409                           | 847.525                           | 2.5268                                     | NaN                                             | 826.602                           | 698.3427                         |
| RIMS          | 0.4794                    | 0.4953                        | 1079.587                           | 769.022                           | 0.9369                                     | NaN                                                | 688.982                           | 631.5424                         |
| SIMOD         | 0.5226                    | 0.6549                        | 787.306                            | 1265.784                          | 4.7083                                     | NaN                                                | 787.647                           | 588.9605                         |
| DSIM          | 0.1616                    | 0.3632                        | 1134.681                           | 774.761                           | 0.9966                                     | NaN                                                | 722.356                           | 596.9468                         |
| LSTM (GAN)    | 0.8572                    | 0.9908                        | 83375.183                          | 83832.262                         | 3.5611                                     | NaN                                                | 782.089                           | 638.1056                         |


## Production

| Model         | Control Flow Log Distance | N-Gram Distribution Distance | Absolute Event Distribution Distance | Case Arrival Distribution Distance | Circadian Event Distribution Distance | Resource-Based Circadian Event Distribution Distance | Relative Event Distribution Distance | Cycle Time Distribution Distance |
|---------------|---------------------------|-------------------------------|------------------------------------|-----------------------------------|----------------------------------------|----------------------------------------------------|-----------------------------------|----------------------------------|
| Our Approach  | 0.453                     | 0.557                         | 354.118                            | 27.4                              | 2.68                                       | 9.843                                              | 288.631                           | 210.643                          |
| LSTM          | 0.8696                    | 1.0                           | 389.0209                           | 441.5556                          | 18.0232                                   | NaN                                                | 275.048                           | 360.5556                         |
| DSIM          | 0.7882                    | 0.9990                        | 378.0306                           | 443.1778                          | 2.5162                                    | NaN                                                | 227.643                           | 301.7556                         |
| LSTM (GAN)    | 0.9078                    | 1.0                           | 78504.7324                         | 78575.9556                        | 6.7231                                    | NaN                                                | 232.318                           | 313.1556                         |
| SIMOD         | 0.6588                    | 0.8939                        | 1962.0009                          | 1884.0000                         | 3.7217                                    | NaN                                                | 263.869                           | 348.3111                         |
| RIMS          | 0.7700                    | 0.9989                        | 371.8265                           | 446.3111                          | 2.5407                                    | NaN                                                | 216.239                           | 285.1333                         |



# BPI 12 

| Model         | Control Flow Log Distance | N-Gram Distribution Distance | Absolute Event Distribution Distance | Case Arrival Distribution Distance | Circadian Event Distribution Distance | Relative Event Distribution Distance | Cycle Time Distribution Distance |
|---------------|---------------------------|-------------------------------|------------------------------------|-----------------------------------|----------------------------------------|-----------------------------------|----------------------------------|
| Our Approach  | 0.1332                    | 0.1480                        | 1666.828                           | 1553.55                           | 2.556                                  | 150.789                           | 75.57                            |
| DSIM          | 0.3591                    | 0.7081                        | 1630.4531                          | 1827.3333                         | 5.6947                                 | 191.2276                          | 172.5630                         |
| LSTM          | 0.1849                    | 0.5327                        | 1491.2698                          | 1600.4043                         | 7.7336                                 | 107.8183                          | 106.4822                         |
| LSTM (GAN)    | 0.6433                    | 0.8978                        | 80229.5653                         | 80475.7456                        | 17.9561                                | 237.1947                          | 206.7501                         |
| SIMOD         | 0.6185                    | 0.9150                        | 1551.0054                          | 1798.3923                         | 7.3284                                 | 238.9240                          | 207.8781                         |
| RIMS          | 0.3823                    | 0.6923                        | 1586.3750                          | 1814.2196                         | 5.5128                                 | 215.2365                          | 183.5921                         |



# BPI 17

| Model         | Control Flow Log Distance | N-Gram Distribution Distance | Absolute Event Distribution Distance | Case Arrival Distribution Distance | Circadian Event Distribution Distance | Relative Event Distribution Distance | Cycle Time Distribution Distance |
|---------------|---------------------------|-------------------------------|------------------------------------|-----------------------------------|----------------------------------------|-----------------------------------|----------------------------------|
| Our Approach  | 0.147                     | 0.226                         | 2213.156                           | 2165.504                          | 2.727                                  | 41.271                            | 76.2703                          |
| DSIM          | 0.3949                    | 0.7117                        | 3401.9366                          | 3358.229                          | 2.4363                                 | 110.0635                          | 142.1077                         |
| LSTM          | 0.3899                    | 0.8388                        | 8023.4654                          | 7934.8956                         | 3.5194                                 | 126.9128                          | 172.2947                         |
| LSTM (GAN)    | 0.6411                    | 0.9268                        | 40332.2253                         | 40365.2728                        | 2.9865                                 | 192.6855                          | 248.3603                         |
| SIMOD         | 0.6084                    | 0.9982                        | 8216.3653                          | 8252.9624                         | 1.6320                                 | 220.4520                          | 292.1077                         |
| RIMS          | 0.4821                    | 0.8230                        | 3429.7243                          | 3379.5636                         | 2.2990                                 | 109.7954                          | 115.2111                         |
