In [1]:
import pandas as pd
from src.eventlog_utils import convert_log
from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
case_studies = {
    1: 'Consulta',
    2: 'Production',
    3: 'Purchasing',
    4: 'bpi12',
    5: 'bpi17'
}

In [None]:
# chose case study

case_study = case_studies[1]

In [2]:
log_real = pd.read_csv(f'data/{case_study}/logTest.csv')
log_sim = pd.read_csv(f'simulations/{case_study}/sim.csv')

In [None]:
# convert event log format lifecycles

log_real = convert_log(log_real)
log_sim = convert_log(log_sim)

In [3]:
# Set event log column ID mapping
event_log_ids = EventLogIDs(
    case="case:concept:name",
    activity="concept:name",
    start_time="start:timestamp",
    end_time="time:timestamp"
)

In [4]:
log_real[event_log_ids.start_time] = pd.to_datetime(log_real[event_log_ids.start_time], utc=True)
log_real[event_log_ids.end_time] = pd.to_datetime(log_real[event_log_ids.end_time], utc=True)

log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

### Control-flow Log Distance

<b>WARNING: It may take a long time</b>

In [None]:
# Call passing the event logs, and its column ID mappings
distance = control_flow_log_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
)

In [None]:
print('CF Log distance: ', distance)

### N-Gram Distribution Distance

In [5]:
n_gram = 5

In [6]:
# Call passing the event logs, and its column ID mappings
distance = n_gram_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    n=n_gram
)

In [7]:
print('N-Gram distr. distance: ', distance)

N-Gram distr. distance:  0.3513343363316009


### Absolute Event Distribution Distance

In [8]:
# EMD of the (END) timestamps distribution where each bin represents a minute
distance = absolute_event_distribution_distance(
    log_real, event_log_ids,
    log_sim, event_log_ids,
    discretize_type=AbsoluteTimestampType.END,
    discretize_event=discretize_to_hour
)

  ).floor(freq='H')


In [9]:
print('Absolute Event Distribution Distance: ', distance)

Absolute Event Distribution Distance:  157.50357358490396


### Case Arrival Distribution Distance

In [10]:
distance = case_arrival_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
)

  ).floor(freq='H')


In [11]:
print('Case Arrival distr distance: ', distance)

Case Arrival distr distance:  150.8545526419818


### Circadian Event Distribution Distance

In [12]:
distance = circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [13]:
print('Circadian Event distr distance: ', distance)

Circadian Event distr distance:  2.0155024844958973


### Relative Event Distribution Distance

In [14]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = relative_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
    discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
)

In [15]:
print('Relative Event distr distance: ', distance)

Relative Event distr distance:  1.2574550651146357


### Work in Progress Distance

<b>WARNING: It may take a long time</b>

In [18]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = work_in_progress_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    window_size=pd.Timedelta(hours=1)  # Bins of 1 hour
)

  ).floor(freq='24H')
  ).ceil(freq='24H')


KeyboardInterrupt: 

In [None]:
print('Work in Progress distance: ', distance)

### Cycle Time Distribution Distance

In [16]:
distance = cycle_time_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    bin_size=pd.Timedelta(hours=1)  # Bins of 1 minute
)

In [17]:
print('Cycle Time distr distance: ', distance)

Cycle Time distr distance:  5.211885623929367
