In [7]:
import pandas as pd
from src.eventlog_utils import convert_log_from_lc_to_se
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance
from log_distance_measures import earth_movers_distance
from src.res_based_ced import resource_based_circadian_event_distribution_distance
from distance_utils import emd_attributes

In [8]:
case_studies = {
    1: 'Purchasing',
    2: 'Production',
    3: 'Consulta',
    4: 'bpi12',
    5: 'bpi17',
    6: 'sepsis',
    8: 'bpi19',
    7: 'rtf'
}

In [9]:
# chose case study
case_study = case_studies[5]

# Choose if our approach or Sota
our_approach = False
Sota = not(our_approach)
approach = 'SIMOD' if Sota else None
print(f'Case study: {case_study}, our approach = {our_approach}')

Case study: bpi17, our approach = False


In [10]:
log_real = xes_importer.apply(f'data/{case_study}/logTest.xes')
log_real = pm4py.convert_to_dataframe(log_real)

if our_approach:
    log_sim = pd.read_csv(f'simulations/{case_study}/sim_0.csv')
    print('imported log from our approach')

elif Sota:
    
    if approach == 'RIMS':
        log_sim = pd.read_csv(f'RIMS/{case_study}/results/rims/sim.csv')
        del log_sim['st_wip']
        del log_sim['queue']
        del log_sim['st_tsk_wip']
        print('imported log from Sota, approach:', approach)        
    
    elif approach == 'AgentSimulator':
        log_sim = pd.read_csv(f'AgentSimulator/{case_study}/main_results/sim.csv')
        print('imported log from Sota, approach:', approach)    

    elif approach == 'SIMOD':
        log_sim = pd.read_csv(f'SIMOD/{case_study}/simulated_log_0.csv')
        print('imported log from Sota, approach:', approach)

    elif approach == 'DSIM':
        log_sim = pd.read_csv(f'DSIM/results/generated_logs/{case_study}/DSIM/sim.csv')
        print('imported log from Sota, approach:', approach)

    else:
        log_sim = pd.read_csv(f'Sota/{case_study}/{approach}/sim.csv')
    
    #Rename the columns named 'end_timestamp' and 'start_timestamp' with 'time:timestamp' and 'start:timestamp'
    log_sim = log_sim.rename(columns={'end_timestamp': 'time:timestamp', 
                                      'start_timestamp': 'start:timestamp', 
                                      'task':'concept:name',
                                      'caseid': 'case:concept:name',
                                      'resource': 'org:resource'}, errors='ignore')
    
    log_sim = log_sim.rename(columns={'end_time': 'time:timestamp', 
                                        'start_time': 'start:timestamp', 
                                        'activity':'concept:name',
                                        'case_id': 'case:concept:name',
                                        'resource': 'org:resource'}, errors='ignore')
    
    # Errors='ignore' has been set because the log from francesca meneghello et al has some columns already correctly
    print('imported log from Sota, approach:', approach, ',columns renamed')

parsing log, completed traces :: 100%|██████████| 5771/5771 [00:02<00:00, 2512.21it/s]


imported log from Sota, approach: SIMOD
imported log from Sota, approach: SIMOD ,columns renamed


In [11]:
# convert event log format lifecycles
if 'lifecycle:transition' in log_real.columns:
    log_real = convert_log_from_lc_to_se(log_real)
    log_real.rename(columns={'START': 'start:timestamp', 'END': 'time:timestamp'}, errors='ignore', inplace=True)
    log_real.reset_index(inplace=True)
if 'lifecycle:transition' in log_sim.columns:
    log_sim = convert_log_from_lc_to_se(log_sim)
    log_sim.reset_index(inplace=True)

100%|██████████| 5771/5771 [01:28<00:00, 65.30it/s]


In [15]:
# Set event log column ID mapping
event_log_ids = EventLogIDs(
    case="case:concept:name",
    activity="concept:name",
    start_time="start:timestamp",
    end_time="time:timestamp",
    resource="org:resource",
)

In [16]:
log_real[event_log_ids.start_time] = pd.to_datetime(log_real[event_log_ids.start_time], utc=True)
log_real[event_log_ids.end_time] = pd.to_datetime(log_real[event_log_ids.end_time], utc=True)


# Convert timestamps strings to datetime objects
log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

# log_sim[event_log_ids.start_time] = [i[:19] for i in log_sim[event_log_ids.start_time].values]
# log_sim[event_log_ids.end_time] = [i[:19] for i in log_sim[event_log_ids.end_time].values]
# log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
# log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

In [17]:
#Initialize distances dictionary
distances = {}

In [9]:
from log_distance_measures.circadian_workforce_distribution import circadian_workforce_distribution_distance
print('case study is: ', case_study, 'CWD is: ', circadian_workforce_distribution_distance(log_real, event_log_ids, log_sim, event_log_ids))

case study is:  bpi17 CWD is:  1.9388987368576223


### Generated Attributes Distribution Distance

In [10]:
distance = emd_attributes(log_real, log_sim, attr_names=[])
print('Attr distance: ', distance)
# Fill the distances dictionary
distances['emd_attributes'] = distance

Attr distance:  nan


<b>WARNING: It may take a long time</b>

### Control-flow Log Distance

In [None]:
# Call passing the event logs, and its column ID mappings
distance = control_flow_log_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
)

In [None]:
print('CF Log distance: ', distance)
# Fill the distances dictionary
distances['control_flow_log_distance'] = distance


CF Log distance:  0.5878994838548787


### N-Gram Distribution Distance

In [None]:
n_gram = 3

In [None]:
# Call passing the event logs, and its column ID mappings
distance = n_gram_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    n=n_gram
)

In [None]:
print('N-Gram distr. distance: ', distance)
# Fill the distances dictionary
distances['n_gram_distribution_distance'] = distance

N-Gram distr. distance:  0.9762775141946022


### Absolute Event Distribution Distance

In [120]:
# EMD of the (END) timestamps distribution where each bin represents a minute
distance = absolute_event_distribution_distance(
    log_real, event_log_ids,
    log_sim, event_log_ids,
    discretize_type=AbsoluteTimestampType.END,
    discretize_event=discretize_to_hour
)

In [121]:
print('Absolute Event Distribution Distance: ', distance)
# Fill the distances dictionary
distances['absolute_event_distribution_distance'] = distance

Absolute Event Distribution Distance:  79222.95631580324


### Case Arrival Distribution Distance

In [122]:
distance = case_arrival_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
)

In [123]:
print('Case Arrival distr distance: ', distance)
# Fill the distances dictionary
distances['case_arrival_distribution_distance'] = distance

Case Arrival distr distance:  227.30366492146598


### Circadian Event Distribution Distance

In [124]:
distance = circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [125]:
print('Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['circadian_event_distribution_distance'] = distance

Circadian Event distr distance:  4.130469247980372


### Resource-Based Circadian Event Distribution Distance


In [126]:
distance = resource_based_circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [127]:
print('Resource-Based Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['resource_based_circadian_event_distribution_distance'] = distance

Resource-Based Circadian Event distr distance:  8.168591247366653


### Relative Event Distribution Distance

In [128]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = relative_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
    discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
)

In [129]:
print('Relative Event distr distance: ', distance)
# Fill the distances dictionary
distances['relative_event_distribution_distance'] = distance

Relative Event distr distance:  79435.52090004024


### Work in Progress Distance

<b>WARNING: It may take a long time</b>

In [130]:
# # Call passing the event logs, its column ID mappings, timestamp type, and discretize function
# distance = work_in_progress_distance(
#     log_real, event_log_ids,  # First event log and its column id mappings
#     log_sim, event_log_ids,  # Second event log and its column id mappings
#     window_size=pd.Timedelta(hours=1)  # Bins of 1 hour
# )

In [131]:
# print('Work in Progress distance: ', distance)
# Fill the distances dictionary
# distances['work_in_progress_distance'] = distance

### Cycle Time Distribution Distance

In [132]:
distance = cycle_time_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    bin_size=pd.Timedelta(hours=1)  # Bins of 1 minute
)

In [133]:
print('Cycle Time distr distance: ', distance)
# Fill the distances dictionary
distances['cycle_time_distribution_distance'] = distance

Cycle Time distr distance:  62797.13089005235


### Save them

In [134]:
print(f'case study is {case_study} for approach {approach}, with our approach {our_approach} and Sota {Sota}')
# Save the distances dictionary to a file with pkl
pd.to_pickle(distances, f'simulations/{case_study}/metrics.pkl')

# Print the distances vector with each distance measure, in different lines
for key, value in distances.items():
    print(key, value)



case study is Consulta for approach SIMOD, with our approach False and Sota True
emd_attributes nan
control_flow_log_distance 0.5878994838548787
n_gram_distribution_distance 0.9762775141946022
absolute_event_distribution_distance 79222.95631580324
case_arrival_distribution_distance 227.30366492146598
circadian_event_distribution_distance 4.130469247980372
resource_based_circadian_event_distribution_distance 8.168591247366653
relative_event_distribution_distance 79435.52090004024
cycle_time_distribution_distance 62797.13089005235


### Cast them into .xes for evaluating entropy

In [None]:
import os 
import glob
from src.eventlog_utils import convert_log_from_lc_to_se
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
directory = '/home/padela/Scrivania/ProbabilityBasedEventLogGenerator/data'
os.chdir('data')


print('Folder s in directory:', directory)
for folder in glob.glob('*'):  # Skip the first folder which is the current directory
    if os.path.isdir(folder):
        print(folder)
        for file in glob.glob(f'{folder}/*.xes'):
            log = xes_importer.apply(file)
            log = pm4py.convert_to_dataframe(log)
            if 'lifecycle:transition' in log.columns:
                log = convert_log_from_lc_to_se(log)
                log.rename(columns={'START': 'start:timestamp', 'END': 'time:timestamp'}, errors='ignore', inplace=True)
                log.reset_index(inplace=True)
            # Save the log to a csv file
            log.to_csv(f'{folder}/{os.path.basename(file).replace(".xes", ".csv")}', index=False)
            print(f'Saved {file} to {folder}/{os.path.basename(file).replace(".xes", ".csv")}')

In [3]:
import time
import subprocess

case_studies = {
    1: 'Purchasing',
    2: 'Production',
    3: 'Consulta',
    4: 'bpi12',
    5: 'bpi17',
    6: 'sepsis',
    7: 'rtf',
    8: 'bpi19'
}

base_path = "/home/padela/Scrivania/ProbabilityBasedEventLogGenerator/simod_new"

for key, case_name in case_studies.items():
    config_path = f"{base_path}/{case_name}.yml"
    command = ["simod", "--configuration", config_path]
    
    print(f"\nRunning case study: {case_name}")
    
    start_time = time.time()
    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error while running {case_name}: {e}")
    end_time = time.time()
    
    duration = end_time - start_time
    print(f"Execution time for {case_name}: {duration:.2f} seconds")


Running case study: Purchasing


Traceback (most recent call last):
  File "/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/.conda/bin/simod", line 5, in <module>
    from simod.cli import main
  File "/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/.conda/lib/python3.11/site-packages/simod/cli.py", line 12, in <module>
    from simod.simod import Simod
  File "/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/.conda/lib/python3.11/site-packages/simod/simod.py", line 18, in <module>
    from simod.branch_rules.discovery import discover_branch_rules, map_branch_rules_to_flows
  File "/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/.conda/lib/python3.11/site-packages/simod/branch_rules/discovery.py", line 8, in <module>
    from pix_framework.discovery.gateway_conditions.gateway_conditions import discover_gateway_conditions
  File "/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/.conda/lib/python3.11/site-packages/pix_framework/discovery/gateway_co

KeyboardInterrupt: 

In [5]:
from src.entropies import cf_entropy, compute_etd_entropy, compute_ctd_entropy
from src.eventlog_utils import convert_log_from_lc_to_se
import glob
import pandas as pd

approach = 'SIMOD'
main_dir = f"/home/padela/Desktop/DataAug/ProbabilityBasedEventLogGenerator/{approach}"

def separate_lifecycle_transition(df):
    """
    Moves the lifecycle transition (e.g., 'start', 'complete') from the end of concept:name
    to the lifecycle:transition column, and updates concept:name to remove the suffix.
    """
    # Extract lifecycle transition from concept:name
    extracted = df['concept:name'].str.extract(r'^(.*)_(start|complete)$')
    
    # If extraction was successful, update the columns
    df.loc[extracted[1].notnull(), 'concept:name'] = extracted[0]
    df.loc[extracted[1].notnull(), 'lifecycle:transition'] = extracted[1]
    
    return df

import os
os.chdir(main_dir)

import glob
for folder in glob.glob('*'):
    # if folder=='bpi12':
    #     print('db')
    try:
        log = pd.read_csv(f'{folder}/simulated_log_0.csv', sep=',')
        # log = separate_lifecycle_transition(log)
        # log = convert_log_from_lc_to_se(log)
        log = log.rename(columns={'end_time': 'time:timestamp', 
                                    'start_time': 'start:timestamp', 
                                    'activity':'concept:name',
                                    'case_id': 'case:concept:name',
                                    'resource': 'org:resource'}, errors='ignore')
        
        # print(f'{folder} Trace: ', cf_entropy(log, prefix=False))
        # print(f'{folder} Act: ', cf_entropy(log, prefix=True))
        print(f'{folder} Time Cycle: ', compute_ctd_entropy(log))
        print(f'{folder} Time Act: ', compute_etd_entropy(log))

        print('\n')
    except Exception as e:
        print(f'Error processing {folder}: {e}')
        print('\n')
        continue



Purchasing Time Cycle:  2.029741090810826
Purchasing Time Act:  0.7476869516284058


Consulta Time Cycle:  1.0876320807353892
Consulta Time Act:  0.619089921860491


sepsis Time Cycle:  2.129605053558443
sepsis Time Act:  0.0


Error processing bpi19: [Errno 2] No such file or directory: 'bpi19/simulated_log_0.csv'


Error processing Production: [Errno 2] No such file or directory: 'Production/simulated_log_0.csv'


bpi12 Time Cycle:  2.900298553067816
bpi12 Time Act:  1.7172184450619847


bpi17 Time Cycle:  3.3953641762340796
bpi17 Time Act:  1.9501820935452605


Error processing rtf: [Errno 2] No such file or directory: 'rtf/simulated_log_0.csv'


