In [66]:
import pandas as pd
from src.eventlog_utils import convert_log
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from log_distance_measures.config import EventLogIDs, AbsoluteTimestampType, discretize_to_hour
from log_distance_measures.control_flow_log_distance import control_flow_log_distance
from log_distance_measures.n_gram_distribution import n_gram_distribution_distance
from log_distance_measures.absolute_event_distribution import absolute_event_distribution_distance
from log_distance_measures.case_arrival_distribution import case_arrival_distribution_distance
from log_distance_measures.circadian_event_distribution import circadian_event_distribution_distance
from log_distance_measures.relative_event_distribution import relative_event_distribution_distance
from log_distance_measures.work_in_progress import work_in_progress_distance
from log_distance_measures.cycle_time_distribution import cycle_time_distribution_distance
from src.res_based_ced import resource_based_circadian_event_distribution_distance
from distance_utils import emd_attributes

In [67]:
case_studies = {
    1: 'Purchasing',
    2: 'Production',
    3: 'Consulta',
    4: 'bpi12',
    5: 'bpi17',
    6: 'sepsis',
    7: 'rtf',
    8: 'bpi19'
}

In [68]:
# chose case study
case_study = case_studies[2]

# Choose if our approach or Sota
our_approach = True
Sota = not(our_approach)
approach = 'SIMOD' if Sota else None
print(f'Case study: {case_study}')

Case study: Production


In [69]:
log_real = xes_importer.apply(f'data/{case_study}/logTest.xes')
log_real = pm4py.convert_to_dataframe(log_real)

if our_approach:
    log_sim = pd.read_csv(f'simulations/{case_study}/sim_0.csv')
    print('imported log from our approach')

elif Sota:
    log_sim = pd.read_csv(f'Sota/{case_study}/{approach}/sim.csv')
    
    #Rename the columns named 'end_timestamp' and 'start_timestamp' with 'time:timestamp' and 'start:timestamp'
    log_sim = log_sim.rename(columns={'end_timestamp': 'time:timestamp', 
                                      'start_timestamp': 'start:timestamp', 
                                      'task':'concept:name',
                                      'caseid': 'case:concept:name'}, errors='ignore')
    
    # Errors='ignore' has been set because the log from francesca meneghello et al has some columns already correctly

    print('imported log from Sota, approach:', approach, ',columns renamed')

parsing log, completed traces :: 100%|██████████| 45/45 [00:00<00:00, 1878.35it/s]

imported log from our approach





In [70]:
# convert event log format lifecycles
if 'lifecycle:transition' in log_real.columns:
    log_real = convert_log(log_real)
    log_real.rename(columns={'START': 'start:timestamp', 'END': 'time:timestamp'}, errors='ignore', inplace=True)
    log_real.reset_index(inplace=True)
if 'lifecycle:transition' in log_sim.columns:
    log_sim = convert_log(log_sim)
    log_sim.reset_index(inplace=True)

100%|██████████| 45/45 [00:00<00:00, 113.14it/s]
100%|██████████| 45/45 [00:00<00:00, 60.41it/s]


In [71]:
# Set event log column ID mapping
event_log_ids = EventLogIDs(
    case="case:concept:name",
    activity="concept:name",
    start_time="start:timestamp",
    end_time="time:timestamp"
)

In [72]:
log_real[event_log_ids.start_time] = pd.to_datetime(log_real[event_log_ids.start_time], utc=True)
log_real[event_log_ids.end_time] = pd.to_datetime(log_real[event_log_ids.end_time], utc=True)

log_sim[event_log_ids.start_time] = [i[:19] for i in log_sim[event_log_ids.start_time].values]
log_sim[event_log_ids.end_time] = [i[:19] for i in log_sim[event_log_ids.end_time].values]
log_sim[event_log_ids.start_time] = pd.to_datetime(log_sim[event_log_ids.start_time], utc=True)
log_sim[event_log_ids.end_time] = pd.to_datetime(log_sim[event_log_ids.end_time], utc=True)

In [73]:
#Initialize distances dictionary
distances = {}

### Control-flow Log Distance

<b>WARNING: It may take a long time</b>

In [74]:
# Call passing the event logs, and its column ID mappings
distance = control_flow_log_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
)

In [75]:
print('CF Log distance: ', distance)

# Fill the distances dictionary
distances['control_flow_log_distance'] = distance


CF Log distance:  0.6157317421428391


### N-Gram Distribution Distance

In [76]:
n_gram = 3

In [77]:
# Call passing the event logs, and its column ID mappings
distance = n_gram_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    n=n_gram
)

In [78]:
print('N-Gram distr. distance: ', distance)
# Fill the distances dictionary
distances['n_gram_distribution_distance'] = distance

N-Gram distr. distance:  0.5501193317422435


### Absolute Event Distribution Distance

In [79]:
# EMD of the (END) timestamps distribution where each bin represents a minute
distance = absolute_event_distribution_distance(
    log_real, event_log_ids,
    log_sim, event_log_ids,
    discretize_type=AbsoluteTimestampType.END,
    discretize_event=discretize_to_hour
)

In [80]:
print('Absolute Event Distribution Distance: ', distance)
# Fill the distances dictionary
distances['absolute_event_distribution_distance'] = distance

Absolute Event Distribution Distance:  427.6620904558404


### Case Arrival Distribution Distance

In [81]:
distance = case_arrival_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_event=discretize_to_hour  # Function to discretize each timestamp (default by hour)
)

In [82]:
print('Case Arrival distr distance: ', distance)
# Fill the distances dictionary
distances['case_arrival_distribution_distance'] = distance

Case Arrival distr distance:  46.866666666666674


### Circadian Event Distribution Distance

In [83]:
distance = circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [84]:
print('Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['circadian_event_distribution_distance'] = distance

Circadian Event distr distance:  1.0571307579949614


### Resource-Based Circadian Event Distribution Distance


In [85]:
distance = resource_based_circadian_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH  # Consider both start/end timestamps of each activity instance
)

In [86]:
print('Resource-Based Circadian Event distr distance: ', distance)
# Fill the distances dictionary 
distances['resource_based_circadian_event_distribution_distance'] = distance

Resource-Based Circadian Event distr distance:  6.8550158615266685


### Relative Event Distribution Distance

In [87]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
distance = relative_event_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    discretize_type=AbsoluteTimestampType.BOTH,  # Type of timestamp distribution (consider start times and/or end times)
    discretize_event=discretize_to_hour  # Function to discretize the absolute seconds of each timestamp (default by hour)
)

In [88]:
print('Relative Event distr distance: ', distance)
# Fill the distances dictionary
distances['relative_event_distribution_distance'] = distance

Relative Event distr distance:  398.7212695868946


### Work in Progress Distance

<b>WARNING: It may take a long time</b>

In [89]:
# Call passing the event logs, its column ID mappings, timestamp type, and discretize function
# distance = work_in_progress_distance(
#     log_real, event_log_ids,  # First event log and its column id mappings
#     log_sim, event_log_ids,  # Second event log and its column id mappings
#     window_size=pd.Timedelta(hours=1)  # Bins of 1 hour
# )

In [90]:
print('Work in Progress distance: ', distance)
# Fill the distances dictionary
# distances['work_in_progress_distance'] = distance

Work in Progress distance:  398.7212695868946


### Cycle Time Distribution Distance

In [91]:
distance = cycle_time_distribution_distance(
    log_real, event_log_ids,  # First event log and its column id mappings
    log_sim, event_log_ids,  # Second event log and its column id mappings
    bin_size=pd.Timedelta(hours=1)  # Bins of 1 minute
)

In [92]:
print('Cycle Time distr distance: ', distance)
# Fill the distances dictionary
distances['cycle_time_distribution_distance'] = distance

Cycle Time distr distance:  441.71111111111105


### Generateed Attributes Distribution Distance

In [93]:
distance = emd_attributes(log_real, log_sim, attr_names=['Work Order  Qty', 'Part Desc.', 'Report Type', 'Qty Completed', 'Qty Rejected', 'Qty for MRB', 'Rework'], fillna=True)
print('Attr distance: ', distance)
# Fill the distances dictionary
distances['emd_attributes'] = distance

Attr distance:  2.9926358363858365


### Save them

In [94]:
print(f'case study is {case_study} for approach {approach}, with our approach {our_approach} and Sota {Sota}')
# Save the distances dictionary to a file with pkl
pd.to_pickle(distances, f'simulations/{case_study}/metrics.pkl')

case study is Production for approach None, with our approach True and Sota False


In [1]:
from src.preprocess_utils import add_start_end_times

log_real = add_start_end_times(log_path='/home/padela/Scrivania/ProbabilityBasedEventLogGenerator/data/sepsis/sepsis.xes')

parsing log, completed traces ::   0%|          | 0/1050 [00:00<?, ?it/s]

In [2]:
log_real

Unnamed: 0,time:timestamp,concept:name,org:resource,case:concept:name
0,2014-10-22 11:15:41+00:00,ER Registration,A,A
1,2014-10-22 11:27:00+00:00,Leucocytes,B,A
2,2014-10-22 11:27:00+00:00,CRP,B,A
3,2014-10-22 11:27:00+00:00,LacticAcid,B,A
4,2014-10-22 11:33:37+00:00,ER Triage,C,A
...,...,...,...,...
15209,2014-12-16 07:00:00+00:00,CRP,B,KNA
15210,2014-12-16 17:00:00+00:00,Release A,E,KNA
15211,2014-12-03 10:50:28+00:00,ER Registration,L,LNA
15212,2014-12-03 10:54:19+00:00,ER Triage,C,LNA


In [3]:
log_real.to_csv('/home/padela/Scrivania/ProbabilityBasedEventLogGenerator/AgentSimulator-main/raw_data/sepsis.csv', index=True)