In [60]:
import pm4py
import statistics as stat
from datetime import datetime

# Preliminary Analysis

### Loading the Data

In [61]:
# Loading log 1
log_1 = pm4py.read_xes(str(os.getcwd()) + "/input-logs/L1.xes.gz")
event_log_1 = pm4py.convert_to_event_log(log_1)

# Loading log 1
log_2 = pm4py.read_xes(str(os.getcwd()) + "/input-logs/L2.xes.gz")
event_log_2 = pm4py.convert_to_event_log(log_2)

parsing log, completed traces ::   0%|          | 0/14159 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

### Creating procedure for displaying statistics

In [62]:
"""
    Computes and prints log statistics for a given branch.
    
    Parameters:
    - branch_name: Name of the branch (e.g., "Branch 1", "Branch 2").
    - current_log: The event log for the branch.
    - current_event_log: The event log for the branch.
"""
def print_log_statistics(branch_name, current_log, current_event_log):
    
    # Calculate case durations
    all_case_durations = pm4py.get_all_case_durations(current_log)
    
    # General statistics
    num_cases = len(current_event_log)
    num_events = len(current_log)
    num_variants = len(pm4py.get_variants(current_log))
    
    # Calculate total events
    total_event_count = 0
    for order in current_event_log:
        total_event_count += len(order)
    
    # Calculate average number of events per case
    average_events_per_case = total_event_count / num_variants
    
    # Activity and resource statistics
    unique_activities = len(set(current_log["concept:name"]))
    unique_resources = len(set(current_log["org:resource"]))

    # Event timestamp information
    earliest_event = min(current_log["time:timestamp"]).strftime("%d.%m.%Y %H:%M:%S")
    latest_event = max(current_log["time:timestamp"]).strftime("%d.%m.%Y %H:%M:%S")

    # Trace duration statistics
    longest_trace = max(all_case_durations)
    shortest_trace = min(all_case_durations)
    median_trace_duration = stat.median(all_case_durations)
    mean_trace_duration = stat.mean(all_case_durations)

    # Print results
    print(f"--- Statistics for {branch_name} ---")
    print(f"There are {num_cases} cases")
    print(f"There are {num_events} events")
    print(f"There are {num_variants} variants")
    print(f"The average number of events per case is {average_events_per_case:.2f}\n")
    print(f"There are {unique_activities} unique activities")
    print(f"There are {unique_resources} unique resources")
    print(f"The earliest event happened on {earliest_event}")
    print(f"The latest event happened on {latest_event}\n")
    print(f"Longest trace duration: {longest_trace:.2f}")
    print(f"Shortest trace duration: {shortest_trace:.2f}")
    print(f"Median trace duration: {median_trace_duration:.2f}")
    print(f"Mean trace duration: {mean_trace_duration:.2f}")

### Statistics for Branch One

In [63]:
print_log_statistics("Branch 1", log_1, event_log_1)

--- Statistics for Branch 1 ---
There are 14159 cases
There are 527123 events
There are 7697 variants
The average number of events per case is 68.48

There are 25 unique activities
There are 124 unique resources
The earliest event happened on 01.01.2022 02:38:41
The latest event happened on 15.06.2022 20:49:46

Longest trace duration: 9934804.29
Shortest trace duration: 281.70
Median trace duration: 1628876.10
Mean trace duration: 1858220.64


### Statistics for Branch Two


In [64]:
print_log_statistics("Branch 2", log_2, event_log_2)

--- Statistics for Branch 2 ---
There are 13087 cases
There are 262200 events
There are 4366 variants
The average number of events per case is 60.05

There are 24 unique activities
There are 69 unique resources
The earliest event happened on 01.01.2022 01:38:44
The latest event happened on 15.06.2022 18:04:54

Longest trace duration: 11855936.01
Shortest trace duration: 1.85
Median trace duration: 69857.43
Mean trace duration: 745078.36


### Unique Activities on both Branches

In [65]:
activities_branch_1 = set(log_1["concept:name"])
activities_branch_2 = set(log_2["concept:name"])

print("There are " + str(len(activities_branch_1.difference(activities_branch_2))) + " unique activites on branch 1")
print("\nThere are " + str(len(activities_branch_2.difference(activities_branch_1))) + " unique activites on branch 2")

There are 8 unique activites on branch 1

There are 7 unique activites on branch 2
