In [1]:
import pandas as pd
import pm4py
import os
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.objects.conversion.process_tree import converter as pt_converter

import matplotlib.pyplot as plt

In [2]:
# def prepare_dataset(file_path):
#     with open(file_path, "r") as file:
#         lines = file.readlines()

#     df = pd.DataFrame([line.split() for line in lines], columns=["Date", "Time", "org:resource", "lifecycle:transition", "concept:name"])

#     df["time:timestamp"] = pd.to_datetime(df["Date"] + " " + df["Time"], format='ISO8601')
#     df['case:concept:name'] = df['time:timestamp'].dt.date.astype(str)

#     df = df.drop(columns=["Date", "Time"])

#     return df[["case:concept:name", "time:timestamp", "concept:name", "org:resource", "lifecycle:transition"]]

In [3]:
# def prepare_dataset(file_path):
#     with open(file_path, "r") as file:
#         lines = file.readlines()

#     df = pd.DataFrame([line.split() for line in lines], columns=["Date", "Time", "org:resource", "lifecycle:transition", "concept:name"])
    
#     # Convert to datetime
#     df["time:timestamp"] = pd.to_datetime(df["Date"] + " " + df["Time"], format='ISO8601')
    
#     # Sort by timestamp
#     df = df.sort_values('time:timestamp')
    
#     # Calculate time differences between consecutive events
#     time_diff = df['time:timestamp'].diff()
    
#     # Create new session when:
#     # 1. Gap between events exceeds threshold (e.g., 4 hours)
#     # 2. A new Sleep ON event starts
#     new_session = (
#         (time_diff > pd.Timedelta(hours=4)) | 
#         ((df['concept:name'] == 'Sleep') & (df['lifecycle:transition'] == 'ON'))
#     )
    
#     # Create case IDs based on sessions
#     df['case:concept:name'] = new_session.cumsum().astype(str)
    
#     # Keep required columns in correct order
#     df = df[[
#         'case:concept:name',
#         'time:timestamp',
#         'concept:name',
#         'org:resource',
#         'lifecycle:transition'
#     ]]

#     print(df['concept:name'].unique())
#     print(df['org:resource'].unique())
    
#     # Verify dtypes
#     print("DataFrame dtypes after preparation:", df.dtypes)
#     print("\nFirst few rows of prepared data:")
#     print(df.head())
    
#     return df

In [4]:
def prepare_dataset(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()

    df = pd.DataFrame([line.split() for line in lines], 
                     columns=["Date", "Time", "org:resource", 
                             "lifecycle:transition", "concept:name"])
    
    # Convert to datetime and sort chronologically
    df["time:timestamp"] = pd.to_datetime(df["Date"] + " " + df["Time"], 
                                        format='ISO8601')
    df = df.sort_values('time:timestamp')
    
    # Calculate time differences between consecutive events
    time_diff = df['time:timestamp'].diff()
    
    # Start new session when:
    # 1. There's a gap of more than 4 hours between activities
    # 2. A Sleep activity starts
    new_session = (
        (time_diff > pd.Timedelta(hours=4)) | 
        ((df['concept:name'] == 'Sleep') & (df['lifecycle:transition'] == 'ON'))
    )
    
    # Create session IDs
    df['case:concept:name'] = new_session.cumsum().astype(str)
    
    # Drop unused columns and reorder
    df = df.drop(columns=["Date", "Time"])
    
    return df[["case:concept:name", "time:timestamp", "concept:name", 
               "org:resource", "lifecycle:transition"]]

In [5]:
def apply_process_mining(log):
    # Apply different miners

    alpha_net, alpha_initial_marking, alpha_final_marking = alpha_miner.apply(log)
    heuristic_net, heu_initial_marking, heu_final_marking = heuristics_miner.apply(log)
    inductive_tree = inductive_miner.apply(log)
    inductive_net, ind_initial_marking, ind_final_marking = pt_converter.apply(inductive_tree)

    models = {
        'Alpha': (alpha_net, alpha_initial_marking, alpha_final_marking),
        'Heuristic': (heuristic_net, heu_initial_marking, heu_final_marking),
        'Inductive': (inductive_net, ind_initial_marking, ind_final_marking)
    }
    
    return models

In [6]:
def calculate_metrics(log, models):
    metrics = {}
    
    for name, (net, initial_marking, final_marking) in models.items():
        fitness = replay_fitness_evaluator.apply(log, net, initial_marking, final_marking)
        precision = precision_evaluator.apply(log, net, initial_marking, final_marking)
        generalization = generalization_evaluator.apply(log, net, initial_marking, final_marking)
        simplicity = simplicity_evaluator.apply(net)
        
        metrics[name] = {
            'Fitness': fitness['average_trace_fitness'],
            'Precision': precision,
            'Generalization': generalization,
            'Simplicity': simplicity
        }
    
    return metrics

In [7]:
# def plot_metrics_comparison(metrics):
#     metrics_df = pd.DataFrame(metrics).T
    
#     plt.figure(figsize=(12, 6))
#     metrics_df.plot(kind='bar', width=0.8)
#     plt.title('Process Mining Metrics Comparison')
#     plt.xlabel('Miners')
#     plt.ylabel('Score')
#     plt.legend(title='Metrics')
#     plt.tight_layout()
#     plt.savefig('metrics_comparison.png')
#     plt.close()

def plot_metrics_comparison(metrics, figures_folder):
    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(metrics).T
    
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Plot bars
    metrics_df.plot(kind='bar', width=0.8, ax=ax)
    
    # Add value labels on top of each bar
    for container in ax.containers:
        ax.bar_label(container, fmt='%.3f', padding=3)
    
    # Customize plot
    plt.title('Process Mining Metrics Comparison')
    plt.xlabel('Miners')
    plt.ylabel('Score')
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Adjust layout to prevent label cutoff
    plt.subplots_adjust(right=0.85, bottom=0.15)
    
    # Save plot
    output_path = os.path.join(figures_folder, f"metrics_comparison.png")
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()
    
    # Print numerical values in console
    print("\nNumerical Metrics:")
    print(metrics_df.round(3).to_string())
    return metrics_df

In [8]:
def visualize_petri_nets(models, figures_folder):
    os.makedirs(figures_folder, exist_ok=True)
    for name, (net, initial_marking, final_marking) in models.items():
        gviz = pn_visualizer.apply(net, initial_marking, final_marking)
        output_path = os.path.join(figures_folder, f"petri_net_{name.lower()}.png")
        pn_visualizer.save(gviz, output_path)

In [9]:
# def analyze_user_habits(log):
#     # Extract activity patterns
#     activities_by_day = {}
#     for trace in log:
#         day = trace.attributes['concept:name']
#         activities = [event['concept:name'] for event in trace]
#         activities_by_day[day] = activities
    
#     # Analyze common patterns
#     common_sequences = {}
#     for day, activities in activities_by_day.items():
#         sequence = ' -> '.join(activities)
#         common_sequences[sequence] = common_sequences.get(sequence, 0) + 1
    
#     return common_sequences

In [10]:
# def analyze_user_habits(log):
#     # Extract activity patterns by session
#     activities_by_session = {}
#     for trace in log:
#         session_id = trace.attributes['concept:name']
#         # Create list of (activity, state) pairs in temporal order
#         activities = [(event['concept:name'], event['lifecycle:transition']) 
#                      for event in sorted(trace, key=lambda x: x['time:timestamp'])]
#         activities_by_session[session_id] = activities
    
#     # Analyze common patterns
#     common_sequences = {}
#     for session_id, activities in activities_by_session.items():
#         # Create meaningful sequence string with state transitions
#         sequence_parts = []
#         current_activity = None
        
#         for activity, state in activities:
#             # Only add to sequence when activity changes or when same activity has different state
#             if current_activity != activity:
#                 sequence_parts.append(f"{activity}({state})")
#                 current_activity = activity
#             else:
#                 sequence_parts.append(f"→{state}")
        
#         sequence = ' -> '.join(sequence_parts)
#         common_sequences[sequence] = common_sequences.get(sequence, 0) + 1
    
#     # Sort by frequency
#     sorted_sequences = dict(sorted(common_sequences.items(), 
#                                  key=lambda x: x[1], 
#                                  reverse=True))
    
#         # Save habits analysis
#     with open('user_habits_analysis.txt', 'w') as f:
#         for sequence, count in sorted(sorted_sequences.items(), key=lambda x: x[1], reverse=True):
#             f.write(f'Frequency: {count}\nSequence: {sequence}\n\n')
    
#     return sorted_sequences

In [11]:
def analyze_user_habits(log, processed_folder):
    # Extract activity patterns by session
    activities_by_session = {}
    for trace in log:
        session_id = trace.attributes['concept:name']
        # Create list of (activity, state) pairs in temporal order
        activities = [(event['concept:name'], event['lifecycle:transition']) 
                     for event in sorted(trace, key=lambda x: x['time:timestamp'])]
        activities_by_session[session_id] = activities
    
    # Analyze common patterns
    common_sequences = {}
    for session_id, activities in activities_by_session.items():
        # Create meaningful sequence string with state transitions
        sequence_parts = []
        current_activity = None
        
        for activity, state in activities:
            # Only add new activities when they start (ON state)
            if current_activity != activity and state == 'ON':
                sequence_parts.append(activity)
                current_activity = activity
        
        # Only create sequence if there are activities
        if sequence_parts:
            sequence = ' -> '.join(sequence_parts)
            common_sequences[sequence] = common_sequences.get(sequence, 0) + 1
    
    # Sort by frequency
    sorted_sequences = dict(sorted(common_sequences.items(), 
                                 key=lambda x: x[1], 
                                 reverse=True))
    
    output_file = os.path.join(processed_folder, "user_habits_analysis.txt")

    with open(output_file, 'w', encoding='utf-8') as f:
        for sequence, count in sorted(sorted_sequences.items(), key=lambda x: x[1], reverse=True):
            f.write(f'Frequency: {count}\nSequence: {sequence}\n\n')
    
    return sorted_sequences

In [12]:
# Load dataset

input_file = os.path.join('data', 'raw', 'tm001.txt')

df = prepare_dataset(input_file)
df_stratified = df.groupby('concept:name').apply(lambda x: x.sample(2)).reset_index(drop=True)

# Convert to event log
log = pm4py.convert_to_event_log(df_stratified)

  df_stratified = df.groupby('concept:name').apply(lambda x: x.sample(2)).reset_index(drop=True)


In [13]:
# Apply process mining
models = apply_process_mining(log)

In [14]:
# Calculate metrics
metrics = calculate_metrics(log, models)

aligning log, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/29 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/29 [00:00<?, ?it/s]

aligning log, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

computing precision with alignments, completed variants ::   0%|          | 0/29 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/29 [00:00<?, ?it/s]

In [15]:
figures_folder = os.path.join('resources', 'figures')

# Plot comparisons
plot_metrics_comparison(metrics, figures_folder)

# Visualize Petri nets
visualize_petri_nets(models, figures_folder)


Numerical Metrics:
           Fitness  Precision  Generalization  Simplicity
Alpha        0.317      0.685           0.293       0.417
Heuristic    0.758      0.989           0.092       0.628
Inductive    1.000      0.773           0.283       0.600


In [16]:
# Analyze user habits
user_habits_path = os.path.join('data', 'processed')
habits = analyze_user_habits(log, user_habits_path)