In [33]:
import pandas as pd
import numpy as np 


In [34]:
df_event_log = pd.read_csv('synthetic_triage_eventlog.csv')
print(df_event_log.head(10))

     Case ID Activity                    Activity Label       time:timestamp  \
0  202501011        A                Initial Assessment  2025-01-01T10:26:00   
1  202501011        B                     Immediate CPR  2025-01-01T10:44:00   
2  202501011        C      Stabilization and Monitoring  2025-01-01T11:11:00   
3  202501011        E   Further Diagnostics and Imaging  2025-01-01T12:07:00   
4  202501011        G  Treatment and Monitor Conditions  2025-01-01T12:19:00   
5  202501011        H               Transfer to the ICU  2025-01-01T13:12:00   
6  202501012        A                Initial Assessment  2025-01-01T08:49:00   
7  202501012        B                     Immediate CPR  2025-01-01T09:47:00   
8  202501012        C      Stabilization and Monitoring  2025-01-01T10:17:00   
9  202501012        E   Further Diagnostics and Imaging  2025-01-01T10:51:00   

   Heart_Rate  Oxygen_Saturation Blood_Pressure  Age Consciousness_Level  \
0          63               87.7         12

In [35]:
print(df_event_log.columns)

Index(['Case ID', 'Activity', 'Activity Label', 'time:timestamp', 'Heart_Rate',
       'Oxygen_Saturation', 'Blood_Pressure', 'Age', 'Consciousness_Level',
       'PHI', 'Imaging_Results'],
      dtype='object')


In [36]:
# ------------------------------
# --- Decision Point Labeling (DP1–DP4)
# ------------------------------

# Step 1: Convert timestamp column to datetime format and sort events by case and time
df_event_log['time:timestamp'] = pd.to_datetime(df_event_log['time:timestamp'])
df_event_log = df_event_log.sort_values(by=['Case ID', 'time:timestamp'])

# Step 2: Define a function to get the first activity that follows a given target activity
def get_next_after(activity_list, target):
    for i in range(len(activity_list) - 1):
        if activity_list[i] == target:
            return activity_list[i + 1]
    return None

# Step 3: Initialize dictionaries to store DP labels
dp1_labels = {}            # First activity after A
dp2_label_dict = {}        # Label determined based on C/D pattern before E
dp3_labels = {}            # First activity after E
dp4_labels = {}            # First activity after G

# Step 4: Process each case in the event log
grouped = df_event_log.groupby('Case ID')

for case_id, group in grouped:
    activities = list(group['Activity'].values)
    indices = group.index.tolist()

    # --- DP1: First activity after A ---
    dp1_labels[case_id] = get_next_after(activities, 'A')

    # --- DP3: First activity after E ---
    dp3_labels[case_id] = get_next_after(activities, 'E')

    # --- DP4: First activity after G ---
    dp4_labels[case_id] = get_next_after(activities, 'G')

    # --- DP2: Labeling logic before E ---
    try:
        e_index = activities.index('E')
        activities_before_e = activities[:e_index]
        indices_before_e = indices[:e_index]
    except ValueError:
        # Skip case if there is no activity E (Exception)
        continue

    # Count how many times C and D occurred before E
    c_count = activities_before_e.count('C')
    d_count = activities_before_e.count('D')
    total_cd = c_count + d_count

    # Initialize a label list for the whole trace
    case_labels = [''] * len(activities)

    if total_cd == 1:
        # Case 1: Only one C or D before E → all rows get label E
        case_labels = ['E'] * len(activities)
    else:
        # Case 2: Multiple occurrences → label based on next activity
        last_cd_index = -1
        for i in range(len(activities_before_e)):
            act = activities_before_e[i]
            if act == 'C':
                if i + 1 < len(activities_before_e):
                    next_act = activities_before_e[i + 1]
                    if next_act == 'B':
                        case_labels[i] = 'B'
                    elif next_act == 'C':
                        case_labels[i] = 'C'
                    elif next_act == 'D':
                        case_labels[i] = 'D'
                last_cd_index = i
            elif act == 'D':
                case_labels[i] = 'E'
                last_cd_index = i

        # After the last C or D, add label E to the next activity
        if last_cd_index >= 0 and last_cd_index + 1 < len(activities):
            case_labels[last_cd_index + 1] = 'E'

    # Map the generated DP2 labels to their original row indices
    for idx, label in zip(indices, case_labels):
        dp2_label_dict[idx] = label

# Step 5: Add all decision point labels back into the dataframe
df_event_log['DP1_label'] = df_event_log['Case ID'].map(dp1_labels)
df_event_log['DP2_label'] = df_event_log.index.map(dp2_label_dict)
df_event_log['DP3_label'] = df_event_log['Case ID'].map(dp3_labels)
df_event_log['DP4_label'] = df_event_log['Case ID'].map(dp4_labels)

# Step 6: Forward-fill DP2_label within each case
df_event_log['DP2_label'] = df_event_log['DP2_label'].replace('', np.nan)
df_event_log['DP2_label'] = df_event_log.groupby('Case ID')['DP2_label'].ffill()

# Display the first 10 rows
#print(df_event_log.head(10))

# Preview updated dataframe
df_event_log[[ 'Heart_Rate', 'Oxygen_Saturation', 'Blood_Pressure', 'Age', 'Consciousness_Level', 'PHI', 'Imaging_Results',
              'DP1_label', 'DP2_label', 'DP3_label', 'DP4_label' ]].head(20)

Unnamed: 0,Heart_Rate,Oxygen_Saturation,Blood_Pressure,Age,Consciousness_Level,PHI,Imaging_Results,DP1_label,DP2_label,DP3_label,DP4_label
0,63,87.7,128/52,51,Drowsy,Critical,,B,E,G,H
1,63,87.7,128/52,51,Drowsy,Critical,,B,E,G,H
2,63,87.7,128/52,51,Drowsy,Critical,,B,E,G,H
3,63,87.7,128/52,51,Drowsy,Critical,No Critical Injury Found,B,E,G,H
4,63,87.7,128/52,51,Drowsy,Critical,No Critical Injury Found,B,E,G,H
5,63,87.7,128/52,51,Drowsy,Critical,No Critical Injury Found,B,E,G,H
6,133,89.9,103/51,47,Drowsy,Critical,,B,E,F,
7,133,89.9,103/51,47,Drowsy,Critical,,B,E,F,
8,133,89.9,103/51,47,Drowsy,Critical,,B,E,F,
9,133,89.9,103/51,47,Drowsy,Critical,Confirmed Critical Injury,B,E,F,


In [37]:
# ------------------------------
# --- Transfer categorical event attributes to numeric values format for decision tree
# ------------------------------


# --- Split Blood_Pressure ---
df_event_log[['bp_sys', 'bp_dia']] = df_event_log['Blood_Pressure'].str.split('/', expand=True).astype('Int64')

# --- Map Consciousness_Level ---
consciousness_map = {
    'Unconscious': 0,
    'Drowsy': 1,
    'Lethargic': 2,
    'Alert': 3
}
df_event_log['cons_num'] = df_event_log['Consciousness_Level'].map(consciousness_map).astype('Int64')

# --- Map PHI ---
phi_map = {
    'Life-Threatening': 0,
    'Critical': 1,
    'Deteriorating': 2,
    'Stable': 3
}
df_event_log['PHI_num'] = df_event_log['PHI'].map(phi_map).astype('Int64')

# --- Map Imaging_Results (handle empty values as NaN first) ---
imaging_map = {
    'Confirmed Critical Injury': 1,
    'No Critical Injury Found': 0
}
df_event_log['Img_res_num'] = df_event_log['Imaging_Results'].map(imaging_map).astype('Int64')

# Preview updated dataframe
df_event_log[['Blood_Pressure', 'bp_sys', 'bp_dia', 
              'Consciousness_Level', 'cons_num',
              'PHI', 'PHI_num',
              'Imaging_Results', 'Img_res_num']].head(20)


Unnamed: 0,Blood_Pressure,bp_sys,bp_dia,Consciousness_Level,cons_num,PHI,PHI_num,Imaging_Results,Img_res_num
0,128/52,128,52,Drowsy,1,Critical,1,,
1,128/52,128,52,Drowsy,1,Critical,1,,
2,128/52,128,52,Drowsy,1,Critical,1,,
3,128/52,128,52,Drowsy,1,Critical,1,No Critical Injury Found,0.0
4,128/52,128,52,Drowsy,1,Critical,1,No Critical Injury Found,0.0
5,128/52,128,52,Drowsy,1,Critical,1,No Critical Injury Found,0.0
6,103/51,103,51,Drowsy,1,Critical,1,,
7,103/51,103,51,Drowsy,1,Critical,1,,
8,103/51,103,51,Drowsy,1,Critical,1,,
9,103/51,103,51,Drowsy,1,Critical,1,Confirmed Critical Injury,1.0


In [38]:
# ------------------------------
# --- Map DPs Label columns to numeric values format for decision tree
# ------------------------------


# --- DP1_label: B → 0, D → 1 ---
dp1_map = {'B': 0, 'D': 1}
df_event_log['DP1_label_num'] = df_event_log['DP1_label'].map(dp1_map).astype('Int64')

# --- DP2_label: B → 0, C → 1, D → 2, E → 3 ---
dp2_map = {'B': 0, 'C': 1, 'D': 2, 'E': 3}
df_event_log['DP2_label_num'] = df_event_log['DP2_label'].map(dp2_map).astype('Int64')

# --- DP3_label: F → 0, G → 1 ---
dp3_map = {'F': 0, 'G': 1}
df_event_log['DP3_label_num'] = df_event_log['DP3_label'].map(dp3_map).astype('Int64')

# --- DP4_label: H → 0, I → 1 ---
dp4_map = {'H': 0, 'I': 1}
df_event_log['DP4_label_num'] = df_event_log['DP4_label'].map(dp4_map).astype('Int64')

df_event_log[[ 'DP1_label', 'DP1_label_num',
               'DP2_label', 'DP2_label_num',
               'DP3_label', 'DP3_label_num',
               'DP4_label', 'DP4_label_num' ]].head(20)


Unnamed: 0,DP1_label,DP1_label_num,DP2_label,DP2_label_num,DP3_label,DP3_label_num,DP4_label,DP4_label_num
0,B,0,E,3.0,G,1,H,0.0
1,B,0,E,3.0,G,1,H,0.0
2,B,0,E,3.0,G,1,H,0.0
3,B,0,E,3.0,G,1,H,0.0
4,B,0,E,3.0,G,1,H,0.0
5,B,0,E,3.0,G,1,H,0.0
6,B,0,E,3.0,F,0,,
7,B,0,E,3.0,F,0,,
8,B,0,E,3.0,F,0,,
9,B,0,E,3.0,F,0,,


In [39]:
# Save to CSV
df_event_log.to_csv("labeled_triage_eventlog.csv", index=False)

from IPython.display import FileLink
FileLink("labeled_triage_eventlog.csv")