In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv("data/data_health/trace_activities.csv")

In [2]:
data = data.drop(columns=["index", "EVENTID"])
data["start"] = pd.to_datetime(data["start"])
data["end"] = pd.to_datetime(data["end"])

attributes = [
    attr
    for attr in data.select_dtypes(include=["object", "bool", "number"]).columns
    if attr not in ["traceId", "activity", "start", "end"]
]


def is_trace_level(attribute):
    return data.groupby("traceId")[attribute].nunique().max() == 1


selected_attributes = [attr for attr in attributes if is_trace_level(attr)]

In [3]:
data["activity_duration"] = (data["end"] - data["start"]).dt.total_seconds().astype(int)

data["activity_durations"] = data.groupby("traceId")["activity_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

data["transition_duration"] = (
    (data.groupby("traceId")["start"].shift(-1) - data["end"])
    .dt.total_seconds()
    .fillna(0)
    .astype(int)
)


data["transition_durations"] = data.groupby("traceId")["transition_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

trace_total_duration = (
    data.groupby("traceId")
    .apply(lambda x: (x["end"].max() - x["start"].min()).total_seconds())
    .reset_index(name="trace_total_duration")
)


data = pd.merge(data, trace_total_duration, on="traceId")

In [4]:
activities = data["activity"].unique().tolist()
activity_to_index = {activity: i for i, activity in enumerate(activities)}
data["activity"] = data["activity"].map(activity_to_index)


def decode_activities(indices, index_to_activity):
    return [index_to_activity[index] for index in indices]


data["trace_activity_list"] = data.groupby("traceId")["activity"].transform(
    lambda x: [x.tolist()] * len(x)
)

In [5]:
data = data.drop_duplicates(subset="traceId")[
    [
        "traceId",
        "trace_activity_list",
        "activity_durations",
        "transition_durations",
        "trace_total_duration",
    ]
    + selected_attributes
]

In [6]:
def assert_duration_consistency(row):
    activity_sum = sum(row["activity_durations"])
    transition_sum = sum(row["transition_durations"])
    total_duration = row["trace_total_duration"]
    assert (
        activity_sum + transition_sum == total_duration
    ), f"Inconsistency found in trace {row['traceId']}: {activity_sum} (activities) + {transition_sum} (transitions) != {total_duration} (total)"


data.apply(assert_duration_consistency, axis=1)
print("All durations are consistent.")

All durations are consistent.


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in (
    data[selected_attributes].select_dtypes(include=["object", "category"]).columns
):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

for col in data[selected_attributes].select_dtypes(include=["bool"]).columns:
    data[col] = data[col].astype(int)

In [8]:
from sklearn.ensemble import RandomForestRegressor

X = data[selected_attributes]
y = data["trace_total_duration"]

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

feature_importances = pd.DataFrame(
    {"feature": X.columns, "importance": model.feature_importances_}
).sort_values(by="importance", ascending=False)

feature_importances["cumulative_importance"] = feature_importances[
    "importance"
].cumsum()

important_attributes = feature_importances[
    feature_importances["cumulative_importance"] <= 0.95
]["feature"].tolist()

attributes_to_exclude = list(set(selected_attributes) - set(important_attributes))
data = data.drop(columns=attributes_to_exclude)

In [9]:
data

Unnamed: 0,traceId,trace_activity_list,activity_durations,transition_durations,trace_total_duration,ENDDT,MANDT,NOTKZ,STATU,STORN
0,049032b2-7422-4e24-af73-3a2d3e4e5b54,"[0, 1, 2, 3, 4, 5]","[120, 60, 0, 840, 540, 0]","[120, 181680, 120, 120, 120, 0]",183720.0,1,0,0,0,0
6,064bbd7b-825d-4614-ba4c-17b8100ec562,"[2, 3, 4, 5]","[180, 2280, 420, 0]","[180, 120, 120, 0]",3300.0,1,9,1,3,0
10,0833d1f0-75bf-4d97-b2a5-9f9176f9f8e7,"[0, 1, 2, 3, 4, 5]","[120, 60, 60, 960, 360, 0]","[180, 268560, 120, 120, 180, 0]",270720.0,1,8,0,0,0
16,08edba5c-6735-4d55-aec2-2f73e0a1f68d,"[0, 1, 2, 3, 4, 5]","[120, 60, 60, 1140, 420, 60]","[120, 293640, 120, 180, 60, 0]",295980.0,1,3,0,2,0
22,08f090ac-1c84-4f7c-8b4c-21c40fff6eb2,"[0, 1, 2, 3, 4, 5]","[120, 240, 180, 1200, 600, 60]","[120, 315360, 60, 120, 120, 0]",318180.0,1,9,0,1,0
...,...,...,...,...,...,...,...,...,...,...
78455,e94bb8bf-ba56-46b7-a404-c4c14403551c,"[0, 1, 1, 2, 3, 4, 5]","[120, 60, 60, 60, 1020, 540, 60]","[180, 105120, 342720, 60, 120, 120, 0]",450240.0,1,10,0,0,0
78462,eaf4be2d-cc0f-4186-b916-2f767ee1a895,"[0, 1, 2, 3, 4, 5]","[60, 240, 180, 540, 480, 60]","[120, 279540, 120, 60, 60, 0]",281460.0,1,6,0,0,0
78468,f10c33f0-aa15-4d33-a594-f1bed1f9bf20,"[0, 1, 2, 3, 4, 5]","[120, 180, 120, 720, 360, 60]","[120, 191340, 180, 120, 120, 0]",193440.0,1,2,0,1,0
78474,fac1adc5-5f6d-4762-a78e-9c565d40fc7f,"[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]","[60, 0, 60, 960, 3180, 60, 60, 60, 900, 3120]","[120, 254400, 120, 120, 120, 180, 243780, 180,...",507600.0,0,8,0,0,0


In [11]:
import numpy as np

def truncate_sequence(seq):
    if len(seq) > 1:
        trunc_point = np.random.randint(1, len(seq))
        truncated = seq[:trunc_point]
        remaining = seq[trunc_point:]
    else:
        truncated = seq
        remaining = []
        trunc_point = len(seq)
    return truncated, remaining, trunc_point


def truncate_list(lst, trunc_points, offset=0):
    truncated = [item[: tp - offset] for item, tp in zip(lst, trunc_points)]
    remaining = [item[tp - offset :] for item, tp in zip(lst, trunc_points)]
    return truncated, remaining


data[["truncated_tokenized", "remaining_tokenized", "trunc_point"]] = (
    data["trace_activity_list"].apply(truncate_sequence).apply(pd.Series)
)

data["truncated_durations"], data["remaining_durations"] = truncate_list(
    data["activity_durations"], data["trunc_point"]
)
data["truncated_transitions"], data["remaining_transitions"] = truncate_list(
    data["transition_durations"], data["trunc_point"], 1
)

data["truncated_total_duration"] = data["truncated_durations"].apply(sum) + data[
    "truncated_transitions"
].apply(sum)
data["remaining_total_duration"] = data["remaining_durations"].apply(sum) + data[
    "remaining_transitions"
].apply(sum)

assert all(
    data["truncated_total_duration"] + data["remaining_total_duration"]
    == data["trace_total_duration"]
)