In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv("data/data_health/trace_activities.csv")

In [2]:
# data = data.drop(columns=["index", "EVENTID"])
data["start"] = pd.to_datetime(data["start"])
data["end"] = pd.to_datetime(data["end"])

attributes = [
    attr
    for attr in data.select_dtypes(include=["object", "bool", "number"]).columns
    if attr not in ["traceId", "activity", "start", "end"]
]


def is_trace_level(attribute):
    return data.groupby("traceId")[attribute].nunique().max() == 1


selected_attributes = [attr for attr in attributes if is_trace_level(attr)]

In [3]:
data["activity_duration"] = (data["end"] - data["start"]).dt.total_seconds().astype(int)

data["activity_durations"] = data.groupby("traceId")["activity_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

data["transition_duration"] = (
    (data.groupby("traceId")["start"].shift(-1) - data["end"])
    .dt.total_seconds()
    .fillna(0)
    .astype(int)
)


data["transition_durations"] = data.groupby("traceId")["transition_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

trace_total_duration = (
    data.groupby("traceId")
    .apply(lambda x: (x["end"].max() - x["start"].min()).total_seconds())
    .reset_index(name="trace_total_duration")
)


data = pd.merge(data, trace_total_duration, on="traceId")

In [4]:
activities = data["activity"].unique().tolist()
activity_to_index = {activity: i for i, activity in enumerate(activities)}
data["activity"] = data["activity"].map(activity_to_index)


def decode_activities(indices, index_to_activity):
    return [index_to_activity[index] for index in indices]


data["trace_activity_list"] = data.groupby("traceId")["activity"].transform(
    lambda x: [x.tolist()] * len(x)
)

In [5]:
data = data.drop_duplicates(subset="traceId")[
    [
        "traceId",
        "trace_activity_list",
        "activity_durations",
        "transition_durations",
        "trace_total_duration",
    ]
    + selected_attributes
]

In [6]:
def assert_duration_consistency(row):
    activity_sum = sum(row["activity_durations"])
    transition_sum = sum(row["transition_durations"])
    total_duration = row["trace_total_duration"]
    assert (
        activity_sum + transition_sum == total_duration
    ), f"Inconsistency found in trace {row['traceId']}: {activity_sum} (activities) + {transition_sum} (transitions) != {total_duration} (total)"


data.apply(assert_duration_consistency, axis=1)
print("All durations are consistent.")

All durations are consistent.


In [7]:
for col in data[selected_attributes].select_dtypes(include=["bool"]).columns:
    data[col] = data[col].astype(int)

data = pd.get_dummies(
    data,
    columns=data[selected_attributes]
    .select_dtypes(include=["object", "category"])
    .columns,
)

transformed_attributes = data.columns

Index(['traceId', 'trace_activity_list', 'activity_durations',
       'transition_durations', 'trace_total_duration', 'CORDERID', 'NOTKZ',
       'EINRI_Badajoz', 'EINRI_Castellon', 'EINRI_Teruel', 'ENDDT_Abierto',
       'ENDDT_Cerrado', 'LSSTAE_DI', 'LSSTAE_ER', 'LSSTAE_QU', 'LSSTAE_UA',
       'MANDT_Doctor_1', 'MANDT_Doctor_10', 'MANDT_Doctor_11',
       'MANDT_Doctor_2', 'MANDT_Doctor_3', 'MANDT_Doctor_4', 'MANDT_Doctor_5',
       'MANDT_Doctor_6', 'MANDT_Doctor_7', 'MANDT_Doctor_8', 'MANDT_Doctor_9',
       'STATU_50.0', 'STATU_60.0', 'STATU_70.0', 'STATU_True', 'STORN_Empty',
       'STORN_x'],
      dtype='object')

In [8]:
##TODO Update attributes here!!

from sklearn.ensemble import RandomForestRegressor

X = data[selected_attributes]
y = data["trace_total_duration"]

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

feature_importances = pd.DataFrame(
    {"feature": X.columns, "importance": model.feature_importances_}
).sort_values(by="importance", ascending=False)

feature_importances["cumulative_importance"] = feature_importances[
    "importance"
].cumsum()

important_attributes = feature_importances[
    feature_importances["cumulative_importance"] <= 0.95
]["feature"].tolist()

attributes_to_exclude = list(set(selected_attributes) - set(important_attributes))
data = data.drop(columns=attributes_to_exclude)

KeyError: "['EINRI', 'ENDDT', 'LSSTAE', 'MANDT', 'STATU', 'STORN'] not in index"

In [None]:
import numpy as np

np.random.seed(42)


def truncate_sequence_random(seq):
    if len(seq) > 1:
        trunc_point = np.random.randint(1, len(seq))
        truncated = seq[:trunc_point]
        remaining = seq[trunc_point:]
    else:
        truncated = seq
        remaining = []
        trunc_point = len(seq)
    return truncated, remaining, trunc_point


def truncate_list(lst, trunc_points, offset=0):
    truncated = [
        item[: truncation_point - offset]
        for item, truncation_point in zip(lst, trunc_points)
    ]
    remaining = [
        item[truncation_point - offset :]
        for item, truncation_point in zip(lst, trunc_points)
    ]
    return truncated, remaining


data[["truncated_activity_list", "remaining_activity_list", "trunc_point"]] = (
    data["trace_activity_list"].apply(truncate_sequence_random).apply(pd.Series)
)

data["truncated_durations"], data["remaining_durations"] = truncate_list(
    data["activity_durations"], data["trunc_point"]
)
data["truncated_transitions"], data["remaining_transitions"] = truncate_list(
    data["transition_durations"], data["trunc_point"], offset=1
)

data["truncated_total_duration"] = data["truncated_durations"].apply(sum) + data[
    "truncated_transitions"
].apply(sum)
data["remaining_total_duration"] = data["remaining_durations"].apply(sum) + data[
    "remaining_transitions"
].apply(sum)

assert all(
    data["truncated_total_duration"] + data["remaining_total_duration"]
    == data["trace_total_duration"]
)

This part is probably going to be removed, I leave it here just in case

In [None]:
# # Calculate the lengths of each trace_activity_list
# data["sequence_length"] = data["trace_activity_list"].apply(len)

# fig, ax = plt.subplots()
# ax.hist(
#     data["sequence_length"],
#     bins=range(1, data["sequence_length"].max() + 1),
#     color="blue",
#     alpha=0.5,
#     label="Sequence Lengths",
# )
# ax.hist(
#     data["trunc_point"],
#     bins=range(1, data["sequence_length"].max() + 1),
#     color="red",
#     alpha=0.5,
#     label="Truncation Points",
# )
# ax.set_title("Comparison of Sequence Lengths and Truncation Points")
# ax.set_xlabel("Point in Sequence")
# ax.set_ylabel("Frequency")
# ax.legend()
# plt.show()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = max(data["trace_activity_list"].apply(len))

data["truncated_activity_list"] = pad_sequences(
    data["truncated_activity_list"], maxlen=max_sequence_length, padding="post"
).tolist()

data["remaining_activity_list"] = pad_sequences(
    data["remaining_activity_list"], maxlen=max_sequence_length, padding="post"
).tolist()