In [1]:
import pandas as pd

data = pd.read_csv("data/data_health/trace_activities.csv")
data.drop(axis=1, labels="NUMEPISODES", inplace=True)

In [2]:
number_of_unique_activities = len(data["activity"].unique()) + 1

data["start"] = pd.to_datetime(data["start"])
data["end"] = pd.to_datetime(data["end"])

exclude_columns = [
    "traceId",
    "index",
    "activity",
    "start",
    "end",
    "duration",
    "EVENTID",
    "VariantId",
]

attributes = data.select_dtypes(
    include=["object", "category", "bool", "number"]
).columns.tolist()
selected_attributes = [attr for attr in attributes if attr not in exclude_columns]


In [3]:
# Calculate the total duration for each trace
trace_durations = data.groupby("traceId")["duration"].sum().reset_index()
trace_durations.columns = ["traceId", "total_duration"]
trace_durations.head()

# Aggregate the selected attributes for each trace
trace_attributes = data.groupby("traceId")[selected_attributes].first().reset_index()

# Merge the trace durations with the aggregated attributes
trace_data = trace_durations.merge(trace_attributes, on="traceId")
trace_data.head()


Unnamed: 0,traceId,total_duration,CORDERID,EINRI,ENDDT,LSSTAE,MANDT,NOTKZ,STATU,STORN
0,00035269-393b-47ca-9e77-6f9626433e4f,2312,False,Badajoz,Cerrado,ER,Doctor_10,False,70.0,Empty
1,00042ec6-0f5c-469f-bcf4-32ea9de63d5f,2988,False,Teruel,Cerrado,DI,Doctor_4,True,True,Empty
2,000811dc-2bdb-4497-96bc-0688b4bb381a,2836,False,Castellon,Cerrado,UA,Doctor_1,True,True,Empty
3,00132b96-1cac-4c27-80b2-ad3b59290d43,3289,True,Badajoz,Cerrado,ER,Doctor_10,False,60.0,Empty
4,0014f473-8b72-4ecd-95d2-b33762db32c2,5760,True,Badajoz,Cerrado,UA,Doctor_9,False,50.0,Empty


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in trace_data.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    trace_data[col] = le.fit_transform(trace_data[col])
    label_encoders[col] = le
    
# Split the data into features and target
X = trace_data.drop(columns=["traceId", "total_duration"])
y = trace_data["total_duration"]

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

feature_importances = pd.DataFrame(
    {"feature": X.columns, "importance": model.feature_importances_}
).sort_values(by="importance", ascending=False)

feature_importances