In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv("data/data_health/trace_activities.csv")

#### Initial Considerations

In [2]:
# data = data.drop(columns=["index", "EVENTID"])
data["start"] = pd.to_datetime(data["start"])
data["end"] = pd.to_datetime(data["end"])

n_unique_activities = len(data["activity"].unique()) + 1

attributes = [
    attr
    for attr in data.select_dtypes(include=["object", "bool", "number"]).columns
    if attr not in ["traceId", "activity", "start", "end"]
]


def is_trace_level(attribute):
    return data.groupby("traceId")[attribute].nunique().max() == 1


selected_attributes = [attr for attr in attributes if is_trace_level(attr)]

#### We get the durations

In [3]:
data["activity_duration"] = (data["end"] - data["start"]).dt.total_seconds().astype(int)

data["activity_durations"] = data.groupby("traceId")["activity_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

data["transition_duration"] = (
    (data.groupby("traceId")["start"].shift(-1) - data["end"])
    .dt.total_seconds()
    .fillna(0)
    .astype(int)
)


data["transition_durations"] = data.groupby("traceId")["transition_duration"].transform(
    lambda x: [x.tolist()] * len(x)
)

trace_total_duration = (
    data.groupby("traceId")
    .apply(lambda x: (x["end"].max() - x["start"].min()).total_seconds())
    .reset_index(name="trace_total_duration")
)


data = pd.merge(data, trace_total_duration, on="traceId")

#### Encode the activities and get the activities list / Take only the first row and assert durations

In [4]:
activities = data["activity"].unique().tolist()
activity_to_index = {activity: i for i, activity in enumerate(activities)}
data["activity"] = data["activity"].map(activity_to_index)


def decode_activities(indices, index_to_activity):
    return [index_to_activity[index] for index in indices]


data["trace_activity_list"] = data.groupby("traceId")["activity"].transform(
    lambda x: [x.tolist()] * len(x)
)

data = data.drop_duplicates(subset="traceId")[
    [
        "traceId",
        "trace_activity_list",
        "activity_durations",
        "transition_durations",
        "trace_total_duration",
    ]
    + selected_attributes
]


def assert_duration_consistency(row):
    activity_sum = sum(row["activity_durations"])
    transition_sum = sum(row["transition_durations"])
    total_duration = row["trace_total_duration"]
    assert (
        activity_sum + transition_sum == total_duration
    ), f"Inconsistency found in trace {row['traceId']}: {activity_sum} (activities) + {transition_sum} (transitions) != {total_duration} (total)"


data.apply(assert_duration_consistency, axis=1)
print("All durations are consistent.")

All durations are consistent.


#### Encode attributes

In [5]:
boolean_columns = data[selected_attributes].select_dtypes(include=["bool"]).columns
for col in boolean_columns:
    data[col] = data[col].astype(int)

initial_columns = data.columns.tolist()

data = pd.get_dummies(
    data,
    columns=data[selected_attributes]
    .select_dtypes(include=["object", "category"])
    .columns,
)

new_dummy_columns = list(set(data.columns) - set(initial_columns))
transformed_columns = list(boolean_columns) + new_dummy_columns

#### Truncate sequences and padding

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

np.random.seed(42)


def truncate_sequence_random(seq):
    if len(seq) > 1:
        trunc_point = np.random.randint(1, len(seq))
        truncated = seq[:trunc_point]
        remaining = seq[trunc_point:]
    else:
        truncated = seq
        remaining = []
        trunc_point = len(seq)
    return truncated, remaining, trunc_point


def truncate_list(lst, trunc_points, offset=0):
    truncated = [
        item[: truncation_point - offset]
        for item, truncation_point in zip(lst, trunc_points)
    ]
    remaining = [
        item[truncation_point - offset :]
        for item, truncation_point in zip(lst, trunc_points)
    ]
    return truncated, remaining


data[["truncated_activity_list", "remaining_activity_list", "trunc_point"]] = (
    data["trace_activity_list"].apply(truncate_sequence_random).apply(pd.Series)
)

data["truncated_durations"], data["remaining_durations"] = truncate_list(
    data["activity_durations"], data["trunc_point"]
)
data["truncated_transitions"], data["remaining_transitions"] = truncate_list(
    data["transition_durations"], data["trunc_point"], offset=1
)

data["truncated_total_duration"] = data["truncated_durations"].apply(sum) + data[
    "truncated_transitions"
].apply(sum)
data["remaining_total_duration"] = data["remaining_durations"].apply(sum) + data[
    "remaining_transitions"
].apply(sum)

assert all(
    data["truncated_total_duration"] + data["remaining_total_duration"]
    == data["trace_total_duration"]
)

max_sequence_length = max(data["trace_activity_list"].apply(len))

data["truncated_activity_list"] = pad_sequences(
    data["truncated_activity_list"], maxlen=max_sequence_length, padding="post"
).tolist()

data["remaining_activity_list"] = pad_sequences(
    data["remaining_activity_list"], maxlen=max_sequence_length, padding="post"
).tolist()

#### Data Splitting, reshaping and one-hot encoding

In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

train_df = train_df.sort_values(by="traceId")
test_df = test_df.sort_values(by="traceId")

X_train_features = train_df[transformed_columns].values
X_test_features = test_df[transformed_columns].values

from tensorflow.keras.utils import to_categorical

X_train = np.array(train_df["truncated_activity_list"].tolist())
Y_train = np.array(train_df["remaining_activity_list"].tolist())

X_test = np.array(test_df["truncated_activity_list"].tolist())
Y_test = np.array(test_df["remaining_activity_list"].tolist())

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
Y_train = Y_train.reshape(Y_train.shape[0], Y_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
Y_test = Y_test.reshape(Y_test.shape[0], Y_test.shape[1], 1)

X_train = X_train.astype(np.float32)
X_train_features = X_train_features.astype(np.float32)
X_test = X_test.astype(np.float32)
X_test_features = X_test_features.astype(np.float32)

Y_train_onehot = to_categorical(Y_train.squeeze(), num_classes=n_unique_activities)
Y_test_onehot = to_categorical(Y_test.squeeze(), num_classes=n_unique_activities)

#### The Model

In [13]:
from tensorflow.keras import backend as K


def mask_acc(y_true, y_pred):
    mask = K.cast(K.max(y_true, axis=-1), K.floatx())

    y_true_labels = K.cast(K.argmax(y_true, axis=-1), K.floatx())
    y_pred_labels = K.cast(K.argmax(y_pred, axis=-1), K.floatx())

    non_zero_mask = K.cast(K.greater(y_true_labels, 0), K.floatx())

    is_correct = (
        K.cast(K.equal(y_true_labels, y_pred_labels), K.floatx()) * mask * non_zero_mask
    )
    total_correct = K.sum(is_correct)
    total_values = K.sum(mask * non_zero_mask)

    return total_correct / total_values


def seq_acc(y_true, y_pred):
    y_pred_labels = K.argmax(y_pred, axis=-1)
    y_true_labels = K.argmax(y_true, axis=-1)

    correct_preds = K.all(K.equal(y_true_labels, y_pred_labels), axis=-1)

    accuracy = K.mean(correct_preds)
    return accuracy

In [14]:
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import (
    Input,
    Embedding,
    LSTM,
    Dense,
    Concatenate,
    Bidirectional,
    RepeatVector,
    Dropout,
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

sequence_input = Input(shape=(X_train.shape[1],))
embedded_sequences = Embedding(input_dim=n_unique_activities, output_dim=64)(
    sequence_input
)
lstm_out = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
lstm_out = Dropout(0.15)(lstm_out)  # Add dropout after LSTM

feature_input = Input(shape=(X_train_features.shape[1],))
dense_feature = Dense(64, activation="relu")(feature_input)
dense_feature = Dropout(0.15)(dense_feature)  # Add dropout after first Dense layer
dense_feature = Dense(64, activation="relu")(dense_feature)
repeated_feature = RepeatVector(X_train.shape[1])(dense_feature)

concatenated = Concatenate(axis=-1)([lstm_out, repeated_feature])
combined_dense = Dense(64, activation="relu")(concatenated)

output = Dense(n_unique_activities, activation="softmax")(combined_dense)

model = Model(inputs=[sequence_input, feature_input], outputs=output)
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.005),
    metrics=[mask_acc, seq_acc],
)

In [15]:
predicted_sequences1 = model.predict([X_train, X_train_features])
predicted_sequences2 = model.predict([X_test, X_test_features])

predicted_activity_indices1 = [np.argmax(seq, axis=-1) for seq in predicted_sequences1]
predicted_activity_indices2 = [np.argmax(seq, axis=-1) for seq in predicted_sequences2]

train_df['predicted_sequence'] = predicted_activity_indices1
test_df['predicted_sequence'] = predicted_activity_indices2

combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
combined_df = combined_df[['traceId', 'predicted_sequence']].copy()

[1m287/287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
