# Setup

## Installation

In [9]:
%load_ext autoreload
%autoreload 2

import logging
import os
import sys
import subprocess
import pkg_resources

def install_missing_requirements(requirements_path):
    if not os.path.exists(requirements_path):
        print(f"Requirements file '{requirements_path}' not found.")
        return

    with open(requirements_path) as f:
        required = [line.strip() for line in f if line.strip() and not line.startswith("#")]

    installed = {pkg.key for pkg in pkg_resources.working_set}
    
    missing = []
    for req in required:
        pkg_name = req.split("==")[0].lower()
        if pkg_name not in installed:
            missing.append(req)

    if not missing:
        print("All required packages are already installed.")
        return

    print(f"Installing missing packages: {missing}")

    # Check if running inside conda
    conda_prefix = os.environ.get("CONDA_PREFIX")
    if conda_prefix:
        print("Detected conda environment. Trying to use conda first...")
        for pkg in missing:
            pkg_name = pkg.split("==")[0]
            try:
                subprocess.check_call(["conda", "install", "-y", pkg_name])
            except subprocess.CalledProcessError:
                print(f"Package '{pkg_name}' not found in conda. Falling back to pip.")
                subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    else:
        # Not in conda, use pip directly
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])


def update_repository():
    print("Updating the git repository...")
    try:
        result = subprocess.run(["git", "pull"], capture_output=True, text=True, check=True)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Git pull failed:")
        print(e.stderr)

# install missing requirements
install_missing_requirements("/home/jovyan/requirements_reddit.txt")

# set cwd to the project root
cwd = os.getcwd()
wd = '/home/jovyan/reddit-mining/'
if cwd != wd:
    os.chdir(wd)

# update the git repository
update_repository()

# set PYTHONPATH to the src directory
sys.path.append('src')

# set up logging
from logger import setup_logger
#logger = setup_logger(level=logging.INFO)
logger = setup_logger(level=logging.DEBUG)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Installing missing packages: ['sentence_transformers']
Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Updating the git repository...
Already up to date.



## Initialisation

In [None]:
update_repository()
# from pm4py.objects.log.importer.xes import importer as xes_importer
from tapp.tapp_model import _get_event_labels, TappModel
from pm4py.objects.log.importer.xes import importer as xes_importer

folder_name = "mimicel"

if folder_name == "werk":
    path = "./data/werk.xes"
    data_attributes = ["age"]
    text_attribute = "question"
elif folder_name == "mimicel":
    path = "./data/mimicel_mini_2000.xes"
    data_attributes = ["case:acuity"]
    text_attribute = "case:chiefcomplaint"

variant = xes_importer.Variants.ITERPARSE
parameters = {
    variant.value.Parameters.TIMESTAMP_SORT: True,
    variant.value.Parameters.REVERSE_SORT: False,
}
log = xes_importer.apply(path, variant=variant, parameters=parameters)
activities = _get_event_labels(log, "concept:name")
class_names = _get_event_labels(log, "concept:name")
class_names.append("END")
split = len(log) // 5 * 4
train_log = log[:split]
test_log = log[split:]

Updating the git repository...
Already up to date.



parsing log, completed traces ::   0%|          | 0/2000 [00:00<?, ?it/s]

# Prepare Data

## Shorten Event Log

In [18]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.obj import EventLog  # <-- needed to wrap the sliced log

max_cases = 2000

mini_log = EventLog(log[:max_cases])  # wrap slice as EventLog

# --- BUILD OUTPUT PATH ---
folder, filename = os.path.split(path)
name, ext = os.path.splitext(filename)
out_path = os.path.join(folder, f"{name}_mini_{len(mini_log)}{ext}")

# --- EXPORT MINI LOG ---
xes_exporter.apply(mini_log, out_path)

print(f"Mini log saved at: {out_path}")


exporting log, completed traces ::   0%|          | 0/2000 [00:00<?, ?it/s]

Mini log saved at: ./data/mimicel_mini_20000_mini_2000.xes


## Get Distilled Labels

In [24]:
update_repository()
from tapp.log_encoder import LogEncoder
from tapp.tapp_model import TappModel, _get_event_labels
from tapp.text_encoder import BoWTextEncoder
from tapp.text_encoder import BoNGTextEncoder
from tapp.text_encoder import LDATextEncoder
from tapp.text_encoder import BERTbaseTextEncoder
from tapp.text_encoder import BERTbaseFineTunedNextActivityTextEncoder
from distillation import get_distillation_paths
import os
import numpy as np

text_models = [
    #None,
    #BoWTextEncoder(encoding_length=50, language="english"),
    #BoNGTextEncoder(n=2, encoding_length=50, language="english"),
    #LDATextEncoder(encoding_length=100, language="english"),
    BERTbaseTextEncoder(encoding_length=768, language="english"),
    #BERTbaseFineTunedNextActivityTextEncoder(encoding_length=768, language="english", epochs=16, lr=5e-5),
]

for text_model in text_models:

    # initialize and fit the log encoder
    log_encoder = LogEncoder(
        text_encoder=text_model,
        advanced_time_attributes=True,
        text_base_for_training="event",
    )
    log_encoder.fit(
        log,
        activities=activities,
        data_attributes=data_attributes,
        text_attribute=text_attribute,
    )

    X_train, y_train, _ = log_encoder.transform(train_log, for_training=True)
    X_test, y_test, _ = log_encoder.transform(test_log, for_training=True)
    y_train = y_train.argmax(axis=1)
    y_test = y_test.argmax(axis=1)

    # prepare distillation path
    distillation_path_train, distillation_path_test = get_distillation_paths(folder_name, text_model)

    # tappbert hyperparameters
    shared_layer = 1
    special_layer = 1
    neuron = 100

    # tappbert training and evaluation
    print("Training and evaluating TAPPBERT model...")
    tapp_model = TappModel(
        log_encoder=log_encoder,
        num_shared_layer=shared_layer,
        num_specialized_layer=special_layer,
        neurons_per_layer=neuron,
        dropout=0.2,
        learning_rate=0.001,
    )
    tapp_model.activities = activities
    tapp_model.fit(
        train_log, data_attributes=data_attributes, text_attribute=text_attribute, epochs=25
    )
    y_train_distilled = tapp_model.model.predict(X_train)
    y_train_distilled = y_train_distilled[0]
    np.save(distillation_path_train, y_train_distilled)
    print("Saved distilled training data to:", distillation_path_train)
    y_test_distilled = tapp_model.model.predict(X_test)
    y_test_distilled = y_test_distilled[0]
    np.save(distillation_path_test, y_test_distilled)
    print("Saved distilled test data to:", distillation_path_test)
    tapp_model.evaluate(test_log, "results.csv", num_prefixes=8)

Updating the git repository...
Already up to date.



[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'case:acuity'

# Evaluation

In [16]:
update_repository()
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from tapp.text_encoder import BoWTextEncoder, BoNGTextEncoder, LDATextEncoder, BERTbaseTextEncoder
from tapp.log_encoder import LogEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from distillation import (
    get_distillation_paths,
    get_evaluation_paths,
    evaluate_distillation,
    analyze_text_splits,
    prepare_text_feature_datasets,
    concatenate_text_feature_datasets,
    get_feature_datasets,
    save_evaluation_results,
    tree_to_str,
)
import numpy as np
import sys
import pickle
import itertools

# ----------------------------
# Factory for student models
# ----------------------------
def get_student_model(model_type="dt", random_state=0, **kwargs):
    model_type = model_type.lower()
    if model_type == "dt":
        return DecisionTreeClassifier(random_state=random_state, **kwargs)
    elif model_type == "ebm":
        return ExplainableBoostingClassifier(random_state=random_state, **kwargs)
    elif model_type == "lasso":
        return LogisticRegression(
            penalty="l1",
            solver="saga",
            max_iter=5000,
            multi_class="multinomial",
            random_state=random_state,
            **kwargs
        )
    else:
        raise ValueError(f"Unknown student model type: {model_type}")
    
def resample_training_data(X, y, method="smote"):
    if method == "smote":
        sampler = SMOTE(sampling_strategy='not majority', random_state=42)
    elif method == "random":
        sampler = RandomOverSampler(sampling_strategy='not majority', random_state=42)
    else:
        return X, y
    return sampler.fit_resample(X, y)

# ----------------------------
# Params
# ----------------------------
student_model_types = ["dt", "ebm", "lasso"]   # choose which students to try
student_model_types = ["dt"]   # choose which students to try
model_names = ["BoW", "BoNG", "LDA"]
text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")
text_models = [
    BoWTextEncoder(encoding_length=50, language="english"),
    BoNGTextEncoder(n=2, encoding_length=50, language="english"),
    LDATextEncoder(encoding_length=100, language="english"),
]
k_values = [5]
ccp_alpha = 0.0001
sampling_method = None 

# ----------------------------
# Load distilled labels
# ----------------------------
distillation_path_train, distillation_path_test = get_distillation_paths(folder_name, text_model_tapp)
y_train_distilled = np.load(distillation_path_train)
y_test_distilled = np.load(distillation_path_test)
y_train_distilled = y_train_distilled.argmax(axis=1)
y_test_distilled = y_test_distilled.argmax(axis=1)

# ----------------------------
# Evaluation loop
# ----------------------------
model_names_powerset = [list(subset) 
    for r in range(len(model_names)+1) 
    for subset in itertools.combinations(model_names, r)]

for model_names_subset in model_names_powerset:
    print(f"Using text models: {model_names_subset}")
    for k in k_values:
        print(f"Training and evaluating for k={k}...")
        train_dataset, test_dataset = get_feature_datasets(
            folder_name,
            text_models,
            log,
            train_log,
            test_log,
            k=k,
        )
        y_train = train_dataset["y"].argmax(axis=1)
        y_test = test_dataset["y"].argmax(axis=1)
        X_train, features = concatenate_text_feature_datasets(train_dataset, model_names_subset)
        X_test, _ = concatenate_text_feature_datasets(test_dataset, model_names_subset)
        print("Transformed training data shape:", X_train.shape)
        print("Transformed test data shape:", X_test.shape)

        for model_type in student_model_types:
                for version, train_labels in [
                    ("original", y_train),
                    ("distilled", y_train_distilled),
                ]:
                    desc = f"{model_type.upper()} ({version})"
                    model_path, model_str_path, model_features_path, model_y_path = get_evaluation_paths(
                        folder_name,
                        version,
                        model_type=model_type,
                        model_names_subset=model_names_subset,
                        k=k,
                        ccp_alpha=ccp_alpha,
                    )

                    # Create student model
                    if model_type == "dt":
                        student = get_student_model(model_type, ccp_alpha=ccp_alpha)
                    else:
                        student = get_student_model(model_type)

                    # sampling
                    print(X_train.shape, train_labels.shape)
                    X_train_sampled, train_labels_sampled = resample_training_data(X_train, train_labels, method=sampling_method)
                    # Train + evaluate
                    acc, f1, con_acc, con_f1 = evaluate_distillation(
                        student,
                        X_train_sampled,
                        X_test,
                        train_labels_sampled,
                        y_test,
                        y_test_distilled,
                        description=desc
                    )

                    # Save results
                    meta = dict(
                        description=desc,
                        model_type=model_type,
                        model_names=model_names_subset,
                        k=k,
                        acc=acc,
                        f1=f1,
                        con_acc=con_acc,
                        con_f1=con_f1,
                        num_nodes="",
                        max_depth="",
                        ccp=""
                    )

                    if model_type == "dt":
                        meta.update(
                            num_nodes=student.tree_.node_count,
                            max_depth=student.tree_.max_depth,
                            ccp=ccp_alpha,
                        )

                    save_evaluation_results(**meta)

                    # Save model + features
                    with open(model_path, "wb") as f:
                        pickle.dump(student, f)
                    with open(model_features_path, "wb") as f:
                        pickle.dump(features, f)

                    # Export string representation if DT
                    if model_type == "dt":
                        tree_str = tree_to_str(student, feature_names=features, class_names=class_names)
                        with open(model_str_path, "w") as f:
                            f.write(tree_str)

                    # Save predictions
                    y_pred = student.predict(X_test)
                    np.save(model_y_path, y_pred)

print("Analysis complete.")
acc_tp = accuracy_score(y_test_distilled, y_test)
f1_score_tp = f1_score(y_test_distilled, y_test, average="weighted")
print(f"Tappbert baseline: acc - {acc_tp:.4f}, f1 - {f1_score_tp:.4f}")
print("Done and dusted!")

Updating the git repository...
Already up to date.

Using text models: []
Training and evaluating for k=5...


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Transformed training data shape: (44240, 125)
Transformed test data shape: (10980, 125)
(44240, 125) (44240,)
DT (original): acc - 0.4708, f1 - 0.3974, con_acc - 0.8499, con_f1 - 0.8329
Number of nodes: 253
Max depth: 20
(44240, 125) (44240,)
DT (distilled): acc - 0.4606, f1 - 0.3879, con_acc - 0.9096, con_f1 - 0.8912
Number of nodes: 457
Max depth: 21
Using text models: ['BoW']
Training and evaluating for k=5...
Transformed training data shape: (44240, 375)
Transformed test data shape: (10980, 375)
(44240, 375) (44240,)
DT (original): acc - 0.4888, f1 - 0.4227, con_acc - 0.8885, con_f1 - 0.8847
Number of nodes: 335
Max depth: 22
(44240, 375) (44240,)
DT (distilled): acc - 0.4802, f1 - 0.4229, con_acc - 0.9714, con_f1 - 0.9711
Number of nodes: 541
Max depth: 26
Using text models: ['BoNG']
Training and evaluating for k=5...
Transformed training data shape: (44240, 375)
Transformed test data shape: (10980, 375)
(44240, 375) (44240,)


KeyboardInterrupt: 

# Interpretation

### Load Tree and Data

In [None]:
update_repository()
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tapp.text_encoder import BoWTextEncoder
from tapp.text_encoder import BoNGTextEncoder
from tapp.text_encoder import LDATextEncoder
from tapp.text_encoder import BERTbaseTextEncoder
from tapp.log_encoder import LogEncoder
from distillation import (
    get_distillation_paths,
    get_evaluation_paths,
    evaluate_distillation,
    analyze_text_splits,
    prepare_text_feature_datasets,
    concatenate_text_feature_datasets,
    get_feature_datasets,
    save_evaluation_results,
    tree_to_str,
    explain_sample
)
import numpy as np
import sys
import pickle
import itertools

text_models = [
    BoWTextEncoder(encoding_length=50, language="english"),
    BoNGTextEncoder(n=2, encoding_length=50, language="english"),
    LDATextEncoder(encoding_length=100, language="english"),
]
version = "distilled"
model_names = ["BoW", "BoNG"]
model_names = ["BoNG"]
model_type = "dt"
k = 5
ccp_alpha = 0.001

tree_model_path, tree_str_path, tree_features_path, tree_y_path = get_evaluation_paths(
    version,
    model_type=model_type,
    model_names_subset=model_names,
    k=k,
    ccp_alpha=ccp_alpha,
)
#text_model_tapp = BERTbaseFineTunedNextActivityTextEncoder(encoding_length=768, language="english", epochs=16, lr=5e-5)
text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")

# get the x_test samples
train_dataset, test_dataset = get_feature_datasets(
    text_models,
    log,
    train_log,
    test_log,
    k=k,
)
X_test, features = concatenate_text_feature_datasets(test_dataset, model_names)

# load the distilled labels
_, distillation_path_test = get_distillation_paths(
    text_model_tapp
)
y_test_distilled_raw = np.load(distillation_path_test)
y_test_distilled = y_test_distilled_raw.argmax(axis=1)

# load the tree model
with open(tree_model_path, "rb") as f:
    dt = pickle.load(f)
# load the tree predictions
y_tree = np.load(tree_y_path)

tree_str = tree_to_str(dt, feature_names=features, class_names=class_names)
print("Decision Tree Structure:")
print(tree_str)



Updating the git repository...
Already up to date.



[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Decision Tree Structure:
0: event -1 activity question > 0.5000 T: 1, F: 72
1: event -1 activity werkmap > 0.5000 T: 2, F: 63
2: event -1 activity mijn sollicitaties > 0.5000 T: 3, F: 62
3: event -1 activity home > 0.5000 T: 4, F: 45
4: event -2 activity question > 0.5000 T: 5, F: 40
5: event -3 age=18-29 > 0.0208 T: 6, F: 7
6: predict question
7: event -1 activity taken > 0.5000 T: 8, F: 23
8: event -1 activity mijn documenten > 0.5000 T: 9, F: 20
9: event -1 activity mijn berichten > 0.5000 T: 10, F: 13
10: event -1 activity aanvragen-ww > 0.5000 T: 11, F: 12
11: predict mijn_sollicitaties
12: predict home
13: event -1 age=50-65 > 0.5000 T: 14, F: 17
14: event -4 time since midnight scaled > 0.0000 T: 15, F: 16
15: predict mijn_sollicitaties
16: predict mijn_sollicitaties
17: event -2 activity home > 0.5000 T: 18, F: 19
18: predict mijn_sollicitaties
19: predict home
20: event -4 age=18-29 > 0.0238 T: 21, F: 22
21: predict mijn_werkmap
22: predict mijn_sollicitaties
23: event -3 age=

### Analyze Sample

In [10]:
update_repository()
from distillation import explain_sample, get_decision_path

k_highest_classes = 5

class_names = np.array(class_names)
conforming_indices = np.where(y_tree == y_test_distilled)[0]
random_index = np.random.choice(conforming_indices, 1)[0]
#print(f"Randomly selected index: {random_index}")
random_X = X_test[random_index]
random_y = y_test_distilled[random_index]
random_y_raw = y_test_distilled_raw[random_index]
# get the k highest probabilities
topk_idx = np.argsort(random_y_raw)[-k_highest_classes:][::-1]
topk_values = random_y_raw[topk_idx]
topk_classes = class_names[topk_idx]
topk_classes_str = ", ".join(f"{cls}: {val:.3f}" for cls, val in zip(topk_classes, topk_values))

correct_activity = class_names[random_y]
#print("Randomly selected sample:")
#print(explain_sample(random_X, features))
#print("Correct activity:", correct_activity)
decision_path = get_decision_path(dt, random_X, feature_names=features, class_names=class_names, show_distribution=True)
print("Sample:")
print(explain_sample(random_X, features))
print("Correct activity:", correct_activity)
print("Top 3 predicted activities:", topk_classes_str)
print("Decision path:")
print(decision_path)

print("explain first few samples:")
print(explain_sample(X_test[0], features))
print(class_names[y_test_distilled[0]])
print(explain_sample(X_test[1], features))
print(class_names[y_test_distilled[1]])
print(explain_sample(X_test[2], features))
print(class_names[y_test_distilled[2]])
print(explain_sample(X_test[3], features))
print(class_names[y_test_distilled[3]])
print(explain_sample(X_test[4], features))
print(class_names[y_test_distilled[4]])





Updating the git repository...
Already up to date.

Sample:
event -2 activity question = 1, event -2 time since monday scaled = 0.5006, event -2 age=18-29 = 0.3572, event -2 age=50-65 = 1, event -1 activity question = 1, event -1 time since midnight scaled = 0.0007, event -1 time since monday scaled = 0.5006, event -1 age=18-29 = 0.3572, event -1 age=50-65 = 1
Correct activity: question
Top 3 predicted activities: question: 0.303, home: 0.164, taken: 0.150, werkmap: 0.114, mijn_berichten: 0.111
Decision path:
Node 0: (event_-1_activity_question = 1.000 > 0.500) | class distribution = {question: 15748, aanvragen-ww: 101, home: 3432, taken: 5587, vacatures_bij_mijn_cv: 2, mijn_berichten: 2902, wijziging_doorgeven: 12, mijn_cv: 21, vacatures_zoeken: 2, END: 16433}
Node 72: (event_-1_BoNG_want cancel = 0.000 <= 0.124) | class distribution = {question: 14882, aanvragen-ww: 26, home: 321, taken: 281, vacatures_bij_mijn_cv: 0, mijn_berichten: 1933, wijziging_doorgeven: 0, mijn_cv: 21, vacatur

### Prompting

In [None]:
max_sentences = 8

role_description = "You are a helpful assistant that provides explanations for machine learning model predictions. " \
"Your task is to explain the decision-making process of a machine learning model, " \
"that has been distilled into a decision tree. \n"

task_description = f"Your explanation should be clear, concise, " \
"and focused on how the features contribute to the prediction. " \
"Make sure not to ground your explanation on the decision tree architecture itself, " \
"since the tree is only a proxy for the actual model, " \
"and provide higher level reasoning for the prediction instead. " \
"Be very precise in your explanation and leave no ambiguities. \n"

sentence_limit = "Limit the description to {max_sentences} sentences.\n"

input_description = "You will receive a sample represented by its features (assume unprovided features are 0) " \
"and the corresponding predicted class label, as well as the path the sample took in the decision tree itself. " \
"For every node in the path, you will see the class distribution of samples belonging to that node. " \
"Use this path to explain the decision process step by step how certain classes have been ruled out. \n"

topk_description = f"Additionally, you will receive the {k} classes with the highest prediction probability. " \
"Provide an explanation of why the chosen class was predicted over the other top classes.\n"

self_correction = "Additionally, for the purpose of supervision, also provide the nodes that were traversed " \
"in the decision tree to reach the prediction. \n\n"


sample_description = f"Sample features: \n {explain_sample(random_X, features)} \n"
prediction_description = f"Predicted class label: {correct_activity} \n"
tree_description = f"Decision tree structure: {tree_str} \n"
decision_path_str = f"Decision path taken: \n {decision_path} \n"
topk_classes_str = f"Top {k_highest_classes} predicted classes: {topk_classes_str} \n"

#TODO: perhaps instruct to provide reasoning step by step
#prompt = role_description + self_correction + sample_description + prediction_description + tree_description
#prompt = role_description + sample_description + prediction_description + tree_description
#prompt = f"{role_description}{task_description}{input_description}{sample_description}{prediction_description}{tree_description}"
prompt = f"{role_description}{task_description}{input_description}{topk_description}{sample_description}{prediction_description}{topk_classes_str}{decision_path_str}"
print(prompt)

You are a helpful assistant that provides explanations for decision tree predictions. Your task is to explain the decision-making process of a machine learning model, that has been distilled into a decision tree. 
Your explanation should be clear, concise, and focused on how the features contribute to the prediction. Make sure not to ground your explanation on the decision tree architecture itself, since the tree is only a proxy for the actual model, and provide higher level reasoning for the prediction instead. Be very precise in your explanation and leave no ambiguities. 
You will receive a sample represented by its features (assume unprovided features are 0) and the corresponding predicted class label, as well as the path the sample took in the decision tree itself. For every node in the path, you will see the class distribution of samples belonging to that node. Use this path to explain the decision process step by step how certain classes have been ruled out. 
Additionally, you wi

# Debugging

In [30]:
print(f"activities: {activities}")
print(f"Number of activities: {len(activities)}")
print(dt.classes_)
print(len(dt.classes_))

unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Class counts in training data:", class_counts)
unique, counts = np.unique(y_test, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Class counts in test data:", class_counts)

activities: ['taken' 'question' 'mijn_werkmap' 'home' 'mijn_berichten'
 'wijziging_doorgeven' 'werkmap' 'mijn_documenten' 'vacatures_bij_mijn_cv'
 'mijn_sollicitaties' 'wdo' 'mijn_cv' 'aanvragen-ww' 'vacatures_zoeken'
 'inschrijven' 'mijn_tips' 'vacatures' 'vragenlijst-uwv']
Number of activities: 18
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
19
Class counts in training data: {0: 6252, 1: 242, 2: 6992, 3: 6408, 4: 1088, 5: 767, 6: 4790, 7: 2103, 8: 690, 9: 1104, 10: 600, 11: 558, 12: 82, 13: 319, 14: 34, 15: 80, 16: 31, 17: 100, 18: 12000}
Class counts in test data: {0: 1587, 1: 32, 2: 1759, 3: 1577, 4: 269, 5: 146, 6: 1260, 7: 542, 8: 175, 9: 282, 10: 146, 11: 91, 12: 12, 13: 59, 14: 9, 15: 16, 16: 4, 17: 13, 18: 3001}


In [15]:
# load the distilled labels
text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")
distillation_path_train, distillation_path_test = get_distillation_paths(
    text_model_tapp
)
y_train_distilled = np.load(distillation_path_train)
y_test_distilled = np.load(distillation_path_test)
y_train_distilled_base = y_train_distilled.argmax(axis=1)
y_test_distilled_base = y_test_distilled.argmax(axis=1)

text_model_tapp = BERTbaseFineTunedNextActivityTextEncoder(encoding_length=768, language="english", epochs=16, lr=5e-5)
distillation_path_train, distillation_path_test = get_distillation_paths(
    text_model_tapp
)
y_train_distilled = np.load(distillation_path_train)
y_test_distilled = np.load(distillation_path_test)
y_train_distilled_tuned = y_train_distilled.argmax(axis=1)
y_test_distilled_tuned = y_test_distilled.argmax(axis=1)

conformance = accuracy_score(y_test_distilled_base, y_test_distilled_tuned)
print(f"Conformance between base and fine-tuned distilled labels: {conformance:.4f}")
acc_base = accuracy_score(y_test_distilled_base, y_test)
f1_base = f1_score(y_test_distilled_base, y_test, average="weighted")
acc_tuned = accuracy_score(y_test_distilled_tuned, y_test)
f1_tuned = f1_score(y_test_distilled_tuned, y_test, average="weighted")
print(f"Base distilled labels: acc - {acc_base:.4f}, f1 - {f1_base:.4f}")
print(f"Fine-tuned distilled labels: acc - {acc_tuned:.4f}, f1 - {f1_tuned:.4f}")

Conformance between base and fine-tuned distilled labels: 0.8415
Base distilled labels: acc - 0.4857, f1 - 0.5491
Fine-tuned distilled labels: acc - 0.4848, f1 - 0.5341


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
k=3
text_encoder = BoWTextEncoder(encoding_length=50, language="english")
log_encoder = LogEncoder(
    text_encoder=text_encoder,
    advanced_time_attributes=True,
    text_base_for_training="event",
)
log_encoder.fit(
    log,
    activities=activities,
    data_attributes=["age"],
    text_attribute="question",
)
X_train_indie, y_train_indie, _ = log_encoder.transform(train_log, for_training=True)
X_test_indie, y_test_indie, _ = log_encoder.transform(test_log, for_training=True)
X_train_indie = log_encoder.transform_tree(X_train_indie, k=k)
X_test_indie = log_encoder.transform_tree(X_test_indie, k=k)

train_dataset, test_dataset = get_feature_datasets(
    text_models,
    log,
    train_log,
    test_log,
    k=k,
)
y_train_combo = train_dataset["y"]
y_test_combo = test_dataset["y"]
X_train_combo, features = concatenate_text_feature_datasets(train_dataset, [text_encoder.name])
X_test_combo, _ = concatenate_text_feature_datasets(test_dataset, [text_encoder.name])
X_train_combo = train_dataset["None"]["X"]
X_test_combo = test_dataset["None"]["X"]


def _mismatch_report(A, B, rtol=1e-5, atol=1e-8, max_show=25):
    mask = ~(np.isclose(A, B, rtol=rtol, atol=atol) | (np.isnan(A) & np.isnan(B)))
    idxs = np.argwhere(mask)
    n_diffs = idxs.shape[0]
    cols = np.unique(idxs[:, 1]) if idxs.size else np.array([], dtype=int)
    rows = np.unique(idxs[:, 0]) if idxs.size else np.array([], dtype=int)
    sample = []
    for r, c in idxs[:max_show]:
        sample.append((int(r), int(c), A[r, c], B[r, c]))
    return n_diffs, rows, cols, sample

print("Shapes:")
print("  X_train_indie:", X_train_indie.shape, "X_train_combo:", X_train_combo.shape)
print("  X_test_indie :", X_test_indie.shape,  "X_test_combo :", X_test_combo.shape)
print("  y_train_indie:", y_train_indie.shape, "y_train_combo:", y_train_combo.shape)
print("  y_test_indie :", y_test_indie.shape,  "y_test_combo :", y_test_combo.shape)

print("\nTraining X differences:")
n_diffs_tr, rows_tr, cols_tr, sample_tr = _mismatch_report(X_train_indie, X_train_combo)
print(f"  total differing elements: {n_diffs_tr}")
print(f"  unique differing rows: {len(rows_tr)} -> {rows_tr[:20]}")
print(f"  unique differing cols: {len(cols_tr)} -> {cols_tr[:20]}")
if sample_tr:
    print("  first mismatches (row, col, indie_val, combo_val):")
    for tup in sample_tr[:10]:
        print("   ", tup)

print("\nTest X differences:")
n_diffs_te, rows_te, cols_te, sample_te = _mismatch_report(X_test_indie, X_test_combo)
print(f"  total differing elements: {n_diffs_te}")
print(f"  unique differing rows: {len(rows_te)} -> {rows_te[:20]}")
print(f"  unique differing cols: {len(cols_te)} -> {cols_te[:20]}")
if sample_te:
    print("  first mismatches (row, col, indie_val, combo_val):")
    for tup in sample_te[:10]:
        print("   ", tup)

print("\nTraining y differences:")
y_train_mask = ~(np.isclose(y_train_indie, y_train_combo) | (np.isnan(y_train_indie) & np.isnan(y_train_combo)))
y_train_diffs = np.argwhere(y_train_mask)
print(f"  total differing elements: {y_train_diffs.shape[0]}")
if y_train_diffs.size:
    rows = np.unique(y_train_diffs[:,0])
    print(f"  unique differing rows: {len(rows)} -> {rows[:20]}")

print("\nTest y differences:")
y_test_mask = ~(np.isclose(y_test_indie, y_test_combo) | (np.isnan(y_test_indie) & np.isnan(y_test_combo)))
y_test_diffs = np.argwhere(y_test_mask)
print(f"  total differing elements: {y_test_diffs.shape[0]}")
if y_test_diffs.size:
    rows = np.unique(y_test_diffs[:,0])
    print(f"  unique differing rows: {len(rows)} -> {rows[:20]}")

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyboardInterrupt: 

In [41]:
import numpy as np

def _extract_y(ds):
    return ds["y"] if isinstance(ds, dict) else getattr(ds, "y")

def _assert_same_shape(A, B, nameA="A", nameB="B"):
    if A.shape != B.shape:
        raise ValueError(f"Shape mismatch: {nameA}.shape={A.shape} vs {nameB}.shape={B.shape}")

def _mismatch_report(A, B, rtol=1e-5, atol=1e-8, max_show=25):
    mask = ~(np.isclose(A, B, rtol=rtol, atol=atol) | (np.isnan(A) & np.isnan(B)))
    idxs = np.argwhere(mask)
    n_diffs = idxs.shape[0]
    cols = np.unique(idxs[:, 1]) if idxs.size else np.array([], dtype=int)
    rows = np.unique(idxs[:, 0]) if idxs.size else np.array([], dtype=int)
    sample = []
    for r, c in idxs[:max_show]:
        sample.append((int(r), int(c), A[r, c], B[r, c]))
    return n_diffs, rows, cols, sample

def compare_indie_vs_combo(
    log_encoder,
    train_log,
    test_log,
    train_dataset,
    test_dataset,
    concatenate_text_feature_datasets,
    k,
):
    X_train_3d, y_train_indie, _ = log_encoder.transform(train_log, for_training=True)
    X_test_3d,  y_test_indie,  _ = log_encoder.transform(test_log,  for_training=True)

    # IMPORTANT: bring the "indie" path into the same flattened, k-limited space
    X_train_indie = log_encoder.transform_tree(X_train_3d, k=k)
    X_test_indie  = log_encoder.transform_tree(X_test_3d,  k=k)

    X_train_combo, feat_train = concatenate_text_feature_datasets(train_dataset, [])
    X_test_combo,  feat_test  = concatenate_text_feature_datasets(test_dataset,  [])

    y_train_combo = _extract_y(train_dataset)
    y_test_combo  = _extract_y(test_dataset)

    print("Shapes:")
    print("  X_train_indie:", X_train_indie.shape, "X_train_combo:", X_train_combo.shape)
    print("  X_test_indie :", X_test_indie.shape,  "X_test_combo :", X_test_combo.shape)
    print("  y_train_indie:", y_train_indie.shape, "y_train_combo:", y_train_combo.shape)
    print("  y_test_indie :", y_test_indie.shape,  "y_test_combo :", y_test_combo.shape)

    _assert_same_shape(X_train_indie, X_train_combo, "X_train_indie", "X_train_combo")
    _assert_same_shape(X_test_indie,  X_test_combo,  "X_test_indie",  "X_test_combo")
    _assert_same_shape(y_train_indie, y_train_combo, "y_train_indie", "y_train_combo")
    _assert_same_shape(y_test_indie,  y_test_combo,  "y_test_indie",  "y_test_combo")

    print("\nTraining X differences:")
    n_diffs_tr, rows_tr, cols_tr, sample_tr = _mismatch_report(X_train_indie, X_train_combo)
    print(f"  total differing elements: {n_diffs_tr}")
    print(f"  unique differing rows: {len(rows_tr)} -> {rows_tr[:20]}")
    print(f"  unique differing cols: {len(cols_tr)} -> {cols_tr[:20]}")
    if sample_tr:
        print("  first mismatches (row, col, indie_val, combo_val):")
        for tup in sample_tr[:10]:
            print("   ", tup)

    print("\nTest X differences:")
    n_diffs_te, rows_te, cols_te, sample_te = _mismatch_report(X_test_indie, X_test_combo)
    print(f"  total differing elements: {n_diffs_te}")
    print(f"  unique differing rows: {len(rows_te)} -> {rows_te[:20]}")
    print(f"  unique differing cols: {len(cols_te)} -> {cols_te[:20]}")
    if sample_te:
        print("  first mismatches (row, col, indie_val, combo_val):")
        for tup in sample_te[:10]:
            print("   ", tup)

    print("\nTraining y differences:")
    y_train_mask = ~(np.isclose(y_train_indie, y_train_combo) | (np.isnan(y_train_indie) & np.isnan(y_train_combo)))
    y_train_diffs = np.argwhere(y_train_mask)
    print(f"  total differing elements: {y_train_diffs.shape[0]}")
    if y_train_diffs.size:
        rows = np.unique(y_train_diffs[:,0])
        print(f"  unique differing rows: {len(rows)} -> {rows[:20]}")

    print("\nTest y differences:")
    y_test_mask = ~(np.isclose(y_test_indie, y_test_combo) | (np.isnan(y_test_indie) & np.isnan(y_test_combo)))
    y_test_diffs = np.argwhere(y_test_mask)
    print(f"  total differing elements: {y_test_diffs.shape[0]}")
    if y_test_diffs.size:
        rows = np.unique(y_test_diffs[:,0])
        print(f"  unique differing rows: {len(rows)} -> {rows[:20]}")

    return {
        "train": {
            "n_diffs": n_diffs_tr,
            "rows": rows_tr,
            "cols": cols_tr,
            "sample": sample_tr,
            "features": feat_train,
        },
        "test": {
            "n_diffs": n_diffs_te,
            "rows": rows_te,
            "cols": cols_te,
            "sample": sample_te,
            "features": feat_test,
        },
    }

res = compare_indie_vs_combo(
    log_encoder=log_encoder,
    train_log=train_log,
    test_log=test_log,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    concatenate_text_feature_datasets=concatenate_text_feature_datasets,
    k=k,
)



Shapes:
  X_train_indie: (44240, 25) X_train_combo: (44240, 25)
  X_test_indie : (10980, 25) X_test_combo : (10980, 25)
  y_train_indie: (44240, 19) y_train_combo: (44240, 19)
  y_test_indie : (10980, 19) y_test_combo : (10980, 19)

Training X differences:
  total differing elements: 0
  unique differing rows: 0 -> []
  unique differing cols: 0 -> []

Test X differences:
  total differing elements: 8972
  unique differing rows: 4486 -> [ 1  3  5  6  9 10 15 19 20 21 23 25 26 27 32 33 36 37 42 46]
  unique differing cols: 15 -> [ 1  2  4  5  6  8  9 10 11 12 13 14 15 16 17]
  first mismatches (row, col, indie_val, combo_val):
    (1, 1, 0.0, 1.0)
    (1, 8, 1.0, 0.0)
    (3, 2, 0.0, 1.0)
    (3, 6, 1.0, 0.0)
    (5, 4, 0.0, 1.0)
    (5, 9, 1.0, 0.0)
    (6, 2, 1.0, 0.0)
    (6, 5, 0.0, 1.0)
    (9, 2, 0.0, 1.0)
    (9, 6, 1.0, 0.0)

Training y differences:
  total differing elements: 0

Test y differences:
  total differing elements: 8546
  unique differing rows: 4273 -> [ 0  2  4  5  8

# Log Analysis

In [22]:
import pm4py
from log_processing import convert_to_datetime
import pandas as pd

log_dir = "data"
folder_name = "mimicel_mini_2000"

log_file = f"{folder_name}.xes"
log_df = pm4py.read_xes(os.path.join(log_dir, log_file))
log_df = convert_to_datetime(log_df, "time:timestamp")


pd.set_option('display.max_rows', None)
print(f"Columns: {log_df.columns.tolist()}")
activities = log_df['concept:name'].unique()
print(f"Activities: {sorted(activities)}")
print(f"Number of unique activities: {len(activities)}")
print(f"Cases: {len(log_df['case:concept:name'].unique())}")
print(f"Total number of events: {len(log_df)}")
print(f"Average number of events per case: {log_df.groupby('case:concept:name').size().mean()}")
print(f"Average event duration: {log_df['time:timestamp'].diff().mean()}")


log_df = log_df[["case:concept:name", "concept:name", "time:timestamp"]]
log_df_sorted = log_df.sort_values(by=['case:concept:name', "time:timestamp"])
print("Event log head:")
print(log_df_sorted.head(100))

# group by case and print first few cases
grouped = log_df_sorted.groupby('case:concept:name')
for case_id, group in list(grouped)[:10]:
    print(f"\nCase ID: {case_id}")
    print(group)

Columns: ['gender', 'name', '@@case_index', 'disposition', 'ndc', '@@index', 'timestamps', 'rhythm', 'resprate', 'etc_rn', 'sbp', 'icd_title', 'med_rn', 'dbp', 'etcdescription', 'case:acuity', 'icd_code', 'gsn', 'time:timestamp', 'icd_version', 'etccode', 'case:concept:name', 'arrival_transport', 'pain', 'case:subject_id', 'case:chiefcomplaint', 'o2sat', 'stay_id', 'race', 'case:hadm_id', 'seq_num', 'gsn_rn', 'heartrate', 'temperature', 'activity', 'concept:name']
Activities: ['Discharge from the ED', 'Enter the ED', 'Medicine dispensations', 'Medicine reconciliation', 'Triage in the ED', 'Vital sign check']
Number of unique activities: 6
Cases: 2000
Total number of events: 27718
Average number of events per case: 13.859
Average event duration: 0 days 00:20:04.299166576
Event log head:
      case:concept:name             concept:name      time:timestamp
11318          30005370             Enter the ED 2110-06-25 05:10:00
11319          30005370         Triage in the ED 2110-06-25 05:10



# Old Code

In [None]:
import os
import argparse

def extract_df(log):
    # Make predictions
    prefix_log = [case[0:prefix_length] for case in log for prefix_length in range(1, len(case) + 1)]
    print("First 10 prefixes:")
    print(prefix_log[:1])
    caseIDs = []
    prefix_lengths = []
    true_next_activities = []
    true_next_times = []
    for case in log:
        caseID = case.attributes["concept:name"]
        for prefix_length in range(1, len(case) + 1):
            caseIDs.append(caseID)
            prefix_lengths.append(prefix_length)
            true_next_activities.append("END" if prefix_length == len(case) else case[prefix_length]["concept:name"])
            true_next_times.append(0 if prefix_length == len(case) else (case[prefix_length]["time:timestamp"].timestamp() -
                                                                            case[prefix_length - 1]["time:timestamp"].timestamp()) / 86400)

    # Generate DataFrame
    column_data = {"caseID": caseIDs, "prefix-length": prefix_lengths, "true-next-activity": true_next_activities,
        "true-next-time": true_next_times,}
    columns = ["caseID", "prefix-length", "true-next-activity", "true-next-time"]
    return pd.DataFrame(column_data, columns=columns)

print("Prepare...")
from pm4py.objects.log.importer.xes import importer as xes_importer
from tapp.tapp_model import TappModel, _get_event_labels
from tapp.log_encoder import LogEncoder
from tapp.text_encoder import BoWTextEncoder
from tapp.text_encoder import BoNGTextEncoder
from tapp.text_encoder import PVTextEncoder
from tapp.text_encoder import LDATextEncoder
from tapp.text_encoder import BERTbaseTextEncoder
from tapp.text_encoder import BERTbaseFineTunedNextActivityTextEncoder
from tapp.text_encoder import BERTbaseFineTunedNextTimeTextEncoder
from tapp.text_encoder import BERTbaseFineTunedNextActivityAndTimeTextEncoder
from tapp.text_encoder import BERTfromScratchTextEncoder
from tapp.text_encoder import BERTAndTokenizerFromScratchTextEncoder
from nltk.tokenize import word_tokenize
from pm4py.algo.filtering.log.variants import variants_filter
from pm4py.objects.conversion.log import converter as log_converter
import datetime
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf

# Workstation
from tensorflow.core.protobuf.config_pb2 import ConfigProto
from tensorflow.python.client.session import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Download text preprocessing resources from nltk
try:
    nltk.data.find("corpora/wordnet")
except LookupError:
    nltk.download("wordnet")
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords")

if not os.path.exists('./results/'):
    os.makedirs('./results/')

print("Done.")

# Load event data
print("Load event log...")
path = "./data/werk.xes"
variant = xes_importer.Variants.ITERPARSE
parameters = {variant.value.Parameters.TIMESTAMP_SORT: True, variant.value.Parameters.REVERSE_SORT: False}
log = xes_importer.apply(path, variant=variant, parameters=parameters)
print("Done.")


# Analyse log
print("Create log statistics...")
language = "english" 
text_attribute = "question"
traces = len(log)
events = sum(len(case) for case in log)
durations = [(case[-1]["time:timestamp"].timestamp() - case[0]["time:timestamp"].timestamp()) / 86400 for case in log]
docs = [event[text_attribute] for case in log for event in case if text_attribute in event]
words = [word for doc in docs for word in word_tokenize(doc, language=language)]
docs_filtered = BoWTextEncoder().preprocess_docs(docs, as_list=False)
words_filtered = [word for doc in docs_filtered for word in word_tokenize(doc, language=language)]

log_info = pd.DataFrame(
    [[path,
      traces,
      len(variants_filter.get_variants(log)),
      events,
      events / traces,
      np.median(durations),
      np.mean(durations),
      len(list(dict.fromkeys([event["concept:name"] for case in log for event in case])) if log else []),
      len(words),
      len(words_filtered),
      len(set(words)),
      len(set(words_filtered))]],
    columns=["log", "cases", "trace variants", "events", "events per trace", "median case duration",
             "mean case duration", "activities", "words pre filtering", "words post filtering",
             "vocabulary pre filtering", "vocabulary post filtering"]
)

log_info.to_csv("./results/log_info.csv", index=False, sep=";")
print("Done.")

# Split data in train and test log
split = len(log) // 5 * 4
train_log = log[:split]
test_log = log[split:]

# Configure and build model variants
language = "english"

# configure text base for training:
# 'event' -> treat text attributes as event attributes
# 'prefix' -> use concatenation of text attributes from events
text_base_for_training = 'event'

text_models = [
    # --- Baselines from Pegoraro et al. ---
    None,
    BoWTextEncoder(encoding_length=50, language=language),
    BoWTextEncoder(encoding_length=100, language=language),
    BoWTextEncoder(encoding_length=500, language=language),
    BoNGTextEncoder(n=2, encoding_length=50, language=language),
    BoNGTextEncoder(n=2, encoding_length=100, language=language),
    BoNGTextEncoder(n=2, encoding_length=500, language=language),
    PVTextEncoder(encoding_length=10, language=language),
    PVTextEncoder(encoding_length=20, language=language),
    PVTextEncoder(encoding_length=100, language=language),
    LDATextEncoder(encoding_length=10, language=language),
    LDATextEncoder(encoding_length=20, language=language),
    LDATextEncoder(encoding_length=100, language=language),

    # --- TAPPBERT ---
    # Pre-trained BERT
    BERTbaseTextEncoder(encoding_length=768, language=language),
    # Pre-trained + fine-tuned BERT
    # (1) fine-tuned toward next activity prediction
    BERTbaseFineTunedNextActivityTextEncoder(encoding_length=768, language=language, epochs=16, lr=5e-5),
    # (2) fine-tuned toward next timestamp prediction
    BERTbaseFineTunedNextTimeTextEncoder(encoding_length=768, language=language, epochs=16, lr=5e-5),
    # (3) concat. embeddings of BERT fine-tuned toward next activity + next timestamp prediction
    BERTbaseFineTunedNextActivityAndTimeTextEncoder(encoding_length=768, language=language, epochs=16, lr=5e-5),
    # BERT trained from scratch
    # (1) tokenizer is pre-trained
    BERTfromScratchTextEncoder(encoding_length=36, language=language),
    BERTfromScratchTextEncoder(encoding_length=768, language=language),
    # (2) tokenizer is trained from scratch
    BERTAndTokenizerFromScratchTextEncoder(encoding_length=36, language=language, vocab_size=1000),
    BERTAndTokenizerFromScratchTextEncoder(encoding_length=768, language=language, vocab_size=1000),
]

text_models = [
    BERTbaseFineTunedNextActivityTextEncoder(encoding_length=768, language=language, epochs=16, lr=5e-5),
]

if BERTfromScratchTextEncoder in text_models or BERTAndTokenizerFromScratchTextEncoder in text_models:
    text_data_path = "../../datasets/questions.txt"
    if not os.path.exists(text_data_path):
        # extract text and store in separate file to be used during pretraining BERT from scratch
        df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
        txt = df["question"].dropna().values.tolist()
        with open(text_data_path, "w", encoding="utf-8") as output:
            for doc in txt:
                sentences = nltk.tokenize.sent_tokenize(doc)
                for sentence in sentences:
                    output.write(sentence)
                    output.write('\n')
                output.write('\n')


shared_layers = [1]
special_layers = [1]
neurons = [100]
data_attributes_list = [[]]
iterations = 1

print("Extract DataFrame for evaluation...")
df = extract_df(log)
print(df.head(20))

print("Evaluate prediction models...")
print("This might take a while...")
for text_model in text_models:
    for shared_layer in shared_layers:
        for special_layer in special_layers:
            for neuron in neurons:
                for data_attributes in data_attributes_list:
                    if shared_layer + special_layer == 0:
                        pass
                    else:
                        log_encoder = LogEncoder(text_encoder=text_model, advanced_time_attributes=True,
                                                 text_base_for_training=text_base_for_training)
                        model = TappModel(log_encoder=log_encoder, num_shared_layer=shared_layer,
                                          num_specialized_layer=special_layer, neurons_per_layer=neuron, dropout=0.2,
                                          learning_rate=0.001)
                        model.activities = _get_event_labels(log, "concept:name")
                        log_encoder.fit(log, activities=model.activities, data_attributes=data_attributes,
                                        text_attribute=text_attribute)
                        for iteration in range(iterations):
                            model.fit(train_log, data_attributes=data_attributes, text_attribute=text_attribute, epochs=25)
                            model.evaluate(test_log, "results.csv", num_prefixes=8)
print("Done. Evaluation completed.")

Prepare...
Done.
Load event log...


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


parsing log, completed traces ::   0%|          | 0/15001 [00:00<?, ?it/s]

Done.
Create log statistics...


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Done.
Extract DataFrame for evaluation...
First 10 prefixes:
[[{'lifecycle:transition': 'complete', 'time:timestamp': datetime.datetime(2015, 7, 1, 8, 30, 31, tzinfo=datetime.timezone.utc), '(case)_creator': 'Fluxicon Disco', '(case)_variant': 'Variant 261', '(case)_variant-index': '261', 'age': '18-29', 'concept:name': 'question', 'customer_id': '2046012', 'gender': 'V', 'office_u': '296', 'office_w': '269', 'question': 'Application/Registration WW'}]]
      caseID  prefix-length     true-next-activity  true-next-time
0   18684438              1               question        0.002662
1   18684438              2           aanvragen-ww        0.060613
2   18684438              3                   home        0.030463
3   18684438              4           aanvragen-ww        0.000243
4   18684438              5                   home        0.008553
5   18684438              6                    END        0.000000
6    9706911              1                  taken        0.570220
7    9

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading 

IndexError: index 21 is out of bounds for axis 0 with size 21