# Approximation through Distillation

## Initialisation

In [2]:
%load_ext autoreload
%autoreload 2

import logging
import os
import sys
import subprocess
import pkg_resources

def install_missing_requirements(requirements_path):
    if not os.path.exists(requirements_path):
        print(f"Requirements file '{requirements_path}' not found.")
        return

    with open(requirements_path) as f:
        required = [line.strip() for line in f if line.strip() and not line.startswith("#")]

    installed = {pkg.key for pkg in pkg_resources.working_set}
    
    missing = []
    for req in required:
        pkg_name = req.split("==")[0].lower()
        if pkg_name not in installed:
            missing.append(req)

    if not missing:
        print("All required packages are already installed.")
        return

    print(f"Installing missing packages: {missing}")

    # Check if running inside conda
    conda_prefix = os.environ.get("CONDA_PREFIX")
    if conda_prefix:
        print("Detected conda environment. Trying to use conda first...")
        for pkg in missing:
            pkg_name = pkg.split("==")[0]
            try:
                subprocess.check_call(["conda", "install", "-y", pkg_name])
            except subprocess.CalledProcessError:
                print(f"Package '{pkg_name}' not found in conda. Falling back to pip.")
                subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
    else:
        # Not in conda, use pip directly
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])


def update_repository():
    print("Updating the git repository...")
    try:
        result = subprocess.run(["git", "pull"], capture_output=True, text=True, check=True)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Git pull failed:")
        print(e.stderr)

# install missing requirements
install_missing_requirements("/home/jovyan/requirements_reddit.txt")

# set cwd to the project root
cwd = os.getcwd()
wd = '/home/jovyan/reddit-mining/'
if cwd != wd:
    os.chdir(wd)

# update the git repository
update_repository()

# set PYTHONPATH to the src directory
sys.path.append('src')

# set up logging
from logger import setup_logger
from pm4py.objects.log.importer.xes import importer as xes_importer
#logger = setup_logger(level=logging.INFO)
logger = setup_logger(level=logging.DEBUG)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


variant = xes_importer.Variants.ITERPARSE
parameters = {
    #variant.value.Parameters.TIMESTAMP_SORT: True,
    #variant.value.Parameters.REVERSE_SORT: False,
}


  import pkg_resources


Installing missing packages: ['sentence_transformers']
Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Updating the git repository...
Already up to date.



## Create Log Statistics

In [None]:
from tapp.text_encoder import BoWTextEncoder, BERTbaseTextEncoder
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np

folder_names = ["bpi_2012_enriched_filtered_A", "werk"]

print("Load event log...")

print("Create log statistics...")
for folder_name in folder_names:
    if folder_name == "werk":
        path = "./data/werk.xes"
        data_attributes = ["age", "gender"]
        text_attribute = "question"
        text_models = [BERTbaseTextEncoder(encoding_length=768, language="english")]
    elif "bpi" in folder_name:
        path = f"./data/{folder_name}.xes"
        data_attributes = []
        if "special" in folder_name:
            text_attribute = "binary_flag"
        else:
            text_attribute = "text"
        text_models = [BERTbaseTextEncoder(encoding_length=768, language="english")]
    log = xes_importer.apply(path, variant=variant, parameters=parameters)

    language = "english"
    traces = len(log)
    events = sum(len(case) for case in log)
    durations = [(case[-1]["time:timestamp"].timestamp() - case[0]["time:timestamp"].timestamp()) / 86400 for case in log]
    docs = [event[text_attribute] for case in log for event in case if text_attribute in event]
    words = [word for doc in docs for word in word_tokenize(doc, language=language)]
    docs_filtered = BoWTextEncoder().preprocess_docs(docs, as_list=False)
    words_filtered = [word for doc in docs_filtered for word in word_tokenize(doc, language=language)]

    log_info = pd.DataFrame(
        [[path,
        traces,
        events,
        events / traces,
        np.median(durations),
        np.mean(durations),
        len(list(dict.fromkeys([event["concept:name"] for case in log for event in case])) if log else []),
        len(words),
        len(words_filtered),
        len(set(words)),
        len(set(words_filtered))]],
        columns=["log", "cases", "events", "events per trace", "median case duration",
                "mean case duration", "activities", "words pre filtering", "words post filtering",
                "vocabulary pre filtering", "vocabulary post filtering"]
    )

    log_info.to_csv("./results/log_info.csv", index=False, sep=";")
print("Done.")

Load event log...
Create log statistics...


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: 12 columns passed, passed data had 11 columns

## Create the Training Data

In [10]:
update_repository()
# from pm4py.objects.log.importer.xes import importer as xes_importer
from tapp.tapp_model import _get_event_labels, TappModel
from tapp.log_encoder import LogEncoder
from tapp.tapp_model import TappModel, _get_event_labels
from tapp.text_encoder import BoWTextEncoder
from tapp.text_encoder import BoNGTextEncoder
from tapp.text_encoder import LDATextEncoder
from tapp.text_encoder import BERTbaseTextEncoder
from tapp.text_encoder import BERTbaseFineTunedNextActivityTextEncoder
from distillation import get_distillation_paths
import os
import numpy as np
import sys
from pm4py.objects.log.importer.xes import importer as xes_importer

runs = 3
folder_names = ["bpi_2012_enriched_special_filtered_A"]
folder_names = ["bpi_2012_enriched_special_filtered_A"]
folder_names = ["bpi_2012_enriched_event"]
folder_names = ["bpi_2012_enriched_filtered_A", "werk"]
folder_names = ["werk"]
force_recompute = True


for folder_name in folder_names:
    print("Processing folder:", folder_name)

    if folder_name == "werk":
        path = "./data/werk.xes"
        data_attributes = ["age", "gender"]
        text_attribute = "question"
        text_models = [BERTbaseTextEncoder(encoding_length=768, language="english")]
    elif "bpi" in folder_name:
        path = f"./data/{folder_name}.xes"
        data_attributes = []
        if "special" in folder_name:
            text_attribute = "binary_flag"
        else:
            text_attribute = "text"
        text_models = [BERTbaseTextEncoder(encoding_length=768, language="english")]

    print("usingtextattribute:", text_attribute)
    log = xes_importer.apply(path, variant=variant, parameters=parameters)
    activities = _get_event_labels(log, "concept:name")
    class_names = _get_event_labels(log, "concept:name")
    class_names.append("END")
    split = len(log) // 5 * 4
    train_log = log[:split]
    test_log = log[split:]

    for text_model in text_models:
        print(f"Using text model: {text_model.name} with encoding length {text_model.encoding_length}")

        # initialize and fit the log encoder
        log_encoder = LogEncoder(
            text_encoder=text_model,
            advanced_time_attributes=True,
            text_base_for_training="event",
        )
        print("Fitting log encoder...")
        log_encoder.fit(
            log,
            activities=activities,
            data_attributes=data_attributes,
            text_attribute=text_attribute,
        )

        print("Transforming training data...")
        X_train, y_train_act, y_train_time = log_encoder.transform(train_log, for_training=True)
        print("X_train shape:", X_train.shape)
        print("Transforming test data...")
        X_test, y_test, _ = log_encoder.transform(test_log, for_training=True)
        print("X_test shape:", X_test.shape)

        # tappbert original hyperparameters
        shared_layer = 1
        special_layer = 1
        neuron = 100
        epochs = 25
        # test for lars
        shared_layer = 2
        special_layer = 1
        neuron = 150
        epochs = 50

        for run_id in range(runs):
            distillation_path_train, distillation_path_test = get_distillation_paths(folder_name, text_model, data_attributes, text_attribute, run_id)
            if os.path.exists(distillation_path_train) and os.path.exists(distillation_path_test) and not force_recompute:
                print("Skipping run", run_id, "as distilled training data already exists at:", distillation_path_train)
                continue
            print(f"Starting run {run_id}...")
            # tappbert training and evaluation
            tapp_model = TappModel(
                log_encoder=log_encoder,
                num_shared_layer=shared_layer,
                num_specialized_layer=special_layer,
                neurons_per_layer=neuron,
                dropout=0.2,
                learning_rate=0.001,
                use_lr_reduction=True,
            )
            tapp_model.activities = activities
            print("Training TAPPBERT model...")
            tapp_model.fit_with_ready_data(X_train, y_train_act, y_train_time, epochs=epochs)

            y_train_distilled = tapp_model.model.predict(X_train)
            y_train_distilled = y_train_distilled[0]
            np.save(distillation_path_train, y_train_distilled)
            print("Saved distilled training data to:", distillation_path_train)
            y_test_distilled = tapp_model.model.predict(X_test)
            y_test_distilled = y_test_distilled[0]
            np.save(distillation_path_test, y_test_distilled)
            print("Saved distilled test data to:", distillation_path_test)

Updating the git repository...
Already up to date.

Processing folder: werk
usingtextattribute: question


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


parsing log, completed traces ::   0%|          | 0/15001 [00:00<?, ?it/s]

Using text model: BERTbase with encoding length 768
Fitting log encoder...
Event encoding length: 795
feature_dim components: activity_encoding_length = 18 , time_encoding_length = 3 , categorical_attributes_encoding_length = 6 , numerical_attributes_encoding_length = 0 , text_encoding_length = 768
categorical_attributes_values: [['50-65', '30-39', '40-49', '18-29'], ['M', 'V']]
Number of documents for text encoder training: 21750
Number of unique documents: 764
Sample documents: ['How can I add a document/share with my consultant work through the workbook?', 'Filling: What should I do if I made a mistake when filling out the Income Problem?', 'When is/are transferred my unemployment benefits?', 'General: Can you answer my question (UWV colleague)?', 'What are the consequences if I want to stop my unemployment benefit themselves?']
self.text_encoder type: <class 'tapp.text_encoder.BERTbaseTextEncoder'>
Transforming training data...
Encoding log with 12000 cases...
vecs shape: (709, 768

Encoding log: 100%|██████████| 12000/12000 [00:01<00:00, 7032.47case/s]


X_train shape: (44241, 10, 795)
Transforming test data...
Encoding log with 3001 cases...
vecs shape: (460, 768)
target_dim: 768
vecs sample: [[-9.064173  -7.279898  -8.656439  ... -5.796433  -7.662382   6.61687  ]
 [ 2.1863418 -2.5088878 -2.763081  ... -2.3167014  0.4521589  4.0149517]
 [ 0.2531448 -7.193407  -3.79432   ... -1.5682118 -0.5932338  4.415711 ]]


Encoding log: 100%|██████████| 3001/3001 [00:00<00:00, 7608.67case/s]


X_test shape: (10979, 10, 795)
Starting run 0...
Training TAPPBERT model...
building model with timesteps = 10 and feature_dim = 795
start fitting...
Epoch 1/50
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 18ms/step - loss: 1.9136 - next_activity_output_categorical_accuracy: 0.3924 - next_activity_output_loss: 1.7959 - next_timestamp_output_loss: 0.1177 - next_timestamp_output_mean_absolute_error: 0.1177 - val_loss: 1.8501 - val_next_activity_output_categorical_accuracy: 0.4111 - val_next_activity_output_loss: 1.7816 - val_next_timestamp_output_loss: 0.0691 - val_next_timestamp_output_mean_absolute_error: 0.0692 - learning_rate: 0.0010
Epoch 2/50
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 17ms/step - loss: 1.8267 - next_activity_output_categorical_accuracy: 0.4057 - next_activity_output_loss: 1.7490 - next_timestamp_output_loss: 0.0777 - next_timestamp_output_mean_absolute_error: 0.0777 - val_loss: 1.8274 - val_next_activity_output_cate

## Evaluation

In [None]:
update_repository()
from pm4py.objects.log.importer.xes import importer as xes_importer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from tapp.text_encoder import BoWTextEncoder, BoNGTextEncoder, LDATextEncoder, BERTbaseTextEncoder
from tapp.log_encoder import LogEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from tapp.tapp_model import _get_event_labels
from distillation import (
    get_distillation_paths,
    get_evaluation_paths,
    evaluate_distillation,
    analyze_text_splits,
    prepare_text_feature_datasets,
    concatenate_text_feature_datasets,
    get_feature_datasets,
    save_evaluation_results,
    tree_to_str,
)
import numpy as np
import sys
import pickle
import itertools

def get_student_model(target_type="original", ccp_alpha=0.00001, random_state=0, **kwargs):
    if target_type == "soft":
        return DecisionTreeRegressor(random_state=random_state, ccp_alpha=ccp_alpha, **kwargs)
    else:
        return DecisionTreeClassifier(random_state=random_state, ccp_alpha=ccp_alpha, **kwargs)

text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")
text_models = [
    BoWTextEncoder(encoding_length=50, language="english"),
    BoNGTextEncoder(n=2, encoding_length=50, language="english"),
    LDATextEncoder(encoding_length=10, language="english"),
]
model_names = [[], ["BoW"], ["BoNG"], ["LDA"], ["BoW", "BoNG", "LDA"]]
runs = 3
start_alpha = 0.00001
end_alpha = 0.001
skip_pruning = False
folder_names = ["bpi_2012_enriched_special_filtered_A"]
folder_names = ["bpi_2012_enriched_filtered_A"]
folder_names = ["werk"]
folder_names = ["bpi_2012_enriched_filtered_A","werk"]

# ----------------------------
# Load distilled labels
# ----------------------------
for folder_name in folder_names:
    print("Evaluating folder:", folder_name)
    if folder_name == "werk":
        path = "./data/werk.xes"
        data_attributes = ["age", "gender"]
        text_attribute = "question"
        k = 10
        text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")
    elif "bpi" in folder_name:
        path = f"./data/{folder_name}.xes"
        data_attributes = []
        text_attribute = "binary_flag" if "special" in folder_name else "text"
        k = 8
        text_model_tapp = BERTbaseTextEncoder(encoding_length=768, language="english")

    log = xes_importer.apply(path, variant=variant, parameters=parameters)
    activities = _get_event_labels(log, "concept:name")
    class_names = _get_event_labels(log, "concept:name")
    class_names.append("END")
    split = len(log) // 5 * 4
    print("Log split at index:", split)
    train_log = log[:split]
    test_log = log[split:]
    
    for run_id in range(runs):
        print(f"Loading distilled labels for run {run_id}...")
        distillation_path_train, distillation_path_test = get_distillation_paths(folder_name, text_model_tapp, data_attributes, text_attribute, run_id)
        print("Distillation paths:", distillation_path_train, distillation_path_test)
        y_train_distilled = np.load(distillation_path_train)
        y_test_distilled = np.load(distillation_path_test)
        print(f"shape of y_train_distilled: {y_train_distilled.shape}, shape of y_test_distilled: {y_test_distilled.shape}")
        y_train_soft = y_train_distilled.copy()
        y_test_soft = y_test_distilled.copy()
        y_train_distilled = y_train_distilled.argmax(axis=1)
        y_test_distilled = y_test_distilled.argmax(axis=1)

        # ----------------------------
        # Evaluation loop
        # ----------------------------

        for model_names_subset in model_names:
            print(f"Using text models: {model_names_subset}")
            train_dataset, test_dataset = get_feature_datasets(
                folder_name,
                text_models,
                log,
                train_log,
                test_log,
                k=k,
                data_attributes=data_attributes,
                text_attribute=text_attribute,
            )
            y_train = train_dataset["y"].argmax(axis=1)
            y_test = test_dataset["y"].argmax(axis=1)
            X_train, features = concatenate_text_feature_datasets(train_dataset, model_names_subset)
            X_test, _ = concatenate_text_feature_datasets(test_dataset, model_names_subset)
            print("Transformed training data shape:", X_train.shape)
            print("Transformed test data shape:", X_test.shape)

            # evaluate baseline
            meta = dict(
                folder_name=folder_name,
                description="baseline",
                data_attributes=data_attributes,
                text_attribute=text_attribute,
                model_names=model_names_subset,
                k=k,
                run_id=run_id,
            )

            acc_tp = accuracy_score(y_test, y_test_distilled)
            f1_score_tp = f1_score(y_test, y_test_distilled, average="weighted")
            print(f"Tappbert baseline: acc - {acc_tp:.4f}, f1 - {f1_score_tp:.4f}")
            results = {"accuracy": acc_tp, "f1_score": f1_score_tp, "con_accuracy": 1.0, "con_f1_score": 1.0, "num_nodes": 0, "max_depth": 0, "ccp_alpha": 0.0, "avg_path_length": 0.0}

            meta.update(results)

            save_evaluation_results(**meta)

            for version, train_labels in [
                    #("original", y_train),
                    ("soft", y_train_soft),
                    ("distilled", y_train_distilled),
                ]:
                print(f"Evaluating version: {version}")
                desc = f"{version}"
                model_path, model_str_path, model_features_path, model_y_path = get_evaluation_paths(
                    folder_name,
                    version,
                    model_names_subset=model_names_subset,
                    data_attributes=data_attributes,
                    text_attribute=text_attribute,
                    run_id=run_id,
                )

                student = get_student_model(target_type=version)

                print(X_train.shape, train_labels.shape)
                # Train + evaluate
                results = evaluate_distillation(
                    student,
                    X_train,
                    X_test,
                    train_labels,
                    y_test,
                    y_test_distilled,
                    description=desc,
                    y_save_path=model_y_path,
                    model_save_path=model_path,
                    start_alpha=start_alpha,
                    end_alpha=end_alpha,
                    skip_pruning=skip_pruning,
                )

                # Save results
                meta = dict(
                    folder_name=folder_name,
                    description=desc,
                    data_attributes=data_attributes,
                    text_attribute=text_attribute,
                    model_names=model_names_subset,
                    k=k,
                    run_id=run_id,
                )

                meta.update(results)

                save_evaluation_results(**meta)


print("Analysis complete.")
print("Done and dusted!")

Updating the git repository...
Already up to date.

Evaluating folder: bpi_2012_enriched_filtered_A


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

Log split at index: 10468
Loading distilled labels for run 0...
Distillation paths: data/distillation/bpi_2012_enriched_filtered_A/y_BERTbase_768_None_text_0_train.npy data/distillation/bpi_2012_enriched_filtered_A/y_BERTbase_768_None_text_0_test.npy
shape of y_train_distilled: (49126, 11), shape of y_test_distilled: (11723, 11)
Using text models: []
Transformed training data shape: (49126, 104)
Transformed test data shape: (11723, 104)
Tappbert baseline: acc - 0.7896, f1 - 0.7868
Evaluating version: soft
(49126, 104) (49126, 11)
skip_pruning False
Ccp alpha paths: 25960
Ccp alpha paths, after filtering: 16


CCP pruning search:   0%|          | 0/16 [00:00<?, ?it/s]

soft: acc - 0.7677, f1 - 0.7303, con_acc - 0.8734, con_f1 - 0.8374
Selected ccp_alpha: 2.4948467645463673e-05
Number of nodes: 33
Max depth: 10
Evaluating version: distilled
(49126, 104) (49126,)
skip_pruning False
Ccp alpha paths: 1541
Ccp alpha paths, after filtering: 1539


CCP pruning search:   0%|          | 0/1539 [00:00<?, ?it/s]

distilled: acc - 0.7676, f1 - 0.7303, con_acc - 0.8734, con_f1 - 0.8374
Selected ccp_alpha: 0.000139066857697742
Number of nodes: 31
Max depth: 10
Using text models: ['BoW']
Transformed training data shape: (49126, 504)
Transformed test data shape: (11723, 504)
Tappbert baseline: acc - 0.7896, f1 - 0.7868
Evaluating version: soft
(49126, 504) (49126, 11)
skip_pruning False
Ccp alpha paths: 28583
Ccp alpha paths, after filtering: 26


CCP pruning search:   0%|          | 0/26 [00:00<?, ?it/s]

soft: acc - 0.7737, f1 - 0.7308, con_acc - 0.8889, con_f1 - 0.8563
Selected ccp_alpha: 1.054222327139895e-05
Number of nodes: 55
Max depth: 11
Evaluating version: distilled
(49126, 504) (49126,)
skip_pruning False
Ccp alpha paths: 859
Ccp alpha paths, after filtering: 858


CCP pruning search:   0%|          | 0/858 [00:00<?, ?it/s]

distilled: acc - 0.7940, f1 - 0.7901, con_acc - 0.9282, con_f1 - 0.9278
Selected ccp_alpha: 0.00011698855132231559
Number of nodes: 283
Max depth: 24
Using text models: ['BoNG']
Transformed training data shape: (49126, 504)
Transformed test data shape: (11723, 504)
Tappbert baseline: acc - 0.7896, f1 - 0.7868
Evaluating version: soft
(49126, 504) (49126, 11)
skip_pruning False
Ccp alpha paths: 27362
Ccp alpha paths, after filtering: 26


CCP pruning search:   0%|          | 0/26 [00:00<?, ?it/s]

soft: acc - 0.7801, f1 - 0.7654, con_acc - 0.9029, con_f1 - 0.8946
Selected ccp_alpha: 1.0596483121131335e-05
Number of nodes: 53
Max depth: 12
Evaluating version: distilled
(49126, 504) (49126,)
skip_pruning False
Ccp alpha paths: 1116
Ccp alpha paths, after filtering: 1115


CCP pruning search:   0%|          | 0/1115 [00:00<?, ?it/s]

distilled: acc - 0.7919, f1 - 0.7913, con_acc - 0.9186, con_f1 - 0.9191
Selected ccp_alpha: 0.00017119872736414432
Number of nodes: 121
Max depth: 17
Using text models: ['LDA']
Transformed training data shape: (49126, 184)
Transformed test data shape: (11723, 184)
Tappbert baseline: acc - 0.7896, f1 - 0.7868
Evaluating version: soft
(49126, 184) (49126, 11)
skip_pruning False
Ccp alpha paths: 28369
Ccp alpha paths, after filtering: 26


CCP pruning search:   0%|          | 0/26 [00:00<?, ?it/s]

soft: acc - 0.7721, f1 - 0.7297, con_acc - 0.8839, con_f1 - 0.8512
Selected ccp_alpha: 1.0767096483898792e-05
Number of nodes: 53
Max depth: 11
Evaluating version: distilled
(49126, 184) (49126,)
skip_pruning False
Ccp alpha paths: 1140
Ccp alpha paths, after filtering: 1139


CCP pruning search:   0%|          | 0/1139 [00:00<?, ?it/s]

distilled: acc - 0.7673, f1 - 0.7465, con_acc - 0.8674, con_f1 - 0.8531
Selected ccp_alpha: 0.00035000649973367364
Number of nodes: 77
Max depth: 12
Using text models: ['BoW', 'BoNG', 'LDA']
Transformed training data shape: (49126, 984)
Transformed test data shape: (11723, 984)
Tappbert baseline: acc - 0.7896, f1 - 0.7868
Evaluating version: soft
(49126, 984) (49126, 11)
skip_pruning False
Ccp alpha paths: 28971
Ccp alpha paths, after filtering: 26


CCP pruning search:   0%|          | 0/26 [00:00<?, ?it/s]

soft: acc - 0.7737, f1 - 0.7308, con_acc - 0.8889, con_f1 - 0.8563
Selected ccp_alpha: 1.0542223271399045e-05
Number of nodes: 55
Max depth: 11
Evaluating version: distilled
(49126, 984) (49126,)
skip_pruning False
Ccp alpha paths: 805
Ccp alpha paths, after filtering: 804


CCP pruning search:   0%|          | 0/804 [00:00<?, ?it/s]

distilled: acc - 0.7894, f1 - 0.7857, con_acc - 0.9151, con_f1 - 0.9144
Selected ccp_alpha: 7.313959198250084e-05
Number of nodes: 1083
Max depth: 26
Loading distilled labels for run 1...
Distillation paths: data/distillation/bpi_2012_enriched_filtered_A/y_BERTbase_768_None_text_1_train.npy data/distillation/bpi_2012_enriched_filtered_A/y_BERTbase_768_None_text_1_test.npy
shape of y_train_distilled: (49126, 11), shape of y_test_distilled: (11723, 11)
Using text models: []
Transformed training data shape: (49126, 104)
Transformed test data shape: (11723, 104)
Tappbert baseline: acc - 0.7894, f1 - 0.7930
Evaluating version: soft
(49126, 104) (49126, 11)
skip_pruning False
Ccp alpha paths: 24025
Ccp alpha paths, after filtering: 19


CCP pruning search:   0%|          | 0/19 [00:00<?, ?it/s]

soft: acc - 0.7769, f1 - 0.7408, con_acc - 0.8398, con_f1 - 0.7978
Selected ccp_alpha: 3.4410822100691786e-05
Number of nodes: 39
Max depth: 13
Evaluating version: distilled
(49126, 104) (49126,)
skip_pruning False
Ccp alpha paths: 1522
Ccp alpha paths, after filtering: 1519


CCP pruning search:   0%|          | 0/1519 [00:00<?, ?it/s]

distilled: acc - 0.7478, f1 - 0.7200, con_acc - 0.8754, con_f1 - 0.8369
Selected ccp_alpha: 0.0001798674382041199
Number of nodes: 29
Max depth: 10
Using text models: ['BoW']
Transformed training data shape: (49126, 504)
Transformed test data shape: (11723, 504)
Tappbert baseline: acc - 0.7894, f1 - 0.7930
Evaluating version: soft
(49126, 504) (49126, 11)
skip_pruning False
Ccp alpha paths: 26151
Ccp alpha paths, after filtering: 44


CCP pruning search:   0%|          | 0/44 [00:00<?, ?it/s]

soft: acc - 0.7971, f1 - 0.8006, con_acc - 0.9237, con_f1 - 0.9232
Selected ccp_alpha: 1.0003964602014569e-05
Number of nodes: 93
Max depth: 15
Evaluating version: distilled
(49126, 504) (49126,)
skip_pruning False
Ccp alpha paths: 879
Ccp alpha paths, after filtering: 878


CCP pruning search:   0%|          | 0/878 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Results

In [4]:
import csv
import os
import sys

folder_path = "./data/distillation"
input_name = "evaluation_results.csv"
output_name = "evaluation_results_bpi.csv"
in_path = os.path.join(folder_path, input_name)
out_path = os.path.join(folder_path, output_name)

K = 64  # number of data rows to remove AFTER header

with open(in_path, "r", newline="") as infile, \
     open(out_path, "w", newline="") as outfile:
    
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read and keep the header
    header = next(reader, None)
    writer.writerow(header)

    # Skip the first K actual data rows
    for _ in range(K):
        next(reader, None)

    # Write the remaining rows
    for row in reader:
        print("Writing row:", row)
        writer.writerow(row)



Writing row: ['bpi_2012_enriched_special_filtered_A', 'baseline', '', 'binary_flag', '', '8', '0', '0.6287639682675084', '0.5445995597274994', '1.0', '1.0']
Writing row: ['bpi_2012_enriched_special_filtered_A', 'soft', '', 'binary_flag', '', '8', '0', '0.6285933634735136', '0.5445184859041607', '0.9996587904120106', '0.9996590345983271', '15', '6', '0.0008562450573939325', '3.313486308965282']
Writing row: ['bpi_2012_enriched_special_filtered_A', 'distilled', '', 'binary_flag', '', '8', '0', '0.6287639682675084', '0.5445985870293423', '0.9999146976030027', '0.9999146999561759', '11', '4', '5.088452034061167e-05', '2.7787255821888595']
Writing row: ['bpi_2012_enriched_special_filtered_A', 'baseline', '', 'binary_flag', 'BoW', '8', '0', '0.6287639682675084', '0.5445995597274994', '1.0', '1.0']
Writing row: ['bpi_2012_enriched_special_filtered_A', 'soft', '', 'binary_flag', 'BoW', '8', '0', '0.6115328840740425', '0.5256469325526913', '0.9676703915380023', '0.9542277282793291', '13', '5', 

In [6]:
import pandas as pd

# Path to your CSV
csv_folder = "./data/distillation"
csv_file = "evaluation_results_bpi.csv"
csv_path = os.path.join(csv_folder, csv_file)

# Read CSV into DataFrame
df = pd.read_csv(csv_path)

# Columns to compute stats for
metric_cols = ["accuracy", "f1_score", "con_accuracy", "con_f1_score"]
complexity_cols = ["num_nodes", "max_depth", "avg_path_length"]

# Group by the required columns
df["model_names"] = df["model_names"].replace("", "None").fillna("None")
grouped = df.groupby(["folder_name", "description", "model_names"])

# Process each group
print("=== Performance Results ===")
for group_keys, group_df in grouped:
    folder, desc, model = group_keys
    print(f"\n=== Group: folder={folder}, description={desc}, model={model} ===")

    for col in metric_cols:
        mean_val = group_df[col].mean()
        std_val = group_df[col].std()

        # Format: .xxx+-.xxx (rounded to 3 decimals)
        formatted = f"{mean_val:.3f} ± {std_val:.3f}"

        print(f"{col}: {formatted}")

print("\n=== Complexity Results ===")
for group_keys, group_df in grouped:
    folder, desc, model = group_keys
    if desc == "baseline":
        continue
    print(f"\n=== Group: folder={folder}, description={desc}, model={model} ===")

    for col in complexity_cols:
        mean_val = group_df[col].mean()
        std_val = group_df[col].std()

        # Format: .xxx+-.xxx (rounded to 3 decimals)
        formatted = f"{mean_val:.3f} ± {std_val:.3f}"

        print(f"{col}: {formatted}")


=== Performance Results ===

=== Group: folder=bpi_2012_enriched_special_filtered_A, description=baseline, model=BoNG ===
accuracy: 0.656 ± 0.047
f1_score: 0.597 ± 0.091
con_accuracy: 1.000 ± 0.000
con_f1_score: 1.000 ± 0.000

=== Group: folder=bpi_2012_enriched_special_filtered_A, description=baseline, model=BoW ===
accuracy: 0.656 ± 0.047
f1_score: 0.597 ± 0.091
con_accuracy: 1.000 ± 0.000
con_f1_score: 1.000 ± 0.000

=== Group: folder=bpi_2012_enriched_special_filtered_A, description=baseline, model=BoW,BoNG,LDA ===
accuracy: 0.656 ± 0.047
f1_score: 0.597 ± 0.091
con_accuracy: 1.000 ± 0.000
con_f1_score: 1.000 ± 0.000

=== Group: folder=bpi_2012_enriched_special_filtered_A, description=baseline, model=LDA ===
accuracy: 0.656 ± 0.047
f1_score: 0.597 ± 0.091
con_accuracy: 1.000 ± 0.000
con_f1_score: 1.000 ± 0.000

=== Group: folder=bpi_2012_enriched_special_filtered_A, description=baseline, model=None ===
accuracy: 0.656 ± 0.047
f1_score: 0.597 ± 0.091
con_accuracy: 1.000 ± 0.000
con_

## Plot Average Path Length

In [None]:
from distillation import (
    get_distillation_paths,
    get_evaluation_paths,
    get_feature_datasets,
    concatenate_text_feature_datasets,
)
text_models = [
    BoWTextEncoder(encoding_length=50, language="english"),
    BoNGTextEncoder(n=2, encoding_length=50, language="english"),
    LDATextEncoder(encoding_length=10, language="english"),
]
model_names_soft = [[], ["BoW"], ["BoNG"], ["LDA"], ["BoW", "BoNG", "LDA"]]
model_names_hard = [[], ["BoW"], ["BoNG"], ["LDA"], ["BoW", "BoNG", "LDA"]]
folder_names = ["bpi_2012_enriched_filtered_A", "werk"]
runs = 3

for folder_name in folder_names:
    print("Evaluating folder:", folder_name)
    if folder_name == "werk":
        path = "./data/werk.xes"
        data_attributes = ["age", "gender"]
        text_attribute = "question"
        k = 10
    elif "bpi" in folder_name:
        path = f"./data/{folder_name}.xes"
        data_attributes = []
        text_attribute = "binary_flag" if "special" in folder_name else "text"
        k = 8

    train_dataset, test_dataset = get_feature_datasets(
        folder_name,
        text_models,
        log,
        None,
        None,
        k=k,
        data_attributes=data_attributes,
        text_attribute=text_attribute,
    )
    
    for soft_model_names_subset in model_names_soft:
        for hard_model_names_subset in model_names_hard:
            all_soft_decisions = []
            all_hard_decisions = []
            for run_id in range(runs):
                print(f"Using text models: soft - {soft_model_names_subset}, hard - {hard_model_names_subset}")
                X_test_soft, _ = concatenate_text_feature_datasets(test_dataset, soft_model_names_subset)
                X_test_hard, _ = concatenate_text_feature_datasets(test_dataset, hard_model_names_subset)

                hard_model_path, _, _, _ = get_evaluation_paths(
                    folder_name,
                    "hard",
                    model_names_subset=hard_model_names_subset,
                    data_attributes=data_attributes,
                    text_attribute=text_attribute,
                    run_id=run_id,
                )

                soft_model_path, _, _, _ = get_evaluation_paths(
                    folder_name,
                    "soft",
                    model_names_subset=soft_model_names_subset,
                    data_attributes=data_attributes,
                    text_attribute=text_attribute,
                    run_id=run_id,
                )

                with open(soft_model_path, "rb") as f:
                    soft_model = pickle.load(f)
                with open(hard_model_path, "rb") as f:
                    hard_model = pickle.load(f)

                soft_decisions = soft_model.decision_path(X_test_soft)
                all_soft_decisions.append(soft_decisions)
                hard_decisions = hard_model.decision_path(X_test_hard)
                all_hard_decisions.append(hard_decisions)