# Demo of Classification

This notebook demonstrates the code necessary to train different classifiers and use these to predict new MSMS spectra of either being of "interest" or "other". 

## Setup

In [1]:
from AnnoMe.Classification import (
    generate_ms2deepscore_embeddings,
    add_mzmine_metainfos,
    add_sirius_fingerprints,
    add_sirius_canopus,
    add_sirius_predictions,
    add_mzmine_quant,
    remove_invalid_CEs,
    show_dataset_overview,
    generate_embedding_plots,
    train_and_classify,
    generate_prediction_overview,
    generate_ml_metrics_overview,
    set_random_seeds,
)

import pandas as pd

from collections import OrderedDict
import os

set_random_seeds(42)

## Parameters

In [2]:
# fmt: off
# parameters

# Main folder for input and output files
base_folder = "../../../"

# Path to the MS2DeepScore model file
model_file_name = f"{base_folder}/models/ms2deepscore_model.pt"

# Output directory
output_dir = f"{base_folder}/output_PrenylatedCompounds_BOKUDBs/"

# Main datasets to process for classification
datasets = OrderedDict(
    [
        ## MS/MS of reference prenylated flavones
        #(
        #    "prenylated_flavones_CIDpos_relevant",
        #    {
        #        "name": "prenylated flavonoids",
        #        "type": "train - relevant",
        #        "file": f"{base_folder}/../results/CID_pos__sirius.mgf",
        #        "fragmentation_method": "fragmentation_method",
        #        "colour": "#D41F11",
        #        "canopus_file": f"{base_folder}/../results/CID_pos__sirius/canopus_formula_summary.tsv",
        #    },
        #),
        #(
        #    "prenylated_flavones_CIDneg_relevant",
        #    {
        #        "name": "prenylated flavonoids",
        #        "type": "train - relevant",
        #        "file": f"{base_folder}/../results/CID_neg__sirius.mgf",
        #        "fragmentation_method": "fragmentation_method",
        #        "colour": "#F37A00",
        #        "canopus_file": f"{base_folder}/../results/CID_neg__sirius/canopus_formula_summary.tsv",
        #    },
        #),
        (
            "prenylated_flavones_HCDpos_relevant",
            {
                "name": "prenylated flavonoids",
                "type": "train - relevant",
                "file": f"{base_folder}/../results/HCD_pos__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#10ADC2",
                "canopus_file": f"{base_folder}/../results/HCD_pos__sirius/canopus_formula_summary.tsv",
            },
        ),
        (
            "prenylated_flavones_HCDneg_relevant",
            {
                "name": "prenylated flavonoids",
                "type": "train - relevant",
                "file": f"{base_folder}/../results/HCD_neg__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#017192",
                "canopus_file": f"{base_folder}/../results/HCD_neg__sirius/canopus_formula_summary.tsv",
            },
        ),

        ## MS/MS of wheat samples
        (
            "Wheat_HCDpos_other",
            {
                "name": "wheat metabolites",
                "type": "train - other",
                "file": f"{base_folder}/../results/Wheat_pos__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#F37A00",
            },
        ),
        (
            "Wheat_HCDneg_other",
            {
                "name": "wheat metabolites",
                "type": "train - other",
                "file": f"{base_folder}/../results/Wheat_neg__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#F37A00",
            },
        ),

        ## MS/MS of BOKU MassBank
        (
            "MBBOKU_relevant",
            {
                "name": "MassBank BOKU relevant",
                "type": "train - relevant",
                "file": f"{base_folder}/data/derived/BOKU_iBAM_MB___StructureOfInterest__MatchingSmiles.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#FF0000",
                "fingerprintFile": "::SIRIUS",
            },
        ),
        (
            "MBBOKU_other",
            {
                "name": "MassBank BOKU other",
                "type": "train - other",
                "file": f"{base_folder}/data/derived/BOKU_iBAM_MB___StructureOfInterest__NonMatchingSmiles.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#8B8989",
                "fingerprintFile": "::SIRIUS",
            },
        ),

        ## MS/MS for inference
        #(
        #    "Samp_PaulowinaTomentosa_neg_inference",
        #    {
        #        "name": "Samp_PaulowinaTomentosa_neg",
        #        "type": "inference",
        #        "file": f"{base_folder}/../results/Samp1_neg__sirius.mgf",
        #        "fragmentation_method": "fragmentation_method",
        #        "colour": "#80BF02",
        #        "fingerprintFile": f"{base_folder}/../results/Samp1_neg__sirius_fingerprints.json",
        #        "canopus_file": f"{base_folder}/../results/Samp1_neg__sirius/canopus_formula_summary.tsv",
        #        "quant_file": f"{base_folder}/../results/Samp1_neg__full_feature_table.csv",
        #        "sirius_file": f"{base_folder}/../results/Samp1_neg__sirius/structure_identifications_top-15.tsv",
        #        "mzmine_meta_table": f"{base_folder}/../results/Samp1_neg__full_feature_table.csv",
        #    },
        #),
        (
            "PT22CH_pos_inference",
            {
                "name": "PT22CH_pos",
                "type": "inference",
                "file": f"{base_folder}/../results/PT22CH_pos__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#80BF02",
            },
        ),
        (
            "PT22CH_neg_inference",
            {
                "name": "PT22CH_neg",
                "type": "inference",
                "file": f"{base_folder}/../results/PT22CH_neg__sirius.mgf",
                "fragmentation_method": "fragmentation_method",
                "colour": "#80BF02",
            },
        ),
    ]
)

# meta-data to add to the output from the MS/MS spectra
data_to_add = OrderedDict(
    [
        ("name", ["feature_id", "name", "title", "compound_name"]),
        ("formula", ["formula"]),
        ("smiles", ["smiles"]),
        ("adduct", ["adduct", "precursor_type"]),
        ("ionMode", ["ionmode"]),
        ("RTINSECONDS", ["rtinseconds", "retention_time"]),
        ("precursor_mz", ["pepmass", "precursor_mz"]),
        ("fragmentation_method", ["fragmentation_method", "fragmentation_mode"]),
        ("CE", ["collision_energy"]),
    ]
)

training_subsets = {
    ## "all"           : lambda x: True,
    ## "CE30"          : lambda x: x["CE"] == "30.0",
    ## "CE50"          : lambda x: x["CE"] == "50.0",
    ## "CE70"          : lambda x: x["CE"] == "70.0",
    ## "pos"           : lambda x: x["ionMode"] == "positive",
    ## "neg"           : lambda x: x["ionMode"] == "negative",
    ## "cid"           : lambda x: x["fragmentation_method"] == "cid",
    ## "hcd"           : lambda x: x["fragmentation_method"] == "hcd",
    ## "pos_CE30"      : lambda x: (x["ionMode"] == "positive") & (x["CE"] == "30.0"),
    ## "pos_CE50"      : lambda x: (x["ionMode"] == "positive") & (x["CE"] == "50.0"),
    ## "pos_CE70"      : lambda x: (x["ionMode"] == "positive") & (x["CE"] == "70.0"),
    ## "neg_CE30"      : lambda x: (x["ionMode"] == "negative") & (x["CE"] == "30.0"),
    ## "neg_CE50"      : lambda x: (x["ionMode"] == "negative") & (x["CE"] == "50.0"),
    ## "neg_CE70"      : lambda x: (x["ionMode"] == "negative") & (x["CE"] == "70.0"),
    ## "cid_CE30"      : lambda x: (x["fragmentation_method"] == "cid") & (x["CE"] == "30.0"),
    ## "cid_CE50"      : lambda x: (x["fragmentation_method"] == "cid") & (x["CE"] == "50.0"),
    ## "cid_CE70"      : lambda x: (x["fragmentation_method"] == "cid") & (x["CE"] == "70.0"),
    ## "hcd_CE30"      : lambda x: (x["fragmentation_method"] == "hcd") & (x["CE"] == "30.0"),
    ## "hcd_CE50"      : lambda x: (x["fragmentation_method"] == "hcd") & (x["CE"] == "50.0"),
    ## "hcd_CE70"      : lambda x: (x["fragmentation_method"] == "hcd") & (x["CE"] == "70.0"),
    ## "cid_pos"       : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "positive"),
    ## "cid_neg"       : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "negative"),
    ## "hcd_pos"       : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive"),
    ## "hcd_neg"       : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative"),
    ## "cid_pos_CE30"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "positive") & (x["CE"] == "30.0"),
    ## "cid_pos_CE50"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "positive") & (x["CE"] == "50.0"),
    ## "cid_pos_CE70"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "positive") & (x["CE"] == "70.0"),
    ## "cid_neg_CE30"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "negative") & (x["CE"] == "30.0"),
    ## "cid_neg_CE50"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "negative") & (x["CE"] == "50.0"),
    ## "cid_neg_CE70"  : lambda x: (x["fragmentation_method"] == "cid") & (x["ionMode"] == "negative") & (x["CE"] == "70.0"),
    ## "hcd_pos_CE30"  : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive") & (x["CE"] == "30.0"),
    ## "hcd_pos_CE50"  : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive") & (x["CE"] == "50.0"),
    ## "hcd_pos_CE70"  : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive") & (x["CE"] == "70.0"),
    "hcd_neg_step45.0" : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] in ["45.0", "stepped20,45,70ev(absolute)"]),
    "hcd_pos_step45.0" : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "positive") & (x["CE"] in ["45.0", "stepped20,45,70ev(absolute)"]),
    #"hcd_neg_CE20"     : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] == "20.0"),
    #"hcd_neg_CE30"     : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] == "30.0"),
    #"hcd_neg_CE40"     : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] == "40.0"),
    ## "hcd_neg_CE50"  : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] == "50.0"),
    ## "hcd_neg_CE70"  : lambda x: (x["fragmentation_method"] == "hcd") & (x["ionMode"] == "negative") & (x["CE"] == "70.0"),
}


# derived, do not change
colors = {ds: datasets[ds]["colour"] for ds in datasets}
# fmt: on

## Execute pipeline

In [3]:
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

pickle_file = f"{output_dir}/df_embeddings.pkl"
if os.path.exists(pickle_file):
    df = pd.read_pickle(pickle_file)

else:
    # Import the spectra and process MS2DeepScore embeddings
    df = generate_ms2deepscore_embeddings(model_file_name, datasets, data_to_add)
    
    # add associated metadata
    df = add_mzmine_metainfos(datasets, df)
    # df = add_sirius_fingerprints(datasets, df)
    df = add_sirius_canopus(datasets, df)
    df = add_sirius_predictions(datasets, df)
    df = add_mzmine_quant(datasets, df)
    
    # show overview and plot
    show_dataset_overview(df)
    generate_embedding_plots(df, output_dir, colors)

    # export dataframe for re-use
    df.to_pickle(f"{output_dir}/df_embeddings.pkl")

# train and predict new datasets
df_train, df_validation, df_inference, df_metrics = train_and_classify(df, subsets=training_subsets)
generate_prediction_overview(df, df_train, output_dir, "training", min_prediction_threshold=13)
generate_prediction_overview(df, df_inference, output_dir, "inference", min_prediction_threshold=13)

# Generate an overview of the machine learning metrics
generate_ml_metrics_overview(df_metrics, output_dir)



Training models
#######################################################
15 different models will be trained
Size of training dataset: [33m12602[0m
Size of validation dataset: [33m0[0m
Size of inference dataset: [33m1188[0m



********************************************************************************
Subset: hcd_neg_step45.0
Number of spectra in subset (train): 317, these are 2.52% of the total spectra.
Overview of trainSubset_y_gt (ground-truth labels):
   - other: 264
   - relevant: 53

--------------------------------------------------------------------------------
Classifier: Nearest Neighbors n=3

   [Fold 1] Score: [33m0.938[0m, Duration: [33m0.00 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (6/32) -> relevant (6): 100.000%[0m
   - [Test subset] [33mConfusion matrix relevant (6/32) -> other (0): 0.000%[0m
   - [Test subset] [33mConfusion matrix other (26/32) -> relevant (2): 7.692%[0m
   - [Test subset] [32mConfusion matrix other (26/32) -> 



   [Fold 1] Score: [33m0.688[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (6/32) -> relevant (0): 0.000%[0m
   - [Test subset] [33mConfusion matrix relevant (6/32) -> other (6): 100.000%[0m
   - [Test subset] [33mConfusion matrix other (26/32) -> relevant (4): 15.385%[0m
   - [Test subset] [32mConfusion matrix other (26/32) -> other (22): 84.615%[0m
   - [Test subset] [32mBalanced accuracy: 0.423[0m
   - [Test subset] [32mPrecision: 0.638[0m
   - [Test subset] [32mF1 Score: 0.662[0m
   - [Test subset] [32mRecall: 0.688[0m
   - [Test subset] [32mROC AUC: 0.423[0m

   [Fold 2] Score: [33m0.719[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (6/32) -> relevant (1): 16.667%[0m
   - [Test subset] [33mConfusion matrix relevant (6/32) -> other (5): 83.333%[0m
   - [Test subset] [33mConfusion matrix other (26/32) -> relevant (4): 15.385%[0m
   - [Test subset] [32mConfusion matrix other (26



   [Fold 3] Score: [33m0.781[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (6/32) -> relevant (2): 33.333%[0m
   - [Test subset] [33mConfusion matrix relevant (6/32) -> other (4): 66.667%[0m
   - [Test subset] [33mConfusion matrix other (26/32) -> relevant (3): 11.538%[0m
   - [Test subset] [32mConfusion matrix other (26/32) -> other (23): 88.462%[0m
   - [Test subset] [32mBalanced accuracy: 0.609[0m
   - [Test subset] [32mPrecision: 0.767[0m
   - [Test subset] [32mF1 Score: 0.773[0m
   - [Test subset] [32mRecall: 0.781[0m
   - [Test subset] [32mROC AUC: 0.609[0m

   [Fold 4] Score: [33m0.844[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/32) -> relevant (1): 20.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/32) -> other (4): 80.000%[0m
   - [Test subset] [33mConfusion matrix other (27/32) -> relevant (1): 3.704%[0m
   - [Test subset] [32mConfusion matrix other (27/



   [Fold 5] Score: [33m0.719[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/32) -> relevant (0): 0.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/32) -> other (5): 100.000%[0m
   - [Test subset] [33mConfusion matrix other (27/32) -> relevant (4): 14.815%[0m
   - [Test subset] [32mConfusion matrix other (27/32) -> other (23): 85.185%[0m
   - [Test subset] [32mBalanced accuracy: 0.426[0m
   - [Test subset] [32mPrecision: 0.693[0m
   - [Test subset] [32mF1 Score: 0.706[0m
   - [Test subset] [32mRecall: 0.719[0m
   - [Test subset] [32mROC AUC: 0.426[0m

   [Fold 6] Score: [33m0.781[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/32) -> relevant (2): 40.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/32) -> other (3): 60.000%[0m
   - [Test subset] [33mConfusion matrix other (27/32) -> relevant (4): 14.815%[0m
   - [Test subset] [32mConfusion matrix other (27



   [Fold 7] Score: [33m0.719[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/32) -> relevant (3): 60.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/32) -> other (2): 40.000%[0m
   - [Test subset] [33mConfusion matrix other (27/32) -> relevant (7): 25.926%[0m
   - [Test subset] [32mConfusion matrix other (27/32) -> other (20): 74.074%[0m
   - [Test subset] [32mBalanced accuracy: 0.670[0m
   - [Test subset] [32mPrecision: 0.814[0m
   - [Test subset] [32mF1 Score: 0.751[0m
   - [Test subset] [32mRecall: 0.719[0m
   - [Test subset] [32mROC AUC: 0.670[0m

   [Fold 8] Score: [33m0.677[0m, Duration: [33m0.13 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/31) -> relevant (1): 20.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/31) -> other (4): 80.000%[0m
   - [Test subset] [33mConfusion matrix other (26/31) -> relevant (6): 23.077%[0m
   - [Test subset] [32mConfusion matrix other (26



   [Fold 9] Score: [33m0.613[0m, Duration: [33m0.14 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/31) -> relevant (2): 40.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/31) -> other (3): 60.000%[0m
   - [Test subset] [33mConfusion matrix other (26/31) -> relevant (9): 34.615%[0m
   - [Test subset] [32mConfusion matrix other (26/31) -> other (17): 65.385%[0m
   - [Test subset] [32mBalanced accuracy: 0.527[0m
   - [Test subset] [32mPrecision: 0.742[0m
   - [Test subset] [32mF1 Score: 0.660[0m
   - [Test subset] [32mRecall: 0.613[0m
   - [Test subset] [32mROC AUC: 0.527[0m

   [Fold 10] Score: [33m0.871[0m, Duration: [33m0.14 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (5/31) -> relevant (2): 40.000%[0m
   - [Test subset] [33mConfusion matrix relevant (5/31) -> other (3): 60.000%[0m
   - [Test subset] [33mConfusion matrix other (26/31) -> relevant (1): 3.846%[0m
   - [Test subset] [32mConfusion matrix other (26



   [Fold 2] Score: [33m0.759[0m, Duration: [33m0.19 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (6): 50.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (6): 50.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (8): 17.391%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (38): 82.609%[0m
   - [Test subset] [32mBalanced accuracy: 0.663[0m
   - [Test subset] [32mPrecision: 0.774[0m
   - [Test subset] [32mF1 Score: 0.765[0m
   - [Test subset] [32mRecall: 0.759[0m
   - [Test subset] [32mROC AUC: 0.663[0m
   * [Inference] Number of 'relevant': [33m202[0m
   * [Inference] Number of 'other'   : [33m796[0m

   [Fold 3] Score: [33m0.810[0m, Duration: [33m0.18 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (5): 41.667%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (7): 58.333%[0m
   - [Test subset] [33mCo



   [Fold 4] Score: [33m0.897[0m, Duration: [33m0.17 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (7): 58.333%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (5): 41.667%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (1): 2.174%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (45): 97.826%[0m
   - [Test subset] [32mBalanced accuracy: 0.781[0m
   - [Test subset] [32mPrecision: 0.895[0m
   - [Test subset] [32mF1 Score: 0.888[0m
   - [Test subset] [32mRecall: 0.897[0m
   - [Test subset] [32mROC AUC: 0.781[0m
   * [Inference] Number of 'relevant': [33m172[0m
   * [Inference] Number of 'other'   : [33m826[0m





   [Fold 5] Score: [33m0.707[0m, Duration: [33m0.18 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (3): 25.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (9): 75.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (8): 17.391%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (38): 82.609%[0m
   - [Test subset] [32mBalanced accuracy: 0.538[0m
   - [Test subset] [32mPrecision: 0.698[0m
   - [Test subset] [32mF1 Score: 0.702[0m
   - [Test subset] [32mRecall: 0.707[0m
   - [Test subset] [32mROC AUC: 0.538[0m
   * [Inference] Number of 'relevant': [33m214[0m
   * [Inference] Number of 'other'   : [33m784[0m





   [Fold 6] Score: [33m0.776[0m, Duration: [33m0.19 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (3): 25.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (9): 75.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (4): 8.696%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (42): 91.304%[0m
   - [Test subset] [32mBalanced accuracy: 0.582[0m
   - [Test subset] [32mPrecision: 0.742[0m
   - [Test subset] [32mF1 Score: 0.752[0m
   - [Test subset] [32mRecall: 0.776[0m
   - [Test subset] [32mROC AUC: 0.582[0m
   * [Inference] Number of 'relevant': [33m208[0m
   * [Inference] Number of 'other'   : [33m790[0m





   [Fold 7] Score: [33m0.793[0m, Duration: [33m0.19 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (3): 25.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (9): 75.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (3): 6.522%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (43): 93.478%[0m
   - [Test subset] [32mBalanced accuracy: 0.592[0m
   - [Test subset] [32mPrecision: 0.759[0m
   - [Test subset] [32mF1 Score: 0.765[0m
   - [Test subset] [32mRecall: 0.793[0m
   - [Test subset] [32mROC AUC: 0.592[0m
   * [Inference] Number of 'relevant': [33m160[0m
   * [Inference] Number of 'other'   : [33m838[0m





   [Fold 8] Score: [33m0.776[0m, Duration: [33m0.18 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (3): 25.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (9): 75.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (4): 8.696%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (42): 91.304%[0m
   - [Test subset] [32mBalanced accuracy: 0.582[0m
   - [Test subset] [32mPrecision: 0.742[0m
   - [Test subset] [32mF1 Score: 0.752[0m
   - [Test subset] [32mRecall: 0.776[0m
   - [Test subset] [32mROC AUC: 0.582[0m
   * [Inference] Number of 'relevant': [33m171[0m
   * [Inference] Number of 'other'   : [33m827[0m





   [Fold 9] Score: [33m0.776[0m, Duration: [33m0.19 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (12/58) -> relevant (6): 50.000%[0m
   - [Test subset] [33mConfusion matrix relevant (12/58) -> other (6): 50.000%[0m
   - [Test subset] [33mConfusion matrix other (46/58) -> relevant (7): 15.217%[0m
   - [Test subset] [32mConfusion matrix other (46/58) -> other (39): 84.783%[0m
   - [Test subset] [32mBalanced accuracy: 0.674[0m
   - [Test subset] [32mPrecision: 0.783[0m
   - [Test subset] [32mF1 Score: 0.779[0m
   - [Test subset] [32mRecall: 0.776[0m
   - [Test subset] [32mROC AUC: 0.674[0m
   * [Inference] Number of 'relevant': [33m222[0m
   * [Inference] Number of 'other'   : [33m776[0m





   [Fold 10] Score: [33m0.724[0m, Duration: [33m0.19 seconds[0m
   - [Test subset] [32mConfusion matrix relevant (11/58) -> relevant (5): 45.455%[0m
   - [Test subset] [33mConfusion matrix relevant (11/58) -> other (6): 54.545%[0m
   - [Test subset] [33mConfusion matrix other (47/58) -> relevant (10): 21.277%[0m
   - [Test subset] [32mConfusion matrix other (47/58) -> other (37): 78.723%[0m
   - [Test subset] [32mBalanced accuracy: 0.621[0m
   - [Test subset] [32mPrecision: 0.760[0m
   - [Test subset] [32mF1 Score: 0.739[0m
   - [Test subset] [32mRecall: 0.724[0m
   - [Test subset] [32mROC AUC: 0.621[0m
   * [Inference] Number of 'relevant': [33m238[0m
   * [Inference] Number of 'other'   : [33m760[0m

Average score: 0.772 ± 0.053 (min: 0.707, max: 0.897)
Average duration: 0.18 seconds
Average Confusion Matrix (Percentages, rows: ground-truth, columns: predictions):
            other  relevant
other     38.7122   61.2878
relevant  12.7799   87.2201
[Classifier



plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/training_relevant_predictions_classificationChart.pdf




Umap plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/training_relevant_predictions_umap.pdf




Feature map plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/training_relevant_predictions_feature_map.pdf




Saved table to ../../..//output_PrenylatedCompounds_BOKUDBs/training_data.xlsx


Generating Prediction Overview
#######################################################
plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/inference_relevant_predictions_classificationChart.pdf
Umap plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/inference_relevant_predictions_umap.pdf
Feature map plot saved as ../../..//output_PrenylatedCompounds_BOKUDBs/inference_relevant_predictions_feature_map.pdf
Saved table to ../../..//output_PrenylatedCompounds_BOKUDBs/inference_data.xlsx


Generating Machine Learning Metrics Overview
#######################################################
Machine learning metrics overview saved to ../../..//output_PrenylatedCompounds_BOKUDBs/ML_metrics_overview.tsv

Machine Learning Metrics and Sets Overview:
['Confusion matrix count: relevant -> relevant'
 'Confusion matrix percent: relevant -> relevant'
 'Confusion matrix count: relevant -> other'
 'Confusion m