# Decision Trees generation on Diva using CFS

This notebook Generates the decision trees on Diva with CFS


In [1]:
from pathlib import Path
import sys
from numba import njit

UTILS_RELATIVE_PATH = "../../../../"
sys.path.append(UTILS_RELATIVE_PATH)

MLEM_RELATIVE_PATH = "../../../../.."
sys.path.append(MLEM_RELATIVE_PATH)

LIME_RELATIVE_PATH = "../../../../../lime/"
sys.path.append(LIME_RELATIVE_PATH)

OUTPUT_FOLDER = Path("experiment_output")
OUTPUT_FOLDER.mkdir(exist_ok=True)

import logging
logging.disable('DEBUG')


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
import numpy as np
import scipy.spatial.distance as distance
import multiprocessing

np.random.seed(4321)
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from lime.lime_tabular import LimeTabularExplainer # type: ignore
from mlem.utilities import generate_balanced_dataset, save_pickle_bz2, load_pickle_bz2, save_txt

# Loading the Diva data

loading the Diva RandomForest and the dictionary with all the useful data

In [3]:
from utils.dataloading.diva import load_diva_data, load_diva_randomforest # type: ignore

BB = load_diva_randomforest()
BB_DATA = load_diva_data('diva-blackbox-data2.npz')

print(classification_report(BB_DATA['y_test'], BB.predict(BB_DATA['X_test'])))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94      2094
           1       0.85      0.77      0.81       745

    accuracy                           0.90      2839
   macro avg       0.89      0.86      0.87      2839
weighted avg       0.90      0.90      0.90      2839



In [4]:
print(*BB_DATA.keys(), sep=" ")

X_train y_train X_test y_test X_validation y_validation X_validation_noisy y_validation_noisy X_attack_5_per_quantile y_attack_5_per_quantile categorical_features numerical_features categorical_features_mask centroids X_distance_separated y_distance_separated


Loading the CFS data

In [5]:
import json
import numpy as np

cfs_data = [json.loads(line) for line in open("divaRF_RF_cfs_novembre.json", 'r')]
print(f"{cfs_data[0].keys()=}")

cfs_data[0].keys()=dict_keys(['x', 'Z_list'])


In [6]:
assert len(BB_DATA['X_distance_separated']) == 100
assert len(cfs_data) == 100
# check that the x corresponds to the selected instances
for i in range(100):
    assert (np.array(cfs_data[i]['x']) - BB_DATA['X_distance_separated'][i] <= 1e-4).all()

In [7]:
def extract_zlist_points(zlist):
    """
    Extract and concatenate all the points from a Z_list
    """
    l = [np.array(zl) for zl in zlist]
    return np.concatenate(l)

# Creating the Decision Trees

Creating the decision trees <span style="color:red"> if they don't already exist </span>.

In [10]:
from mlem.utilities import create_decision_tree


for i, cfs in enumerate(cfs_data):
    FOLDER_PATH = OUTPUT_FOLDER / f"{i}"
    if not (FOLDER_PATH / "dt.bz2").exists():
        FOLDER_PATH.mkdir(exist_ok=True)
        dt_data_x = extract_zlist_points(cfs['Z_list'])
        dt_data_y = BB.predict(dt_data_x)
        X_train, X_test, y_train, y_test = train_test_split(dt_data_x, dt_data_y, test_size=0.33, random_state=42, stratify=dt_data_y)
        dt = create_decision_tree(X_train, y_train, use_halving=True)
        save_pickle_bz2(FOLDER_PATH / "dt.bz2", dt)
        save_txt(FOLDER_PATH / "dt_classification_report.txt", classification_report(y_test, dt.predict(X_test)))
        np.savez(FOLDER_PATH / "dt-data.npz", X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)