# Decision Trees generation on Adult using CFS

This notebook Generates the decision trees on Adult with CFS


In [1]:
from pathlib import Path
import sys
from numba import njit

UTILS_RELATIVE_PATH = "../../../../"
sys.path.append(UTILS_RELATIVE_PATH)

MLEM_RELATIVE_PATH = "../../../../.."
sys.path.append(MLEM_RELATIVE_PATH)

LIME_RELATIVE_PATH = "../../../../../lime/"
sys.path.append(LIME_RELATIVE_PATH)

OUTPUT_FOLDER = Path("experiment_output")
OUTPUT_FOLDER.mkdir(exist_ok=True)

import logging
logging.disable('DEBUG')


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
import numpy as np
import scipy.spatial.distance as distance
import multiprocessing
import json
import numpy as np
np.random.seed(4321)
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from lime.lime_tabular import LimeTabularExplainer # type: ignore
from mlem.utilities import generate_balanced_dataset, save_pickle_bz2, load_pickle_bz2, save_txt

# Loading the Adult data

loading the Adult RandomForest and the dictionary with all the useful data

In [3]:
from utils.dataloading.adult import load_adult_data, load_adult_randomforest # type: ignore

BB = load_adult_randomforest()
BB_DATA = load_adult_data('adult-blackbox-data2.npz')

print(classification_report(BB_DATA['y_test'], BB.predict(BB_DATA['X_test'])))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      8157
           1       0.79      0.61      0.69      2690

    accuracy                           0.86     10847
   macro avg       0.84      0.78      0.80     10847
weighted avg       0.86      0.86      0.86     10847



In [4]:
print(*BB_DATA.keys(), sep=" ")

X_train y_train X_test y_test X_validation y_validation X_validation_noisy y_validation_noisy X_attack_2_per_quantile y_attack_2_per_quantile X_attack_3_per_quantile y_attack_3_per_quantile categorical_features numerical_features categorical_features_mask centroids X_distance_separated y_distance_separated


In [5]:
def extract_zlist_points(zlist):
    """
    Extract and concatenate all the points from a Z_list
    """
    l = [np.array(zl) for zl in zlist]
    return np.concatenate(l)

Loading the CFS data and creating the DT

In [6]:
from mlem.utilities import create_decision_tree

with open("adultRF_CFS.json", 'r') as file:
    for i, line in enumerate(file):
        path = OUTPUT_FOLDER / f"{i}"
        if not path.exists():
            path.mkdir(exist_ok=True)
            print(f"Loading and splitting #{i}")
            data = json.loads(line)
            # assert (np.array(data['x']) - BB_DATA['X_distance_separated'][i] <= 1e-4).all()
            
            dt_data_x = extract_zlist_points(data['Z_list'])
            dt_data_y = BB.predict(dt_data_x)

            X_train, X_test, y_train, y_test = train_test_split(dt_data_x, dt_data_y, test_size=0.33, random_state=42, stratify=dt_data_y)

            dt = create_decision_tree(X_train, y_train, use_halving=True)
            save_pickle_bz2(path / "dt.bz2", dt)
            save_txt(path / "dt_classification_report.txt", classification_report(y_test, dt.predict(X_test)))
            np.savez(path / "dt-data.npz", X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

Loading and splitting #1
Loading and splitting #2
Loading and splitting #3
Loading and splitting #4
Loading and splitting #5
Loading and splitting #6
Loading and splitting #7
Loading and splitting #8
Loading and splitting #9
Loading and splitting #10
Loading and splitting #11
Loading and splitting #12
Loading and splitting #13
Loading and splitting #14
Loading and splitting #15
Loading and splitting #16
Loading and splitting #17
Loading and splitting #18
Loading and splitting #19
Loading and splitting #20
Loading and splitting #21
Loading and splitting #22
Loading and splitting #23
Loading and splitting #24
Loading and splitting #25
Loading and splitting #26
Loading and splitting #27
Loading and splitting #28
Loading and splitting #29
Loading and splitting #30
Loading and splitting #31
Loading and splitting #32
Loading and splitting #33
Loading and splitting #34
Loading and splitting #35
Loading and splitting #36
Loading and splitting #37
Loading and splitting #38
Loading and splitting