# Claim 2: HS is a better than other regularization methods for TBM

In [1]:
# move to notebooks directory
%cd ..

/home/bro/Documents/FRI/MLDS/repro/MLDS/notebooks


In [2]:
%load_ext autoreload
%autoreload 2

###IMPORTS

# system path manipulations
import os
import sys

# standard data science toolbox
import numpy as np
import pandas as pd

# train test splitting
from sklearn.model_selection import train_test_split

# standard DT and RF
from sklearn.tree import export_text, DecisionTreeClassifier, DecisionTreeRegressor

# sklearn baseline random forest 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# authors implementations of HS
import imodels

# this was used to get the datasets
from imodels.util.data_util import get_clean_dataset 

# making deep copies of trees for improvement comparison
from copy import deepcopy

# cross-validation of models
from sklearn.model_selection import StratifiedKFold, KFold

# scoring
from sklearn.metrics import roc_auc_score, r2_score, make_scorer

# hyperparameter search
from sklearn.model_selection import GridSearchCV, cross_val_score

# timing algorithm execution
import time

# import datasets
from utils.experiment_functions import get_datasets

# count number of leaves
from utils.experiment_functions import leaf_count

# calculate best alpha
from utils.experiment_functions import pick_alpha, pick_alpha_best

# hyperparameter tunnning
from skopt import gp_minimize

# hierarchical shrinkage
from imodels import HSTreeClassifier, HSTreeClassifierCV 

# bayesian-additive regression models (BART)
from bartpy.sklearnmodel import SklearnModel

###CONSTANTS

# Repositories used for experiment (location: paper_autors_repo/config/shrinkage/models.py)
CLASSIFICATION_DATASET_NAMES = ["heart", "breast-cancer", "haberman", "ionosphere", "diabetes", "german-credit", "juvenile", "recidivism"]
REGRESSION_DATASET_NAMES = ["red-wine", "california-housing"]

# number of leafs used for DT
num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]

# regularization parameter used for HS
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

In [3]:
# load classification tasks
tasks_classification = get_datasets(CLASSIFICATION_DATASET_NAMES)

# load regression tasks
tasks_regression = get_datasets(REGRESSION_DATASET_NAMES)

# A) Classification

In [None]:
# number of samples per each (dataset, num.leaves, algorithm)
NUM_OF_BOOTSTRAP_SAMPS = 30

def dt_regularization_comparison(dataset_names, task_type, save_to):
    
    tasks = get_datasets(dataset_names)
    
    results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_leaves", "max_leaves", "regularization", "train_score", "test_score", \
                                                     "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

    for task in dataset_names:
        for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
            skf = StratifiedKFold(n_splits=3, shuffle=True)
            X, y = np.array(tasks[task].drop("label", axis = 1)), np.array(tasks[task]["label"])
            for i, (train_index, test_index) in enumerate(skf.split(tasks[task], tasks[task]["label"])):
                print(f"Dataset: {task}, Sample: {samp}, Fold {i}", end = "\r")

                X_train, y_train = X[train_index, :], y[train_index]
                X_test, y_test = X[test_index, :], y[test_index]

                for m in num_of_leaves:

                    ### CART with CCP ###

                    # measure tunning time
                    start_wall_time_tunning = time.time()
                    start_cpu_time_tunning = time.process_time()

                    best_alpha = pick_alpha(X_train, y_train, m, DecisionTreeClassifier)

                    end_wall_time_tunning = time.time()
                    end_cpu_time_tunning = time.process_time()

                    # measure train time
                    start_wall_time_train = time.time()
                    start_cpu_time_train = time.process_time()

                    mccp = DecisionTreeClassifier(ccp_alpha=best_alpha).fit(X_train, y_train)

                    end_wall_time_train = time.time()
                    end_cpu_time_train = time.process_time()

                    # measure test time
                    start_wall_time_test = time.time()
                    start_cpu_time_test = time.process_time()

                    y_train_pred_ccp = mccp.predict_proba(X_train)[:, 1]
                    y_test_pred_ccp = mccp.predict_proba(X_test)[:, 1]

                    end_wall_time_test = time.time()
                    end_cpu_time_test = time.process_time()

                    results = pd.concat([results, pd.DataFrame({"task": ["classification"], 
                                                                                "dataset": [task],
                                                                                "boot_iter": [samp],
                                                                                "algorithm": ["CCP"],
                                                                                "scoring": ["AUC"],
                                                                                "n_leaves": [leaf_count(mccp)],
                                                                                "max_leaves": [m],
                                                                                "regularization": [best_alpha],
                                                                                "train_score": [roc_auc_score(y_train, y_train_pred_ccp)],
                                                                                "test_score": [roc_auc_score(y_test, y_test_pred_ccp)],
                                                                                "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                                "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                                "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                                "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                                "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                                "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])
                    
                    ### Hierarchical shrinkage (CCP) ###

                    # measure tunning time
                    start_wall_time_tunning = time.time()
                    start_cpu_time_tunning = time.process_time()

                    cv_scores = {}
                    for reg_param in reg_hs:
                        hs_skf = StratifiedKFold(n_splits=3, shuffle = True)
                        cv_scores[reg_param] = []
                        for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):
                            X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]
                            X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]
                            hs_cv_ccp = DecisionTreeClassifier(max_leaf_nodes=m, ccp_alpha=pick_alpha(X_cv_train, y_cv_train, m, DecisionTreeClassifier))
                            hs_cv_ccp.fit(X_cv_train, y_cv_train)
                            hs_cv_ccp = imodels.HSTreeClassifier(hs_cv_ccp, reg_param=reg_param)
                            y_val_pred = hs_cv_ccp.predict_proba(X_cv_val)[:, 1]
                            cv_scores[reg_param].append(roc_auc_score(y_cv_val, y_val_pred))
                    cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}
                    best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])
                    best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]
                    hs_reg_param = best_param

                    end_wall_time_tunning = time.time()
                    end_cpu_time_tunning = time.process_time()

                    # evaluation of improvements offered by hierarchical shrinkage model

                    # measure train time
                    start_wall_time_train = time.time()
                    start_cpu_time_train = time.process_time()

                    mshrunk = imodels.HSTreeClassifier(deepcopy(mccp), reg_param=hs_reg_param) #.fit(X_train, y_train)

                    end_wall_time_train = time.time()
                    end_cpu_time_train = time.process_time()

                    # measure test time
                    start_wall_time_test = time.time()
                    start_cpu_time_test = time.process_time()

                    y_train_pred_shrunk = mshrunk.predict_proba(X_train)[:, 1]
                    y_test_pred_shrunk = mshrunk.predict_proba(X_test)[:, 1]

                    end_wall_time_test = time.time()
                    end_cpu_time_test = time.process_time()

                    results = pd.concat([results, pd.DataFrame({"task": ["classification"], 
                                                                                "dataset": [task],
                                                                                "boot_iter": [samp],
                                                                                "algorithm": ["HS (CART-CCP)"],
                                                                                "scoring": ["AUC"],
                                                                                "n_leaves": [leaf_count(mshrunk.estimator_)],
                                                                                "max_leaves": [m],
                                                                                "regularization": [hs_reg_param],
                                                                                "train_score": [roc_auc_score(y_train, y_train_pred_shrunk)],
                                                                                "test_score": [roc_auc_score(y_test, y_test_pred_shrunk)],
                                                                                "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                                "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                                "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                                "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                                "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                                "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])


                    results.to_csv(save_to, index = False)

                break
                
    return results
                
dt_classification = dt_regularization_comparison(CLASSIFICATION_DATASET_NAMES, "classification", "results/claim_1_1_ccp_comparison_classification.csv")

# B) Regression

In [None]:
# number of leafs used in paper
num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]
# reuglarization parameter
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)
NUM_OF_BOOTSTRAP_SAMPS = 10
regression_results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_leaves", "max_leaves", "regularization", "train_score", "test_score", \
                                                 "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

for task in REGRESSION_DATASET_NAMES:
    if task == "music":
        continue
    
    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
        skf = KFold(n_splits=3, shuffle = True)

        if task != "music":
            X, y = np.array(tasks_regression[task].drop("label", axis = 1)), np.array(tasks_regression[task]["label"])
        else:
            X, y = np.array(tasks_regression[task].drop(["label1", "label2"], axis = 1)), np.array(tasks_regression[task][["label1", "label2"]])
        
        for i, (train_index, test_index) in enumerate(skf.split(tasks_regression[task])):
            print(f"Dataset: {task}, Sample: {samp}, Fold {i}", end = "\r")

            if task != "music":
                X_train, y_train = X[train_index, :], y[train_index]
                X_test, y_test = X[test_index, :], y[test_index]
            else:
                X_train, y_train = X[train_index, :], y[train_index, :]
                X_test, y_test = X[test_index, :], y[test_index, :]

            for m in num_of_leaves:
                ### CART with CCP ###
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                best_alpha = pick_alpha(X_train, y_train, m, DecisionTreeRegressor)
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                mccp = DecisionTreeRegressor(ccp_alpha=best_alpha).fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_ccp = mccp.predict(X_train)
                y_test_pred_ccp = mccp.predict(X_test)
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                regression_results = pd.concat([regression_results, pd.DataFrame({"task": ["regression"], 
                                                                            "dataset": [task],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["CCP"],
                                                                            "scoring": ["R2"],
                                                                            "n_leaves": [leaf_count(mccp)],
                                                                            "max_leaves": [m],
                                                                            "regularization": [best_alpha],
                                                                            "train_score": [r2_score(y_train, y_train_pred_ccp)],
                                                                            "test_score": [r2_score(y_test, y_test_pred_ccp)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])

                # TODO: GOSDT ###

                ### Hierarchical shrinkage (CCP) ###
            
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                cv_scores = {}
                for reg_param in reg_hs:
                    hs_skf = KFold(n_splits=3, shuffle = True)
                    cv_scores[reg_param] = []
                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train)):
                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]
                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]
                        hs_cv_ccp = DecisionTreeRegressor(max_leaf_nodes=m, ccp_alpha=pick_alpha(X_cv_train, y_cv_train, m, DecisionTreeRegressor))
                        hs_cv_ccp.fit(X_cv_train, y_cv_train)
                        hs_cv_ccp = imodels.HSTreeRegressor(hs_cv_ccp, reg_param=reg_param)
                        y_val_pred = hs_cv_ccp.predict(X_cv_val)
                        cv_scores[reg_param].append(r2_score(y_cv_val, y_val_pred))
                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}
                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])
                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]
                hs_reg_param = best_param
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()

                # evaluation of improvements offered by hierarchical shrinkage model
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                mshrunk = imodels.HSTreeRegressor(deepcopy(mccp), reg_param=hs_reg_param) #.fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_shrunk = mshrunk.predict(X_train)
                y_test_pred_shrunk = mshrunk.predict(X_test)
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                regression_results = pd.concat([regression_results, pd.DataFrame({"task": ["regression"], 
                                                                            "dataset": [task],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["HS (CART-CCP)"],
                                                                            "scoring": ["R2"],
                                                                            "n_leaves": [leaf_count(mshrunk.estimator_)],
                                                                            "max_leaves": [m],
                                                                            "regularization": [hs_reg_param],
                                                                            "train_score": [r2_score(y_train, y_train_pred_shrunk)],
                                                                            "test_score": [r2_score(y_test, y_test_pred_shrunk)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])


                regression_results.to_csv("results/claim_1_1_ccp_comparison_regression.csv", index = False)
            break

Dataset: california-housing, Sample: 0, Fold 0

# Classification for RF

In [None]:
# Datasets used in paper (location in author repo: github.com/Yu-Group/imodels-experiments/config/shrinkage/models.py)

DATASETS_CLASSIFICATION = [
    # classification datasets from original random forests paper
    # page 9: https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf
    ("heart", "heart", 'imodels'),
    ("breast-cancer", "breast_cancer", 'imodels'),
    ("haberman", "haberman", 'imodels'),
    ("ionosphere", "ionosphere", 'pmlb'),
    ("diabetes", "diabetes", "pmlb"),
    ("german-credit", "german", "pmlb"),
    ("juvenile", "juvenile_clean", 'imodels'),
    ("recidivism", "compas_two_year_clean", 'imodels')
]

# load datasets (datasets of authors seem to already be preprocessed so we will use theirs)
tasks = {}

for task in DATASETS_CLASSIFICATION:
    X, y, feature_names = get_clean_dataset(task[1], data_source = task[2])
    df = pd.DataFrame(X, columns=feature_names)
    df["label"] = y
    tasks[task[0]] = df

In [None]:
# dataframe to save performance of models
classification_results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_trees", "regularization", "train_score", "test_score", \
                                                 "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

# number of leafs used in paper
num_of_trees = [10, 25, 50, 75, 100, 300, 500]

# regularization parameter
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

# number of times to repeat evaluations with random splits (5 repeats with 3-fold cross-validation = 15 repeats)
NUM_OF_BOOTSTRAP_SAMPS = 10

# for each dataset that was used in paper
for task in DATASETS_CLASSIFICATION:
    # repeat NUM_OF_BOOTSTRAP_SAMPS
    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
        # use statified splitting (we tried both stratified and un-stratified => no significant differences)
        skf = StratifiedKFold(n_splits=3, shuffle=True)
         
        X, y = np.array(tasks[task[0]].drop("label", axis = 1)), np.array(tasks[task[0]]["label"])
        
        # cross-validation loop
        for i, (train_index, test_index) in enumerate(skf.split(tasks[task[0]], tasks[task[0]]["label"])):
            print(f"Dataset: {task[0]}, Sample: {samp}, Fold {i}", end = "\r")

            X_train, y_train = X[train_index, :], y[train_index]
            X_test, y_test = X[test_index, :], y[test_index]

            # for each tree (as deduced from fig. 4D)
            for m in num_of_trees:
                
                ### Random Forest (RF) ###
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                rf = RandomForestClassifier(n_estimators=m, max_features = "sqrt").fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_rf = rf.predict_proba(X_train)[:, 1]
                y_test_pred_rf = rf.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["RF"],
                                                                            "scoring": ["AUC"],
                                                                            "n_trees": [m],
                                                                            "regularization": ["None"],
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_rf)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_rf)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [None], 
                                                                            "tunning_cpu_time": [None]})])
                
                ### RF-CV (max_features (mtry)) ###                
                
                # tunning function to use in gp_minimize
                def rf_mtry(mtry):
                    rf_mtry = RandomForestClassifier(n_estimators=m, max_features = mtry[0])
                    roc_spec = make_scorer(roc_auc_score, needs_proba=True)
                    scores = cross_val_score(rf_mtry, X_train, y_train, cv=3, scoring = roc_spec)
                    return -np.mean(scores)
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                mtry_best = gp_minimize(rf_mtry,
                            [(0.1, 1.0)],
                            acq_func="EI",
                            n_calls = 15,
                            n_initial_points = 5,
                            noise = 0.1**2).x[0]
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                rf_mtry = RandomForestClassifier(n_estimators=m, max_features = mtry_best).fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_rf_mtry = rf_mtry.predict_proba(X_train)[:, 1]
                y_test_pred_rf_mtry = rf_mtry.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["RF-MTRY"],
                                                                            "scoring": ["AUC"],
                                                                            "n_trees": [m],
                                                                            "regularization": [rf_mtry], # we store best mtry parameter in regularization
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_rf_mtry)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_rf_mtry)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])
                
                ### RF-CV (max_depth (depth)) ###
                def rf_depth(depth):
                    rf_depth = RandomForestClassifier(n_estimators=m, max_depth = int(np.round(depth[0])))
                    roc_spec = make_scorer(roc_auc_score, needs_proba=True)
                    scores = cross_val_score(rf_depth, X_train, y_train, cv=3, scoring = roc_spec)
                    return -np.mean(scores)
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                depth_best = int(np.round(gp_minimize(rf_depth,
                            [(1.0, 30.0)],
                            acq_func="EI",
                            n_calls = 15,
                            n_initial_points = 5,
                            noise = 0.1**2).x[0]))
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                rf_depth = RandomForestClassifier(n_estimators=m, max_depth = depth_best).fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_rf_depth = rf_depth.predict_proba(X_train)[:, 1]                
                y_test_pred_rf_depth = rf_depth.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["RF-DEPTH"],
                                                                            "scoring": ["AUC"],
                                                                            "n_trees": [m],
                                                                            "regularization": [rf_depth], # we store best depth parameter in regularization
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_rf_depth)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_rf_depth)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])
                
                ### HS-RF (hierarchical shrinkage) ###
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                roc_spec = make_scorer(roc_auc_score)
                rf = RandomForestClassifier(n_estimators=m, max_features = "sqrt")
                hs_rf_cv = HSTreeClassifierCV(estimator_=rf, reg_param_list = reg_hs, cv = 3, scoring = roc_spec)
                hs_rf_cv.fit(X_train, y_train)
                
                best_hs_reg = hs_rf_cv.reg_param
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                rf = RandomForestClassifier(n_estimators=m, max_features = "sqrt")
                hs_rf = HSTreeClassifier(estimator_= rf, reg_param = best_hs_reg) 
                hs_rf.fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_hs_rf = hs_rf.predict_proba(X_train)[:, 1]
                y_test_pred_hs_rf = hs_rf.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["HS-RF"],
                                                                            "scoring": ["AUC"],
                                                                            "n_trees": [m],
                                                                            "regularization": [best_hs_reg], # HS regression parameter (lambda)
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_hs_rf)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_hs_rf)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])

                classification_results.to_csv("results/rf_comparison_classification.csv", index = False)

            break

In [None]:
# dataframe to save performance of models
classification_results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_trees", "regularization", "train_score", "test_score", \
                                                 "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

# number of leafs used in paper
num_of_trees = [10, 25, 50, 75, 100, 300, 500]

# number of times to repeat evaluations with random splits (5 repeats with 3-fold cross-validation = 15 repeats)
NUM_OF_BOOTSTRAP_SAMPS = 10

# for each dataset that was used in paper
for task in DATASETS_CLASSIFICATION:
    # repeat NUM_OF_BOOTSTRAP_SAMPS
    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
        # use statified splitting (we tried both stratified and un-stratified => no significant differences)
        skf = StratifiedKFold(n_splits=3, shuffle=True)
         
        X, y = np.array(tasks[task[0]].drop("label", axis = 1)), np.array(tasks[task[0]]["label"])
        
        # cross-validation loop
        for i, (train_index, test_index) in enumerate(skf.split(tasks[task[0]], tasks[task[0]]["label"])):
            print(f"Dataset: {task[0]}, Sample: {samp}, Fold {i}", end = "\r")

            X_train, y_train = X[train_index, :], y[train_index]
            X_test, y_test = X[test_index, :], y[test_index]

            # for each tree (as deduced from fig. 4D)
            for m in num_of_trees:
                                
                ### BART (doesn't use tunning - takes to long/performs the best anyway) ###
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()

                bart = SklearnModel(n_trees = m);
                bart.fit(X_train, y_train);

                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()

                bart_train_pred = np.round(bart.predict(X_train));
                bart_test_pred = np.round(bart.predict(X_test));

                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                print(roc_auc_score(y_test, bart_test_pred))

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                                "dataset": [task[0]],
                                                                                "boot_iter": [samp],
                                                                                "algorithm": ["BART"],
                                                                                "scoring": ["AUC"],
                                                                                "n_trees": [m],
                                                                                "regularization": ["None"],
                                                                                "train_score": [roc_auc_score(y_train, bart_train_pred)],
                                                                                "test_score": [roc_auc_score(y_test, bart_test_pred)],
                                                                                "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                                "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                                "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                                "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [None], 
                                                                            "tunning_cpu_time": [None]})])

                classification_results.to_csv("results/rf_bart_classification.csv")

            break