In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

import imodels
#from imodels import HSTreeRegressorCV, HSTreeClassifierCV
#from imodels import HSTreeRegressor, HSTreeClassifier

from tqdm import tqdm

from datasets import DATASETS_REGRESSION, DATASETS_CLASSIFICATION

In [None]:
def experiment(X, y, max_leaf_nodes, type_="classification", N=10):
    score_CART = list()
    score_lbsCART = list()
    score_hsCART = list()
    
    if type_ == "classification":
        metric = roc_auc_score
        tree_model = DecisionTreeClassifier
        lbstree_model = imodels.HSTreeClassifier
        hstree_model = imodels.HSTreeClassifierCV
    elif type_ == "regression":
        metric = r2_score
        tree_model = DecisionTreeRegressor
        lbstree_model = imodels.HSTreeRegressor
        hstree_model = imodels.HSTreeRegressorCV

    for _ in range(N):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        CART = tree_model(max_leaf_nodes=max_leaf_nodes)
        CART.fit(X_train, y_train)
        lbsCART = lbstree_model(deepcopy(CART), shrinkage_scheme_="leaf_based")
        hsCART = hstree_model(deepcopy(CART), reg_params=[0.1, 1, 10, 25, 50, 100])
        hsCART.fit(X_train, y_train)

        y_pred = CART.predict(X_test)
        score_CART.append(metric(y_test, y_pred))
        y_pred = lbsCART.predict(X_test)
        score_lbsCART.append(metric(y_test, y_pred))
        y_pred = hsCART.predict(X_test)
        score_hsCART.append(metric(y_test, y_pred))
    
    return score_CART, score_lbsCART, score_hsCART

In [None]:
def lbs_vs_hs(dataset_name, database_name, type_="classification", save=False, save_name=None):
    X, y, cols = imodels.util.data_util.get_clean_dataset(dataset_name, database_name)
    cols = np.array(cols)

    SCORE_CART_m, SCORE_lbsCART_m, SCORE_hsCART_m = list(), list(), list()
    SCORE_CART_s, SCORE_lbsCART_s, SCORE_hsCART_s = list(), list(), list()

    leaf_nodes = [2, 4, 8, 12, 16, 20, 24, 28, 30, 32]

    for max_leaf_nodes in leaf_nodes:
        score_cart, score_lbscart, score_hscart = experiment(X, y, max_leaf_nodes, type_)
        SCORE_CART_m.append(np.mean(score_cart))
        SCORE_CART_s.append(np.std(score_cart))
        SCORE_lbsCART_m.append(np.mean(score_lbscart))
        SCORE_lbsCART_s.append(np.std(score_lbscart))
        SCORE_hsCART_m.append(np.mean(score_hscart))
        SCORE_hsCART_s.append(np.std(score_hscart))
    
    fig, ax = plt.subplots()
    plt.clf()
    plt.errorbar(leaf_nodes, SCORE_CART_m, yerr=SCORE_CART_s, color="lightsalmon", label="CART")
    plt.errorbar(leaf_nodes, SCORE_lbsCART_m, yerr=SCORE_lbsCART_s, color="goldenrod", label="CART (LBS)")
    plt.errorbar(leaf_nodes, SCORE_hsCART_m, yerr=SCORE_hsCART_s, color="firebrick", label="hsCART")
    plt.legend()
    plt.xlabel("Number of Leaves")
    plt.ylabel("AUC" if type_ == "classification" else "R2")
    if save:
        plt.savefig(f"../figures/HS_vs_LBS/{type_}_{save_name}", bbox_inches="tight", facecolor="white", edgecolor="auto")

In [None]:
for save_name, (dataset_name, database_name) in tqdm(DATASETS_CLASSIFICATION.items()):
    lbs_vs_hs(dataset_name, database_name, "classification", False, save_name)

In [None]:
for save_name, (dataset_name, database_name) in tqdm(DATASETS_REGRESSION.items()):
    lbs_vs_hs(dataset_name, database_name, "regression", False, save_name)