In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from copy import deepcopy
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

import imodels
#from imodels import HSTreeRegressorCV, HSTreeClassifierCV
#from imodels import HSTreeRegressor, HSTreeClassifier

from tqdm import tqdm

from datasets import DATASETS_REGRESSION, DATASETS_CLASSIFICATION

In [None]:
def experiment(X, y, max_leaf_nodes, type_="classification", N=10):
    score_CART = list()
    score_lbsCART = list()
    score_hsCART = list()
    
    if type_ == "classification":
        metric = roc_auc_score
        tree_model = DecisionTreeClassifier
        lbstree_model = imodels.HSTreeClassifier
        hstree_model = imodels.HSTreeClassifierCV
    elif type_ == "regression":
        metric = r2_score
        tree_model = DecisionTreeRegressor
        lbstree_model = imodels.HSTreeRegressor
        hstree_model = imodels.HSTreeRegressorCV

    for _ in range(N):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        CART = tree_model(max_leaf_nodes=max_leaf_nodes)
        CART.fit(X_train, y_train)
        lbsCART = lbstree_model(deepcopy(CART), shrinkage_scheme_="leaf_based")
        hsCART = hstree_model(deepcopy(CART), reg_params=[0.1, 1, 10, 25, 50, 100])
        hsCART.fit(X_train, y_train)
        
        if type_ == "classification":
            y_pred_CART = CART.predict_proba(X_test)[:, 1]
            y_pred_lbsCART = lbsCART.predict_proba(X_test)[:, 1]
            y_pred_hsCART = hsCART.predict_proba(X_test)[:, 1]
        elif type_ == "regression":
            y_pred_CART = CART.predict(X_test)
            y_pred_lbsCART = lbsCART.predict(X_test)
            y_pred_hsCART = hsCART.predict(X_test)
        
        score_CART.append(metric(y_test, y_pred_CART))
        score_lbsCART.append(metric(y_test, y_pred_lbsCART))
        score_hsCART.append(metric(y_test, y_pred_hsCART))
    
    return score_CART, score_lbsCART, score_hsCART

In [None]:
def better_same_worse(dataset_name, database_name, type_="classification", N=100, ROPE=0.005):
    df = pd.DataFrame(columns=["dataset", "n_leaves", "score"])
    X, y, cols = imodels.util.data_util.get_clean_dataset(dataset_name, database_name)
    cols = np.array(cols)

    leaf_nodes = [2, 4, 8, 12, 16, 20, 24, 28, 30, 32]

    SAME = list()
    BETTER = list()
    WORSE = list()

    for max_leaf_nodes in tqdm(leaf_nodes):
        score_CART, score_lbsCART, score_hsCART = experiment(X, y, max_leaf_nodes, type_, N=N)
        score_CART = np.array(score_CART)
        score_lbsCART = np.array(score_lbsCART)
        score_hsCART = np.array(score_hsCART)
        
        df = pd.concat([df, pd.DataFrame.from_dict(
            {"dataset":[dataset_name]*N, "n_leaves":[max_leaf_nodes]*N, "score":score_lbsCART}
        )], ignore_index=True)

        idx_CART = np.random.choice(range(N), size=N, replace=True)
        idx_lbsCART = np.random.choice(range(N), size=N, replace=True)
        idx_hsCART = np.random.choice(range(N), size=N, replace=True)

        SAME.append((np.abs(score_lbsCART[idx_lbsCART] - score_hsCART[idx_hsCART]) <= ROPE).sum())
        BETTER.append((score_hsCART[idx_hsCART] - score_lbsCART[idx_lbsCART] > ROPE).sum())
        WORSE.append((score_lbsCART[idx_lbsCART] - score_hsCART[idx_hsCART] > ROPE).sum())
    
    return np.sum(BETTER), np.sum(SAME), np.sum(WORSE), df

In [None]:
def plot_better_same_worse(DATASETS, type_="classification", save=False):
    BETTER, SAME, WORSE = list(), list(), list()
    names = list()
    df = pd.DataFrame(columns=["dataset", "n_leaves", "score"])
    for save_name, (dataset_name, database_name) in DATASETS.items():
        better, same, worse, df_dataset = better_same_worse(dataset_name, database_name, type_)
        BETTER.append(better)
        SAME.append(same)
        WORSE.append(worse)
        names.append(save_name)
        df = pd.concat([df, df_dataset], ignore_index=True)
        df.to_csv("tmp.csv", index=False)

    SAME = np.array(SAME)
    BETTER = np.array(BETTER)
    WORSE = np.array(WORSE)
    names = np.array(names)

    N = SAME+BETTER+WORSE
    SAME = np.divide(SAME, N)
    BETTER = np.divide(BETTER, N)
    WORSE = np.divide(WORSE, N)

    plt.bar(names, BETTER, color="#56B4E9", label="better")
    plt.bar(names, SAME, bottom=BETTER, color="#009E73", label="same")
    plt.bar(names, WORSE, bottom=BETTER+SAME, color="#E69F00", label="worse")
    plt.axhline(0.5, color="red", linestyle="--")
    plt.xticks(rotation=45)
    plt.ylabel("probability of HS being better than LBS")
    plt.legend()
    if save: 
        plt.savefig("../../figures/HS_vs_LBS/"+type_, bbox_inches="tight", facecolor="white", edgecolor="auto")
    
    return df

In [None]:
df_classification = plot_better_same_worse(DATASETS_CLASSIFICATION, "classification", save=False)

In [None]:
df_regression = plot_better_same_worse(DATASETS_REGRESSION, "regression", save=True)

In [None]:
def lbs_vs_hs(dataset_name, database_name, type_="classification", save=False, save_name=None):
    X, y, cols = imodels.util.data_util.get_clean_dataset(dataset_name, database_name)
    cols = np.array(cols)

    SCORE_CART_m, SCORE_lbsCART_m, SCORE_hsCART_m = list(), list(), list()
    SCORE_CART_s, SCORE_lbsCART_s, SCORE_hsCART_s = list(), list(), list()

    leaf_nodes = [2, 4, 8, 12, 16, 20, 24, 28, 30, 32]

    for max_leaf_nodes in leaf_nodes:
        score_cart, score_lbscart, score_hscart = experiment(X, y, max_leaf_nodes, type_)
        SCORE_CART_m.append(np.mean(score_cart))
        SCORE_CART_s.append(np.std(score_cart))
        SCORE_lbsCART_m.append(np.mean(score_lbscart))
        SCORE_lbsCART_s.append(np.std(score_lbscart))
        SCORE_hsCART_m.append(np.mean(score_hscart))
        SCORE_hsCART_s.append(np.std(score_hscart))
    
    fig, ax = plt.subplots()
    plt.clf()
    plt.errorbar(leaf_nodes, SCORE_CART_m, yerr=SCORE_CART_s, color="lightsalmon", label="CART")
    plt.errorbar(leaf_nodes, SCORE_lbsCART_m, yerr=SCORE_lbsCART_s, color="goldenrod", label="CART (LBS)")
    plt.errorbar(leaf_nodes, SCORE_hsCART_m, yerr=SCORE_hsCART_s, color="firebrick", label="hsCART")
    plt.legend()
    plt.xlabel("Number of Leaves")
    plt.ylabel("AUC" if type_ == "classification" else "R2")
    if save:
        plt.savefig(f"../graphs/claim_2/HS_vs_LBS/{type_}_{save_name}", bbox_inches="tight", facecolor="white", edgecolor="auto")

In [None]:
for save_name, (dataset_name, database_name) in tqdm(DATASETS_CLASSIFICATION.items()):
    lbs_vs_hs(dataset_name, database_name, "classification", True, save_name)

In [None]:
for save_name, (dataset_name, database_name) in tqdm(DATASETS_REGRESSION.items()):
    lbs_vs_hs(dataset_name, database_name, "regression", True, save_name)