# Claim 1: HS Improves Predictive performance of TBM

In [1]:
# move to notebooks directory
%cd ..

/home/bro/Documents/FRI/MLDS/repro/MLDS/notebooks


In [4]:
%load_ext autoreload
%autoreload 2

###IMPORTS

# system path manipulations
import os
import sys

# standard data science toolbox
import numpy as np
import pandas as pd

# train test splitting
from sklearn.model_selection import train_test_split

# standard DT and RF
from sklearn.tree import export_text, DecisionTreeClassifier, DecisionTreeRegressor

# authors implementations of HS
import imodels

# making deep copies of trees for improvement comparison
from copy import deepcopy

# cross-validation of models
from sklearn.model_selection import StratifiedKFold, KFold

# scoring
from sklearn.metrics import roc_auc_score, r2_score

# hyperparameter search
from sklearn.model_selection import GridSearchCV

# timing algorithm execution
import time

# import datasets
from utils.experiment_functions import get_datasets

# count number of leaves
from utils.experiment_functions import leaf_count

# calculate best alpha
from utils.experiment_functions import pick_alpha, pick_alpha_best

###CONSTANTS

# Repositories used for experiment (location: paper_autors_repo/config/shrinkage/models.py)
CLASSIFICATION_DATASET_NAMES = ["heart", "breast-cancer", "haberman", "ionosphere", "diabetes", "german-credit", "juvenile", "recidivism"]
REGRESSION_DATASET_NAMES = ['friedman1', 'friedman3', "diabetes-regr", 'abalone', "red-wine", "satellite-image", "california-housing", "music"]

# number of leafs used for DT
num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]

# regularization parameter used for HS
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# load classification tasks
tasks_classification = get_datasets(CLASSIFICATION_DATASET_NAMES)

# load regression tasks
tasks_regression = get_datasets(REGRESSION_DATASET_NAMES)

## 4.2 A) Classification

In [12]:
# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)
NUM_OF_BOOTSTRAP_SAMPS = 100

# classification comparison of DT and HS-DT
classification_results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_leaves", "max_leaves", "regularization", "train_score", "test_score", \
                                                 "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

for task in DATASETS_CLASSIFICATION:
    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
        skf = StratifiedKFold(n_splits=3, shuffle=True)
        X, y = np.array(tasks_classification[task[0]].drop("label", axis = 1)), np.array(tasks_classification[task[0]]["label"])
        for i, (train_index, test_index) in enumerate(skf.split(tasks_classification[task[0]], tasks_classification[task[0]]["label"])):
            print(f"Dataset: {task[0]}, Sample: {samp}, Fold {i}", end = "\r")

            X_train, y_train = X[train_index, :], y[train_index]
            X_test, y_test = X[test_index, :], y[test_index]

            for m in num_of_leaves:
                ### CART ###
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                m1 = DecisionTreeClassifier(max_leaf_nodes=m).fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_dt = m1.predict_proba(X_train)[:, 1]
                y_test_pred_dt = m1.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["DT"],
                                                                            "scoring": ["AUC"],
                                                                            "n_leaves": [leaf_count(m1)],
                                                                            "max_leaves": [m],
                                                                            "regularization": ["None"],
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_dt)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_dt)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [None], 
                                                                            "tunning_cpu_time": [None]})])
                
                ### Hierarchical shrinkage ###
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                cv_scores = {}
                for reg_param in reg_hs:
                    hs_skf = StratifiedKFold(n_splits=3, shuffle = True)
                    cv_scores[reg_param] = []
                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):
                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]
                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]
                        hs_cv_dt = DecisionTreeClassifier(max_leaf_nodes=m)
                        hs_cv_dt.fit(X_cv_train, y_cv_train)
                        hs_cv_dt = imodels.HSTreeClassifier(hs_cv_dt, reg_param=reg_param)
                        y_val_pred = hs_cv_dt.predict_proba(X_cv_val)[:, 1]
                        cv_scores[reg_param].append(roc_auc_score(y_cv_val, y_val_pred))
                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}
                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])
                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]
                hs_reg_param = best_param
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()

                # evaluation of improvements offered by hierarchical shrinkage model
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                mshrunk = imodels.HSTreeClassifier(deepcopy(m1), reg_param=hs_reg_param) #.fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_shrunk = mshrunk.predict_proba(X_train)[:, 1]
                y_test_pred_shrunk = mshrunk.predict_proba(X_test)[:, 1]
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                classification_results = pd.concat([classification_results, pd.DataFrame({"task": ["classification"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["HS (CART)"],
                                                                            "scoring": ["AUC"],
                                                                            "n_leaves": [leaf_count(mshrunk.estimator_)],
                                                                            "max_leaves": [m],
                                                                            "regularization": [hs_reg_param],
                                                                            "train_score": [roc_auc_score(y_train, y_train_pred_shrunk)],
                                                                            "test_score": [roc_auc_score(y_test, y_test_pred_shrunk)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])

                classification_results.to_csv("results/claim_1_1_dt_comparison_classification.csv", index = False)
                
            break

Dataset: recidivism, Sample: 99, Fold 0d 0

## 4.2 B) Regression

In [14]:
# number of leafs used in paper
num_of_leaves = [2, 4, 8, 12, 15, 20, 24, 28, 30, 32]
# reuglarization parameter
reg_hs = [0.1, 1.0, 10.0, 25.0, 50.0, 100.0]

# Potential problem HS appeared to be choosen via CV (hopefully they split the dataset before hand)
NUM_OF_BOOTSTRAP_SAMPS = 100
regression_results = pd.DataFrame(columns = ["task", "dataset", "boot_iter", "algorithm", "scoring", "n_leaves", "max_leaves", "regularization", "train_score", "test_score", \
                                                 "train_wall_time", "test_wall_time", "train_cpu_time", "test_cpu_time", "tunning_wall_time", "tunning_cpu_time"])

for task in DATASETS_REGRESSION:
    for samp in range(NUM_OF_BOOTSTRAP_SAMPS):
        skf = KFold(n_splits=3, shuffle = True)
        if task[0] != "music":
            X, y = np.array(tasks_regression[task[0]].drop("label", axis = 1)), np.array(tasks_regression[task[0]]["label"])
        else:
            X, y = np.array(tasks_regression[task[0]].drop(["label1", "label2"], axis = 1)), np.array(tasks_regression[task[0]][["label1", "label2"]])
        
        for i, (train_index, test_index) in enumerate(skf.split(tasks_regression[task[0]])):
            print(f"Dataset: {task[0]}, Sample: {samp}, Fold {i}", end = "\r")

            if task[0] != "music":
                X_train, y_train = X[train_index, :], y[train_index]
                X_test, y_test = X[test_index, :], y[test_index]
            else:
                X_train, y_train = X[train_index, :], y[train_index, :]
                X_test, y_test = X[test_index, :], y[test_index, :]

            for m in num_of_leaves:
                ### CART ###
                
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                m1 = DecisionTreeRegressor(max_leaf_nodes=m).fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_dt = m1.predict(X_train)
                y_test_pred_dt = m1.predict(X_test)
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                regression_results = pd.concat([regression_results, pd.DataFrame({"task": ["regression"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["DT"],
                                                                            "scoring": ["R2"],
                                                                            "n_leaves": [leaf_count(m1)],
                                                                            "max_leaves": [m],
                                                                            "regularization": ["None"],
                                                                            "train_score": [r2_score(y_train, y_train_pred_dt)],
                                                                            "test_score": [r2_score(y_test, y_test_pred_dt)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [None], 
                                                                            "tunning_cpu_time": [None]})])
                
                ### Hierarchical shrinkage ###
                
                # measure tunning time
                start_wall_time_tunning = time.time()
                start_cpu_time_tunning = time.process_time()
                
                cv_scores = {}
                for reg_param in reg_hs:
                    hs_skf = KFold(n_splits=3, shuffle = True)
                    cv_scores[reg_param] = []
                    for j, (cv_train_index, cv_val_index) in enumerate(hs_skf.split(X_train, y_train)):
                        X_cv_train, y_cv_train = X[cv_train_index, :], y[cv_train_index]
                        X_cv_val, y_cv_val = X[cv_val_index, :], y[cv_val_index]
                        hs_cv_dt = DecisionTreeRegressor(max_leaf_nodes=m)
                        hs_cv_dt.fit(X_cv_train, y_cv_train)
                        hs_cv_dt = imodels.HSTreeRegressor(hs_cv_dt, reg_param=reg_param)
                        y_val_pred = hs_cv_dt.predict(X_cv_val)
                        cv_scores[reg_param].append(r2_score(y_cv_val, y_val_pred))
                cv_scores = {reg_param: np.mean(cv_scores[reg_param]) for reg_param in cv_scores.keys()}
                best_score = np.max([cv_scores[reg_param] for reg_param in cv_scores.keys()])
                best_param = [reg_param for reg_param in cv_scores.keys() if cv_scores[reg_param] == best_score][0]
                hs_reg_param = best_param
                
                end_wall_time_tunning = time.time()
                end_cpu_time_tunning = time.process_time()

                # evaluation of improvements offered by hierarchical shrinkage model
                # measure train time
                start_wall_time_train = time.time()
                start_cpu_time_train = time.process_time()
                
                mshrunk = imodels.HSTreeRegressor(deepcopy(m1), reg_param=hs_reg_param) #.fit(X_train, y_train)
                
                end_wall_time_train = time.time()
                end_cpu_time_train = time.process_time()
                
                # measure test time
                start_wall_time_test = time.time()
                start_cpu_time_test = time.process_time()
                
                y_train_pred_shrunk = mshrunk.predict(X_train)
                y_test_pred_shrunk = mshrunk.predict(X_test)
                
                end_wall_time_test = time.time()
                end_cpu_time_test = time.process_time()

                regression_results = pd.concat([regression_results, pd.DataFrame({"task": ["regression"], 
                                                                            "dataset": [task[0]],
                                                                            "boot_iter": [samp],
                                                                            "algorithm": ["HS (CART)"],
                                                                            "scoring": ["R2"],
                                                                            "n_leaves": [leaf_count(mshrunk.estimator_)],
                                                                            "max_leaves": [m],
                                                                            "regularization": [hs_reg_param],
                                                                            "train_score": [r2_score(y_train, y_train_pred_shrunk)],
                                                                            "test_score": [r2_score(y_test, y_test_pred_shrunk)],
                                                                            "train_wall_time": [end_wall_time_train - start_wall_time_train],
                                                                            "test_wall_time": [end_wall_time_test - start_wall_time_test],
                                                                            "train_cpu_time": [end_cpu_time_train - start_cpu_time_train],
                                                                            "test_cpu_time": [end_cpu_time_test - start_cpu_time_test],
                                                                            "tunning_wall_time": [end_wall_time_tunning - start_wall_time_tunning], 
                                                                            "tunning_cpu_time": [end_cpu_time_tunning - start_cpu_time_tunning]})])

                regression_results.to_csv("results/claim_1_1_dt_comparison_regression.csv", index = False)

            break

Dataset: red-wine, Sample: 99, Fold 099, Fold 0