In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean 
from scipy.stats import sem
import xgboost as xgb
import os
cwd = os.getcwd()
import sys
sys.path.append(cwd+"/src_autoML/")
print (cwd)

/Users/liug28/Downloads/AutoML/AutoML_package


# chronic kidney disease dataset

In [2]:
kidney_result_path=cwd+"/src_autoML/examples/kidney_result"
kidney_data_dir=cwd+"/src_autoML/examples/data/chronic_kidney_disease_dataset.csv"
kidney_target = 'classification'
# kidney_data[kidney_target].replace({'ckd\t':'ckd'}, inplace=True)

## 1 default setting xgboost

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin

kidney_data=pd.read_csv(kidney_data_dir)
y_kidney = kidney_data[kidney_target]
x_kidney = kidney_data.drop([kidney_target], axis=1)

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

x_kidney = pd.DataFrame(x_kidney)
x_kidney = DataFrameImputer().fit_transform(x_kidney)
x_kidney = pd.get_dummies(x_kidney)
# y_breast = y_breast.replace({'yes':1, 'no':0}, inplace=True)


In [4]:
def model_perform_validation(clf, prepro, x, y, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=np.random.RandomState(i), test_size=test_size, stratify=y)
        X_train = prepro.fit_transform(X_train)
        model = clf.fit(X_train, y_train)
        X_test = prepro.fit_transform(X_test)
        y_pred = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred)
        aucs.append(auc)
    return aucs

In [6]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model=LogisticRegression(solver='liblinear', random_state=0)
logistic_aucs=model_perform_validation(logistic_regression_model, StandardScaler(copy=True, with_mean=False, with_std=False), x_kidney, y_kidney, 100)
print (mean(logistic_aucs))
print (sem(logistic_aucs))

0.99834
0.00020067115222566965


In [7]:
default_clf = xgb.XGBClassifier()


In [8]:
default_aucs=model_perform_validation(default_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_kidney, y_kidney, 100)


In [11]:
print (mean(default_aucs))
print (sem(default_aucs))

0.9984433333333333
0.0003579482309314243


## 2 allow missing autoML

In [19]:
allowMissing_prepro = '1'
allowMissing_dir = kidney_result_path+'/resultAllowMissing/'+allowMissing_prepro
am_x_train_dir = allowMissing_dir+'/X_train_important.csv'
am_x_test_dir = allowMissing_dir+'/X_test_important.csv'
am_y_train_dir = allowMissing_dir+'/y_train.csv'
am_y_test_dir = allowMissing_dir+'/y_test.csv'

am_x_train = pd.read_csv(am_x_train_dir, index_col = 0)
am_x_test = pd.read_csv(am_x_test_dir, index_col = 0)
am_y_train = pd.read_csv(am_y_train_dir, index_col = 0)
am_y_test = pd.read_csv(am_y_test_dir, index_col = 0)

x_kidney_am = pd.concat([am_x_train, am_x_test])
y_kidney_am = pd.concat([am_y_train, am_y_test])

kidney_am_clf = xgb.XGBClassifier(base_score=0.5, booster=None,
              colsample_bylevel=0.9381242698734285, colsample_bynode=None,
              colsample_bytree=0.9618275026279567, gamma=0.10931758182961511,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.09547791922258886, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=np.nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None,
              reg_alpha=0.12551300960344314, reg_lambda=1.1469407983901347,
              scale_pos_weight=1, seed=0, subsample=0.9427950482564263,
              tree_method=None, validate_parameters=False, verbosity=None)


In [20]:
kidney_am_aucs=model_perform_validation(kidney_am_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_kidney_am, y_kidney_am['ckd'], 100)


In [21]:
print (mean(kidney_am_aucs))
print (sem(kidney_am_aucs))


0.99996
1.851986527089379e-05


## 3 not allow missing autoML

extra tree classifier at number 1 preprocessing method

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

In [16]:
notAllowMissing_prepro = '1'
notAllowMissing_dir = kidney_result_path+'/resultNoMissingAllow/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_kidney_nam = pd.concat([nam_x_train, nam_x_test])
y_kidney_nam = pd.concat([nam_y_train, nam_y_test])

kidney_nam_clf = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
                     max_depth=None, max_features=0.7503423342390505,
                     max_leaf_nodes=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=48, n_jobs=1, oob_score=False, random_state=0,
                     verbose=False, warm_start=False)


In [17]:
kidney_nam_aucs=model_perform_validation(kidney_nam_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_kidney_nam, y_kidney_nam['ckd'], 100)


In [18]:
print (mean(kidney_nam_aucs))
print (sem(kidney_nam_aucs))


0.9917733333333334
0.0011453010425711116


## 4 no allow missing autoML from all classifiers

In [5]:
notAllowMissing_prepro = '0'
notAllowMissing_dir = kidney_result_path+'/resultNoMissingAllow/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_kidney_nam = pd.concat([nam_x_train, nam_x_test])
y_kidney_nam = pd.concat([nam_y_train, nam_y_test])

In [6]:
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

et_clf = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=36, n_jobs=1,
                     oob_score=False, random_state=0, verbose=False,
                     warm_start=False)

# lr_clf = LogisticRegression(C=5.251280863890487, class_weight=None, dual=False,
#                    fit_intercept=True, intercept_scaling=9.735618231724368,
#                    l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
#                    penalty='l2', random_state=0, solver='liblinear',
#                    tol=0.004302760799758684, verbose=0, warm_start=False)

In [7]:
kidney_nam_aucs=model_perform_validation(et_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_kidney_nam, y_kidney_nam['ckd'], 100)

In [8]:
print (mean(kidney_nam_aucs))
print (sem(kidney_nam_aucs))

0.9999766666666666
1.2761298112549434e-05


## 3 H2O classifier

In [46]:
from h2o.automl import H2OAutoML
import h2o


In [47]:
kidney_h2o = kidney_data.copy()
# kidney_h2o[kidney_target].replace({1:'Yes', 0:'No'}, inplace=True)


In [48]:
kidney_h2o

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [49]:
h2o.init(nthreads = -1, max_mem_size = 8)
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 25 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.466 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Connecting to H2O server at http://localhost:54321 ... successful.


0,1
H2O_cluster_uptime:,1 hour 25 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.466 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


<H2OConnection to http://localhost:54321, no session>

In [50]:
df = h2o.H2OFrame(kidney_h2o)


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [51]:
df.describe()

Rows:400
Cols:26




Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
type,int,int,int,real,int,int,enum,enum,enum,enum,int,real,real,real,real,real,int,int,real,enum,enum,enum,enum,enum,enum,enum
mins,0.0,2.0,50.0,1.005,0.0,0.0,,,,,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1,,,,,,,
mean,199.5,51.4833759590793,76.46907216494847,1.0174079320113314,1.0169491525423722,0.45014245014244997,,,,,148.03651685393265,57.425721784776904,3.072454308093998,137.5287539936102,4.627243589743593,12.526436781609192,38.871951219512184,8413.698630136989,4.707434944237918,,,,,,,
maxs,399.0,90.0,180.0,1.025,5.0,5.0,,,,,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0,,,,,,,
sigma,115.61430130683084,17.16971408926224,13.68363749352527,0.005716616974376757,1.3526789127628445,1.099191251885407,,,,,79.28171423511773,50.50300584922251,5.741126066859788,10.408752051798777,3.1939041765566945,2.912586608826765,9.000955088843337,2951.7338404534125,1.0253232655721791,,,,,,,
zeros,1,0,0,0,199,290,,,,,0,0,0,0,0,0,0,0,0,,,,,,,
missing,0,9,12,47,46,49,0,0,0,0,44,19,17,87,88,52,72,108,131,0,0,0,0,0,0,0
0,0.0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1.0,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2.0,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd


In [52]:
h2o_estimator=H2OAutoML(max_models = 10, seed = 1, exclude_algos=["DeepLearning", "StackedEnsemble", "GLM"],sort_metric = "AUC",stopping_metric="AUC")


In [53]:
train_data, test_data=df.split_frame(ratios=[0.8],seed=0)
h2o_estimator.train(x=list(df.columns).remove(kidney_target), y=kidney_target, training_frame=train_data)


AutoML progress: |████████████████████████████████████████████████████████| 100%


In [54]:
h2o_estimator.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_2_AutoML_20200601_004034,1.0,0.159885,1.0,0.0,0.160162,0.025652
GBM_3_AutoML_20200601_004034,1.0,0.0321043,1.0,0.0,0.0541888,0.00293643
XRT_1_AutoML_20200601_004034,1.0,0.039467,1.0,0.0,0.0714893,0.00511072
DRF_1_AutoML_20200601_004034,1.0,0.0222127,1.0,0.0,0.0494621,0.00244649
GBM_2_AutoML_20200601_004034,1.0,0.0320183,1.0,0.0,0.0559252,0.00312762
XGBoost_3_AutoML_20200601_004034,1.0,0.0311685,1.0,0.0,0.048031,0.00230698
GBM_1_AutoML_20200601_004034,1.0,0.0276362,1.0,0.0,0.0444383,0.00197477
XGBoost_1_AutoML_20200601_004034,1.0,0.0674516,1.0,0.0,0.0842285,0.00709445
GBM_4_AutoML_20200601_004034,1.0,0.0320208,1.0,0.0,0.0547681,0.00299954
GBM_5_AutoML_20200601_004034,0.999243,0.0798936,0.998621,0.00487805,0.113159,0.012805




In [55]:
h2o_estimator.leader

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_2_AutoML_20200601_004034


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,30.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.019849778371375554
RMSE: 0.1408892415033013
LogLoss: 0.12108103549483021
Mean Per-Class Error: 0.0024390243902439046
AUC: 0.9997897392767031
AUCPR: 0.9996234908318737
Gini: 0.9995794785534062

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4359660744667053: 


Unnamed: 0,Unnamed: 1,ckd,notckd,Error,Rate
0,ckd,204.0,1.0,0.0049,(1.0/205.0)
1,notckd,0.0,116.0,0.0,(0.0/116.0)
2,Total,204.0,117.0,0.0031,(1.0/321.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.435966,0.995708,9.0
1,max f2,0.435966,0.998279,9.0
2,max f0point5,0.435966,0.993151,9.0
3,max accuracy,0.435966,0.996885,9.0
4,max precision,0.877924,1.0,0.0
5,max recall,0.435966,1.0,9.0
6,max specificity,0.877924,1.0,0.0
7,max absolute_mcc,0.435966,0.993286,9.0
8,max min_per_class_accuracy,0.435966,0.995122,9.0
9,max mean_per_class_accuracy,0.435966,0.997561,9.0



Gains/Lift Table: Avg response rate: 36.14 %, avg score: 35.89 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.264798,0.877924,2.767241,2.767241,1.0,0.877924,1.0,0.877924,0.732759,0.732759,176.724138,176.724138
1,,2,0.317757,0.837316,2.767241,2.767241,1.0,0.837316,1.0,0.871156,0.146552,0.87931,176.724138,176.724138
2,,3,0.401869,0.229745,1.434866,2.488372,0.518519,0.523769,0.899225,0.798447,0.12069,1.0,43.48659,148.837209
3,,4,0.563863,0.075845,0.0,1.773481,0.0,0.103615,0.640884,0.598827,0.0,1.0,-100.0,77.348066
4,,5,0.626168,0.070939,0.0,1.597015,0.0,0.070939,0.577114,0.546301,0.0,1.0,-100.0,59.701493
5,,6,1.0,0.042873,0.0,1.0,0.0,0.045129,0.361371,0.358947,0.0,1.0,-100.0,0.0




ModelMetricsBinomial: xgboost
** Reported on cross-validation data. **

MSE: 0.025652010054104803
RMSE: 0.16016244895138437
LogLoss: 0.1598849496575654
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6224709749221802: 


Unnamed: 0,Unnamed: 1,ckd,notckd,Error,Rate
0,ckd,205.0,0.0,0.0,(0.0/205.0)
1,notckd,0.0,116.0,0.0,(0.0/116.0)
2,Total,205.0,116.0,0.0,(0.0/321.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.622471,1.0,11.0
1,max f2,0.622471,1.0,11.0
2,max f0point5,0.622471,1.0,11.0
3,max accuracy,0.622471,1.0,11.0
4,max precision,0.821833,1.0,0.0
5,max recall,0.622471,1.0,11.0
6,max specificity,0.821833,1.0,0.0
7,max absolute_mcc,0.622471,1.0,11.0
8,max min_per_class_accuracy,0.622471,1.0,11.0
9,max mean_per_class_accuracy,0.622471,1.0,11.0



Gains/Lift Table: Avg response rate: 36.14 %, avg score: 35.97 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.065421,0.821833,2.767241,2.767241,1.0,0.821833,1.0,0.821833,0.181034,0.181034,176.724138,176.724138
1,,2,0.133956,0.820489,2.767241,2.767241,1.0,0.820489,1.0,0.821145,0.189655,0.37069,176.724138,176.724138
2,,3,0.202492,0.818336,2.767241,2.767241,1.0,0.818336,1.0,0.820195,0.189655,0.560345,176.724138,176.724138
3,,4,0.320872,0.812522,2.767241,2.767241,1.0,0.812729,1.0,0.817441,0.327586,0.887931,176.724138,176.724138
4,,5,0.401869,0.219631,1.383621,2.488372,0.5,0.449306,0.899225,0.743243,0.112069,1.0,38.362069,148.837209
5,,6,0.501558,0.130643,0.0,1.993789,0.0,0.160155,0.720497,0.62735,0.0,1.0,-100.0,99.378882
6,,7,0.613707,0.089713,0.0,1.629442,0.0,0.097598,0.588832,0.530542,0.0,1.0,-100.0,62.944162
7,,8,0.71028,0.089215,0.0,1.407895,0.0,0.089215,0.508772,0.470537,0.0,1.0,-100.0,40.789474
8,,9,0.800623,0.088842,0.0,1.249027,0.0,0.088842,0.451362,0.427467,0.0,1.0,-100.0,24.902724
9,,10,0.900312,0.088449,0.0,1.110727,0.0,0.088449,0.401384,0.389928,0.0,1.0,-100.0,11.072664




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,auc,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,aucpr,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,err,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,err_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,f0point5,1.0,0.0,1.0,1.0,1.0,1.0,1.0
6,f1,1.0,0.0,1.0,1.0,1.0,1.0,1.0
7,f2,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,lift_top_group,2.7677536,0.033216953,2.7083333,2.7826087,2.7826087,2.7826087,2.7826087
9,logloss,0.15986511,0.007599861,0.16623452,0.15515101,0.15249851,0.15573278,0.16970871



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-06-01 00:40:35,0.085 sec,0.0,0.5,0.693147,0.5,0.361371,1.0,0.638629
1,,2020-06-01 00:40:35,0.092 sec,5.0,0.197546,0.200264,0.999495,0.9991,2.767241,0.006231
2,,2020-06-01 00:40:35,0.097 sec,10.0,0.14445,0.130876,0.999832,0.9997,2.767241,0.003115
3,,2020-06-01 00:40:35,0.101 sec,15.0,0.145038,0.130651,0.999832,0.9997,2.767241,0.003115
4,,2020-06-01 00:40:35,0.106 sec,20.0,0.141055,0.121121,0.99979,0.999623,2.767241,0.003115
5,,2020-06-01 00:40:35,0.110 sec,25.0,0.140649,0.121048,0.99979,0.999623,2.767241,0.003115
6,,2020-06-01 00:40:35,0.114 sec,30.0,0.140889,0.121081,0.99979,0.999623,2.767241,0.003115



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,id,171.105621,1.0,0.427004
1,hemo,156.209717,0.912943,0.389831
2,sg,32.627972,0.190689,0.081425
3,sc,25.820887,0.150906,0.064438
4,rc,7.955438,0.046494,0.019853
5,pcv,6.992236,0.040865,0.01745




In [56]:
h2o_paras_keys = h2o_estimator.leader.params

In [59]:
result=h2o_estimator.leader.model_performance(test_data)

In [60]:
print (result)


ModelMetricsBinomial: xgboost
** Reported on test data. **

MSE: 0.01380958558914457
RMSE: 0.11751419313914625
LogLoss: 0.10799106573304831
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7336121201515198: 


Unnamed: 0,Unnamed: 1,ckd,notckd,Error,Rate
0,ckd,45.0,0.0,0.0,(0.0/45.0)
1,notckd,0.0,34.0,0.0,(0.0/34.0)
2,Total,45.0,34.0,0.0,(0.0/79.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.733612,1.0,5.0
1,max f2,0.733612,1.0,5.0
2,max f0point5,0.733612,1.0,5.0
3,max accuracy,0.733612,1.0,5.0
4,max precision,0.877924,1.0,0.0
5,max recall,0.733612,1.0,5.0
6,max specificity,0.877924,1.0,0.0
7,max absolute_mcc,0.733612,1.0,5.0
8,max min_per_class_accuracy,0.733612,1.0,5.0
9,max mean_per_class_accuracy,0.733612,1.0,5.0



Gains/Lift Table: Avg response rate: 43.04 %, avg score: 40.71 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.291139,0.877924,2.323529,2.323529,1.0,0.877924,1.0,0.877924,0.676471,0.676471,132.352941,132.352941
1,,2,0.35443,0.837316,2.323529,2.323529,1.0,0.837316,1.0,0.870673,0.147059,0.823529,132.352941,132.352941
2,,3,0.417722,0.793728,2.323529,2.323529,1.0,0.802625,1.0,0.860362,0.147059,0.970588,132.352941,132.352941
3,,4,0.518987,0.075845,0.290441,1.926829,0.125,0.229253,0.829268,0.737219,0.029412,1.0,-70.955882,92.682927
4,,5,0.632911,0.070939,0.0,1.58,0.0,0.070939,0.68,0.617289,0.0,1.0,-100.0,58.0
5,,6,1.0,0.042873,0.0,1.0,0.0,0.04463,0.43038,0.407072,0.0,1.0,-100.0,0.0






In [61]:
h2o_paras_keys

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'XGBoost_2_AutoML_20200601_004034',
   'type': 'Key<Model>',
   'URL': '/3/Models/XGBoost_2_AutoML_20200601_004034'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_py_38_sid_a7b7',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_py_38_sid_a7b7'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'fold_assignment': {'default': 'AUTO', 'actual': 'Modulo'},
 'fold_colu

In [63]:
from h2o.estimators.xgboost import H2OXGBoostEstimator

In [64]:
def h2o_perform_validation(paras, df, x_columns, y_columns, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        model=H2OXGBoostEstimator(**paras)
        train_data, test_data=df.split_frame(ratios=[0.8],seed=i)
        model.train(x=x_columns, y=y_columns, training_frame=train_data)
        temp_auc = model.model_performance(test_data)['AUC']
        aucs.append(temp_auc)
    return aucs

In [65]:
x_columns = list(df.columns).remove(kidney_target)
y_columns = kidney_target

xgboost_paras = {'nfolds': 5,
                 'keep_cross_validation_models': False,
                 'keep_cross_validation_predictions': True,
                 'fold_assignment': 'Modulo',
                 'stopping_metric': 'AUC',
                 'stopping_tolerance': 0.05,
                 'seed': 2,
                 'distribution':'bernoulli',
                 'ntrees': 30,
                 'max_depth': 20,
                 'min_rows': 10.0,
                 'min_child_weight': 10.0,
                 'sample_rate': 0.6,
                 'subsample': 0.6,
                 'col_sample_rate': 0.8,
                 'colsample_bylevel': 0.8,
                 'col_sample_rate_per_tree': 0.8,
                 'colsample_bytree': 0.8,
                 'score_tree_interval': 5}

h2o_aucs = h2o_perform_validation(xgboost_paras, df, x_columns, y_columns, 100)

xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress

In [66]:
print (mean(h2o_aucs))
print (sem(h2o_aucs))

0.9994816049807931
0.00016848457845687818


In [68]:
import matplotlib.pyplot as plt

def sort_models(average_roc_aucs, roc_auc_sem, prepro_index=None, filename=None, sort_plot = True):
    a = sorted(average_roc_aucs.items(), key=lambda x: x[1], reverse = True)
    sorted_model_name = [e[0] for e in a]
    sorted_roc = [e[1] for e in a]
    print (sorted_model_name)
    sorted_stdev = []
    for x in sorted_model_name:
        sorted_stdev.append(roc_auc_sem[x])
    if sort_plot:
        x_pos = np.arange(len(sorted_model_name))
        fig, ax = plt.subplots()
        ax.bar(x_pos, sorted_roc, yerr=sorted_stdev, align='center', alpha=0.5, ecolor='black', capsize=10)
        ax.set_ylabel('mean ROC AUC value')
        print (x_pos)
        ax.set_xticks(x_pos)
        ax.set_xticklabels(sorted_model_name)
        ax.set_title('Model rank on test data')
        ax.yaxis.grid(True)
        # Save the figure and show
        plt.tight_layout()
        for i, v in enumerate(sorted_roc):
            plt.text(x_pos[i]-0.25, v+0.01, str("%.2f"%v))
        if filename!=None:
            save_name = filename + '/prepro_index_'+ str(prepro_index) + '_model_leaderboard_with_error_bars.png'
            plt.savefig(save_name, dpi=300)
        plt.show()
        plt.close()
    return a, sorted_model_name


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution
