In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean 
from scipy.stats import sem
import xgboost as xgb
import os
cwd = os.getcwd()
import sys
sys.path.append(cwd+"/src_autoML/")
print (cwd)

/Users/liug28/Downloads/AutoML/AutoML_package


# breast cancer dataset

In [2]:
breast_result_path=cwd+"/src_autoML/examples/breast_result"
breast_data_dir=cwd+"/src_autoML/examples/data/breast_cancer.csv"
breast_target = 'irradiat'


In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin

breast_data=pd.read_csv(breast_data_dir)
y_breast = breast_data[breast_target]
x_breast = breast_data.drop([breast_target], axis=1)

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

x_breast = pd.DataFrame(x_breast)
x_breast = DataFrameImputer().fit_transform(x_breast)
x_breast = pd.get_dummies(x_breast)
# y_breast = y_breast.replace({'yes':1, 'no':0}, inplace=True)

In [4]:
def model_perform_validation(clf, prepro, x, y, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=np.random.RandomState(i), test_size=test_size, stratify=y)
#         X_train = prepro.fit_transform(X_train)
        model = clf.fit(X_train, y_train)
#         X_test = prepro.fit_transform(X_test)
        y_pred = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred)
        aucs.append(auc)
    return aucs

## 1 default setting logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model=LogisticRegression(solver='liblinear', random_state=0)
logistic_aucs=model_perform_validation(logistic_regression_model, StandardScaler(copy=True, with_mean=False, with_std=False), x_breast, y_breast, 100)
print (mean(logistic_aucs))
print (sem(logistic_aucs))

0.7278652597402597
0.007080401755275602


## 2 default setting xgboost

In [6]:
default_clf = xgb.XGBClassifier()


In [7]:
default_aucs=model_perform_validation(default_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_breast, y_breast, 100)


In [8]:
print (mean(default_aucs))
print (sem(default_aucs))


0.6692938311688311
0.007211133535383651


## 2 allow missing autoML

In [9]:
allowMissing_prepro = '1'
allowMissing_dir = breast_result_path+'/resultAllowMissing/'+allowMissing_prepro
am_x_train_dir = allowMissing_dir+'/X_train_important.csv'
am_x_test_dir = allowMissing_dir+'/X_test_important.csv'
am_y_train_dir = allowMissing_dir+'/y_train.csv'
am_y_test_dir = allowMissing_dir+'/y_test.csv'

am_x_train = pd.read_csv(am_x_train_dir, index_col = 0)
am_x_test = pd.read_csv(am_x_test_dir, index_col = 0)
am_y_train = pd.read_csv(am_y_train_dir, index_col = 0)
am_y_test = pd.read_csv(am_y_test_dir, index_col = 0)

x_breast_am = pd.concat([am_x_train, am_x_test])
y_breast_am = pd.concat([am_y_train, am_y_test])


In [11]:
breast_am_clf = xgb.XGBClassifier(base_score=0.5, booster=None,
              colsample_bylevel=0.6794110311225349, colsample_bynode=None,
              colsample_bytree=0.6578434062409568, gamma=0.08909604138154013,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.005124939094078793, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=np.nan, monotone_constraints=None,
              n_estimators=800, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None,
              reg_alpha=0.00029681281926420257, reg_lambda=1.419324135565436,
              scale_pos_weight=1, seed=0, subsample=0.6825666314422394,
              tree_method=None, validate_parameters=False, verbosity=None)

In [12]:
breast_am_aucs=model_perform_validation(breast_am_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_breast_am, y_breast_am['no'], 100)

In [13]:
print (mean(breast_am_aucs))
print (sem(breast_am_aucs))

0.7435551948051948
0.006076099912095068


## 4 not allow missing classifier choose from all classifiers

In [14]:
from sklearn.ensemble import RandomForestClassifier

breast_rf_clf = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=4, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=101, n_jobs=1,
                       oob_score=False, random_state=0, verbose=False,
                       warm_start=False)


In [15]:
notAllowMissing_prepro = '0'
notAllowMissing_dir = breast_result_path+'/resultNoMissingAllow/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_breast_nam = pd.concat([nam_x_train, nam_x_test])
y_breast_nam = pd.concat([nam_y_train, nam_y_test])


In [16]:
breast_nam_aucs=model_perform_validation(breast_rf_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_breast_nam, y_breast_nam['no'], 100)

In [17]:
print (mean(breast_nam_aucs))
print (sem(breast_nam_aucs))

0.7597646103896104
0.0064523144046197255


## 5 H2O classifier

In [42]:
from h2o.automl import H2OAutoML
import h2o

In [43]:
breast_h2o = breast_data.copy()
# cervical_h2o[cervical_target].replace({1:'Yes', 0:'No'}, inplace=True)


In [44]:
breast_h2o


Unnamed: 0,Target,age_catg,menopause,tumor-size,inv-nodes,node-caps,degree malignant,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,5-Mar,no,3,left,left_low,no


In [45]:
h2o.init(nthreads = -1, max_mem_size = 8)
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmph_fyduh7
  JVM stdout: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmph_fyduh7/h2o_liug28_started_from_python.out
  JVM stderr: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmph_fyduh7/h2o_liug28_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.111 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Connecting to H2O server at http://localhost:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.111 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


<H2OConnection to http://localhost:54321, no session>

In [46]:
df = h2o.H2OFrame(breast_h2o)


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [47]:
df.describe()


Rows:286
Cols:10




Unnamed: 0,Target,age_catg,menopause,tumor-size,inv-nodes,node-caps,degree malignant,breast,breast-quad,irradiat
type,enum,enum,enum,enum,enum,enum,int,enum,enum,enum
mins,,,,,,,1.0,,,
mean,,,,,,,2.0489510489510487,,,
maxs,,,,,,,3.0,,,
sigma,,,,,,,0.7382166403717155,,,
zeros,,,,,,,0,,,
missing,0,0,0,0,0,0,0,0,0,0
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3.0,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2.0,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2.0,left,left_low,no


In [48]:
h2o_estimator=H2OAutoML(max_models = 10, seed = 1, exclude_algos=["DeepLearning", "StackedEnsemble", "GLM"],sort_metric = "AUC",stopping_metric="AUC")

In [50]:
train_data, test_data=df.split_frame(ratios=[0.8],seed=0)
h2o_estimator.train(x=list(df.columns).remove(breast_target), y=breast_target, training_frame=train_data)


AutoML progress: |██████████████
23:16:04.999: GBM_5_AutoML_20200531_231555 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200531_231555.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 186.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 186.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 186.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 187.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 187.0.


█████████████████████████████

In [51]:
h2o_estimator.leaderboard


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DRF_1_AutoML_20200531_231555,0.791475,0.457009,0.449524,0.258415,0.380595,0.144853
XGBoost_3_AutoML_20200531_231555,0.787814,0.466479,0.39852,0.240219,0.395048,0.156063
GBM_4_AutoML_20200531_231555,0.784481,0.442623,0.493278,0.236557,0.378807,0.143495
GBM_2_AutoML_20200531_231555,0.781967,0.4484,0.424223,0.241148,0.383673,0.147205
XGBoost_1_AutoML_20200531_231555,0.779126,0.442837,0.429268,0.207432,0.377948,0.142845
GBM_3_AutoML_20200531_231555,0.778579,0.448361,0.447941,0.244754,0.380478,0.144764
XRT_1_AutoML_20200531_231555,0.775792,0.452793,0.425233,0.239235,0.380637,0.144884
GBM_1_AutoML_20200531_231555,0.712459,0.572034,0.423304,0.304863,0.410265,0.168318
XGBoost_2_AutoML_20200531_231555,0.453607,0.528897,0.187592,0.5,0.41414,0.171512
XGBoost_grid__1_AutoML_20200531_231555_model_1,0.43082,0.523921,0.187061,0.5,0.412127,0.169848




In [52]:
h2o_estimator.leader


Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_1_AutoML_20200531_231555


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,35.0,35.0,18200.0,7.0,12.0,10.028571,28.0,50.0,36.685715




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.14824544298312192
RMSE: 0.3850265484133814
LogLoss: 0.7361072309005123
Mean Per-Class Error: 0.3002732240437158
AUC: 0.7578688524590164
AUCPR: 0.5355877450200047
Gini: 0.5157377049180327

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.17857141494750972: 


Unnamed: 0,Unnamed: 1,no,yes,Error,Rate
0,no,128.0,55.0,0.3005,(55.0/183.0)
1,yes,15.0,35.0,0.3,(15.0/50.0)
2,Total,143.0,90.0,0.3004,(70.0/233.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.178571,0.5,74.0
1,max f2,0.018518,0.656425,127.0
2,max f0point5,0.673611,0.597015,19.0
3,max accuracy,0.722222,0.832618,17.0
4,max precision,1.0,1.0,0.0
5,max recall,0.0,1.0,156.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.673611,0.419587,19.0
8,max min_per_class_accuracy,0.178571,0.699454,74.0
9,max mean_per_class_accuracy,0.178571,0.699727,74.0



Gains/Lift Table: Avg response rate: 21.46 %, avg score: 21.93 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.012876,0.940513,4.66,4.66,1.0,0.971795,1.0,0.971795,0.06,0.06,366.0,366.0
1,,2,0.021459,0.863919,2.33,3.728,0.5,0.894231,0.8,0.940769,0.02,0.08,133.0,272.8
2,,3,0.030043,0.834444,4.66,3.994286,1.0,0.862103,0.857143,0.918293,0.04,0.12,366.0,299.428571
3,,4,0.042918,0.816591,3.106667,3.728,0.666667,0.825015,0.8,0.89031,0.04,0.16,210.666667,272.8
4,,5,0.051502,0.784848,4.66,3.883333,1.0,0.803977,0.833333,0.875921,0.04,0.2,366.0,288.333333
5,,6,0.103004,0.642017,2.33,3.106667,0.5,0.711552,0.666667,0.793736,0.12,0.32,133.0,210.666667
6,,7,0.150215,0.547078,1.270909,2.529714,0.272727,0.596273,0.542857,0.731676,0.06,0.38,27.090909,152.971429
7,,8,0.201717,0.42381,1.553333,2.280426,0.333333,0.49033,0.489362,0.670056,0.08,0.46,55.333333,128.042553
8,,9,0.300429,0.275817,1.215652,1.930571,0.26087,0.357152,0.414286,0.567245,0.12,0.58,21.565217,93.057143
9,,10,0.407725,0.166667,1.3048,1.765895,0.28,0.208268,0.378947,0.472777,0.14,0.72,30.48,76.589474




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.14485284255435574
RMSE: 0.3805953790501873
LogLoss: 0.45700918455513145
Mean Per-Class Error: 0.258415300546448
AUC: 0.7914754098360656
AUCPR: 0.4495243476642712
Gini: 0.5829508196721311

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.22380952324186054: 


Unnamed: 0,Unnamed: 1,no,yes,Error,Rate
0,no,136.0,47.0,0.2568,(47.0/183.0)
1,yes,13.0,37.0,0.26,(13.0/50.0)
2,Total,149.0,84.0,0.2575,(60.0/233.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.22381,0.552239,69.0
1,max f2,0.071429,0.680473,112.0
2,max f0point5,0.6,0.551948,21.0
3,max accuracy,0.6,0.819742,21.0
4,max precision,0.6,0.653846,21.0
5,max recall,0.004026,1.0,156.0
6,max specificity,0.971429,0.994536,0.0
7,max absolute_mcc,0.22381,0.413121,69.0
8,max min_per_class_accuracy,0.22381,0.74,69.0
9,max mean_per_class_accuracy,0.22381,0.741585,69.0



Gains/Lift Table: Avg response rate: 21.46 %, avg score: 22.42 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.012876,0.866857,0.0,0.0,0.0,0.909524,0.0,0.909524,0.0,0.0,-100.0,-100.0
1,,2,0.021459,0.84861,2.33,0.932,0.5,0.857143,0.2,0.888571,0.02,0.02,133.0,-6.8
2,,3,0.030043,0.828857,2.33,1.331429,0.5,0.839762,0.285714,0.874626,0.02,0.04,133.0,33.142857
3,,4,0.042918,0.770286,4.66,2.33,1.0,0.811111,0.5,0.855571,0.06,0.1,366.0,133.0
4,,5,0.051502,0.738857,2.33,2.33,0.5,0.741429,0.5,0.836548,0.02,0.12,133.0,133.0
5,,6,0.103004,0.641905,3.495,2.9125,0.75,0.698175,0.625,0.767361,0.18,0.3,249.5,191.25
6,,7,0.154506,0.542857,1.553333,2.459444,0.333333,0.57226,0.527778,0.702327,0.08,0.38,55.333333,145.944444
7,,8,0.201717,0.446857,2.118182,2.379574,0.454545,0.484935,0.510638,0.651448,0.1,0.48,111.818182,137.957447
8,,9,0.300429,0.305229,1.418261,2.063714,0.304348,0.388288,0.442857,0.564981,0.14,0.62,41.826087,106.371429
9,,10,0.399142,0.161757,1.418261,1.904086,0.304348,0.23195,0.408602,0.482619,0.14,0.76,41.826087,90.408602




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.76336724,0.081405185,0.78723407,0.85106385,0.78723407,0.76086956,0.6304348
1,auc,0.8007539,0.08729411,0.8648649,0.88782054,0.6731602,0.82236844,0.75555557
2,aucpr,0.4706439,0.12168994,0.6627031,0.50338715,0.44834504,0.38109332,0.35769096
3,err,0.23663275,0.081405185,0.21276596,0.14893617,0.21276596,0.23913044,0.36956522
4,err_count,11.0,3.6742346,10.0,7.0,10.0,11.0,17.0
5,f0point5,0.53250885,0.08621566,0.5405405,0.5769231,0.6451613,0.47619048,0.42372882
6,f1,0.59909624,0.035555646,0.61538464,0.6315789,0.61538464,0.5925926,0.5405405
7,f2,0.70615554,0.07375801,0.71428573,0.6976744,0.5882353,0.78431374,0.74626863
8,lift_top_group,0.94,2.101904,4.7,0.0,0.0,0.0,0.0
9,logloss,0.45675585,0.15193866,0.36202896,0.3572869,0.7099765,0.36555263,0.48893437



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-05-31 23:15:59,0.282 sec,0.0,,,,,,
1,,2020-05-31 23:15:59,0.291 sec,5.0,0.454124,5.487992,0.662361,0.423713,2.761481,0.280193
2,,2020-05-31 23:15:59,0.300 sec,10.0,0.411899,3.146814,0.716384,0.442519,2.509231,0.295154
3,,2020-05-31 23:15:59,0.313 sec,15.0,0.393085,1.954303,0.721749,0.498994,3.883333,0.253219
4,,2020-05-31 23:15:59,0.322 sec,20.0,0.390544,1.14373,0.743115,0.520816,4.66,0.309013
5,,2020-05-31 23:15:59,0.331 sec,25.0,0.392,1.152926,0.734809,0.501609,4.66,0.27897
6,,2020-05-31 23:15:59,0.339 sec,30.0,0.385071,0.868532,0.759344,0.541889,4.66,0.248927
7,,2020-05-31 23:15:59,0.347 sec,35.0,0.385027,0.736107,0.757869,0.535588,4.66,0.300429



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,inv-nodes,225.782822,1.0,0.2671
1,tumor-size,164.51416,0.728639,0.19462
2,node-caps,120.478989,0.533606,0.142526
3,breast-quad,81.030487,0.358887,0.095859
4,age_catg,80.077629,0.354667,0.094731
5,degree malignant,63.271732,0.280233,0.07485
6,Target,45.39547,0.201058,0.053703
7,breast,35.276455,0.156241,0.041732
8,menopause,29.484015,0.130586,0.034879




In [53]:
h2o_paras_keys = h2o_estimator.leader.params

In [56]:
result=h2o_estimator.leader.model_performance(test_data)


In [57]:
print (result)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.2555959772738171
RMSE: 0.5055650079602197
LogLoss: 1.4602145132810103
Mean Per-Class Error: 0.35317460317460325
AUC: 0.6055555555555555
AUCPR: 0.429987852144441
Gini: 0.21111111111111103

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.09922302109854564: 


Unnamed: 0,Unnamed: 1,no,yes,Error,Rate
0,no,20.0,15.0,0.4286,(15.0/35.0)
1,yes,5.0,13.0,0.2778,(5.0/18.0)
2,Total,25.0,28.0,0.3774,(20.0/53.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.099223,0.565217,26.0
1,max f2,0.0,0.72,45.0
2,max f0point5,0.099223,0.5,26.0
3,max accuracy,0.6,0.698113,5.0
4,max precision,0.6,0.666667,5.0
5,max recall,0.0,1.0,45.0
6,max specificity,0.742857,0.971429,0.0
7,max absolute_mcc,0.099223,0.278582,26.0
8,max min_per_class_accuracy,0.114286,0.611111,22.0
9,max mean_per_class_accuracy,0.099223,0.646825,26.0



Gains/Lift Table: Avg response rate: 33.96 %, avg score: 21.03 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.018868,0.728743,0.0,0.0,0.0,0.742857,0.0,0.742857,0.0,0.0,-100.0,-100.0
1,,2,0.037736,0.715657,2.944444,1.472222,1.0,0.715714,0.5,0.729286,0.055556,0.055556,194.444444,47.222222
2,,3,0.037736,0.714914,0.0,1.472222,0.0,0.0,0.5,0.729286,0.0,0.055556,-100.0,47.222222
3,,4,0.056604,0.712245,0.0,0.981481,0.0,0.714286,0.333333,0.724286,0.0,0.055556,-100.0,-1.851852
4,,5,0.056604,0.69898,0.0,0.981481,0.0,0.0,0.333333,0.724286,0.0,0.055556,-100.0,-1.851852
5,,6,0.113208,0.592571,2.944444,1.962963,1.0,0.643311,0.666667,0.683798,0.166667,0.222222,194.444444,96.296296
6,,7,0.150943,0.474612,1.472222,1.840278,0.5,0.540952,0.625,0.648087,0.055556,0.277778,47.222222,84.027778
7,,8,0.207547,0.43551,0.0,1.338384,0.0,0.447517,0.454545,0.593386,0.0,0.277778,-100.0,33.838384
8,,9,0.301887,0.290857,1.177778,1.288194,0.4,0.398707,0.4375,0.532549,0.111111,0.388889,17.777778,28.819444
9,,10,0.396226,0.159953,1.177778,1.261905,0.4,0.190745,0.428571,0.451167,0.111111,0.5,17.777778,26.190476






In [58]:
h2o_paras_keys


{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'DRF_1_AutoML_20200531_231555',
   'type': 'Key<Model>',
   'URL': '/3/Models/DRF_1_AutoML_20200531_231555'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_py_5_sid_b7d4',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_py_5_sid_b7d4'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'score_tree_interval': {'default': 0, 'actual': 5},
 'fold_assignment': {'default

In [80]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

def h2o_perform_validation(paras, df, x_columns, y_columns, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        model=H2ORandomForestEstimator(**paras)
        train_data, test_data=df.split_frame(ratios=[0.8],seed=i)
        model.train(x=x_columns, y=y_columns, training_frame=train_data)
        temp_auc = model.model_performance(test_data)['AUC']
        aucs.append(temp_auc)
    return aucs


In [81]:
x_columns = list(df.columns).remove(breast_target)
y_columns = breast_target

rf_paras = {'nfolds': 5, 
            'keep_cross_validation_models': False, 
            'keep_cross_validation_predictions': True, 
            'keep_cross_validation_fold_assignment': False, 
            'score_each_iteration': False, 
            'score_tree_interval': 5, 
            'fold_assignment': 'Modulo', 
            'fold_column': None, 
            'ntrees': 35, 
            'stopping_metric': 'AUC', 
            'stopping_tolerance': 0.05, 
            'seed': 4, 
            'distribution': 'multinomial', 
            }
h2o_aucs = h2o_perform_validation(rf_paras, df, x_columns, y_columns, 100)


drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |█

In [82]:
print (mean(h2o_aucs))
print (sem(h2o_aucs))

0.7158428302173341
0.0060227067979433755
