In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean 
from scipy.stats import sem
import xgboost as xgb
import os
cwd = os.getcwd()
import sys
sys.path.append(cwd+"/src_autoML/")
print (cwd)

/Users/liug28/Downloads/AutoML/AutoML_package


# heart dataset

In [4]:
heart_result_path=cwd+"/src_autoML/examples/heart_result"
heart_data_dir=cwd+"/src_autoML/examples/data/heart.csv"
target = 'target'

In [5]:
heart_data=pd.read_csv(heart_data_dir)
y_heart = heart_data[target]
x_heart = heart_data.drop([target], axis=1)

In [6]:
categorical_features = ['cp','restecg','slope','thal']
x_heart = pd.get_dummies(x_heart,columns=categorical_features)

In [5]:
x_heart.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_0,...,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,1,150,0,2.3,0,0,...,1,0,0,1,0,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,...,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,0,0,...,1,0,0,0,0,1,0,0,1,0
3,56,1,120,236,0,178,0,0.8,0,0,...,0,1,0,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,0,1,...,0,1,0,0,0,1,0,0,1,0


In [6]:
def model_perform_validation(clf, prepro, x, y, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=np.random.RandomState(i), test_size=test_size, stratify=y)
        X_train = prepro.fit_transform(X_train)
        model = clf.fit(X_train, y_train)
        X_test = prepro.fit_transform(X_test)
        y_pred = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred)
        aucs.append(auc)
    return aucs

## 1 default logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression


In [8]:
logistic_regression_model=LogisticRegression(solver='liblinear', random_state=0)
logistic_aucs=model_perform_validation(logistic_regression_model, StandardScaler(copy=True, with_mean=False, with_std=False), x_heart, y_heart, 100)

In [9]:
print (mean(logistic_aucs))
print (sem(logistic_aucs))

0.9027813852813853
0.0032670613797825417


## 2 default xgboost

In [10]:
default_clf = xgb.XGBClassifier()

In [11]:
default_aucs=model_perform_validation(default_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_heart, y_heart, 100)

In [12]:
print (mean(default_aucs))
print (sem(default_aucs))

0.8759415584415584
0.0031310612986943597


## 3 allow missing autoML

In [13]:
allowMissing_prepro = '0'
allowMissing_dir = heart_result_path+'/resultAllowMissing/'+allowMissing_prepro
am_x_train_dir = allowMissing_dir+'/X_train_important.csv'
am_x_test_dir = allowMissing_dir+'/X_test_important.csv'
am_y_train_dir = allowMissing_dir+'/y_train.csv'
am_y_test_dir = allowMissing_dir+'/y_test.csv'

am_x_train = pd.read_csv(am_x_train_dir, index_col = 0)
am_x_test = pd.read_csv(am_x_test_dir, index_col = 0)
am_y_train = pd.read_csv(am_y_train_dir, index_col = 0)
am_y_test = pd.read_csv(am_y_test_dir, index_col = 0)

x_heart_am = pd.concat([am_x_train, am_x_test])
y_heart_am = pd.concat([am_y_train, am_y_test])

heart_am_clf = xgb.XGBClassifier(base_score=0.5, booster=None,
              colsample_bylevel=0.7799435641986237, colsample_bynode=None,
              colsample_bytree=0.6160895891255447, gamma=0.07696682162529252,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.018691803145673834, max_delta_step=0, max_depth=8,
              min_child_weight=10, missing=np.nan, monotone_constraints=None,
              n_estimators=1600, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None,
              reg_alpha=0.023235532273767453, reg_lambda=2.8724456841580874,
              scale_pos_weight=1, seed=0, subsample=0.7249905765120273,
              tree_method=None, validate_parameters=False, verbosity=None)


In [14]:
heart_am_aucs=model_perform_validation(heart_am_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_heart_am, y_heart_am[target], 100)

In [15]:
print (mean(heart_am_aucs))
print (sem(heart_am_aucs))

0.9080627705627706
0.0033876805818448654


## 4 not allow missing autoML


In [16]:
notAllowMissing_prepro = '0'
notAllowMissing_dir = heart_result_path+'/resultNoMissingAllow/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_heart_nam = pd.concat([nam_x_train, nam_x_test])
y_heart_nam = pd.concat([nam_y_train, nam_y_test])



In [17]:
from sklearn.ensemble import RandomForestClassifier

heart_nam_clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None,
                       max_features=0.1215422474347232, max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=6,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=493, n_jobs=1, oob_score=False,
                       random_state=0, verbose=False, warm_start=False)

In [18]:
heart_nam_aucs=model_perform_validation(heart_nam_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_heart_nam, y_heart_nam[target], 100)

In [19]:
print (mean(heart_nam_aucs))
print (sem(heart_nam_aucs))

0.9157034632034632
0.0030555323237170537


## 5 H2O classifier

In [1]:
from h2o.automl import H2OAutoML
import h2o

In [7]:
heart_h2o = heart_data.copy()
heart_h2o = pd.get_dummies(heart_h2o,columns=categorical_features)
heart_h2o["target"].replace({1:'Yes', 0:'No'}, inplace=True)

In [8]:
heart_h2o

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,...,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,1,150,0,2.3,0,Yes,...,1,0,0,1,0,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,Yes,...,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,172,0,1.4,0,Yes,...,1,0,0,0,0,1,0,0,1,0
3,56,1,120,236,0,178,0,0.8,0,Yes,...,0,1,0,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,0,Yes,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,0,No,...,0,1,0,0,1,0,0,0,0,1
299,45,1,110,264,0,132,0,1.2,0,No,...,0,1,0,0,1,0,0,0,0,1
300,68,1,144,193,1,141,0,3.4,2,No,...,0,1,0,0,1,0,0,0,0,1
301,57,1,130,131,0,115,1,1.2,1,No,...,0,1,0,0,1,0,0,0,0,1


In [9]:
h2o.init(nthreads = -1, max_mem_size = 8)
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmpkdycp7i_
  JVM stdout: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmpkdycp7i_/h2o_liug28_started_from_python.out
  JVM stderr: /var/folders/h5/4_9f6vkj6jl5zmx1ck62g9qr0000gp/T/tmpkdycp7i_/h2o_liug28_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 1 day
H2O_cluster_name:,H2O_from_python_liug28_o3kbo4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.111 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Connecting to H2O server at http://localhost:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 1 day
H2O_cluster_name:,H2O_from_python_liug28_o3kbo4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.111 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


<H2OConnection to http://localhost:54321, no session>

In [10]:
# df = h2o.import_file(heart_data_dir)
df = h2o.H2OFrame(heart_h2o)
# X_train, X_test, y_train, y_test = train_test_split(x_heart, y_heart, random_state=np.random.RandomState(0), test_size=0.2, stratify=y_heart)
# train_data = pd.concat([X_train, y_train], axis=1)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [11]:
df.describe()

Rows:303
Cols:24




Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_0,cp_1,cp_2,cp_3,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
type,int,int,int,int,int,int,int,real,int,enum,int,int,int,int,int,int,int,int,int,int,int,int,int,int
mins,29.0,0.0,94.0,126.0,0.0,71.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,54.36633663366333,0.6831683168316832,131.62376237623772,246.26402640264035,0.1485148514851485,149.6468646864687,0.32673267326732675,1.0396039603960392,0.7293729372937293,,0.47194719471947194,0.16501650165016502,0.2871287128712871,0.07590759075907591,0.48514851485148514,0.5016501650165016,0.013201320132013201,0.06930693069306931,0.46204620462046203,0.46864686468646866,0.006600660066006601,0.0594059405940594,0.5478547854785478,0.38613861386138615
maxs,77.0,1.0,200.0,564.0,1.0,202.0,1.0,6.2,4.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sigma,9.08210098983786,0.46601082333962385,17.538142813517098,51.83075098793005,0.35619787492797644,22.905161114914087,0.4697944645223165,1.1610750220686346,1.0226063649693276,,0.5000382473951351,0.37180949564433824,0.4531705889793461,0.2652883312869679,0.5006061501873533,0.5008244028681619,0.11432489313489193,0.2543954947188794,0.49938217243855904,0.49984151532804405,0.0811098241628091,0.23677393771377483,0.49852798419782596,0.4876684024916939
zeros,0,96,0,0,258,0,204,99,175,,160,253,216,280,156,151,299,282,163,161,301,285,137,186
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,Yes,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,Yes,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,Yes,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [12]:
h2o_estimator=H2OAutoML(max_models = 10, seed = 1, exclude_algos=["DeepLearning", "StackedEnsemble"],sort_metric = "AUC",stopping_metric="AUC")

In [13]:
train_data, test_data=df.split_frame(ratios=[0.8],seed=0)
h2o_estimator.train(x=list(df.columns).remove(target), y=target, training_frame=train_data)

AutoML progress: |███████████████████
11:27:40.196: GBM_5_AutoML_20200615_112727 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200615_112727.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 197.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 197.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 198.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 198.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 198.0.


████████████████████████

In [14]:
h2o_estimator.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GLM_1_AutoML_20200615_112727,0.903344,0.391371,0.922191,0.180522,0.352758,0.124438
GBM_2_AutoML_20200615_112727,0.898614,0.398091,0.919549,0.194411,0.357792,0.128015
GBM_4_AutoML_20200615_112727,0.896483,0.404759,0.917616,0.196976,0.363218,0.131928
GBM_3_AutoML_20200615_112727,0.896017,0.401452,0.915878,0.190281,0.359919,0.129542
XGBoost_1_AutoML_20200615_112727,0.893019,0.40326,0.906976,0.187217,0.356565,0.127139
XGBoost_2_AutoML_20200615_112727,0.891387,0.42127,0.905648,0.181521,0.364634,0.132958
XRT_1_AutoML_20200615_112727,0.889955,0.415053,0.911092,0.198008,0.363298,0.131986
XGBoost_3_AutoML_20200615_112727,0.888756,0.43021,0.912377,0.193878,0.371377,0.137921
DRF_1_AutoML_20200615_112727,0.884526,0.540998,0.910868,0.180989,0.369955,0.136867
GBM_1_AutoML_20200615_112727,0.882361,0.448664,0.885628,0.214995,0.379304,0.143872




In [15]:
h2o_estimator.leader

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_1_AutoML_20200615_112727


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,Ridge ( lambda = 0.04597 ),"nlambda = 30, lambda.max = 26.37, lambda.min = 0.04597, lambda.1se...",23,23,48,automl_training_py_2_sid_8d99




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.10267284878116732
RMSE: 0.3204260426075997
LogLoss: 0.33194767866365305
Null degrees of freedom: 246
Residual degrees of freedom: 223
Null deviance: 338.5137398250584
Residual deviance: 163.98215325984458
AIC: 211.98215325984458
AUC: 0.9373168132160937
AUCPR: 0.9523387554487233
Gini: 0.8746336264321874

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.49024695381341715: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,87.0,21.0,0.1944,(21.0/108.0)
1,Yes,11.0,128.0,0.0791,(11.0/139.0)
2,Total,98.0,149.0,0.1296,(32.0/247.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.490247,0.888889,147.0
1,max f2,0.224445,0.921409,180.0
2,max f0point5,0.65046,0.899844,123.0
3,max accuracy,0.603925,0.870445,133.0
4,max precision,0.992501,1.0,0.0
5,max recall,0.075114,1.0,210.0
6,max specificity,0.992501,1.0,0.0
7,max absolute_mcc,0.603925,0.738223,133.0
8,max min_per_class_accuracy,0.603925,0.87037,133.0
9,max mean_per_class_accuracy,0.603925,0.870437,133.0



Gains/Lift Table: Avg response rate: 56.28 %, avg score: 56.28 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.012146,0.984378,1.776978,1.776978,1.0,0.988414,1.0,0.988414,0.021583,0.021583,77.697842,77.697842
1,,2,0.020243,0.97993,1.776978,1.776978,1.0,0.982374,1.0,0.985998,0.014388,0.035971,77.697842,77.697842
2,,3,0.032389,0.97705,1.776978,1.776978,1.0,0.979039,1.0,0.983388,0.021583,0.057554,77.697842,77.697842
3,,4,0.040486,0.973087,1.776978,1.776978,1.0,0.975111,1.0,0.981733,0.014388,0.071942,77.697842,77.697842
4,,5,0.052632,0.970501,1.776978,1.776978,1.0,0.971768,1.0,0.979433,0.021583,0.093525,77.697842,77.697842
5,,6,0.101215,0.957304,1.776978,1.776978,1.0,0.96285,1.0,0.971474,0.086331,0.179856,77.697842,77.697842
6,,7,0.149798,0.944697,1.776978,1.776978,1.0,0.951033,1.0,0.964844,0.086331,0.266187,77.697842,77.697842
7,,8,0.202429,0.918828,1.776978,1.776978,1.0,0.929623,1.0,0.955687,0.093525,0.359712,77.697842,77.697842
8,,9,0.299595,0.843906,1.702938,1.752965,0.958333,0.887372,0.986486,0.93353,0.165468,0.52518,70.293765,75.29652
9,,10,0.40081,0.748666,1.492662,1.687232,0.84,0.806631,0.949495,0.901485,0.151079,0.676259,49.266187,68.723203




ModelMetricsBinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.12443785816159157
RMSE: 0.35275750617327983
LogLoss: 0.3913706472695088
Null degrees of freedom: 246
Residual degrees of freedom: 223
Null deviance: 338.5474235554453
Residual deviance: 193.33709975113734
AIC: 241.33709975113734
AUC: 0.9033439914734879
AUCPR: 0.9221911721257895
Gini: 0.8066879829469757

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.38363332142299555: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,76.0,32.0,0.2963,(32.0/108.0)
1,Yes,9.0,130.0,0.0647,(9.0/139.0)
2,Total,85.0,162.0,0.166,(41.0/247.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.383633,0.863787,161.0
1,max f2,0.368899,0.907202,165.0
2,max f0point5,0.685639,0.862355,115.0
3,max accuracy,0.542949,0.834008,143.0
4,max precision,0.989942,1.0,0.0
5,max recall,0.053995,1.0,224.0
6,max specificity,0.989942,1.0,0.0
7,max absolute_mcc,0.383633,0.667149,161.0
8,max min_per_class_accuracy,0.615081,0.827338,132.0
9,max mean_per_class_accuracy,0.615081,0.830336,132.0



Gains/Lift Table: Avg response rate: 56.28 %, avg score: 56.00 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.012146,0.984413,1.776978,1.776978,1.0,0.988516,1.0,0.988516,0.021583,0.021583,77.697842,77.697842
1,,2,0.020243,0.978297,1.776978,1.776978,1.0,0.980329,1.0,0.985241,0.014388,0.035971,77.697842,77.697842
2,,3,0.032389,0.970332,1.776978,1.776978,1.0,0.973086,1.0,0.980683,0.021583,0.057554,77.697842,77.697842
3,,4,0.040486,0.968568,1.776978,1.776978,1.0,0.969465,1.0,0.978439,0.014388,0.071942,77.697842,77.697842
4,,5,0.052632,0.967232,1.776978,1.776978,1.0,0.967962,1.0,0.976021,0.021583,0.093525,77.697842,77.697842
5,,6,0.101215,0.953333,1.776978,1.776978,1.0,0.959868,1.0,0.968268,0.086331,0.179856,77.697842,77.697842
6,,7,0.149798,0.938623,1.628897,1.728952,0.916667,0.945533,0.972973,0.960894,0.079137,0.258993,62.889688,72.895197
7,,8,0.202429,0.91868,1.640288,1.705899,0.923077,0.928999,0.96,0.952602,0.086331,0.345324,64.028777,70.589928
8,,9,0.299595,0.840981,1.628897,1.680926,0.916667,0.875484,0.945946,0.92759,0.158273,0.503597,62.889688,68.092553
9,,10,0.40081,0.751576,1.492662,1.633384,0.84,0.799544,0.919192,0.895255,0.151079,0.654676,49.266187,63.33842




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.8621225,0.06707453,0.92,0.86,0.9183673,0.75510204,0.85714287
1,auc,0.9075912,0.046810076,0.96266234,0.9025974,0.9285714,0.835034,0.90909094
2,aucpr,0.9297562,0.037944812,0.9730829,0.9293063,0.94406736,0.8692108,0.9331138
3,err,0.13787755,0.06707453,0.08,0.14,0.08163265,0.24489796,0.14285715
4,err_count,6.8,3.2710855,4.0,7.0,4.0,12.0,7.0
5,f0point5,0.8618259,0.07011503,0.9121622,0.8680556,0.9285714,0.75,0.8503401
6,f1,0.88643473,0.046351697,0.9310345,0.877193,0.9285714,0.8181818,0.877193
7,f2,0.9143195,0.025383012,0.9507042,0.8865248,0.9285714,0.9,0.9057971
8,lift_top_group,1.7772486,0.02756596,1.7857143,1.7857143,1.75,1.75,1.8148148
9,logloss,0.38373458,0.08632358,0.2814349,0.4374599,0.32462725,0.4974255,0.37772536



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se
0,,2020-06-15 11:27:31,0.000 sec,2,26.0,24,1.333973,,1.342164,0.001915
1,,2020-06-15 11:27:31,0.002 sec,4,19.0,24,1.321295,,1.332135,0.002164
2,,2020-06-15 11:27:31,0.003 sec,6,14.0,24,1.304711,,1.31894,0.002644
3,,2020-06-15 11:27:31,0.005 sec,8,10.0,24,1.283315,,1.30172,0.003425
4,,2020-06-15 11:27:31,0.007 sec,10,7.4,24,1.256269,,1.279667,0.004571
5,,2020-06-15 11:27:31,0.008 sec,12,5.4,24,1.222996,,1.252039,0.006148
6,,2020-06-15 11:27:31,0.010 sec,14,3.9,24,1.183362,,1.218449,0.008225
7,,2020-06-15 11:27:31,0.012 sec,16,2.9,24,1.137851,,1.179033,0.010864
8,,2020-06-15 11:27:31,0.013 sec,18,2.1,24,1.087849,,1.134563,0.014106
9,,2020-06-15 11:27:31,0.016 sec,20,1.5,24,1.035307,,1.086733,0.017949



See the whole table with table.as_data_frame()




In [16]:
h2o_paras_keys = h2o_estimator.leader.params

In [17]:
result=h2o_estimator.leader.model_performance(test_data)

In [18]:
print (result)


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.12292101358138617
RMSE: 0.3506009320885873
LogLoss: 0.3918800065878094
Null degrees of freedom: 55
Residual degrees of freedom: 32
Null deviance: 79.53097555208426
Residual deviance: 43.890560737834655
AIC: 91.89056073783465
AUC: 0.9115384615384615
AUCPR: 0.8648517399762101
Gini: 0.823076923076923

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.18489522700797967: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,21.0,9.0,0.3,(9.0/30.0)
1,Yes,0.0,26.0,0.0,(0.0/26.0)
2,Total,21.0,35.0,0.1607,(9.0/56.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.184895,0.852459,34.0
1,max f2,0.184895,0.935252,34.0
2,max f0point5,0.599638,0.846154,25.0
3,max accuracy,0.599638,0.857143,25.0
4,max precision,0.972128,1.0,0.0
5,max recall,0.184895,1.0,34.0
6,max specificity,0.972128,1.0,0.0
7,max absolute_mcc,0.184895,0.72111,34.0
8,max min_per_class_accuracy,0.599638,0.846154,25.0
9,max mean_per_class_accuracy,0.599638,0.85641,25.0



Gains/Lift Table: Avg response rate: 46.43 %, avg score: 49.75 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.017857,0.967649,2.153846,2.153846,1.0,0.972128,1.0,0.972128,0.038462,0.038462,115.384615,115.384615
1,,2,0.035714,0.963897,2.153846,2.153846,1.0,0.963984,1.0,0.968056,0.038462,0.076923,115.384615,115.384615
2,,3,0.035714,0.963419,0.0,2.153846,0.0,0.0,1.0,0.968056,0.0,0.076923,-100.0,115.384615
3,,4,0.053571,0.962452,2.153846,2.153846,1.0,0.963115,1.0,0.966409,0.038462,0.115385,115.384615,115.384615
4,,5,0.053571,0.96063,0.0,2.153846,0.0,0.0,1.0,0.966409,0.0,0.115385,-100.0,115.384615
5,,6,0.107143,0.946227,1.435897,1.794872,0.666667,0.952857,0.833333,0.959633,0.076923,0.192308,43.589744,79.487179
6,,7,0.160714,0.934201,2.153846,1.91453,1.0,0.941787,0.888889,0.953684,0.115385,0.307692,115.384615,91.452991
7,,8,0.214286,0.925227,2.153846,1.974359,1.0,0.926483,0.916667,0.946884,0.115385,0.423077,115.384615,97.435897
8,,9,0.303571,0.868744,1.723077,1.900452,0.8,0.888542,0.882353,0.929725,0.153846,0.576923,72.307692,90.045249
9,,10,0.410714,0.794598,1.435897,1.779264,0.666667,0.829275,0.826087,0.90352,0.153846,0.730769,43.589744,77.926421






In [19]:
h2o_paras_keys

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'GLM_1_AutoML_20200615_112727',
   'type': 'Key<Model>',
   'URL': '/3/Models/GLM_1_AutoML_20200615_112727'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_py_2_sid_8d99',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_py_2_sid_8d99'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'seed': {'default': -1, 'actual': 4},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'fold_assignment': {'default': 'AUTO', 'actual': 'Modulo'},
 'fold_column': {'default': None, 'actual': 

In [20]:
# from h2o.estimators.xgboost import H2OXGBoostEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

def h2o_perform_validation(paras, df, x_columns, y_columns, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        model=H2OGeneralizedLinearEstimator(**paras)
        train_data, test_data=df.split_frame(ratios=[0.8],seed=i)
        model.train(x=x_columns, y=y_columns, training_frame=train_data)
        temp_auc = model.model_performance(test_data)['AUC']
        aucs.append(temp_auc)
    return aucs

In [23]:
glm_paras = {'nfolds': 5,
             'seed': 4,
             'keep_cross_validation_models': False,
             'keep_cross_validation_predictions': True,
             'fold_assignment':'Modulo',
             'family': 'binomial',
             'solver': 'COORDINATE_DESCENT',
             'alpha': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
#              'lambda': [26.370076734964307, 19.194657141609913, 13.971702338486011,10.169937644371114, 7.402650670956407,
#                         5.388355255702259, 3.9221589201240774, 2.854921374834996, 2.0780840915624856,
#                            1.512627818639899, 1.101034807500474, 0.8014381544414825, 0.5833631334986591,
#                            0.4246273323018121, 0.3090842752718586, 0.22498101736048418, 0.16376264411393365,
#                            0.11920207278739621, 0.08676663859265771, 0.06315703575135227, 0.045971714815688694,
#                            0.03346259902721544, 0.02435727138188167, 0.01772954541540884],
             'lambda_search': True,
             'nlambdas': 30,
             'max_iterations': 300,
             'objective_epsilon': 0.0001,
             'gradient_epsilon': 1.0000000000000002e-06,
             'link': 'logit',
             'lambda_min_ratio': 0.0001,
             'max_active_predictors': 5000,
             'obj_reg': 0.004048582995951417,
            }
# xgboost_paras = {'nfolds': 5,
#                  'keep_cross_validation_models': False,
#                  'keep_cross_validation_predictions': True,
#                  'fold_assignment': 'Modulo',
#                  'stopping_metric': 'AUC',
#                  'stopping_tolerance': 0.05,
#                  'seed': 1,
#                  'distribution': 'bernoulli',
#                  'ntrees': 31,
#                  'max_depth': 10,
#                  'min_rows': 5.0,
#                  'min_child_weight': 5.0,
#                  'sample_rate': 0.6,
#                  'subsample': 0.6,
#                  'col_sample_rate': 0.8,
#                  'colsample_bylevel': 0.8,
#                  'col_sample_rate_per_tree': 0.8,
#                  'colsample_bytree': 0.8,
#                  'score_tree_interval': 5}

x_columns = list(df.columns).remove(target)
y_columns = target

h2o_aucs = h2o_perform_validation(glm_paras, df, x_columns, y_columns, 100)

glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |█

In [24]:
print (mean(h2o_aucs))
print (sem(h2o_aucs))

0.9044765332345763
0.0034414737684716557


In [43]:
import matplotlib.pyplot as plt

def sort_models(average_roc_aucs, roc_auc_sem, prepro_index=None, filename=None, sort_plot = True):
    a = sorted(average_roc_aucs.items(), key=lambda x: x[1], reverse = True)
    sorted_model_name = [e[0] for e in a]
    sorted_roc = [e[1] for e in a]
    print (sorted_model_name)
    sorted_stdev = []
    for x in sorted_model_name:
        sorted_stdev.append(roc_auc_sem[x])
    if sort_plot:
        x_pos = np.arange(len(sorted_model_name))
        fig, ax = plt.subplots()
        ax.bar(x_pos, sorted_roc, yerr=sorted_stdev, align='center', alpha=0.5, ecolor='black', capsize=10)
        ax.set_ylabel('mean ROC AUC value')
        print (x_pos)
        ax.set_xticks(x_pos)
        ax.set_xticklabels(sorted_model_name)
        ax.set_title('Model rank on test data')
        ax.yaxis.grid(True)
        # Save the figure and show
        plt.tight_layout()
        for i, v in enumerate(sorted_roc):
            plt.text(x_pos[i]-0.25, v+0.01, str("%.2f"%v))
        if filename!=None:
            save_name = filename + '/prepro_index_'+ str(prepro_index) + '_model_leaderboard_with_error_bars.png'
            plt.savefig(save_name, dpi=300)
        plt.show()
        plt.close()
    return a, sorted_model_name


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution
