In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean 
from scipy.stats import sem
import xgboost as xgb
import os
cwd = os.getcwd()
import sys
sys.path.append(cwd+"/src_autoML/")
print (cwd)

/Users/liug28/Downloads/AutoML/AutoML_package


# risk factors cervical cancer dataset

In [3]:
cervical_result_path=cwd+"/src_autoML/examples/cervical_result"
cervical_data_dir=cwd+"/src_autoML/examples/data/risk_factors_cervical_cancer.csv"
cervical_target = 'Biopsy'

In [4]:
cervical_data=pd.read_csv(cervical_data_dir)
y_cervical = cervical_data[cervical_target]
x_cervical = cervical_data.drop([cervical_target], axis=1)

In [5]:
def model_perform_validation(clf, prepro, x, y, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=np.random.RandomState(i), test_size=test_size, stratify=y)
        X_train = prepro.fit_transform(X_train)
        model = clf.fit(X_train, y_train)
        X_test = prepro.fit_transform(X_test)
        y_pred = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred)
        aucs.append(auc)
    return aucs

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

x_cervical_impute = pd.DataFrame(x_cervical)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_cervical_impute = imp_mean.fit_transform(x_cervical_impute)
x_cervical_impute = pd.DataFrame(x_cervical_impute)
x_cervical_impute = DataFrameImputer().fit_transform(x_cervical_impute)
x_cervical_impute = pd.get_dummies(x_cervical_impute)

## 1 default logistic regression

In [27]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model=LogisticRegression(solver='liblinear', random_state=0)
logistic_aucs=model_perform_validation(logistic_regression_model, StandardScaler(copy=True, with_mean=False, with_std=False), x_cervical_impute, y_cervical, 100)
print (mean(logistic_aucs))
print (sem(logistic_aucs))

0.9335798983625071
0.0049644612479355105


## 2 default setting xgboost

In [31]:
default_clf = xgb.XGBClassifier()

In [34]:
default_aucs=model_perform_validation(default_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_cervical, y_cervical, 100)

In [35]:
print (mean(default_aucs))
print (sem(default_aucs))

0.9525719932241672
0.0031818552269275288


## 3 allow missing autoML

In [6]:
allowMissing_prepro = '0'
allowMissing_dir = cervical_result_path+'/resultAllowMissing/'+allowMissing_prepro
am_x_train_dir = allowMissing_dir+'/X_train_important.csv'
am_x_test_dir = allowMissing_dir+'/X_test_important.csv'
am_y_train_dir = allowMissing_dir+'/y_train.csv'
am_y_test_dir = allowMissing_dir+'/y_test.csv'

am_x_train = pd.read_csv(am_x_train_dir, index_col = 0)
am_x_test = pd.read_csv(am_x_test_dir, index_col = 0)
am_y_train = pd.read_csv(am_y_train_dir, index_col = 0)
am_y_test = pd.read_csv(am_y_test_dir, index_col = 0)

x_cervical_am = pd.concat([am_x_train, am_x_test])
y_cervical_am = pd.concat([am_y_train, am_y_test])

cervical_am_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.9566990786363013, colsample_bynode=1,
              colsample_bytree=0.5365781665491416, gamma=0.0006502818396644504,
              learning_rate=0.0075220160397958934, max_delta_step=0,
              max_depth=4, min_child_weight=3, missing=np.nan, n_estimators=5400,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0.00018418064439747265,
              reg_lambda=3.0558366994927324, scale_pos_weight=1, seed=0,
              silent=None, subsample=0.6381458957298243, verbosity=1)


In [None]:
cervical_am_aucs=model_perform_validation(cervical_am_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_cervical_am, y_cervical_am[cervical_target], 100)

In [22]:
print (mean(cervical_am_aucs))
print (sem(cervical_am_aucs))

0.9592828910220215
0.002610437188380406


## 4 not allow missing from all classifiers

In [9]:
notAllowMissing_prepro = '0'
notAllowMissing_dir = cervical_result_path+'/resultImputeMissing/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_cervical_nam = pd.concat([nam_x_train, nam_x_test])
y_cervical_nam = pd.concat([nam_y_train, nam_y_test])

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 0 rf
cervical_rf_clf = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.06211573904704122, loss='deviance',
                           max_depth=None, max_features=0.29340780474269756,
                           max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=518, n_iter_no_change=None,
                           presort='deprecated', random_state=0,
                           subsample=0.7914722557669576, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# cervical_rf_clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
#                        criterion='entropy', max_depth=None,
#                        max_features=0.39261535523790037, max_leaf_nodes=None,
#                        max_samples=None, min_impurity_decrease=0.0,
#                        min_impurity_split=None, min_samples_leaf=2,
#                        min_samples_split=2, min_weight_fraction_leaf=0.0,
#                        n_estimators=31, n_jobs=1, oob_score=False,
#                        random_state=0, verbose=False, warm_start=False)

In [11]:
cervical_nam_aucs=model_perform_validation(cervical_rf_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_cervical_nam, y_cervical_nam[cervical_target], 100)


In [12]:
print (mean(cervical_nam_aucs))
print (sem(cervical_nam_aucs))

0.947512987012987
0.004078819938321796


In [14]:
print (mean(cervical_nam_aucs))
print (sem(cervical_nam_aucs))

0.9495313382269904
0.004094592951469818


## 4 H2O classifier

In [14]:
from h2o.automl import H2OAutoML
import h2o

In [15]:
cervical_h2o = cervical_data.copy()
cervical_h2o[cervical_target].replace({1:'Yes', 0:'No'}, inplace=True)

In [16]:
cervical_h2o

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,,,0,0,0,0,0,0,0,No
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,,,0,0,0,0,0,0,0,No
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.00,0.0,...,,,0,0,0,0,0,0,0,No
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.00,0.0,...,,,1,0,1,0,0,0,0,No
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.00,0.0,...,,,0,0,0,0,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,,,0,0,0,0,0,0,0,No
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.00,0.0,...,,,0,0,0,0,0,0,0,No
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,,,0,0,0,0,0,0,1,No
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,,,0,0,0,0,0,0,0,No


In [17]:
h2o.init(nthreads = -1, max_mem_size = 8)
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,29 mins 46 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.634 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Connecting to H2O server at http://localhost:54321 ... successful.


0,1
H2O_cluster_uptime:,29 mins 46 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.634 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


<H2OConnection to http://localhost:54321, no session>

In [18]:
df = h2o.H2OFrame(cervical_h2o)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [19]:
df.describe()

Rows:858
Cols:36




Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
type,int,int,int,int,int,real,real,int,real,int,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,enum,int,int,int,int,int,int,int,enum
mins,13.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
mean,26.82051282051281,2.52764423076923,16.995299647473573,2.2755610972568565,0.1455621301775148,1.2197214125858005,0.4531439506556213,0.6413333333333333,2.2564192013893316,0.11201079622132254,0.5148043184885293,0.1049136786188579,0.1766268260292163,0.05843293492695883,0.0,0.005312084993359893,0.057104913678618856,0.02390438247011952,0.0013280212483399733,0.0013280212483399733,0.0013280212483399733,0.0,0.02390438247011952,0.0013280212483399733,0.0026560424966799467,0.08741258741258745,,,0.02097902097902098,0.01048951048951049,0.02097902097902098,0.027972027972027972,0.04079254079254079,0.08624708624708624,0.05128205128205128,
maxs,84.0,28.0,32.0,11.0,1.0,37.0,37.0,1.0,30.0,1.0,19.0,1.0,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,3.0,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
sigma,8.497948065482602,1.6677604771839047,2.8033553841969017,1.4474140691327373,0.3528755570364017,4.089016937562999,2.2266098026055663,0.4799291595848794,3.764253529146503,0.3155927582700105,1.9430885475317117,0.3066458457562428,0.56199283535662,0.23471619027599683,0.0,0.07273852600189899,0.2321972082083672,0.15285284048760794,0.03644202585395018,0.03644202585395018,0.03644202585395018,0.0,0.15285284048760794,0.03644202585395018,0.051502529309489134,0.3025447418507477,,,0.1433975861165658,0.1019391586774784,0.1433975861165658,0.1649888484215437,0.1979246496661347,0.2808923013686258,0.22070109393249174,
zeros,0,0,0,16,722,722,722,269,269,658,658,674,674,709,753,749,710,735,752,752,752,753,735,752,751,787,,,840,849,840,834,823,784,814,
missing,0,26,7,56,13,13,13,108,108,117,117,105,105,105,105,105,105,105,105,105,105,105,105,105,105,0,0,0,0,0,0,0,0,0,0,0
0,18.0,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No
1,15.0,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No
2,34.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No


In [20]:
h2o_estimator=H2OAutoML(max_models = 10, seed = 1, exclude_algos=["DeepLearning", "StackedEnsemble", "GLM"],sort_metric = "AUC",stopping_metric="AUC")

In [21]:
train_data, test_data=df.split_frame(ratios=[0.8],seed=0)
h2o_estimator.train(x=list(df.columns).remove(cervical_target), y=cervical_target, training_frame=train_data)

AutoML progress: |█████████████████
23:44:33.35: XRT_1_AutoML_20200531_234422 [DRF XRT (Extremely Randomized Trees)] failed: java.lang.AssertionError

███████████████████████████████████████| 100%


In [22]:
h2o_estimator.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_2_AutoML_20200531_234422,0.965162,0.109635,0.609448,0.103062,0.179235,0.0321253
XGBoost_3_AutoML_20200531_234422,0.959108,0.113219,0.550362,0.0816273,0.183735,0.0337587
GBM_3_AutoML_20200531_234422,0.955591,0.120128,0.528694,0.104637,0.186713,0.0348619
GBM_4_AutoML_20200531_234422,0.954733,0.119202,0.545105,0.0927384,0.187675,0.035222
DRF_1_AutoML_20200531_234422,0.946439,0.16171,0.522291,0.104637,0.190274,0.0362043
XGBoost_1_AutoML_20200531_234422,0.941207,0.114601,0.499286,0.0927384,0.178781,0.0319626
GBM_1_AutoML_20200531_234422,0.931619,0.136575,0.562884,0.115748,0.195309,0.0381456
XGBoost_2_AutoML_20200531_234422,0.591111,0.242166,0.0809368,0.413036,0.248448,0.0617266
GBM_5_AutoML_20200531_234422,0.514558,0.247232,0.0646608,0.448819,0.249918,0.0624588
XGBoost_grid__1_AutoML_20200531_234422_model_1,0.44007,0.24515,0.0551064,0.5,0.248974,0.0619879




In [23]:
h2o_estimator.leader

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_2_AutoML_20200531_234422


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,30.0,30.0,7315.0,6.0,7.0,6.966667,10.0,26.0,14.833333




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.014894724243000123
RMSE: 0.12204394390136744
LogLoss: 0.054632509581269084
Mean Per-Class Error: 0.014173228346456623
AUC: 0.9958355205599301
AUCPR: 0.9445114498050666
Gini: 0.9916710411198602

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4311818073631215: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,626.0,9.0,0.0142,(9.0/635.0)
1,Yes,4.0,41.0,0.0889,(4.0/45.0)
2,Total,630.0,50.0,0.0191,(13.0/680.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.431182,0.863158,49.0
1,max f2,0.287947,0.92827,56.0
2,max f0point5,0.669676,0.89172,27.0
3,max accuracy,0.54746,0.980882,41.0
4,max precision,0.901793,1.0,0.0
5,max recall,0.215916,1.0,62.0
6,max specificity,0.901793,1.0,0.0
7,max absolute_mcc,0.287947,0.858741,56.0
8,max min_per_class_accuracy,0.287947,0.977778,56.0
9,max mean_per_class_accuracy,0.215916,0.985827,62.0



Gains/Lift Table: Avg response rate:  6.62 %, avg score:  6.58 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010294,0.826006,15.111111,15.111111,1.0,0.868064,1.0,0.868064,0.155556,0.155556,1411.111111,1411.111111
1,,2,0.020588,0.773872,15.111111,15.111111,1.0,0.803895,1.0,0.835979,0.155556,0.311111,1411.111111,1411.111111
2,,3,0.030882,0.729041,15.111111,15.111111,1.0,0.756503,1.0,0.809487,0.155556,0.466667,1411.111111,1411.111111
3,,4,0.041176,0.66728,15.111111,15.111111,1.0,0.700751,1.0,0.782303,0.155556,0.622222,1411.111111,1411.111111
4,,5,0.05,0.630985,10.074074,14.222222,0.666667,0.64375,0.941176,0.757853,0.088889,0.711111,907.407407,1322.222222
5,,6,0.1,0.080448,5.777778,10.0,0.382353,0.391464,0.661765,0.574658,0.288889,1.0,477.777778,900.0
6,,7,0.15,0.035935,0.0,6.666667,0.0,0.04775,0.441176,0.399022,0.0,1.0,-100.0,566.666667
7,,8,0.2,0.01602,0.0,5.0,0.0,0.025114,0.330882,0.305545,0.0,1.0,-100.0,400.0
8,,9,0.3,0.008306,0.0,3.333333,0.0,0.011249,0.220588,0.207446,0.0,1.0,-100.0,233.333333
9,,10,0.4,0.006519,0.0,2.5,0.0,0.007217,0.165441,0.157389,0.0,1.0,-100.0,150.0




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.03212532579889326
RMSE: 0.17923539214924394
LogLoss: 0.10963527600679282
Mean Per-Class Error: 0.08626421697287845
AUC: 0.965161854768154
AUCPR: 0.609448216547962
Gini: 0.9303237095363079

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3195318224854168: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,617.0,18.0,0.0283,(18.0/635.0)
1,Yes,8.0,37.0,0.1778,(8.0/45.0)
2,Total,625.0,55.0,0.0382,(26.0/680.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.319532,0.74,54.0
1,max f2,0.173671,0.798319,57.0
2,max f0point5,0.319532,0.698113,54.0
3,max accuracy,0.319532,0.961765,54.0
4,max precision,0.853714,1.0,0.0
5,max recall,0.011146,1.0,185.0
6,max specificity,0.853714,1.0,0.0
7,max absolute_mcc,0.319532,0.72381,54.0
8,max min_per_class_accuracy,0.045238,0.888889,78.0
9,max mean_per_class_accuracy,0.045238,0.913736,78.0



Gains/Lift Table: Avg response rate:  6.62 %, avg score:  6.07 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010294,0.79196,8.634921,8.634921,0.571429,0.827017,0.571429,0.827017,0.088889,0.088889,763.492063,763.492063
1,,2,0.020588,0.720656,8.634921,8.634921,0.571429,0.761847,0.571429,0.794432,0.088889,0.177778,763.492063,763.492063
2,,3,0.030882,0.671067,10.793651,9.354497,0.714286,0.696912,0.619048,0.761925,0.111111,0.288889,979.365079,835.449735
3,,4,0.041176,0.599056,12.952381,10.253968,0.857143,0.632648,0.678571,0.729606,0.133333,0.422222,1195.238095,925.396825
4,,5,0.05,0.573608,10.074074,10.222222,0.666667,0.581085,0.676471,0.703396,0.088889,0.511111,907.407407,922.222222
5,,6,0.1,0.067362,7.111111,8.666667,0.470588,0.334495,0.573529,0.518946,0.355556,0.866667,611.111111,766.666667
6,,7,0.15,0.02905,0.444444,5.925926,0.029412,0.041846,0.392157,0.359912,0.022222,0.888889,-55.555556,492.592593
7,,8,0.2,0.020395,0.444444,4.555556,0.029412,0.024444,0.301471,0.276045,0.022222,0.911111,-55.555556,355.555556
8,,9,0.3,0.010788,0.888889,3.333333,0.058824,0.014614,0.220588,0.188901,0.088889,1.0,-11.111111,233.333333
9,,10,0.4,0.007941,0.0,2.5,0.0,0.009278,0.165441,0.143996,0.0,1.0,-100.0,150.0




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.9632353,0.015597944,0.97794116,0.9632353,0.9411765,0.97794116,0.9558824
1,auc,0.9663729,0.016413724,0.98535156,0.95238096,0.94628906,0.9753968,0.97244626
2,aucpr,0.6275625,0.18825856,0.82915425,0.43406782,0.42567405,0.67891043,0.77000606
3,err,0.036764707,0.015597944,0.022058824,0.036764707,0.05882353,0.022058824,0.04411765
4,err_count,5.0,2.1213202,3.0,5.0,8.0,3.0,6.0
5,f0point5,0.70807457,0.11624278,0.79545456,0.64102566,0.546875,0.8333333,0.7236842
6,f1,0.75388336,0.09739602,0.8235294,0.6666667,0.6363636,0.85714287,0.78571427
7,f2,0.8101401,0.07960165,0.85365856,0.6944444,0.76086956,0.88235295,0.859375
8,lift_top_group,10.669524,3.9085834,17.0,9.714286,8.5,6.8,11.333333
9,logloss,0.10963528,0.01983,0.07739778,0.12243146,0.12766944,0.10566468,0.11501304



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-05-31 23:44:28,0.235 sec,0.0,0.24859,0.243634,0.5,0.066176,1.0,0.933824
1,,2020-05-31 23:44:28,0.248 sec,5.0,0.191687,0.136233,0.989169,0.861834,15.111111,0.035294
2,,2020-05-31 23:44:28,0.260 sec,10.0,0.166436,0.104707,0.990954,0.876166,15.111111,0.023529
3,,2020-05-31 23:44:28,0.272 sec,15.0,0.149593,0.084109,0.991934,0.883205,15.111111,0.027941
4,,2020-05-31 23:44:28,0.284 sec,20.0,0.139421,0.071635,0.993141,0.904939,15.111111,0.026471
5,,2020-05-31 23:44:28,0.297 sec,25.0,0.129688,0.061746,0.994646,0.928648,15.111111,0.022059
6,,2020-05-31 23:44:28,0.309 sec,30.0,0.122044,0.054633,0.995836,0.944511,15.111111,0.019118



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Schiller,89.442451,1.0,0.650258
1,Hinselmann,14.466839,0.161745,0.105176
2,Age,9.684093,0.108272,0.070405
3,Hormonal Contraceptives (years),5.829471,0.065176,0.042381
4,First sexual intercourse,3.746708,0.04189,0.027239
5,Citology,3.372355,0.037704,0.024517
6,Num of pregnancies,2.750649,0.030753,0.019998
7,Dx,2.02326,0.022621,0.014709
8,STDs: Time since first diagnosis,1.612288,0.018026,0.011722
9,Number of sexual partners,1.254329,0.014024,0.009119



See the whole table with table.as_data_frame()




In [24]:
h2o_paras_keys = h2o_estimator.leader.params

In [25]:
result=h2o_estimator.leader.predict(test_data)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [26]:
print (result)

predict,No,Yes
No,0.973155,0.0268449
No,0.99275,0.00725003
No,0.993262,0.00673756
No,0.996065,0.00393544
No,0.995338,0.00466235
No,0.99371,0.00629046
No,0.989122,0.0108782
No,0.624499,0.375501
No,0.995373,0.0046265
No,0.9958,0.00419951





In [27]:
h2o_paras_keys

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'GBM_2_AutoML_20200531_234422',
   'type': 'Key<Model>',
   'URL': '/3/Models/GBM_2_AutoML_20200531_234422'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_py_2_sid_8f35',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_py_2_sid_8f35'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'score_tree_interval': {'default': 0, 'actual': 5},
 'fold_assignment': {'default

In [35]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [44]:
def h2o_perform_validation(paras, df, x_columns, y_columns, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        model=H2OGradientBoostingEstimator(**paras)
        train_data, test_data=df.split_frame(ratios=[0.8],seed=i)
        model.train(x=x_columns, y=y_columns, training_frame=train_data)
        temp_auc = model.model_performance(test_data)['AUC']
        aucs.append(temp_auc)
    return aucs

In [48]:
x_columns = list(df.columns).remove(cervical_target)
y_columns = cervical_target

gbm_paras = {'nfolds':5,
             'keep_cross_validation_models': False,
             'score_tree_interval':5,
             'fold_assignment': 'Modulo',
             'ntrees': 30,
             'max_depth': 7,
             'stopping_metric': 'AUC',
             'stopping_tolerance': 0.03834824944236852,
             'seed': 6,
             'distribution':'bernoulli',
             'sample_rate': 0.8,
             'col_sample_rate':0.8,
             'col_sample_rate_per_tree': 0.8
            }
h2o_aucs = h2o_perform_validation(gbm_paras, df, x_columns, y_columns, 100)

x_columns = list(df.columns).remove(cervical_target)
y_columns = cervical_target
h2o_aucs = h2o_perform_validation(h2o_estimator, df, x_columns, y_columns, 100)




In [46]:
print (mean(h2o_aucs))
print (sem(h2o_aucs))

0.9474917417271431
0.003508362546772839
