In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean 
from scipy.stats import sem
import xgboost as xgb
import os
cwd = os.getcwd()
import sys
sys.path.append(cwd+"/src_autoML/")
print (cwd)

/Users/liug28/Downloads/AutoML/AutoML_package


# hepatitis dataset

In [2]:
hepatitis_result_path=cwd+"/src_autoML/examples/hepatitis_result"
hepatitis_data_dir=cwd+"/src_autoML/examples/data/hepatitis.csv"
hepatitis_target = 'HISTOLOGY'

In [3]:
hepatitis_data=pd.read_csv(hepatitis_data_dir)
y_hepatitis = hepatitis_data[hepatitis_target]
x_hepatitis = hepatitis_data.drop([hepatitis_target], axis=1)


In [7]:
# categorical_features = ['SEX','STEROID','ANTIVIRALS','FATIGUE','MALAISE','ANOREXIA','LIVER BIG','LIVER FIRM','SPLEEN PALPABLE','SPIDERS','ASCITES','VARICES']
# x_hepatitis_categorical = x_hepatitis[categorical_features]
# x_hepatitis_other = x_hepatitis.drop(categorical_features, axis=1)

In [5]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_hepatitis_imp = imp_mean.fit_transform(x_hepatitis)

In [11]:
# x_hepatitis_other_df = pd.DataFrame(x_hepatitis_other_imp, columns = x_hepatitis_other.columns)

In [12]:
# x_hepatitis = pd.concat([x_hepatitis_categorical_df, x_hepatitis_other_df], axis=1)

In [6]:
def model_perform_validation(clf, prepro, x, y, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=np.random.RandomState(i), test_size=test_size, stratify=y)
        X_train = prepro.fit_transform(X_train)
        model = clf.fit(X_train, y_train)
        X_test = prepro.fit_transform(X_test)
        y_pred = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_pred)
        aucs.append(auc)
    return aucs

## 1 default logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_regression_model=LogisticRegression(solver='liblinear')
logistic_aucs=model_perform_validation(logistic_regression_model, StandardScaler(copy=True, with_mean=False, with_std=False), x_hepatitis_imp, y_hepatitis, 100)
print (mean(logistic_aucs))
print (sem(logistic_aucs))

0.740546218487395
0.007354881090575848


## 2 default xgboost

In [8]:
default_clf = xgb.XGBClassifier()


In [9]:
default_aucs=model_perform_validation(default_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_hepatitis, y_hepatitis, 100)


In [10]:
print (mean(default_aucs))
print (sem(default_aucs))


0.66390756302521
0.008360278107523093


## 3 allow missing autoML

In [38]:
allowMissing_prepro = '1'
allowMissing_dir = hepatitis_result_path+'/resultAllowMissing/'+allowMissing_prepro
am_x_train_dir = allowMissing_dir+'/X_train_important.csv'
am_x_test_dir = allowMissing_dir+'/X_test_important.csv'
am_y_train_dir = allowMissing_dir+'/y_train.csv'
am_y_test_dir = allowMissing_dir+'/y_test.csv'

am_x_train = pd.read_csv(am_x_train_dir, index_col = 0)
am_x_test = pd.read_csv(am_x_test_dir, index_col = 0)
am_y_train = pd.read_csv(am_y_train_dir, index_col = 0)
am_y_test = pd.read_csv(am_y_test_dir, index_col = 0)

x_hepatitis_am = pd.concat([am_x_train, am_x_test])
y_hepatitis_am = pd.concat([am_y_train, am_y_test])

In [39]:
hepatitis_am_clf = xgb.XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=0.551768889451627,
              colsample_bynode=None, colsample_bytree=0.8912179685315662,
              gamma=0.0014481669677848567, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.001811865598117431,
              max_delta_step=0, max_depth=4, min_child_weight=2, missing=np.nan,
              monotone_constraints=None, n_estimators=5200, n_jobs=None,
              num_parallel_tree=None, objective='binary:logistic',
              random_state=None, reg_alpha=0.22662074246340666,
              reg_lambda=3.64691211739843, scale_pos_weight=1, seed=0,
              subsample=0.5462179618958122, tree_method=None,
              validate_parameters=False, verbosity=None)

In [40]:
hepatitis_am_aucs=model_perform_validation(hepatitis_am_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_hepatitis_am, y_hepatitis_am[hepatitis_target], 100)

In [41]:
print (mean(hepatitis_am_aucs))
print (sem(hepatitis_am_aucs))

0.7045378151260504
0.00829461940147008


## 4 no missing allow autoML

In [70]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

notAllowMissing_prepro = '0'
notAllowMissing_dir = hepatitis_result_path+'/resultNoMissingAllow/'+notAllowMissing_prepro
nam_x_train_dir = notAllowMissing_dir+'/X_train_important.csv'
nam_x_test_dir = notAllowMissing_dir+'/X_test_important.csv'
nam_y_train_dir = notAllowMissing_dir+'/y_train.csv'
nam_y_test_dir = notAllowMissing_dir+'/y_test.csv'

nam_x_train = pd.read_csv(nam_x_train_dir, index_col = 0)
nam_x_test = pd.read_csv(nam_x_test_dir, index_col = 0)
nam_y_train = pd.read_csv(nam_y_train_dir, index_col = 0)
nam_y_test = pd.read_csv(nam_y_test_dir, index_col = 0)

x_hepatitis_nam = pd.concat([nam_x_train, nam_x_test])
y_hepatitis_nam = pd.concat([nam_y_train, nam_y_test])

hepatitis_nam_clf=LogisticRegression(C=0.0165165418572436, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=0.13078315987811,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=0, solver='liblinear',
                   tol=0.0003497038350159911, verbose=0, warm_start=False)


In [71]:
hepatitis_nam_aucs=model_perform_validation(hepatitis_nam_clf, StandardScaler(copy=True, with_mean=False, with_std=False), x_hepatitis_nam, y_hepatitis_nam[hepatitis_target], 100)

In [72]:
print (mean(hepatitis_nam_aucs))
print (sem(hepatitis_nam_aucs))

0.7428151260504202
0.008794945434922544


## 5 H2O classifier

In [16]:
from h2o.automl import H2OAutoML
import h2o

In [17]:
hepatitis_h2o = hepatitis_data.copy()
hepatitis_h2o[hepatitis_target].replace({1:'Yes', 2:'No'}, inplace=True)

In [18]:
hepatitis_h2o

Unnamed: 0,Target,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,Yes
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,Yes
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,Yes
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,Yes
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1,46,1,2.0,2,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,,242.0,3.3,50.0,No
151,2,44,1,2.0,2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,126.0,142.0,4.3,,No
152,2,61,1,1.0,2,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,75.0,20.0,4.1,,No
153,2,53,2,1.0,2,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.5,81.0,19.0,4.1,48.0,No


In [19]:
h2o.init(nthreads = -1, max_mem_size = 8)
h2o.connect()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 58 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.465 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Connecting to H2O server at http://localhost:54321 ... successful.


0,1
H2O_cluster_uptime:,1 hour 58 mins
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_liug28_t7h1w0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.465 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


<H2OConnection to http://localhost:54321, no session>

In [20]:
# df = h2o.import_file(heart_data_dir)
df = h2o.H2OFrame(hepatitis_h2o)
# X_train, X_test, y_train, y_test = train_test_split(x_heart, y_heart, random_state=np.random.RandomState(0), test_size=0.2, stratify=y_heart)
# train_data = pd.concat([X_train, y_train], axis=1)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [21]:
df.describe()

Rows:155
Cols:20




Unnamed: 0,Target,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
type,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,int,int,real,int,enum
mins,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3,26.0,14.0,2.1,0.0,
mean,1.7935483870967739,41.2,1.1032258064516132,1.5064935064935068,1.8451612903225802,1.3506493506493507,1.6038961038961037,1.7922077922077924,1.8275862068965514,1.5833333333333328,1.8,1.66,1.866666666666666,1.8799999999999994,1.42751677852349,105.32539682539687,85.89403973509934,3.8172661870503592,61.85227272727276,
maxs,2.0,78.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,295.0,648.0,6.4,100.0,
sigma,0.4060704019560946,12.565878349773197,0.3052398152015035,0.5015890207223368,0.36292347381022383,0.4787301767423468,0.4906822137460068,0.40705100249439874,0.3790490217894517,0.4947274449181536,0.40134003725439077,0.4752957397962414,0.34107344692436553,0.3260501875972696,1.2121490330948559,51.50810877756305,89.65088973850219,0.6515230826162131,22.87524374481145,
zeros,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
missing,0,0,0,1,0,1,1,1,10,11,5,5,5,5,6,29,4,16,67,0
0,2.0,30.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,Yes
1,2.0,50.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,Yes
2,2.0,78.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,Yes


In [22]:
h2o_estimator=H2OAutoML(max_models = 10, seed = 1, exclude_algos=["DeepLearning", "StackedEnsemble", "GLM"],sort_metric = "AUC",stopping_metric="AUC")

In [23]:
train_data, test_data=df.split_frame(ratios=[0.8],seed=0)
h2o_estimator.train(x=list(df.columns).remove(hepatitis_target), y=hepatitis_target, training_frame=train_data)

AutoML progress: |██████████████
01:13:31.881: Skipping training of model GBM_5_AutoML_20200601_011323 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200601_011323.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 131.0.


██████████████████████████████████████████| 100%


In [24]:
h2o_estimator.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_1_AutoML_20200601_011323,0.701701,0.636462,0.727809,0.349669,0.46908,0.220036
XRT_1_AutoML_20200601_011323,0.697803,0.639877,0.71403,0.341049,0.466449,0.217574
DRF_1_AutoML_20200601_011323,0.695087,0.649672,0.726582,0.4171,0.473747,0.224436
GBM_2_AutoML_20200601_011323,0.6906,0.637905,0.665181,0.322272,0.466413,0.217541
GBM_3_AutoML_20200601_011323,0.671469,0.675034,0.657587,0.358526,0.479359,0.229785
GBM_4_AutoML_20200601_011323,0.663911,0.665815,0.669451,0.380846,0.481274,0.231625
GBM_1_AutoML_20200601_011323,0.642182,0.745072,0.661783,0.379074,0.508275,0.258344
XGBoost_3_AutoML_20200601_011323,0.635333,0.740492,0.657065,0.415328,0.504636,0.254658
XGBoost_2_AutoML_20200601_011323,0.506967,0.686664,0.555709,0.5,0.496754,0.246764
XGBoost_grid__1_AutoML_20200601_011323_model_1,0.477917,0.686897,0.54254,0.5,0.49687,0.246879




In [25]:
h2o_estimator.leader

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_1_AutoML_20200601_011323


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,30.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.16387562093512487
RMSE: 0.4048155393943331
LogLoss: 0.5010034452932738
Mean Per-Class Error: 0.21126594237128016
AUC: 0.8424657534246575
AUCPR: 0.8605769119137593
Gini: 0.6849315068493149

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5056674480438232: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,41.0,17.0,0.2931,(17.0/58.0)
1,Yes,11.0,62.0,0.1507,(11.0/73.0)
2,Total,52.0,79.0,0.2137,(28.0/131.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.505667,0.815789,76.0
1,max f2,0.371365,0.888325,97.0
2,max f0point5,0.641835,0.825243,57.0
3,max accuracy,0.587488,0.78626,65.0
4,max precision,0.94537,1.0,0.0
5,max recall,0.204182,1.0,116.0
6,max specificity,0.94537,1.0,0.0
7,max absolute_mcc,0.587488,0.57382,65.0
8,max min_per_class_accuracy,0.581083,0.780822,67.0
9,max mean_per_class_accuracy,0.587488,0.788734,65.0



Gains/Lift Table: Avg response rate: 55.73 %, avg score: 57.13 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015267,0.936884,1.794521,1.794521,1.0,0.941265,1.0,0.941265,0.027397,0.027397,79.452055,79.452055
1,,2,0.022901,0.91937,1.794521,1.794521,1.0,0.93624,1.0,0.93959,0.013699,0.041096,79.452055,79.452055
2,,3,0.030534,0.907265,1.794521,1.794521,1.0,0.908123,1.0,0.931723,0.013699,0.054795,79.452055,79.452055
3,,4,0.045802,0.902073,1.794521,1.794521,1.0,0.904717,1.0,0.922721,0.027397,0.082192,79.452055,79.452055
4,,5,0.053435,0.895572,1.794521,1.794521,1.0,0.901307,1.0,0.919662,0.013699,0.09589,79.452055,79.452055
5,,6,0.10687,0.847806,1.53816,1.666341,0.857143,0.865986,0.928571,0.892824,0.082192,0.178082,53.816047,66.634051
6,,7,0.152672,0.836751,1.495434,1.615068,0.833333,0.842699,0.9,0.877787,0.068493,0.246575,49.543379,61.506849
7,,8,0.206107,0.799915,1.53816,1.595129,0.857143,0.820102,0.888889,0.862831,0.082192,0.328767,53.816047,59.512938
8,,9,0.305344,0.741108,1.51844,1.570205,0.846154,0.770573,0.875,0.832847,0.150685,0.479452,51.844046,57.020548
9,,10,0.40458,0.662435,1.51844,1.557508,0.846154,0.695572,0.867925,0.799176,0.150685,0.630137,51.844046,55.75084




ModelMetricsBinomial: xgboost
** Reported on cross-validation data. **

MSE: 0.2200357136748244
RMSE: 0.46907964534269064
LogLoss: 0.6364619109930958
Mean Per-Class Error: 0.3350259801606046
AUC: 0.7017005196032121
AUCPR: 0.7278091044916308
Gini: 0.40340103920642423

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.38474637269973755: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,23.0,35.0,0.6034,(35.0/58.0)
1,Yes,7.0,66.0,0.0959,(7.0/73.0)
2,Total,30.0,101.0,0.3206,(42.0/131.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.384746,0.758621,99.0
1,max f2,0.128395,0.866983,127.0
2,max f0point5,0.562759,0.703812,65.0
3,max accuracy,0.447894,0.679389,89.0
4,max precision,0.947108,1.0,0.0
5,max recall,0.128395,1.0,127.0
6,max specificity,0.947108,1.0,0.0
7,max absolute_mcc,0.384746,0.355412,99.0
8,max min_per_class_accuracy,0.562759,0.657534,65.0
9,max mean_per_class_accuracy,0.562759,0.664974,65.0



Gains/Lift Table: Avg response rate: 55.73 %, avg score: 57.92 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.015267,0.94148,0.89726,0.89726,0.5,0.944723,0.5,0.944723,0.013699,0.013699,-10.273973,-10.273973
1,,2,0.022901,0.933067,1.794521,1.196347,1.0,0.939476,0.666667,0.942974,0.013699,0.027397,79.452055,19.634703
2,,3,0.030534,0.925871,1.794521,1.34589,1.0,0.928794,0.75,0.939429,0.013699,0.041096,79.452055,34.589041
3,,4,0.045802,0.916306,1.794521,1.495434,1.0,0.921253,0.833333,0.93337,0.027397,0.068493,79.452055,49.543379
4,,5,0.053435,0.911374,1.794521,1.53816,1.0,0.913697,0.857143,0.93056,0.013699,0.082192,79.452055,53.816047
5,,6,0.10687,0.878055,1.2818,1.40998,0.714286,0.89342,0.785714,0.91199,0.068493,0.150685,28.180039,40.998043
6,,7,0.152672,0.834256,1.495434,1.435616,0.833333,0.848299,0.8,0.892883,0.068493,0.219178,49.543379,43.561644
7,,8,0.206107,0.817322,1.2818,1.395738,0.714286,0.824207,0.777778,0.875078,0.068493,0.287671,28.180039,39.57382
8,,9,0.305344,0.733939,1.3804,1.390753,0.769231,0.781832,0.775,0.844773,0.136986,0.424658,38.040042,39.075342
9,,10,0.40458,0.687498,0.96628,1.286637,0.538462,0.706153,0.716981,0.810772,0.09589,0.520548,-3.37197,28.663737




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.71794873,0.077455424,0.6666667,0.61538464,0.7307692,0.8076923,0.7692308
1,auc,0.7056377,0.08100134,0.7032967,0.63095236,0.6242424,0.8121212,0.75757575
2,aucpr,0.7496592,0.10393783,0.7713862,0.62165415,0.6785056,0.89105856,0.78569144
3,err,0.2820513,0.077455424,0.33333334,0.3846154,0.26923078,0.1923077,0.23076923
4,err_count,7.4,2.0736442,9.0,10.0,7.0,5.0,6.0
5,f0point5,0.7475533,0.11010397,0.6632653,0.6372549,0.7281553,0.90909094,0.8
6,f1,0.775178,0.03984631,0.74285716,0.7222222,0.8108108,0.8,0.8
7,f2,0.8212818,0.07294553,0.84415585,0.8333333,0.91463417,0.71428573,0.8
8,lift_top_group,1.4257143,0.80146974,1.9285715,0.0,1.7333333,1.7333333,1.7333333
9,logloss,0.6362958,0.10129944,0.65805686,0.7927666,0.6312716,0.526627,0.572757



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-06-01 01:13:24,0.245 sec,0.0,0.5,0.693147,0.5,0.557252,1.0,0.442748
1,,2020-06-01 01:13:24,0.253 sec,5.0,0.44393,0.582748,0.78838,0.793278,1.196347,0.282443
2,,2020-06-01 01:13:24,0.259 sec,10.0,0.42192,0.536763,0.809754,0.827852,1.794521,0.244275
3,,2020-06-01 01:13:24,0.266 sec,15.0,0.419,0.529678,0.813533,0.839407,1.794521,0.221374
4,,2020-06-01 01:13:24,0.273 sec,20.0,0.411783,0.515593,0.827586,0.853823,1.794521,0.221374
5,,2020-06-01 01:13:24,0.280 sec,25.0,0.40896,0.508791,0.833727,0.85947,1.794521,0.21374
6,,2020-06-01 01:13:24,0.287 sec,30.0,0.404816,0.501003,0.842466,0.860577,1.794521,0.21374



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,SGOT,25.11747,1.0,0.216161
1,ALBUMIN,19.07663,0.759496,0.164174
2,SPIDERS,16.770082,0.667666,0.144324
3,AGE,15.984846,0.636404,0.137566
4,PROTIME,13.862223,0.551896,0.119299
5,BILIRUBIN,9.339496,0.371833,0.080376
6,ALKPHOSPHATE,9.144035,0.364051,0.078694
7,LIVERFIRM,5.322676,0.211911,0.045807
8,FATIGUE,1.580271,0.062915,0.0136




In [26]:
h2o_paras_keys = h2o_estimator.leader.params

In [27]:
result=h2o_estimator.leader.model_performance(test_data)

In [28]:
print (result)


ModelMetricsBinomial: xgboost
** Reported on test data. **

MSE: 0.21473610137989707
RMSE: 0.4633962681980694
LogLoss: 0.613313518718101
Mean Per-Class Error: 0.25
AUC: 0.7222222222222222
AUCPR: 0.7432667667078361
Gini: 0.4444444444444444

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.48888230323791504: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,7.0,5.0,0.4167,(5.0/12.0)
1,Yes,1.0,11.0,0.0833,(1.0/12.0)
2,Total,8.0,16.0,0.25,(6.0/24.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.488882,0.785714,15.0
1,max f2,0.345173,0.869565,20.0
2,max f0point5,0.488882,0.723684,15.0
3,max accuracy,0.488882,0.75,15.0
4,max precision,0.934542,1.0,0.0
5,max recall,0.345173,1.0,20.0
6,max specificity,0.934542,1.0,0.0
7,max absolute_mcc,0.488882,0.53033,15.0
8,max min_per_class_accuracy,0.600489,0.583333,10.0
9,max mean_per_class_accuracy,0.488882,0.75,15.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 56.39 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.041667,0.91215,2.0,2.0,1.0,0.934542,1.0,0.934542,0.083333,0.083333,100.0,100.0
1,,2,0.041667,0.889758,0.0,2.0,0.0,0.0,1.0,0.934542,0.0,0.083333,-100.0,100.0
2,,3,0.041667,0.867366,0.0,2.0,0.0,0.0,1.0,0.934542,0.0,0.083333,-100.0,100.0
3,,4,0.041667,0.844974,0.0,2.0,0.0,0.0,1.0,0.934542,0.0,0.083333,-100.0,100.0
4,,5,0.083333,0.828981,2.0,2.0,1.0,0.837185,1.0,0.885864,0.083333,0.166667,100.0,100.0
5,,6,0.125,0.779895,2.0,2.0,1.0,0.782489,1.0,0.851405,0.083333,0.25,100.0,100.0
6,,7,0.166667,0.764711,2.0,2.0,1.0,0.773844,1.0,0.832015,0.083333,0.333333,100.0,100.0
7,,8,0.208333,0.74738,0.0,1.6,0.0,0.75355,0.8,0.816322,0.0,0.333333,-100.0,60.0
8,,9,0.291667,0.649137,0.0,1.142857,0.0,0.723563,0.571429,0.789819,0.0,0.333333,-100.0,14.285714
9,,10,0.416667,0.614134,1.333333,1.2,0.666667,0.62662,0.6,0.740859,0.166667,0.5,33.333333,20.0






In [29]:
h2o_paras_keys

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'XGBoost_1_AutoML_20200601_011323',
   'type': 'Key<Model>',
   'URL': '/3/Models/XGBoost_1_AutoML_20200601_011323'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_py_2_sid_a5f7',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_py_2_sid_a5f7'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'fold_assignment': {'default': 'AUTO', 'actual': 'Modulo'},
 'fold_column

In [30]:
from h2o.estimators.xgboost import H2OXGBoostEstimator

def h2o_perform_validation(paras, df, x_columns, y_columns, n_validation, test_size=0.2):
    aucs=[]
    for i in range (n_validation):
        model=H2OXGBoostEstimator(**paras)
        train_data, test_data=df.split_frame(ratios=[0.8],seed=i)
        model.train(x=x_columns, y=y_columns, training_frame=train_data)
        temp_auc = model.model_performance(test_data)['AUC']
        aucs.append(temp_auc)
    return aucs

In [31]:
# not learning rate
# hepatitis_h2o_clf = xgb.XGBClassifier(booster='gbtree',
#                                  colsample_bylevel=0.8, 
#                                  colsample_bytree=0.8, 
#                                  gamma=0.0,
#                                  max_delta_step=0,
#                                  max_depth=10, 
#                                  min_child_weight=5.0, 
#                                  n_estimators=30,
#                                  nthread=-1, 
#                                  reg_alpha=0.0,
#                                  reg_lambda=1.0,
#                                  subsample=0.6)
x_columns = list(df.columns).remove(hepatitis_target)
y_columns = hepatitis_target

xgboost_paras = {'nfolds': 5,
                 'keep_cross_validation_models': False,
                 'keep_cross_validation_predictions':True,
                 'fold_assignment':'Modulo',
                 'stopping_metric': 'AUC',
                 'stopping_tolerance': 0.05,
                 'seed': 1,
                 'distribution': 'bernoulli',
                 'ntrees': 30,
                 'max_depth': 10,
                 'min_rows': 5.0,
                 'min_child_weight': 5.0,
                 'sample_rate': 0.6,
                 'subsample': 0.6,
                 'col_sample_rate': 0.8,
                 'colsample_bylevel': 0.8,
                 'col_sample_rate_per_tree': 0.8,
                 'colsample_bytree': 0.8,
                 'score_tree_interval': 5}

h2o_aucs = h2o_perform_validation(xgboost_paras, df, x_columns, y_columns, 100)

xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress: |███████████████████████████████████████████| 100%
xgboost Model Build progress

In [32]:
print (mean(h2o_aucs))
print (sem(h2o_aucs))

0.6795515925499361
0.009635190307924956
