In [1]:
# see also: 
#   http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
#   https://dzone.com/articles/h2o-automl-examples-in-python-and-scala

In [2]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,4 mins 52 secs
H2O cluster timezone:,Europe/Berlin
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,2 months and 1 day
H2O cluster name:,lars
H2O cluster total nodes:,1
H2O cluster free memory:,4.000 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [3]:
df = h2o.import_file("https://raw.githubusercontent.com/choas/h2o-titanic/master/data/train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
nrows_prev = df.nrows
not_empty_embarked = ~df["Embarked"].isna()
df_e = df[not_empty_embarked]
not_empty_age = df_e["Age"] >= 0
df = df_e[not_empty_age]
print("%d rows removed" % (nrows_prev - df.nrows))

179 rows removed


In [5]:
# filter unknown Age and split 70% train and 30% test
train, test = df[df["Age"] >= 0].split_frame(ratios=[.7])

In [6]:
# Identify predictors and response
x = train.columns
y = "Survived"
x.remove(y)

# remove some columns
x.remove("PassengerId")
x.remove("Name")
x.remove("Ticket")
x.remove("Cabin")

In [7]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

train["Pclass"] = train["Pclass"].asfactor()
test["Pclass"] = test["Pclass"].asfactor()

In [8]:
train.summary()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
type,int,enum,enum,string,enum,real,int,int,int,real,enum,enum
mins,1.0,,,,,0.67,0.0,0.0,695.0,0.0,,
mean,444.157142857,,,,,29.4974489796,0.532653061224,0.428571428571,303082.283333,32.3219722449,,
maxs,891.0,,,,,74.0,5.0,6.0,3101298.0,512.3292,,
sigma,258.145069866,,,,,14.0656996182,0.967759995577,0.899897744702,581908.463656,52.0862930438,,
zeros,0,,,0,,0,323,365,0,5,,
missing,0,0,0,0,0,0,0,0,130,0,383,0
0,1.0,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,,7.25,,S
1,8.0,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,349909.0,21.075,,S
2,9.0,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0.0,2.0,347742.0,11.1333,,S


In [9]:
# Run AutoML for 60 seconds
aml = H2OAutoML(max_runtime_secs = 60)
aml.train(x = x, y = y, training_frame = train, validation_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


## View the AutoML Leaderboard

In [10]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_grid__1_AutoML_20200217_190131_model_7,0.864299,0.469127,0.795421,0.207729,0.383625,0.147168
XGBoost_grid__1_AutoML_20200217_190131_model_5,0.864219,0.481677,0.802079,0.210038,0.39009,0.15217
XGBoost_3_AutoML_20200217_190131,0.861369,0.428127,0.815677,0.189258,0.365466,0.133565
GBM_2_AutoML_20200217_190131,0.857657,0.435284,0.815037,0.186026,0.366208,0.134108
XGBoost_1_AutoML_20200217_190131,0.857515,0.434905,0.822706,0.210003,0.370668,0.137395
GBM_grid__1_AutoML_20200217_190131_model_10,0.856955,0.433823,0.819815,0.196895,0.367663,0.135176
GBM_grid__1_AutoML_20200217_190131_model_19,0.856511,0.436855,0.811137,0.195226,0.368773,0.135994
StackedEnsemble_BestOfFamily_AutoML_20200217_190131,0.854415,0.434793,0.813097,0.188193,0.36844,0.135748
GBM_grid__1_AutoML_20200217_190131_model_31,0.853581,0.539082,0.794133,0.188743,0.420271,0.176627
XGBoost_grid__1_AutoML_20200217_190131_model_2,0.852861,0.437903,0.812019,0.196878,0.36937,0.136434




In [11]:
# To generate predictions on a test set, use the leader model object directly:
preds = aml.leader.predict(test)

xgboost prediction progress: |████████████████████████████████████████████| 100%


In [12]:
preds

predict,p0,p1
1,0.235575,0.764425
1,0.443806,0.556194
1,0.235575,0.764425
0,0.82347,0.17653
0,0.708744,0.291256
1,0.576541,0.423459
1,0.447101,0.552899
1,0.244355,0.755645
1,0.587614,0.412386
1,0.491879,0.508121




### save the model

In [13]:
model_id = aml.leader.model_id
aml.download_mojo(genmodel_name=model_id, path="./"+model_id+".zip")

u'/Users/lars/h2o/XGBoost_grid__1_AutoML_20200217_190131_model_7.zip'

### show parameters

In [14]:
aml.leader.params

{u'backend': {u'actual': u'auto', u'default': u'auto'},
 u'booster': {u'actual': u'gbtree', u'default': u'gbtree'},
 u'calibrate_model': {u'actual': False, u'default': False},
 u'calibration_frame': {u'actual': None, u'default': None},
 u'categorical_encoding': {u'actual': u'AUTO', u'default': u'AUTO'},
 u'checkpoint': {u'actual': None, u'default': None},
 u'col_sample_rate': {u'actual': 1.0, u'default': 1.0},
 u'col_sample_rate_per_tree': {u'actual': 0.9, u'default': 1.0},
 u'colsample_bylevel': {u'actual': 1.0, u'default': 1.0},
 u'colsample_bytree': {u'actual': 1.0, u'default': 1.0},
 u'distribution': {u'actual': u'bernoulli', u'default': u'AUTO'},
 u'dmatrix_type': {u'actual': u'auto', u'default': u'auto'},
 u'eta': {u'actual': 0.3, u'default': 0.3},
 u'export_checkpoints_dir': {u'actual': None, u'default': None},
 u'fold_assignment': {u'actual': u'Modulo', u'default': u'AUTO'},
 u'fold_column': {u'actual': None, u'default': None},
 u'gamma': {u'actual': 0.0, u'default': 0.0},
 u'g