In [1]:
# see also: 
#   http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
#   https://dzone.com/articles/h2o-automl-examples-in-python-and-scala

In [2]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,2 days 18 hours 0 mins
H2O cluster timezone:,Europe/Berlin
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,2 months
H2O cluster name:,lars
H2O cluster total nodes:,1
H2O cluster free memory:,3.327 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [3]:
df = h2o.import_file("https://raw.githubusercontent.com/choas/h2o-titanic/master/data/train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
not_empty_embarked = ~df["Embarked"].isna()
df_e = df[not_empty_embarked]
not_empty_age = df_e["Age"] >= 0
df = df_e[not_empty_age]

In [5]:
# filter unknown Age and split 70% train and 30% test
train, test = df[df["Age"] >= 0].split_frame(ratios=[.7])

In [6]:
# Identify predictors and response
x = train.columns
y = "Survived"
x.remove(y)

# remove some columns
x.remove("PassengerId")
x.remove("Name")
x.remove("Ticket")
x.remove("Cabin")

In [7]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

train["Pclass"] = train["Pclass"].asfactor()
test["Pclass"] = test["Pclass"].asfactor()

In [8]:
train.summary()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
type,int,enum,enum,string,enum,real,int,int,int,real,enum,enum
mins,4.0,,,,,0.75,0.0,0.0,695.0,0.0,,
mean,452.536144578,,,,,29.5978915663,0.512048192771,0.423694779116,288144.136612,31.4853323293,,
maxs,891.0,,,,,71.0,5.0,6.0,3101298.0,512.3292,,
sigma,259.474417891,,,,,14.0779225596,0.934933164359,0.845868645717,559549.744281,42.9521791579,,
zeros,0,,,0,,0,329,366,0,3,,
missing,0,0,0,0,0,0,0,0,132,0,374,0
0,4.0,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803.0,53.1,C123,S
1,8.0,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,349909.0,21.075,,S
2,10.0,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1.0,0.0,237736.0,30.0708,,C


In [9]:
# Run AutoML for 60 seconds
aml = H2OAutoML(max_runtime_secs = 60)
aml.train(x = x, y = y, training_frame = train, validation_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


## View the AutoML Leaderboard

In [10]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XRT_1_AutoML_20200217_083015,0.863107,0.424974,0.825488,0.187793,0.359296,0.129094
DRF_1_AutoML_20200217_083015,0.860005,0.437755,0.802732,0.205945,0.370342,0.137153
GBM_4_AutoML_20200217_083015,0.859552,0.429859,0.816084,0.196089,0.36767,0.135181
GBM_3_AutoML_20200217_083015,0.856406,0.431062,0.803918,0.187401,0.365781,0.133796
GBM_1_AutoML_20200217_083015,0.855508,0.441215,0.81531,0.211496,0.371834,0.138261
XGBoost_grid__1_AutoML_20200217_083015_model_4,0.853783,0.450008,0.810371,0.217003,0.376146,0.141486
XGBoost_grid__1_AutoML_20200217_083015_model_2,0.852389,0.433726,0.80963,0.204768,0.366508,0.134328
GBM_2_AutoML_20200217_083015,0.851273,0.438832,0.806479,0.199636,0.369894,0.136821
StackedEnsemble_BestOfFamily_AutoML_20200217_083015,0.851212,0.424679,0.817741,0.195313,0.362109,0.131123
GBM_grid__1_AutoML_20200217_083015_model_14,0.849705,0.485149,0.815824,0.194128,0.390635,0.152596




In [11]:
# To generate predictions on a test set, use the leader model object directly:
preds = aml.leader.predict(test)

drf prediction progress: |████████████████████████████████████████████████| 100%


In [12]:
preds

predict,p0,p1
0,0.784366,0.215634
1,0.00252404,0.997476
1,0.48098,0.51902
0,0.947306,0.0526941
0,0.919564,0.0804362
1,0.326633,0.673367
0,0.967906,0.0320941
1,0.247187,0.752813
1,0.348063,0.651937
1,0.568073,0.431927




### save the model

In [13]:
model_id = aml.leader.model_id
aml.download_mojo(genmodel_name=model_id, path="~/h2o/titanic/"+model_id+".zip")

u'/Users/lars/h2o/titanic/XRT_1_AutoML_20200217_083015.zip'

### show parameters

In [14]:
aml.leader.params

{u'balance_classes': {u'actual': False, u'default': False},
 u'binomial_double_trees': {u'actual': False, u'default': False},
 u'build_tree_one_node': {u'actual': False, u'default': False},
 u'calibrate_model': {u'actual': False, u'default': False},
 u'calibration_frame': {u'actual': None, u'default': None},
 u'categorical_encoding': {u'actual': u'AUTO', u'default': u'AUTO'},
 u'check_constant_response': {u'actual': True, u'default': True},
 u'checkpoint': {u'actual': None, u'default': None},
 u'class_sampling_factors': {u'actual': None, u'default': None},
 u'col_sample_rate_change_per_level': {u'actual': 1.0, u'default': 1.0},
 u'col_sample_rate_per_tree': {u'actual': 1.0, u'default': 1.0},
 u'custom_metric_func': {u'actual': None, u'default': None},
 u'distribution': {u'actual': u'multinomial', u'default': u'AUTO'},
 u'export_checkpoints_dir': {u'actual': None, u'default': None},
 u'fold_assignment': {u'actual': u'Modulo', u'default': u'AUTO'},
 u'fold_column': {u'actual': None, u'de