In [1]:
### https://dzone.com/articles/h2o-automl-examples-in-python-and-scala

### see also: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

In [2]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,2 days 7 hours 6 mins
H2O cluster timezone:,Europe/Berlin
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,2 months
H2O cluster name:,lars
H2O cluster total nodes:,1
H2O cluster free memory:,3.351 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [3]:
#df = h2o.import_file("./dai/data/train.csv")
df = h2o.import_file("https://raw.githubusercontent.com/choas/h2o-titanic/master/data/train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
not_empty_embarked = ~df["Embarked"].isna()
df_e = df[not_empty_embarked]
not_empty_age = df_e["Age"] >= 0
df = df_e[not_empty_age]

In [5]:
# filter unknown Age and split 70% train and 30% test
train, test = df[df["Age"] >= 0].split_frame(ratios=[.7])

In [6]:
# Identify predictors and response
x = train.columns
y = "Survived"
x.remove(y)

# remove some columns
x.remove("PassengerId")
x.remove("Name")
#x.remove("SibSp")
#x.remove("Parch")
x.remove("Ticket")
#x.remove("Fare")
x.remove("Cabin")

In [7]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

train["Pclass"] = train["Pclass"].asfactor()
test["Pclass"] = test["Pclass"].asfactor()

In [8]:
train.summary()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
type,int,enum,enum,string,enum,real,int,int,int,real,enum,enum
mins,1.0,,,,,0.42,0.0,0.0,693.0,0.0,,
mean,448.439121756,,,,,29.4833732535,0.463073852295,0.433133732535,283303.121212,35.0982197605,,
maxs,891.0,,,,,80.0,5.0,6.0,3101298.0,512.3292,,
sigma,257.790990507,,,,,14.3468589593,0.851547845124,0.854412055205,541379.835055,55.4385984067,,
zeros,0,,,0,,0,337,362,0,6,,
missing,0,0,0,0,0,0,0,0,138,0,361,0
0,1.0,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,,7.25,,S
1,2.0,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1.0,0.0,,71.2833,C85,C
2,3.0,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,,7.925,,S


In [9]:
# Run AutoML for 60 seconds
# exclude algos which PrintMojo doen't handle
#aml = H2OAutoML(max_runtime_secs = 160, include_algos=['GBM', 'DRF']) #exclude_algos=['XGBoost', 'StackedEnsemble', 'DeepLearning', 'GLM'], include_algos=['GBM'])

aml = H2OAutoML(max_runtime_secs = 60) ## , include_algos=['GBM', 'DRF']) #exclude_algos=['XGBoost', 'StackedEnsemble', 'DeepLearning', 'GLM'], include_algos=['GBM'])

aml.train(x = x, y = y, training_frame = train, validation_frame = test) ##leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


## View the AutoML Leaderboard

In [10]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_grid__1_AutoML_20200216_213624_model_18,0.849969,0.463469,0.838926,0.2232,0.384125,0.147552
StackedEnsemble_BestOfFamily_AutoML_20200216_213624,0.847776,0.457051,0.823026,0.212466,0.381686,0.145684
GBM_grid__1_AutoML_20200216_213624_model_2,0.847752,0.483548,0.816236,0.204788,0.388079,0.150606
StackedEnsemble_AllModels_AutoML_20200216_213624,0.847418,0.454522,0.822598,0.20469,0.379324,0.143886
XGBoost_grid__1_AutoML_20200216_213624_model_4,0.845388,0.462992,0.829228,0.204201,0.385027,0.148246
XGBoost_3_AutoML_20200216_213624,0.84481,0.457056,0.82857,0.198675,0.381886,0.145837
XRT_1_AutoML_20200216_213624,0.844475,0.47616,0.820592,0.21743,0.388548,0.150969
XGBoost_grid__1_AutoML_20200216_213624_model_6,0.844394,0.458512,0.832743,0.199286,0.383105,0.14677
XGBoost_grid__1_AutoML_20200216_213624_model_1,0.844166,0.459965,0.836996,0.212979,0.384147,0.147569
GBM_1_AutoML_20200216_213624,0.844052,0.474076,0.838709,0.20579,0.389804,0.151947




In [11]:
# To generate predictions on a test set, use `"H2OAutoML"` object, or on the leader model object directly as below:
preds = aml.predict(test)
# or
preds = aml.leader.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [12]:
preds

predict,p0,p1
0,0.827919,0.172081
1,0.100823,0.899177
0,0.81723,0.18277
1,0.424181,0.575819
1,0.18573,0.81427
1,0.69026,0.30974
0,0.694293,0.305707
0,0.835487,0.164513
1,0.669161,0.330839
1,0.152046,0.847954




In [13]:
model_id = aml.leader.model_id
aml.download_mojo(genmodel_name=model_id, path="~/h2o/titanic/"+model_id+".zip")

u'/Users/lars/h2o/titanic/GBM_grid__1_AutoML_20200216_213624_model_18.zip'

### show parameters

In [14]:
aml.leader.params

{u'balance_classes': {u'actual': False, u'default': False},
 u'build_tree_one_node': {u'actual': False, u'default': False},
 u'calibrate_model': {u'actual': False, u'default': False},
 u'calibration_frame': {u'actual': None, u'default': None},
 u'categorical_encoding': {u'actual': u'AUTO', u'default': u'AUTO'},
 u'check_constant_response': {u'actual': True, u'default': True},
 u'checkpoint': {u'actual': None, u'default': None},
 u'class_sampling_factors': {u'actual': None, u'default': None},
 u'col_sample_rate': {u'actual': 1.0, u'default': 1.0},
 u'col_sample_rate_change_per_level': {u'actual': 1.0, u'default': 1.0},
 u'col_sample_rate_per_tree': {u'actual': 0.7, u'default': 1.0},
 u'custom_distribution_func': {u'actual': None, u'default': None},
 u'custom_metric_func': {u'actual': None, u'default': None},
 u'distribution': {u'actual': u'bernoulli', u'default': u'AUTO'},
 u'export_checkpoints_dir': {u'actual': None, u'default': None},
 u'fold_assignment': {u'actual': u'Modulo', u'defa