# AutoML with H2O.ai

In [1]:
import h2o
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323. connected.


0,1
H2O cluster uptime:,7 days 21 hours 20 mins
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.8
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_hbi16859_w1o7wl
H2O cluster total nodes:,1
H2O cluster free memory:,3.301 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


## Build a gbm model

In [2]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [3]:
airlines=h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
predictors=["Origin","Dest","Year","UniqueCarrier","DayOfWeek","Month","Distance","FlightNum"]
response="IsDepDelayed"
train,valid=airlines.split_frame(ratios=[.8],seed=1234)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
airlines_gbm=H2OGradientBoostingEstimator(nbins_cats=1024,seed=1234)
airlines_gbm.train(x=predictors,y=response,training_frame=train,validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [5]:
print("training score",airlines_gbm.auc(train=True))
print("validation score",airlines_gbm.auc(valid=True))

training score 0.7782538391984423
validation score 0.7343035992322139


## try automl instead

In [6]:
from h2o.automl import H2OAutoML

In [9]:
aml = H2OAutoML(max_runtime_secs = 30)
aml.train(x = predictors, y = response,training_frame = train)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


### Check the leaderboard

In [10]:
lb=aml.leaderboard
lb

model_id,auc,logloss
StackedEnsemble_BestOfFamily_0_AutoML_20180517_141447,0.742927,0.597373
StackedEnsemble_AllModels_0_AutoML_20180517_141447,0.742927,0.597373
XRT_0_AutoML_20180517_141447,0.736592,0.606165
GBM_grid_0_AutoML_20180517_141447_model_0,0.732426,0.607855
DRF_0_AutoML_20180517_141447,0.726343,0.634633
GLM_grid_0_AutoML_20180517_141447_model_0,0.689056,0.63719




### the leader model is here

In [12]:
#aml.leader #print out the model deails

### Tips

If you need to generate predictions on a test set, you can make predictions directly on the `"H2OAutoML"` object, or on the leader
model object directly

preds = aml.predict(test)

or:
preds = aml.leader.predict(test)

In [14]:
aml.predict(valid) #it looks you can parse the whole data.frame as the input
#the model will only use the pre-defined columns
#I think this is really flexible to use

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,NO,YES
YES,0.409527,0.590473
YES,0.409527,0.590473
YES,0.234762,0.765238
YES,0.215186,0.784814
YES,0.248518,0.751482
YES,0.259698,0.740302
YES,0.214665,0.785335
YES,0.234762,0.765238
YES,0.259698,0.740302
YES,0.220319,0.779681




In [15]:
aml.leader.predict(valid)  

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,NO,YES
YES,0.409527,0.590473
YES,0.409527,0.590473
YES,0.234762,0.765238
YES,0.215186,0.784814
YES,0.248518,0.751482
YES,0.259698,0.740302
YES,0.214665,0.785335
YES,0.234762,0.765238
YES,0.259698,0.740302
YES,0.220319,0.779681




In [17]:
valid["IsDepDelayed"]

IsDepDelayed
NO
NO
YES
NO
YES
YES
YES
YES
YES
YES




### Saving and loading the models in H2O

In [19]:
# save the model
model_path = h2o.save_model(model=aml.leader, path="/tmp/h2o-hbi16859/automl_flights_stackensembl_20180517.model", force=True)

In [20]:
model_path

'/private/tmp/h2o-hbi16859/automl_flights_stackensembl_20180517.model/StackedEnsemble_BestOfFamily_0_AutoML_20180517_141447'

In [21]:
# load the model
saved_model = h2o.load_model(model_path)

In [28]:
saved_model.predict(valid[1,:])

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,NO,YES
YES,0.409527,0.590473




## shutdown the server

In [29]:
#shutdown the server
h2o.shutdown(prompt=True)

    >>> h2o.shutdown(prompt=True)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_aaa7 closed.


In [31]:
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_121"; Java(TM) SE Runtime Environment (build 1.8.0_121-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)
  Starting server from /Users/hbi16859/anaconda/envs/python36/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/2h/78_180g57l19twphrlwd2tjr00z10v/T/tmpb42767uq
  JVM stdout: /var/folders/2h/78_180g57l19twphrlwd2tjr00z10v/T/tmpb42767uq/h2o_hbi16859_started_from_python.out
  JVM stderr: /var/folders/2h/78_180g57l19twphrlwd2tjr00z10v/T/tmpb42767uq/h2o_hbi16859_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.8
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_hbi16859_f6ghdw
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [32]:
h2o.shutdown(prompt=True)

    >>> h2o.shutdown(prompt=True)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_aade closed.
