# Start H2O Cluster

_**Note**: The `os.system` command below is used solely for the H2O Aquarium training platform._

In [None]:
import os
os.system('/home/h2o/bin/startup')
!sleep 10

Start by importing `h2o` and creating a connection to the server. The parameters used in `h2o.init` will depend on your specific environment. 

In [None]:
import h2o
h2o.init(url='http://localhost:54321/h2o')

Some commentary needed below

In [None]:
h2o.cluster().show_status()

In [None]:
h2o.cluster().show_status(True)

In [None]:
df = h2o.import_file("/home/h2o/data/prostate/prostate.csv")

In [None]:
df.summary()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sd()

In [None]:
%matplotlib inline

import pylab as pl
df.as_data_frame().hist()
pl.show()

In [None]:
df.type

In [None]:
df.shape

In [None]:
df.col_names

In [None]:
print(df['CAPSULE'].unique())
print(df['DPROS'].levels)

## Generalized Liner Model : Linear Regression

In [None]:
y = 'CAPSULE'
x = df.col_names
x.remove(y)

In [None]:
print("Response = " + y)
print("Predictors = " + str(x))

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
glm = H2OGeneralizedLinearEstimator()

In [None]:
glm.train(x = x, y = y, training_frame = df)

In [None]:
print(glm)

In [None]:
glm.model_performance()

In [None]:
print(glm.rmse())
print(glm.mse())
print(glm.r2())

In [None]:
glm.coef()

In [None]:
train, valid = df.split_frame(ratios=[.9])

In [None]:
print(df.shape)
print(train.shape)
print(valid.shape)

In [None]:
train, valid, test = df.split_frame(ratios=[.8, .1])

In [None]:
print(df.shape)
print(train.shape)
print(valid.shape)
print(test.shape)

In [None]:
glm.train(x= x, y = y, training_frame=train, validation_frame=valid)

In [None]:
glm

In [None]:
print(glm.model_performance(test_data=test).rmse())
print(glm.model_performance(test_data=valid).rmse())
print(glm.model_performance(test_data=train).rmse())

In [None]:
print(glm.model_performance(test_data=test).r2())
print(glm.model_performance(test_data=valid).r2())
print(glm.model_performance(test_data=train).r2())

In [None]:
glm.coef()

In [None]:
glm.predict(test_data=test)

## Generalized Linear Model : Logistic Regression

In [None]:
df['CAPSULE'].summary()

In [None]:
df['CAPSULE'] = df['CAPSULE'].asfactor()

In [None]:
df['CAPSULE'].summary()

In [None]:
df['CAPSULE'].levels()

In [None]:
train, valid, test = df.split_frame(ratios=[.8,.1])

In [None]:
glm_logistic = H2OGeneralizedLinearEstimator(family = "binomial")

In [None]:
glm_logistic.train(x=x, y= y, training_frame=train, validation_frame=valid, 
                   model_id="glm_logistic"
                  )


In [None]:
glm_logistic

In [None]:
glm_logistic.varimp()

In [None]:
glm_logistic.confusion_matrix()

In [None]:
glm_logistic.predict(test_data=test)

## Gradient Boosting Model

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [None]:
gbm = H2OGradientBoostingEstimator()

In [None]:
gbm.train(x=x, y =y, training_frame=train, validation_frame=valid)

In [None]:
gbm

In [None]:
gbm.confusion_matrix()

In [None]:
gbm.varimp()

In [None]:
gbm.varimp_plot()

## Machine learning with cross validation

In [None]:
gbm_cv3 = H2OGradientBoostingEstimator(nfolds=3)

In [None]:
gbm_cv3.train(x=x, y=y, training_frame=train)

In [None]:
gbm_cv3

In [None]:
gbm_cv3.cross_validation_predictions()

In [None]:
gbm_cv4 = H2OGradientBoostingEstimator(keep_cross_validation_predictions = True, nfolds=4)

In [None]:
gbm_cv4.train(x=x, y=y, training_frame=train)

In [None]:
gbm_cv4.cross_validation_fold_assignment

## Random Forest

In [None]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [None]:
drf = H2ORandomForestEstimator(nfolds=5, keep_cross_validation_predictions=True)

In [None]:
drf.train(x=x, y = y, training_frame=train)

In [None]:
drf

In [None]:
drf.confusion_matrix()

In [None]:
drf.gains_lift()

## Deep Learning

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [None]:
##random_fold = random.randint(3,10)
dl_model = H2ODeepLearningEstimator(nfolds=5,fold_assignment="Modulo",hidden=[20,20],epochs=10)

In [None]:
dl_model.train(x=x, y=y, training_frame=train)

In [None]:
dl_model

In [None]:
train.hist

## AutoML

In [None]:
from h2o.automl import H2OAutoML

In [None]:
aml = H2OAutoML(max_runtime_secs=30)

In [None]:
aml.train(x = x, y = y, training_frame = train)

In [None]:
aml.leaderboard

In [None]:
aml.leader

In [None]:
aml.leader.auc()

In [None]:
aml.leader.confusion_matrix()

In [None]:
aml.leader.predict(test_data=test)

In [None]:
aml.leader.model_performance(test_data=test)

In [None]:
h2o.cluster().shutdown()