## Task 1: Initial Setup

In [None]:
#Import H2O and other libaries that will be used in this tutorial 
import h2o
from h2o.estimators import *
from h2o.grid import *

In [None]:
import os

startup  = '/home/h2o/bin/aquarium_startup'
shutdown = '/home/h2o/bin/aquarium_stop'

if os.path.exists(startup):
    os.system(startup)
    local_url = 'http://localhost:54321/h2o'
    aquarium = True
else:
    local_url = 'http://localhost:54321'
    aquarium = False

In [None]:
h2o.init(url = local_url)

In [None]:
#Import the dataset 
loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv")

## Task 2: Machine Learning Concepts - See Tutorial

## Task 3: Start Experiment

In [None]:
loan_level.head()

In [None]:
loan_level.describe()

In [None]:
loan_level["DELINQUENT"].table()

In [None]:
train, valid, test = loan_level.split_frame([0.7, 0.15], seed = 42)

In [None]:
print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows))

In [None]:
y = "DELINQUENT"

ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] 

x = list(set(train.names) - set(ignore))

In [None]:
print(x)

## Task 4: Build a GLM

In [None]:
glm = H2OGeneralizedLinearEstimator(family = "binomial", seed = 42, model_id = 'default_glm')
%time glm.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
glm

In [None]:
glm.plot()

In [None]:
glm.varimp_plot()

In [None]:
glm.accuracy() #You can print individual metrics as well

In [None]:
glm.predict(valid).head(10)

In [None]:
default_glm_perf = glm.model_performance(valid) #validation score

In [None]:
print(default_glm_perf.auc())

## Task 5: Build a Random Forest

In [None]:
rf = H2ORandomForestEstimator (seed = 42, model_id = 'default_rf')
%time rf.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
rf

In [None]:
rf.plot(metric = 'auc')

In [None]:
rf.varimp_plot(20)

In [None]:
#This is a bonus feature for this tutorial!
rf.partial_plot(data = train, cols = ['CREDIT_SCORE'], 
                server = True, plot = True) #Partial Dependence plots can also be generated

In [None]:
rf.accuracy() #Training accuracy

In [None]:
rf.F1() #Training F1

In [None]:
rf.predict(valid)

In [None]:
default_rf_per = rf.model_performance(valid)

## Task 6: Build a GBM

In [None]:
gbm = H2OGradientBoostingEstimator(seed = 42, model_id = 'default_gbm')
%time gbm.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
gbm

In [None]:
gbm.predict(valid)

In [None]:
default_gbm_per = gbm.model_performance(valid)

In [None]:
default_gbm_per.auc() #validation AUC

In [None]:
default_gbm_per.F1() #Validation F1

## Task 7: Tune the GLM with H2O GridSearch 

In [None]:
glm_grid = h2o.grid.H2OGridSearch (
    
    H2OGeneralizedLinearEstimator(family = "binomial",
                                  lambda_search = True),
    
    hyper_params = {"alpha": [x*0.01 for x in range(0, 100)],
                    "missing_values_handling" : ["Skip", "MeanImputation"]
                    },
    
    grid_id = "glm_random_grid",
    
    search_criteria = {
        "strategy":"RandomDiscrete",
        "max_models":300,
        "max_runtime_secs":300,
        "seed":42
        }
)

%time glm_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_glm_grid = glm_grid.get_grid(sort_by = 'auc', decreasing = True)
sorted_glm_grid.sorted_metric_table()

In [None]:
tuned_glm = sorted_glm_grid.models[0]
tuned_glm.summary()

In [None]:
tuned_glm_perf = tuned_glm.model_performance(valid)

In [None]:
print("Default GLM AUC: %.4f \nTuned GLM AUC:%.4f" % (default_glm_perf.auc(), tuned_glm_perf.auc()))

In [None]:
#Not shown in Tutorial
print ("Default GLM Accuracy:", default_glm_perf.accuracy())
print ("Tuned GLM Accuracy", tuned_glm_perf.accuracy())

In [None]:
print ("Default GLM F1 Score:", default_glm_perf.F1())
print ("Tuned GLM F1 Score", tuned_glm_perf.F1())

In [None]:
print ("Default GLM: ", default_glm_perf.confusion_matrix())
print ("Tuned GLM: ",  tuned_glm_perf.confusion_matrix())

## Task 8: Tune the RF model with H2O GridSearch 

In [None]:
#Grid Search Parameters
hyper_parameters = {'max_depth':[1, 3, 5, 6, 7, 8, 9, 10, 12, 13, 15, 20, 25, 35]}

rf = H2ORandomForestEstimator(seed = 42,
                              stopping_rounds = 5, 
                              stopping_tolerance = 1e-4, 
                              stopping_metric = "auc",
                              model_id = 'rf')

grid_id = 'depth_grid'

search_criteria = {'strategy': "Cartesian"}

#Grid Search
rf_grid = H2OGridSearch(model = rf, 
                        hyper_params = hyper_parameters, 
                        grid_id = grid_id, 
                        search_criteria = search_criteria)

%time rf_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_rf_depth = rf_grid.get_grid(sort_by = 'auc',decreasing = True)
sorted_rf_depth.sorted_metric_table()

In [None]:
hyper_parameters = {"max_depth":[8, 9, 10, 11, 12],
                    'sample_rate': [x/100. for x in range(20,101)]
                   }

rf = H2ORandomForestEstimator(ntrees = 500,
                              seed = 42,
                              stopping_rounds = 5, 
                              stopping_tolerance = 1e-3, 
                              stopping_metric = "auc",
                              model_id = 'rf_grid')

grid_id = 'rf_random_grid'

search_criteria = {"strategy":"RandomDiscrete",
                   "max_models":100,
                   "max_runtime_secs":900,
                   "seed":42
                  }

rf_grid = H2OGridSearch(model = rf, 
                        hyper_params = hyper_parameters, 
                        grid_id = grid_id, 
                        search_criteria = search_criteria)

%time rf_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_rf = rf_grid.get_grid(sort_by = 'auc',decreasing = True)
sorted_rf.sorted_metric_table()

In [None]:
tuned_rf = sorted_rf.models[0]

In [None]:
tuned_rf_per = tuned_rf.model_performance(valid)
tuned_rf_per.auc()

In [None]:
tuned_rf_per.F1()

In [None]:
print("Default RF AUC: %.4f \nTuned RF AUC:%.4f" % (default_rf_per.auc(), tuned_rf_per.auc()))

In [None]:
print("Default RF F1 Score:", default_rf_per.F1())
print("Tuned RF F1 Score:", tuned_rf_per.F1())

In [None]:
print ("Default RF: ", default_rf_per.confusion_matrix())
print ("Tuned RF: ",  tuned_rf_per.confusion_matrix())

## Task 9: Tune the GBM model with H2O GridSearch

In [None]:
hyper_params = {'max_depth' : [3,4,5,6,7,8,9,10,12,13,15],
               }

gbm = H2OGradientBoostingEstimator(model_id = 'grid_gbm', 
                                   ntrees = 50,
                                   seed = 42)

gbm_grid = H2OGridSearch(gbm, hyper_params,
                         grid_id = 'depth_gbm_grid',
                         search_criteria = {"strategy":"Cartesian"})


%time gbm_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_gbm_depth = gbm_grid.get_grid(sort_by = 'auc', decreasing = True)
sorted_gbm_depth.sorted_metric_table()

In [None]:
gbm = H2OGradientBoostingEstimator(ntrees = 500,
                                   learn_rate = 0.05,
                                   seed = 42,
                                   model_id = 'grid_gbm')

hyper_params_tune = {'max_depth' : [4, 5, 6, 7, 8],
                     'sample_rate': [x/100. for x in range(20,101)],
                     'col_sample_rate' : [x/100. for x in range(20,101)],
                     'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
                     'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
                    }

search_criteria_tune = {'strategy': "RandomDiscrete",
                        'max_runtime_secs': 900,  
                        'max_models': 100,  ## build no more than 100 models
                        'seed' : 42 
                       }

random_grid = H2OGridSearch(model = gbm, hyper_params = hyper_params_tune,
                            grid_id = 'random_grid',
                            search_criteria = search_criteria_tune)

%time random_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_random_search = random_grid.get_grid(sort_by = 'auc',decreasing = True)
sorted_random_search.sorted_metric_table()

In [None]:
tuned_gbm = sorted_random_search.models[0]

In [None]:
tuned_gbm_per = tuned_gbm.model_performance(valid)
print(tuned_gbm_per.auc())
print(tuned_gbm_per.F1())

In [None]:
tuned_gbm_per.confusion_matrix()

In [None]:
print("Default GBM AUC: %.4f \nTuned GBM AUC:%.4f" % (default_gbm_per.auc(), tuned_gbm_per.auc()))

## Task 10: Test Set Performance

In [None]:
glm_test_per = tuned_glm.model_performance(test)
rf_test_per = tuned_rf.model_performance(test)
gbm_test_per = tuned_gbm.model_performance(test)

In [None]:
print("GLM Test AUC: %.4f \nRF Test AUC: %.4f \nGBM Test AUC: %.4f " % 
      (glm_test_per.auc(), rf_test_per.auc(), gbm_test_per.auc()))

In [None]:
print ("GLM Test F1 Score: ", glm_test_per.F1())
print ("RF Test F1 Score: ",  rf_test_per.F1())
print ("GBM Test F1 Score: ",  gbm_test_per.F1())

In [None]:
#Not shown in the tutorial file. Just for reference - using threshold that maximizes the F1
print ("GLM Test Accuracy: ", glm_test_per.accuracy(thresholds = 0.13108999388747938))
print ("RF Test Accuracy: ",  rf_test_per.accuracy(thresholds = 0.11901725589047217))
print ("GBM Test Accuracy: ",  gbm_test_per.accuracy(thresholds = 0.15601852885798811))

In [None]:
print ("GLM Confusion Matrix: ", glm_test_per.confusion_matrix())
print ("RF Confusion Matrix: ",  rf_test_per.confusion_matrix())
print ("GBM Confusion Matrix ",  gbm_test_per.confusion_matrix())

In [None]:
#Not shown on tutorial file
print("GLM Test logloss: %.5f \nRF Test logloss: %.5f \nGBM Test logloss: %.5f " % 
      (glm_test_per.logloss(), rf_test_per.logloss(), gbm_test_per.logloss()))

## Task 11: Challenge & Shutting down your Cluster

In [None]:
from h2o.estimators import H2ONaiveBayesEstimator

#### Build a bayes classifier model

In [None]:
bayes = H2ONaiveBayesEstimator(seed = 42)
%time bayes.train(x, y, train, validation_frame = valid)

#### Check the validation AUC

In [None]:
bayes_val_per = bayes.model_performance(valid)
bayes_val_per.auc()

#### Quick grid search for parameters laplace, min_prob and eps_prob

In [None]:
hyper_params = {'laplace':[0, 0.1, 0.5, 1, 1.25, 1.75, 2, 2.25, 2.5, 3],
                'min_prob':[0.0001, 0.001, 0.002, 0.005, 0.009, 0.01, 0.05, 0.1], 
                'eps_prob':[0, 0.001, 0.005, 0.01, 0.05, 0.1]
               }

bayes = H2ONaiveBayesEstimator(seed = 42)

grid_id = 'bayes_grid'

search_criteria = {"strategy" : 'RandomDiscrete',
                    'max_models': 100
                  }

bayes_grid = H2OGridSearch(model = bayes,
                           hyper_params = hyper_params,
                           grid_id = grid_id,
                           search_criteria = search_criteria
                           )

%time bayes_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

sorted_grid = bayes_grid.get_grid(sort_by = 'auc', decreasing = True)
sorted_grid.sorted_metric_table()

#### Lastly, check the test AUC

In [None]:
best_bayes_model = bayes_grid.models[0] 
bayes_test_per = best_bayes_model.model_performance(test)
bayes_test_per.auc()

### Shutdown Cluster

In [None]:
h2o.cluster().shutdown()