## Task 1: Initial Setup

In [None]:
#Import H2O and other libaries that will be used in this tutorial 
import h2o
from h2o.estimators import *
from h2o.grid import *

In [None]:
import os

startup  = '/home/h2o/bin/aquarium_startup'
shutdown = '/home/h2o/bin/aquarium_stop'

if os.path.exists(startup):
    os.system(startup)
    local_url = 'http://localhost:54321/h2o'
    aquarium = True
else:
    local_url = 'http://localhost:54321'
    aquarium = False

In [None]:
h2o.init(url = local_url)

In [None]:
#Import the dataset 
loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv")

## Task 2: Regression Concepts
Please refer to the document


## Task 3: Start Experiment

In [None]:
loan_level.head()

In [None]:
loan_level["ORIGINAL_INTEREST_RATE"].describe()

In [None]:
loan_level["ORIGINAL_INTEREST_RATE"].hist()

In [None]:
train, valid, test = loan_level.split_frame([0.70, 0.15], seed = 42)
print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows))

In [None]:
y = "ORIGINAL_INTEREST_RATE"

ignore = ["ORIGINAL_INTEREST_RATE", 
          "FIRST_PAYMENT_DATE", 
          "MATURITY_DATE", 
          "MORTGAGE_INSURANCE_PERCENTAGE", 
          "PREPAYMENT_PENALTY_MORTGAGE_FLAG", 
          "LOAN_SEQUENCE_NUMBER", 
          "PREPAID", 
          "DELINQUENT", 
          "PRODUCT_TYPE"] 

x = list(set(train.names) - set(ignore))

## Task 4: Build an XGBoost Model

In [None]:
xgb = H2OXGBoostEstimator(seed = 42, model_id = 'XGBoost', 
                          nfolds = 0, keep_cross_validation_predictions = False)

%time xgb.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
xgb

In [None]:
xgb.plot()

In [None]:
xgb.varimp_plot()

In [None]:
xgb_def_pred = xgb.predict(valid)
xgb_def_pred.cbind(valid['ORIGINAL_INTEREST_RATE'])

In [None]:
default_xgb_per = xgb.model_performance(valid)

## Task 5: Build a Deep Learning Model

In [None]:
dl = H2ODeepLearningEstimator(seed = 42, model_id = 'DL',
                              nfolds = 0,
                              keep_cross_validation_predictions = False)

%time dl.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
dl

In [None]:
print("epochs = ", dl.params['epochs'])

In [None]:
dl.plot()

In [None]:
dl.varimp_plot()

In [None]:
default_dl_per = dl.model_performance(valid)

## Task 6: Tune the XGBoost Model with H2O GridSearch

In [None]:
xgb = H2OXGBoostEstimator(model_id = 'xgb', ntrees = 300,
                          stopping_rounds = 3, #default
                          stopping_tolerance = 1e-3, #default
                          stopping_metric = "rmse", #default
                          seed = 42
    )

hyper_params = {'max_depth' : [5,7,9,10,12,13,15,20]
               }

grid_id = 'max_depth_grid'

search_criteria = { "strategy":"Cartesian"}

xgb_grid = H2OGridSearch(model = xgb, 
                         hyper_params = hyper_params,
                         grid_id = grid_id,
                         search_criteria = search_criteria
                         )

%time xgb_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_xgb = xgb_grid.get_grid(sort_by = 'rmse',decreasing = False)
sorted_xgb.sorted_metric_table()

In [None]:
xgb = H2OXGBoostEstimator(model_id = 'xgb_grid', ntrees = 500, 
                          learn_rate = 0.25,
                          stopping_rounds = 3, #default
                          stopping_tolerance = 1e-3, #default
                          stopping_metric = "rmse", #default
                          seed = 42)

hyper_params = {'max_depth' : [5,6,7,9],
                'sample_rate': [x/100. for x in range(20,101)],
                'col_sample_rate' : [x/100. for x in range(20,101)],
                'col_sample_rate_per_tree': [x/100. for x in range(20,101)]
               }

search_criteria_tune = {'strategy': "RandomDiscrete",
                        'max_runtime_secs': 900, #15 min  
                        'max_models': 100,  ## build no more than 100 models
                        'seed' : 42 
                       }

xgb_grid = H2OGridSearch(xgb, hyper_params,
                         grid_id = 'random_grid',
                         search_criteria = search_criteria_tune)

%time xgb_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
sorted_xgb = xgb_grid.get_grid(sort_by = 'rmse',decreasing = False)
sorted_xgb.sorted_metric_table()

In [None]:
best_xgb_model = xgb_grid.models[0] 
best_xgb_model

In [None]:
tuned_xgb_per = best_xgb_model.model_performance(valid)

In [None]:
print("Default XGB RMSE: %.4f \nTuned XGB RMSE:%.4f" % (default_xgb_per.rmse(), tuned_xgb_per.rmse()))

In [None]:
print("Default XGB MAE: %.4f \nTuned XGB MAE:%.4f" % (default_xgb_per.mae(), tuned_xgb_per.mae()))

## Task 7: Tune the Deep Learning model with H2O GridSearch

In [None]:
dl = H2ODeepLearningEstimator(seed = 42, model_id = 'DL',
                              nfolds = 0,
                              keep_cross_validation_predictions = False,
                              epochs = 10,
                              activation = 'rectifier_with_dropout',
                              stopping_rounds = 5, #default
                              stopping_tolerance = 1e-3, #default
                              stopping_metric = "rmse", #default
                              
                              )

hyper_params = {'hidden' : [[100, 100], [165, 165], [200,200], [330,330], 
                            [165, 200]],
                'hidden_dropout_ratios' : [[0,0], [0.01,0.01], [0.15,0.15], 
                                           [0.30, 0.30],[0.5,0.5]]
               }

search_criteria_tune = {'strategy': "RandomDiscrete",
                        'max_runtime_secs': 900, #15 min  
                        'max_models': 100,  ## build no more than 100 models
                        'seed' : 42 }

dl_grid = H2OGridSearch(model = dl, 
                        hyper_params = hyper_params,
                        grid_id = 'random_dl_grid',
                        search_criteria = search_criteria_tune)

%time dl_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
hidden_per = dl_grid.get_grid(sort_by = 'rmse', decreasing = False)
hidden_per.sorted_metric_table()

In [None]:
dl = H2ODeepLearningEstimator(epochs = 10,
                              hidden = [100,100],
                              hidden_dropout_ratios = [0.01,0.01],
                              seed = 42,
                              model_id = 'DL',
                              activation = 'rectifier_with_dropout',
                              stopping_rounds = 3, 
                              stopping_tolerance = 1e-3, #default
                              stopping_metric = "rmse", #default
                              adaptive_rate = True)

hyper_params = {'max_w2'  : [1e38, 1e35, 1e36, 1e37, 1e34, 5e35],
                'l2'      : [1e-7, 1e-6, 1e-5, 1e-4, 5e-4, 1e-3, 0],
               }

search_criteria_tune = {'strategy': "RandomDiscrete",
                        'max_runtime_secs': 900, #15 min  
                        'max_models': 100,  ## build no more than 100 models
                        'seed' : 42 
                       }

dl_grid = H2OGridSearch(model = dl, 
                         hyper_params = hyper_params,
                         grid_id = 'random_dl_search',
                         search_criteria = search_criteria_tune)

%time dl_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
dl_perf = dl_grid.get_grid(sort_by = 'rmse', decreasing = False)
dl_perf.sorted_metric_table()

In [None]:
best_dl_model = dl_grid.models[0]

In [None]:
dl_checkpoint = H2ODeepLearningEstimator(checkpoint = best_dl_model.model_id,
                                         epochs = 200,
                                         hidden = [100,100],
                                         hidden_dropout_ratios = [0.01,0.01],
                                         adaptive_rate = True,
                                         l2 = 1.0e-7,
                                         max_w2 = 1e35,
                                         reproducible = True,                                     
                                         model_id = 'DL_checkpoint',
                                         activation = 'rectifier_with_dropout',
                                         distribution = 'auto',
                                         seed = 42,
                                         stopping_metric = 'RMSE',
                                         stopping_tolerance = 1e-3,
                                         stopping_rounds = 5)

%time dl_checkpoint.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [None]:
dl_checkpoint

In [None]:
tuned_dl_per = dl_checkpoint.model_performance(valid)
print("Default DL Model RMSE: %.4f \nTuned DL Model RMSE:%.4f" % (default_dl_per.rmse(), tuned_dl_per.rmse()))

In [None]:
print("Default DL Model MAE: %.4f \nTuned DL Model MAE:%.4f" % (default_dl_per.mae(), tuned_dl_per.mae()))

## Task 8: Test Set Performance

In [None]:
dl_test_per = dl_checkpoint.model_performance(test)
xgb_test_per = best_xgb_model.model_performance(test)

In [None]:
print("XGBoost Test RMSE: %.4f  \nDeep Learning Model Test RMSE: %.4f " % 
      (xgb_test_per.rmse(), dl_test_per.rmse()))

In [None]:
print("XGBoost Test MAE: %.4f  \nDeep Learning model Test MAE: %.4f " % 
      (xgb_test_per.mae(), dl_test_per.mae()))

In [None]:
xgb_tuned_pred = best_xgb_model.predict(test) #get predictions from xgboost
test_rate_pred = test['ORIGINAL_INTEREST_RATE'].cbind(xgb_tuned_pred)#combined xgb predictions with actual interest rate
dl_tuned_pred = dl_checkpoint.predict(test)#get predictions from Deep Learning Model
test_rate_pred.cbind(dl_tuned_pred)

## Task 9: Challenge

Exercise 1: Tune a Deep Learning Model with at least two hidden layers. Use the RMSE and MAE from this tutorial as benchmarks, and try to get a lower RMSE and lower MAE from the tuned model in this tutorial. The grid searches shown below are just an option, you could use `adaptive_rate = True` and tune other parameters.

#### We start by using 2 hidden layers with 165 neurons, and try to find a good dropout ratio

In [None]:
dl = H2ODeepLearningEstimator(epochs = 10,
                              hidden = [165,165],
                              seed = 42,
                              model_id = 'DL',
                              activation = 'rectifier_with_dropout')

hyper_params = {'hidden_dropout_ratios' : [[0,0], [0.1, 0.1], [0.15, 0.15], [0.25,0.25], [0.3,0.3], 
                                           [0.2,0.2], [0.35,0.35],[0.3,0.0]]
               }

search_criteria_tune = {'strategy': "Cartesian"
                       }

dl_grid = H2OGridSearch(model = dl, 
                        hyper_params = hyper_params,
                        grid_id = 'challenge_drop_grid',
                        search_criteria = search_criteria_tune)

%time dl_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

dropout_per = dl_grid.get_grid(sort_by = 'rmse', decreasing = False)
dropout_per.sorted_metric_table()

#### Find a learning rate

In [None]:
dl = H2ODeepLearningEstimator(epochs = 10,
                              hidden = [165,165],
                              seed = 42,
                              model_id = 'DL',
                              activation = 'rectifier_with_dropout',
                              hidden_dropout_ratios = [0.2,0.2],
                              distribution = 'auto',
                              adaptive_rate = False)

hyper_params = { 'rate' : [0.0001, 0.0005, 0.0008, 0.001, 0.0015, 0.0020, 0.003, 0.004, 0.007, 0.009]
               }

search_criteria_tune = {'strategy': "Cartesian",
                       }

dl_grid = H2OGridSearch(model = dl, hyper_params = hyper_params,
                         grid_id = 'challenge_rate_grid_2',
                         search_criteria = search_criteria_tune)

%time dl_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

learn_per = dl_grid.get_grid(sort_by = 'rmse', decreasing = False)
learn_per.sorted_metric_table()

#### Tune the parameters related to learn rate

In [None]:
dl = H2ODeepLearningEstimator(epochs = 10,
                              hidden = [165,165],
                              seed = 42,
                              model_id = 'DL',
                              activation = 'rectifier_with_dropout',
                              hidden_dropout_ratios = [0.25, 0.25],
                              distribution = 'auto',
                              adaptive_rate = False,
                              l1 = 0,
                              l2 = 1e-5,
                              max_w2 = 3.4028235e38,
                              rate = 0.002)

hyper_params = {'rate_annealing' : [1e-6, 1e-7, 1e-8, 1e-5],
                'rate_decay': [1, 0.8, 0.9, 1.1, 1.2],
                'momentum_ramp' : [10000, 15000, 5000, 20000, 50000, 100000],
                'momentum_stable' : [0.9, 0.95, 0.99, 0.999],
                'momentum_start' : [0.9, 0.4, 0.5, 0.7, 0.8]
               }

search_criteria_tune = {'strategy': "RandomDiscrete",
                        'max_runtime_secs': 1200, #15 min  
                        'max_models': 100,  ## build no more than 100 models
                        'seed' : 42 
                       }

dl_grid = H2OGridSearch(model = dl, 
                        hyper_params = hyper_params,
                        grid_id = 'challenge_rate_params_grid',
                        search_criteria = search_criteria_tune)

%time dl_grid.train(x = x, y = y, training_frame = train, validation_frame = valid)

learn_per = dl_grid.get_grid(sort_by = 'rmse', decreasing = False)
learn_per.sorted_metric_table()

### Shutdown Cluster

In [None]:
h2o.cluster().shutdown()