# Training Experiments

## Step 1. Set up connection to Driverless AI

In [1]:
import h2oai_client
import numpy as np
import pandas as pd
# import h2o
import requests
import math
from h2oai_client import Client, ModelParameters, InterpretParameters

In [2]:
address = 'http://54.81.113.228:12345'

In [3]:
username = 'training'
password = 'training'

h2oai = Client(address = address
               , username = username
               , password = password)
# make sure to use the same user name and password when signing in through the GUI

#### Notes

* Diabetes
  * Target = Outcome
  * Scorer = AUC
  * Dropped = Pregnancies
  * Settings = 8/2/8
  * (Time on big: 5:39)

* Titanic
  * Target = survived
  * Scorer = AUC
  * First experiment
    * Dropped = (name2, cabin, embarked, boat, body, home.dest)
    * Settings = 8/2/8
    * (Time on big: 8:46)
    * Suggestion: remove `name` too
  * Second experiment
    * Dropped = (name, name2, cabin, embarked, boat, body, home.dest)
    * Settings = 8/2/8
    * (Time on big: 4:23)
  * Third experiment (naive)
    * Dropped = None
    * Settings = 8/2/8
    * (Time on big: 5:00)
    * Cancel quickly

* Boston Housing
  * Target = VALUE
  * Scorer = RMSE
  * Dropped = None
  * Settings = 8/2/8
  * (Time on big: 16:20)  

* Amazon Reviews
  * Target = PositiveReview
  * Scorer = AUC
  * First experiment (NLP Only)
    * Dropped = everything except Description (UserId, ProductId, Id, Summary, Score, HelpfulneesDenominator, ProfileName, HelpfulnessNumerator, Time)
    * Settings = 7/3/7
    * (Time on big: 31:22)
  * Second experiment (NLP +)
    * Dropped = Score
    * Settings = 7/3/7
    * (Time on big: ?)
  * Third experiment
    * Dropped = everything except Description (UserId, ProductId, Id, Summary, Score, HelpfulneesDenominator, ProfileName, HelpfulnessNumerator, Time)
    * Settings = 7/3/7
    * (Time on big: ?)
    * Enable tensorflow expert setting options

* WA Cannabis
  * Target = Log1pDemand
  * Scorer = RMSE
  * Time = SalesDate
  * First experiment (120-0)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 120
    * Gap = 0
    * (Time on big: 7:55:00)
  * Second experiment (28-0)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 28
    * Gap = 0
    * (Time on big: 1:05:00)
  * Third experiment (28-1)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 28
    * Gap = 1
    * (Time on big: 1:06:00)

* Credit Cards
  * Target = Default
  * Scorer = AUC
  * First experiment
    * Dropped = None
    * Settings = 6/4/6
    * (Time on big: 3:21)
  * Second experiment (Big)
    * Dropped = None
    * Settings = 8/6/6
    * (Time on big: 27:33)
  * Third experiment (Compliant)
    * Dropped = None
    * Settings = 8/6/6
    * (Time on big: 5:46)
    * Enable compliant pipeline in expert settings

# Experiments

## 1. Titanic Data

#### Titanic

* Titanic
  * Target = survived
  * Scorer = AUC
  * First experiment
    * Dropped = (name2, cabin, embarked, boat, body, home.dest)
    * Settings = 8/2/8
    * (Time on big: 8:46)
    * Suggestion: remove `name` too
  * Second experiment
    * Dropped = (name, name2, cabin, embarked, boat, body, home.dest)
    * Settings = 8/2/8
    * (Time on big: 4:23)
  * Third experiment (naive)
    * Dropped = None
    * Settings = 8/2/8
    * (Time on big: 5:00)
    * Cancel quickly

In [4]:
dataPath = '/data/Training/Titanic.csv'
basename = 'Titanic'
target = 'survived'
ratio = 0.8

In [5]:
data = h2oai.create_dataset_sync(dataPath)

In [6]:
# Split the data
split_data = h2oai.make_dataset_split(
    dataset_key = data.key
    , output_name1 = basename + "_train"
    , output_name2 = basename + "_test"
    , target = target
    , fold_col = ""
    , time_col = ""
    , ratio = ratio
)

In [7]:
train_key = h2oai.get_dataset_split_job(split_data).entity[0]
test_key  = h2oai.get_dataset_split_job(split_data).entity[1]

In [8]:
# let Driverless suggest parameters for experiment
params = h2oai.get_experiment_tuning_suggestion(
    dataset_key = train_key
    , target_col = target
    , is_classification = True
    , is_time_series = False
    , config_overrides = None)

In [17]:
dropped = ['name2', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
knobs = [8, 2, 8]

In [19]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = True
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'AUC'
    , enable_gpus = True
    , cols_to_drop = dropped
)

In [20]:
titanic1 = experiment

In [26]:
print("Final Model Score on Validation Data: " + str(round(titanic1.valid_score, 3)))
print("Final Model Score on Test Data: " + str(round(titanic1.test_score, 3)))

Final Model Score on Validation Data: 0.878
Final Model Score on Test Data: 0.804


In [21]:
dropped = ['name', 'name2', 'cabin', 'embarked', 'boat', 'body', 
           'home.dest']
knobs = [8, 2, 8]

In [22]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = True
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'AUC'
    , enable_gpus = True
    , cols_to_drop = dropped
)

ConnectionError: HTTPConnectionPool(host='54.81.113.228', port=12345): Max retries exceeded with url: /rpc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11e53a0f0>: Failed to establish a new connection: [Errno 50] Network is down',))

In [23]:
titanic2 = experiment

In [27]:
print("Final Model Score on Validation Data: " + 
      str(round(titanic2.valid_score, 3)))
print("Final Model Score on Test Data: " + 
      str(round(titanic2.test_score, 3)))

Final Model Score on Validation Data: 0.878
Final Model Score on Test Data: 0.804


In [24]:
dropped = ['name', 'name2', 'cabin', 'embarked', 'boat', 'body', 
           'home.dest']
knobs = [6, 2, 8]

In [25]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = True
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'AUC'
    , enable_gpus = True
    , cols_to_drop = dropped
)

In [29]:
titanic3 = experiment

In [30]:
print("Final Model Score on Validation Data: " + 
      str(round(titanic3.valid_score, 3)))
print("Final Model Score on Test Data: " + 
      str(round(titanic3.test_score, 3)))

Final Model Score on Validation Data: 0.877
Final Model Score on Test Data: 0.809


## 2. Boston Housing

* Boston Housing
  * Target = VALUE
  * Scorer = RMSE
  * Dropped = None
  * Settings = 8/2/8
  * (Time on big: 16:20)  

In [31]:
dataPath = '/data/Training/BostonHousing.csv'
basename = 'Housing'
target = 'VALUE'
ratio = 0.8

In [32]:
data = h2oai.create_dataset_sync(dataPath)

In [33]:
# Split the data
split_data = h2oai.make_dataset_split(
    dataset_key = data.key
    , output_name1 = basename + "_train"
    , output_name2 = basename + "_test"
    , target = target
    , fold_col = ""
    , time_col = ""
    , ratio = ratio
)

In [34]:
train_key = h2oai.get_dataset_split_job(split_data).entity[0]
test_key  = h2oai.get_dataset_split_job(split_data).entity[1]

In [35]:
dropped = []
knobs = [8, 2, 8]

In [37]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = False
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'RMSE'
    , enable_gpus = True
    , cols_to_drop = dropped
)

In [38]:
housing1 = experiment

In [39]:
knobs = [6, 2, 8]

In [40]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = False
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'RMSE'
    , enable_gpus = True
    , cols_to_drop = dropped
)

In [41]:
housing2 = experiment

## 3. WA Cannabis

* WA Cannabis
  * Target = Log1pDemand
  * Scorer = RMSE
  * Time = SalesDate
  * First experiment (120-0)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 120
    * Gap = 0
    * (Time on big: 7:55:00)
  * Second experiment (28-0)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 28
    * Gap = 0
    * (Time on big: 1:05:00)
  * Third experiment (28-1)
    * Dropped = None
    * Settings = 10/6/6
    * Days = 28
    * Gap = 1
    * (Time on big: 1:06:00)

In [49]:
dataPath = '/data/Training/WA_CannabisDailySales.csv'
basename = 'Cannabis'
target = 'Log1pDemandInThou'
time = 'SalesDate'
ratio = 0.75

In [43]:
data = h2oai.create_dataset_sync(dataPath)

In [44]:
# Split the data
split_data = h2oai.make_dataset_split(
    dataset_key = data.key
    , output_name1 = basename + "_train"
    , output_name2 = basename + "_test"
    , target = ""
    , fold_col = ""
    , time_col = time
    , ratio = ratio
)

In [45]:
train_key = h2oai.get_dataset_split_job(split_data).entity[0]
test_key  = h2oai.get_dataset_split_job(split_data).entity[1]

In [47]:
dropped = []
knobs = [6, 2, 6]
forecast = 28
gap = 1

In [50]:
experiment = h2oai.start_experiment_sync(
    dataset_key = train_key
    , testset_key = test_key
    , target_col = target
    , is_classification = False
    , accuracy = knobs[0]
    , time = knobs[1]
    , interpretability = knobs[2]
    , scorer = 'RMSE'
    , enable_gpus = True
    , cols_to_drop = dropped
    , time_col = time
    , num_prediction_periods = forecast
    , num_gap_periods = gap
)

ConnectionError: HTTPConnectionPool(host='54.81.113.228', port=12345): Max retries exceeded with url: /rpc (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11f6389b0>: Failed to establish a new connection: [Errno 50] Network is down',))

## 3. Diabetes Data

In [19]:
dataPath = '/data/Training/DiabetesNA.csv'
basename = 'Diabetes'
target = 'Outcome'
ratio = 0.8

In [4]:
data = h2oai.create_dataset_sync(dataPath)

In [17]:
# Split the data
split_data = h2oai.make_dataset_split(
    dataset_key = data.key
    , output_name1 = basename + "_train"
    , output_name2 = basename + "_test"
    , target = target
    , fold_col = ""
    , time_col = ""
    , ratio = ratio
)

In [35]:
train_key = h2oai.get_dataset_split_job(split_data).entity[0]
test_key  = h2oai.get_dataset_split_job(split_data).entity[1]

In [36]:
# let Driverless suggest parameters for experiment
params = h2oai.get_experiment_tuning_suggestion(
    dataset_key = train_key
    , target_col = target
    , is_classification = True
    , is_time_series = False
    , config_overrides = None)

RemoteError: {'code': 0, 'message': "KeyError: 'dataset.lufudama'"}

In [None]:
exp_preview = h2oai.get_experiment_preview_sync(
    dataset_key= train.key
    , validset_key=''
    , classification=True
    , dropped_cols = []
    , target_col=target
    , is_time_series = False
    , enable_gpus = True
    ,
                                                accuracy = 5, time = 5, interpretability = 5,
                                                config_overrides = None)
exp_preview

## (2) Titanic Data
## (3) Boston Housing Data
## (4) Washington Cannabis Data
## (5) Amazon Reviews Data
## (6) Credit Card Default Data

In [5]:
creditCardPath = '/data/Training/CreditCard.csv'
ccdata = h2oai.create_dataset_sync(creditCardPath)

In [6]:
dir(h2oai_client)

['ActVsPred',
 'Any',
 'AppVersion',
 'AutoDLInit',
 'AutoDLProgress',
 'AutoDLResult',
 'AutoReport',
 'AutoReportJob',
 'AutoVizBarcharts',
 'AutoVizBiplot',
 'AutoVizBoxplot',
 'AutoVizHistogram',
 'AutoVizJob',
 'AutoVizScatterplot',
 'AutoVizSummary',
 'AwsCredentials',
 'AwsLambdaParameters',
 'BarchartJob',
 'BoxplotJob',
 'Client',
 'ConfigItem',
 'ConfusionMatrix',
 'CreateDeploymentJob',
 'Dataset',
 'DatasetColumn',
 'DatasetColumnStats',
 'DatasetJob',
 'DatasetNonNumericColumnStats',
 'DatasetNumericColumnStats',
 'DatasetSplitJob',
 'DatasetSummary',
 'Deployment',
 'DestroyDeploymentJob',
 'DiskStats',
 'DotplotJob',
 'EchoStatus',
 'ExemplarRowsResponse',
 'ExperimentPreviewJob',
 'ExperimentPreviewResponse',
 'ExperimentScore',
 'ExperimentsStats',
 'ExportEntityJob',
 'FileSearchResult',
 'FileSearchResults',
 'GPUStats',
 'GainLift',
 'H2OAutoViz',
 'H2OBarchart',
 'H2OBoxplot',
 'H2OBoxplotEnvelope',
 'H2ODotplot',
 'H2OHeatMap',
 'H2ONetwork',
 'H2OOutliers',
 'H2O

In [29]:
## Get input parameters for a function
# h2oai.FUNCTION_OF_INTEREST.__code__.co_varnames
h2oai.get_experiment_tuning_suggestion.__code__.co_varnames

('self',
 'dataset_key',
 'target_col',
 'is_classification',
 'is_time_series',
 'config_overrides',
 'req_',
 'res_')

In [27]:
## See all information about an object
# h2oai.OBJECT.dump()
h2oai.train_key.dump()

AttributeError: 'Client' object has no attribute 'train_key'

In [21]:
all_data_sets = h2oai.list_datasets(0,100)

print(list(map(lambda x: x.name, all_data_sets)))
print(list(map(lambda x: x.key, all_data_sets)))

['Cannabis_test', 'Cannabis_train', 'WA_CannabisDailySales.csv', 'Housing_test', 'Housing_train', 'BostonHousing.csv', 'Titanic_test', 'Titanic_train', 'Titanic.csv']
['gumilumu', 'tevemome', 'vugiwife', 'vurogesa', 'navelemu', 'lohibigi', 'dibatena', 'hewaduho', 'wodipume']
