In [34]:
import pandas as pd
import numpy as np
import os

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn import metrics

import warnings
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
%matplotlib inline
sns.set()
warnings.filterwarnings(action="ignore")

In [36]:
h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,23 hours 51 mins
H2O cluster timezone:,Europe/Oslo
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.3
H2O cluster version age:,2 months and 4 days
H2O cluster name:,H2O_from_python_gunnarwindsand_4qa5qc
H2O cluster total nodes:,1
H2O cluster free memory:,3.851 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [37]:
train_h2o = h2o.import_file("train.csv")
test_h2o = h2o.import_file("test.csv")
sub_h20 = h2o.import_file("sampleSubmission.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [38]:
train = train_h2o.as_data_frame()
test = test_h2o.as_data_frame()

In [39]:
train_h2o["target"] = train_h2o["target"].asfactor()

In [40]:
x = train_h2o.columns
x.remove("target")
x.remove("id")
y = "target"

In [41]:
fold_column_name = "fold_id"
train_h2o[fold_column_name] = train_h2o["id"] % 10

In [42]:
cat_vars = train_h2o.as_data_frame().columns[train_h2o.isfactor()].drop("target").tolist()

In [43]:
num_vars = train_h2o.as_data_frame().columns[list(map(lambda x: not x, train_h2o.isfactor()))].drop(["id", "fold_id"]).tolist()

In [44]:
X_sklearn_train, X_sklearn_test = np.split(
    pd.get_dummies(pd.concat([train[cat_vars], pd.get_dummies(test[cat_vars])])).fillna(0.),
    [train.shape[0]],
    axis=0)

In [45]:
X_sklearn_train = pd.concat([X_sklearn_train, train[num_vars]], axis=1)

In [46]:
X_sklearn_test = pd.concat([X_sklearn_test, test[num_vars]], axis=1)

In [47]:
minmax_scaler = MinMaxScaler()
X_sklearn_train_scaled = minmax_scaler.fit_transform(X_sklearn_train)

In [48]:
X_sklearn_test_scaled = minmax_scaler.transform(X_sklearn_test)

In [49]:
criteria = {"strategy": "RandomDiscrete", "max_models" : 10,
            "max_runtime_secs": 28800, "seed": 1}

# Random forest

In [21]:
rf_params = {'max_depth': [3, 5, 9, 11, 13, 15],
                'sample_rate': [0.8, 0.9, 1.0],
                'col_sample_rate_per_tree': [0.2, 0.5, 0.8, 1.0]}

rf_grid = H2OGridSearch(model=H2ORandomForestEstimator,
                         grid_id='rf_grid',
                         hyper_params=rf_params,
                         search_criteria=criteria)
rf_grid.train(x=x, y=y, 
                training_frame=train_h2o, 
                fold_column=fold_column_name, 
                ntrees=250,
                seed=1)

print("Random Forest AUC", rf_grid.models[0].cross_validation_metrics_summary().as_data_frame().loc[1, "mean"])

drf Grid Build progress: |████████████████████████████ (cancelled)


H2OJobCancelled: Job<$03017f00000132d4ffffffff$_a3a48637dd688d7d7d23016528674f9e> was cancelled by the user.

# GBM

In [None]:
gbm_params = {'learn_rate': [0.01, 0.05, 0.1, 0.5], 
                'max_depth': [3, 5, 9, 11, 13, 15],
                'sample_rate': [0.8, 0.9, 1.0],
                'col_sample_rate': [0.2, 0.5, 0.8, 1.0]}


gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator,
                         grid_id='gbm_grid',
                         hyper_params=gbm_params,
                         search_criteria=criteria)
gbm_grid.train(x=x, y=y, 
                training_frame=train_h2o, 
                fold_column=fold_column_name, 
                ntrees=200,
                seed=1)

print("GBM AUC:", gbm_grid.models[0].cross_validation_metrics_summary().as_data_frame().loc[1, "mean"])

In [52]:
y

'target'

# Automodel

In [53]:
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models=1000, seed=1, max_runtime_secs=100)
aml.train(x=x, y=y, training_frame=train_h2o)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |████████████████████████████████████████████████████████| 100%
This H2OFrame is empty.




In [56]:
DL_tuned = H2ODeepLearningEstimator(
  model_id="DL",
  overwrite_with_best_model=False,
  hidden=[128,128,128],            # more hidden layers -> more complex interactions
  epochs=100,                       # to keep it short enough
  score_validation_samples=10000,  # downsample validation set for faster scoring
  score_duty_cycle=0.025,          # don't score more than 2.5% of the wall time
  adaptive_rate=False,             # manually tuned learning rate
  rate=0.01, 
  rate_annealing=0.000002,            
  momentum_start=0.2,              # manually tuned momentum
  momentum_stable=0.4, 
  momentum_ramp=10000000, 
  l1=0.00001,                      # add some L1/L2 regularization
  l2=0.00001,
    seed=1,
  max_w2=10.                       # helps stability for Rectifier
)
DL_tuned.train(x=x, y=y, training_frame=train_h2o, fold_column=fold_column_name)


deeplearning Model Build progress: |██████████████████████████████████████| 100%


AttributeError: type object 'H2ODeepLearningEstimator' has no attribute 'models'

In [None]:
print("DL AUC", DL_tuned.models[0].cross_validation_metrics_summary().as_data_frame().loc[1, "mean"])

In [59]:
print("DL AUC", DL_tuned.cross_validation_metrics_summary().as_data_frame().loc[1, "mean"])

DL AUC 0.80159724


In [None]:
sub_h20.as_data_frame().assign(target=X_stack_test[:, 5]).to_csv("mlp_sub_20.csv", index=False)