## H2O Model Playground

In [14]:
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch

file_path = "../data/mini_all_features.csv"

In [15]:
from pyspark.sql import SparkSession
from pysparkling import *
import h2o



# Create Spark session
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

h2o.init()
# Initialize H2OContext
hc = H2OContext.getOrCreate()

# Get H2OConf from H2OContext
h2o_conf = hc.getConf()

# Set H2OConf properties
h2o_conf.set("spark.ext.h2o.client.language", "python")


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,4 hours 15 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 1 day
H2O_cluster_name:,sparkling-water-jacobfletcher_local-1705869136820
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,821 Mb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Sparkling Water configuration:
  backend cluster mode : internal
  workers              : None
  cloudName            : Not set yet, it will be set automatically before starting H2OContext.
  base port            : 54321
  cloudTimeout         : 60000
  log level            : INFO
  nthreads             : -1
  drddMulFactor        : 10




In [16]:
h2o_df = h2o.import_file(file_path)
# h2o_df = spark.read.option("header", True).csv(file_path)
h2o_df = h2o_df.drop("userId")

try:
    h2o_df.drop('C1')
    h2o_df.drop('userId')
except Exception as e:
    pass

col_features = h2o_df.columns
col_features.remove("label")
# Split the data into training and validation sets
# Split the data into training and validation sets
train, valid = h2o_df.split_frame(ratios=[0.85])

# Define predictor and response columns
predictor_cols = col_features
response_col = "label"




Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [17]:


# Define hyperparameter grid
hyperparams = {
    'alpha': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],  # Regularization parameter
    'lambda': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0],
}

# Initialize the linear regression model
linear_reg = H2OGeneralizedLinearEstimator(family='binomial', link="logit")

# Perform grid search
grid_search = H2OGridSearch(linear_reg, hyperparams)
grid_search.train(x=predictor_cols, y=response_col, training_frame=train)

# Get the best model from the grid search
best_model = grid_search.get_grid()[0]

# Make predictions on the validation set
preds = best_model.predict(valid)

# Get model performance on the validation set
validation_performance = best_model.model_performance(valid)


glm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Adding alpha array to hyperparameter runs slower with gridsearch. This is due to the fact that the algo has to run initialization for every alpha value. Setting the alpha array as a model parameter will skip the initialization and run faster overall.


glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [18]:

print(validation_performance.F1())
print(validation_performance.F2())

[[0.24181351431681494, 0.8333333333333334]]
[[0.09730631198576863, 0.8593749999999999]]


In [19]:
validation_performance

Unnamed: 0,0,1,Error,Rate
0,19.0,2.0,0.0952,(2.0/21.0)
1,2.0,10.0,0.1667,(2.0/12.0)
Total,21.0,12.0,0.1212,(4.0/33.0)

metric,threshold,value,idx
max f1,0.2418135,0.8333333,11.0
max f2,0.0973063,0.859375,15.0
max f0point5,0.2418135,0.8333333,11.0
max accuracy,0.2418135,0.8787879,11.0
max precision,0.9999581,1.0,0.0
max recall,0.000704,1.0,28.0
max specificity,0.9999581,1.0,0.0
max absolute_mcc,0.2418135,0.7380952,11.0
max min_per_class_accuracy,0.2418135,0.8333333,11.0
max mean_per_class_accuracy,0.2418135,0.8690476,11.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.030303,0.9998636,2.75,2.75,1.0,0.9999581,1.0,0.9999581,0.0833333,0.0833333,175.0,175.0,0.0833333
2,0.030303,0.999769,0.0,2.75,0.0,0.0,1.0,0.9999581,0.0,0.0833333,-100.0,175.0,0.0833333
3,0.030303,0.9996744,0.0,2.75,0.0,0.0,1.0,0.9999581,0.0,0.0833333,-100.0,175.0,0.0833333
4,0.0606061,0.9295166,0.0,1.375,0.0,0.9996626,0.5,0.9998104,0.0,0.0833333,-100.0,37.5,0.0357143
5,0.0606061,0.8493497,0.0,1.375,0.0,0.0,0.5,0.9998104,0.0,0.0833333,-100.0,37.5,0.0357143
6,0.1212121,0.6880586,1.375,1.375,0.5,0.7187054,0.5,0.8592579,0.0833333,0.1666667,37.5,37.5,0.0714286
7,0.1515152,0.6618601,2.75,1.65,1.0,0.687214,0.6,0.8248491,0.0833333,0.25,175.0,65.0,0.1547619
8,0.2121212,0.6123201,2.75,1.9642857,1.0,0.6541451,0.7142857,0.7760765,0.1666667,0.4166667,175.0,96.4285714,0.3214286
9,0.3030303,0.3847035,2.75,2.2,1.0,0.525794,0.8,0.7009918,0.25,0.6666667,175.0,120.0,0.5714286
10,0.3939394,0.1166487,1.8333333,2.1153846,0.6666667,0.232082,0.7692308,0.5927818,0.1666667,0.8333333,83.3333333,111.5384615,0.6904762


## H2O Gradient Boosting

In [20]:



h2o_df = h2o.import_file(file_path)
# h2o_df = spark.read.option("header", True).csv(file_path)
h2o_df = h2o_df.drop("userId")

try:
    h2o_df.drop('C1')
    h2o_df.drop('userId')
except Exception as e:
    pass

col_features = h2o_df.columns
col_features.remove("label")
# Split the data into training and validation sets
# Split the data into training and validation sets
train, valid = h2o_df.split_frame(ratios=[0.85])

# Define predictor and response columns
predictor_cols = col_features
response_col = "label"


# Define hyperparameter grid
hyperparams = {
    'learn_rate': [0.01, 0.1, 0.2],
    'ntrees': [50, 100, 200],
    'max_depth': [3, 4, 5],
}

gradient_boost = H2OGradientBoostingEstimator(seed=42)


# Perform grid search
grid_search = H2OGridSearch(gradient_boost, hyperparams)
grid_search.train(x=predictor_cols, y=response_col, training_frame=train)

# Get the best model from the grid search
best_model = grid_search.get_grid()[0]

# Make predictions on the validation set
preds = best_model.predict(valid)

# Get model performance on the validation set
validation_performance = best_model.model_performance(valid)



Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
gbm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [None]:

validation_performance

## Distributed Random Forrest

In [27]:
from h2o.estimators import H2ORandomForestEstimator
h2o_df = h2o.import_file(file_path)
# h2o_df = spark.read.option("header", True).csv(file_path)
h2o_df = h2o_df.drop("userId")

try:
    h2o_df.drop('C1')
    h2o_df.drop('userId')
except Exception as e:
    pass

col_features = h2o_df.columns
col_features.remove("label")
# Split the data into training and validation sets
# Split the data into training and validation sets
train, valid = h2o_df.split_frame(ratios=[0.85])

# Define predictor and response columns
predictor_cols = col_features
response_col = "label"


# Define hyperparameter grid
hyperparams = {
    'ntrees': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_rows': [1, 5, 10],
    'sample_rate': [0.7, 0.8, 0.9],
    'col_sample_rate_per_tree': [0.7, 0.8, 0.9],
    'min_split_improvement': [1e-4, 1e-3, 1e-2]
}

drf_estimator = H2ORandomForestEstimator(
    calibrate_model=False,
    calibration_frame=valid,
    binomial_double_trees=True,
    nfolds=5,  # Specify the number of folds for cross-validation
    fold_assignment='Stratified',  # Use stratified cross-validation for binomial classification
    keep_cross_validation_predictions=True  # Keep predictions on the validation folds
)



grid_search = H2OGridSearch(drf_estimator, hyperparams)
grid_search.train(x=predictor_cols, y=response_col, training_frame=train)

best_model = grid_search.get_grid()[0]

preds = best_model.predict(valid)

validation_performance = best_model.model_performance(valid)



# predictors = [col for col in train_data.columns if col.endswith("_scaled")]
# response = "label"

# model_drf = estimator.train(x=predictors,
#                y=response,
#                training_frame=train_scaled_data,
#                validation_frame=test_scaled_data)



# # Eval performance:
# perf = model_drf.model_performance()

# # Generate predictions on a validation set (if necessary):
# pred = model_drf.predict(test_scaled_data)

Parse progress: |

████████████████████████████████████████████████████████████████| (done) 100%
drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%

Hyper-parameter: col_sample_rate_per_tree, 0.8
Hyper-parameter: max_depth, 15
Hyper-parameter: min_rows, 1.0
Hyper-parameter: min_split_improvement, 0.001
Hyper-parameter: ntrees, 150
Hyper-parameter: sample_rate, 0.7
failure_details: Illegal argument(s) for DRF model: Grid_DRF_py_35_sid_8c18_model_python_1705869686358_60_model_197_cv_1.  Details: ERRR on field: _ntrees: The tree model will not fit in the driver node's memory ( 323  B per tree x 150 > Zero  ) - try decreasing ntrees and/or max_depth or increasing min_rows!

failure_stack_traces: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for DRF model: Grid_DRF_py_35_sid_8c18_model_python_1705869686358_60_model_197_cv_1.  Details: ERRR on field: _ntrees: The tree model will not fit in the driver node's memory ( 323  B per tree x 1

In [33]:

validation_performance

In [34]:
h2o.cluster().shutdown()

H2O session _sid_8c18 closed.
