In [99]:
# import libraries
import pandas as pd
import numpy as np
import pathlib
from sklearn import datasets
import plotly.express as px
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

pd.set_option('display.max_columns', 50)

In [100]:
# import iris dataset
iris = datasets.load_iris()

iris_df = pd.DataFrame(
    data=np.concatenate([iris["data"], iris["target"].reshape(150,1)], axis=1),
    columns= ["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm", "target"]
)

iris_df = iris_df.merge(right=pd.DataFrame({"target":[0.0, 1.0, 2.0], "species":["setosa", "versicolor","virginica"]}), on="target", how="left")

In [101]:
# data exploration
iris_df.groupby("species").describe()

Unnamed: 0_level_0,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_length_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,sepal_width_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_length_cm,petal_width_cm,petal_width_cm,petal_width_cm,petal_width_cm,petal_width_cm,petal_width_cm,petal_width_cm,petal_width_cm,target,target,target,target,target,target,target,target
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.428,0.379064,2.3,3.2,3.4,3.675,4.4,50.0,1.462,0.173664,1.0,1.4,1.5,1.575,1.9,50.0,0.246,0.105386,0.1,0.2,0.2,0.3,0.6,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,0.313798,2.0,2.525,2.8,3.0,3.4,50.0,4.26,0.469911,3.0,4.0,4.35,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8,50.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,0.322497,2.2,2.8,3.0,3.175,3.8,50.0,5.552,0.551895,4.5,5.1,5.55,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5,50.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


In [102]:
# data visualisation - Petal vs species
fig = px.scatter(iris_df, x="petal_width_cm", y="petal_length_cm", color="species") 
fig.show()

In [103]:
# data visualisation - correlation matrix
fig = px.imshow(iris_df.iloc[:, 0:4].corr(), color_continuous_scale="blues") 
fig.show()

In [104]:
# initiate h2o instance
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,13 hours 25 mins
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.4
H2O_cluster_version_age:,9 months and 23 days !!!
H2O_cluster_name:,H2O_from_python_Huang_z42nio
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.952 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [107]:
# load dataset into h2o
iris_hdf = h2o.H2OFrame(iris_df.iloc[:, iris_df.columns!="target"])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [108]:
# train model - do not split into test and train since the data size is tiny
iris_gbm = H2OGradientBoostingEstimator(
    distribution="multinomial",
    ntrees=500,
    learn_rate=0.05,
    stopping_rounds=20,
    stopping_metric="AUC",
    auc_type="WEIGHTED_OVR",
    seed=715)
iris_gbm.train(x = list(iris_df.columns[0:4]), y = "species", training_frame = iris_hdf)
# iris_xgb.model_performance(valid).auc()


early stopping is enabled but neither score_tree_interval or score_each_iteration are defined. Early stopping will not be reproducible!



gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1674387121940_5


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,43.0,129.0,23481.0,1.0,5.0,4.906977,2.0,12.0,9.868217




ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 0.016995018106015954
RMSE: 0.13036494201285848
LogLoss: 0.09624239211447781
Mean Per-Class Error: 0.02
AUC: 0.9997333333333334
AUCPR: 0.999474389957814

Multinomial AUC values: 


Unnamed: 0,type,first_class_domain,second_class_domain,auc
0,setosa vs Rest,setosa,,1.0
1,versicolor vs Rest,versicolor,,0.9994
2,virginica vs Rest,virginica,,0.9998
3,Macro OVR,,,0.999733
4,Weighted OVR,,,0.999733
5,Class setosa vs. versicolor,setosa,versicolor,1.0
6,Class setosa vs. virginica,setosa,virginica,1.0
7,Class versicolor vs. virginica,versicolor,virginica,0.9992
8,Macro OVO,,,0.999733
9,Weighted OVO,,,0.999733



Multinomial auc_pr values: 


Unnamed: 0,type,first_class_domain,second_class_domain,auc_pr
0,setosa vs Rest,setosa,,1.0
1,versicolor vs Rest,versicolor,,0.998819
2,virginica vs Rest,virginica,,0.999604
3,Macro OVR,,,0.999474
4,Weighted OVR,,,0.999474
5,Class setosa vs. versicolor,setosa,versicolor,1.0
6,Class setosa vs. virginica,setosa,virginica,1.0
7,Class versicolor vs. virginica,versicolor,virginica,0.999212
8,Macro OVO,,,0.999737
9,Weighted OVO,,,0.999737



Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,setosa,versicolor,virginica,Error,Rate
0,50.0,0.0,0.0,0.0,0 / 50
1,0.0,47.0,3.0,0.06,3 / 50
2,0.0,0.0,50.0,0.0,0 / 50
3,50.0,47.0,53.0,0.02,3 / 150



Top-3 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.98
1,2,1.0
2,3,1.0



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc
0,,2023-01-23 12:02:17,0.005 sec,0.0,0.666667,1.098612,0.64,0.5,0.333333
1,,2023-01-23 12:02:17,0.091 sec,1.0,0.635829,1.010264,0.04,0.994,0.988494
2,,2023-01-23 12:02:17,0.105 sec,2.0,0.606043,0.931892,0.04,0.995333,0.990962
3,,2023-01-23 12:02:17,0.129 sec,3.0,0.577461,0.862121,0.04,0.995467,0.991218
4,,2023-01-23 12:02:17,0.139 sec,4.0,0.549999,0.799366,0.046667,0.996067,0.992812
5,,2023-01-23 12:02:17,0.155 sec,5.0,0.524012,0.743388,0.04,0.996267,0.993174
6,,2023-01-23 12:02:17,0.169 sec,6.0,0.499152,0.692221,0.046667,0.998067,0.996238
7,,2023-01-23 12:02:17,0.172 sec,7.0,0.475336,0.645615,0.046667,0.998267,0.996673
8,,2023-01-23 12:02:17,0.191 sec,8.0,0.452873,0.603229,0.046667,0.997867,0.995898
9,,2023-01-23 12:02:17,0.205 sec,9.0,0.431495,0.564478,0.046667,0.998067,0.99626



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,petal_width_cm,520.333557,1.0,0.594747
1,petal_length_cm,348.451599,0.66967,0.398284
2,sepal_width_cm,4.258981,0.008185,0.004868
3,sepal_length_cm,1.838512,0.003533,0.002101




In [109]:
# see performance by tree iterations
iris_gbm.scoring_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc
0,,2023-01-23 12:02:17,0.005 sec,0.0,0.666667,1.098612,0.64,0.5,0.333333
1,,2023-01-23 12:02:17,0.091 sec,1.0,0.635829,1.010264,0.04,0.994,0.988494
2,,2023-01-23 12:02:17,0.105 sec,2.0,0.606043,0.931892,0.04,0.995333,0.990962
3,,2023-01-23 12:02:17,0.129 sec,3.0,0.577461,0.862121,0.04,0.995467,0.991218
4,,2023-01-23 12:02:17,0.139 sec,4.0,0.549999,0.799366,0.046667,0.996067,0.992812
5,,2023-01-23 12:02:17,0.155 sec,5.0,0.524012,0.743388,0.04,0.996267,0.993174
6,,2023-01-23 12:02:17,0.169 sec,6.0,0.499152,0.692221,0.046667,0.998067,0.996238
7,,2023-01-23 12:02:17,0.172 sec,7.0,0.475336,0.645615,0.046667,0.998267,0.996673
8,,2023-01-23 12:02:17,0.191 sec,8.0,0.452873,0.603229,0.046667,0.997867,0.995898
9,,2023-01-23 12:02:17,0.205 sec,9.0,0.431495,0.564478,0.046667,0.998067,0.99626


In [110]:
# save model
iris_gbm.download_mojo("modelfiles/irisgbm_20230123.zip")

'c:\\Users\\Huang\\PythonProjects\\p_h2o_gcp_deploy\\modelfiles\\irisgbm_20230123.zip'

In [111]:
prediction = iris_gbm.predict(iris_hdf[1,:])

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
