In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.model.PermutationFeatureImportance import permutation_featue_importance

In [2]:
h2o.init(strict_version_check=False)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,59 secs
H2O_cluster_timezone:,Europe/Prague
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.31.0.99999
H2O_cluster_version_age:,2 months and 9 days
H2O_cluster_name:,ard
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.708 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
df.describe()

Rows:380
Cols:9




Unnamed: 0,ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
type,int,int,int,int,int,int,real,real,int
mins,1.0,0.0,43.0,0.0,1.0,1.0,0.3,0.0,0.0
mean,190.5,0.4026315789473684,66.03947368421049,1.0868421052631572,2.2710526315789488,1.1078947368421048,15.408631578947375,15.812921052631573,6.3842105263157904
maxs,380.0,1.0,79.0,2.0,4.0,2.0,139.7,97.6,9.0
sigma,109.84079387914127,0.4910743389630552,6.527071269173311,0.3087732580252793,1.0001076181502861,0.3106564493514939,19.99757266856046,18.347619967271175,1.0919533744261092
zeros,0,227,0,3,0,0,0,167,2
missing,0,0,0,0,0,0,0,0,0
0,1.0,0.0,65.0,1.0,2.0,1.0,1.4,0.0,6.0
1,2.0,0.0,72.0,1.0,3.0,2.0,6.7,0.0,7.0
2,3.0,0.0,70.0,1.0,1.0,2.0,4.9,0.0,6.0


In [5]:
# Remove ID from training frame
train = df.drop("ID")

In [6]:
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None

In [7]:
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()

In [8]:
# See that the data is ready
train.describe()

Rows:380
Cols:8




Unnamed: 0,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON
type,enum,int,int,int,int,real,real,int
mins,,43.0,0.0,1.0,1.0,0.3,0.0,0.0
mean,,66.03947368421049,1.0868421052631572,2.2710526315789488,1.1078947368421048,15.408631578947375,15.812921052631573,6.3842105263157904
maxs,,79.0,2.0,4.0,2.0,139.7,97.6,9.0
sigma,,6.527071269173311,0.3087732580252793,1.0001076181502861,0.3106564493514939,19.99757266856046,18.347619967271175,1.0919533744261092
zeros,,0,3,0,0,0,167,2
missing,0,0,0,0,0,0,0,0
0,0,65.0,1.0,2.0,1.0,1.4,0.0,6.0
1,0,72.0,1.0,3.0,2.0,6.7,0.0,7.0
2,0,70.0,1.0,1.0,2.0,4.9,0.0,6.0


In [9]:
# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)

my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [10]:
pfi_fr =permutation_featue_importance(train, my_gbm)

# pfi_fr

AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,ID
0.998047,0.995425,0.942769,0.930913,0.920991,0.889548,0.815841,Mean of the absolute value tsst
0.00110354,0.00244447,0.00605766,0.00324044,0.00929057,0.0106801,0.0150485,standard deviation


DCAPS,RACE,AGE,VOL,DPROS,PSA,GLEASON,ID
0.99718,0.996005,0.942409,0.930821,0.919717,0.908848,0.8194,Relative Importance
1.0,0.998822,0.945075,0.933453,0.922318,0.911418,0.821717,Scaled Importance
0.153074,0.152893,0.144666,0.142887,0.141183,0.139514,0.125783,Percentage


AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,ID
0.998047,0.995425,0.942769,0.930913,0.920991,0.889548,0.815841,Mean of the absolute value tsst
0.00110354,0.00244447,0.00605766,0.00324044,0.00929057,0.0106801,0.0150485,standard deviation


