-
Notifications
You must be signed in to change notification settings - Fork 2k
/
pyunit_pubdev_4697_early_stop_gbm.py
74 lines (57 loc) · 2.64 KB
/
pyunit_pubdev_4697_early_stop_gbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
import random
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import numpy as np
import pandas as pd
from sklearn import datasets
NUM_SAMPLES = 1000
NUM_TREES = 5000
# A user has noticed that our early stopping failed for special datasets. Turns out that there is a bug.
# For decreasing metrics, when the lastInK drop to zero, this implies that no more improvement is possible and
# the early stopping should return true instead of false. Good catch and thank you, Craig Milhiser!
def generate_baseline_data(include_cat):
X, y = datasets.make_friedman1(n_samples=NUM_SAMPLES, n_features=5, noise=100, random_state=1)
# convert to a binomial
prob = 1 / (1 + np.exp(-y))
y = np.random.binomial(1, prob)
print('Event rate = {0:4.4f}'.format(np.sum(y) / NUM_SAMPLES))
data = np.hstack((y.reshape(-1, 1), X))
data = pd.DataFrame(data, columns=['y', 'x0', 'x1', 'x2', 'x3', 'x4'])
if include_cat is True:
data['c'] = data.apply(lambda row: 'A' if row.y == 1 else 'B', axis=1)
return data
def test_early_stop_gbm():
random.seed(1)
np.random.seed(1)
data = generate_baseline_data(include_cat=True)
data_hex = h2o.H2OFrame(data,
destination_frame='data_cat',
column_types=['enum', 'real', 'real', 'real',
'real', 'real', 'enum'])
frames = data_hex.split_frame([0.8], ['train_cat', 'validate_cat'], seed=1)
train_it(frames, ['x0', 'x1', 'x2', 'x3', 'x4', 'c'])
def train_it(frames, x):
estimator = H2OGradientBoostingEstimator(distribution='bernoulli',
ntrees=NUM_TREES,
learn_rate=0.1,
nfolds=0,
score_tree_interval=20,
stopping_rounds=3,
stopping_tolerance=0.001,
seed=1)
estimator.train(x=x,
y='y',
training_frame=frames[0],
validation_frame=frames[1])
num_trees_trained = (int(estimator.summary()
.as_data_frame()['number_of_trees']
.to_numpy()[0]))
print('num trees trained = {0}'.format(num_trees_trained))
assert num_trees_trained < NUM_TREES, "Early stopping is not work."
if __name__ == "__main__":
pyunit_utils.standalone_test(test_early_stop_gbm)
else:
test_early_stop_gbm()