-
Notifications
You must be signed in to change notification settings - Fork 2k
/
pyunit_H2OXGBoost_native_XGBoost_enum_multinomial_compare_sparse_singleNode.py
85 lines (77 loc) · 3.62 KB
/
pyunit_H2OXGBoost_native_XGBoost_enum_multinomial_compare_sparse_singleNode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys
sys.path.insert(1,"../../../")
import time
from h2o.estimators.xgboost import *
from tests import pyunit_utils
'''
The goal of this test is to compare the results of H2OXGBoost and natibve XGBoost for binomial classification.
The dataset contains only enum columns.
'''
def comparison_test():
if sys.version.startswith("2"):
print("native XGBoost tests only supported on python3")
return
import xgboost as xgb
assert H2OXGBoostEstimator.available() is True
ret = h2o.cluster()
if len(ret.nodes) == 1:
runSeed = 1
dataSeed = 17
ntrees = 17
responseL = 11
# CPU Backend is forced for the results to be comparable
h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
"min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}
nativeParam = {'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"],
'tree_method': 'exact',
'seed': h2oParamsS["seed"],
'booster': 'gbtree',
'objective': 'multi:softprob',
'eta': h2oParamsS["learn_rate"],
'grow_policy': 'depthwise',
'alpha': 0.0,
'subsample': 1.0,
'colsample_bylevel': 1.0,
'max_delta_step': 0.0,
'min_child_weight': h2oParamsS["min_rows"],
'gamma': 0.0,
'max_depth': h2oParamsS["max_depth"],
'num_class':responseL}
nrows = 10000
ncols = 10
factorL = 11
numCols = 0
enumCols = ncols-numCols
trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
responseLevel=responseL, randseed=dataSeed) # load in dataset and add response column
print(trainFile)
myX = trainFile.names
y='response'
myX.remove(y)
nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX)
h2oModelS = H2OXGBoostEstimator(**h2oParamsS)
# gather, print and save performance numbers for h2o model
h2oModelS.train(x=myX, y=y, training_frame=trainFile)
h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"]
time1 = time.time()
h2oPredictS = h2oModelS.predict(trainFile)
h2oPredictTimeS = time.time()-time1
# train the native XGBoost
nrounds = ntrees
nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
modelInfo = nativeModel.get_dump()
print(modelInfo)
print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds))
nativeTrainTime = time.time()-time1
time1=time.time()
nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
nativeScoreTime = time.time()-time1
print("Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse.....")
pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS,
nativeScoreTime, tolerance=1e-6)
else:
print("******** Test skipped. This test cannot be performed in multinode environment.")
if __name__ == "__main__":
pyunit_utils.standalone_test(comparison_test)
else:
comparison_test()