-
Notifications
You must be signed in to change notification settings - Fork 2k
/
pyunit_smallcatRF.py
50 lines (38 loc) · 1.83 KB
/
pyunit_smallcatRF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
sys.path.insert(1, "../../../")
import h2o
import numpy as np
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
def smallcatRF(ip,port):
# Training set has 26 categories from A to Z
# Categories A, C, E, G, ... are perfect predictors of y = 1
# Categories B, D, F, H, ... are perfect predictors of y = 0
# Connect to h2o
h2o.init(ip,port)
#Log.info("Importing alphabet_cattest.csv data...\n")
alphabet = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
alphabet["y"] = alphabet["y"].asfactor()
#Log.info("Summary of alphabet_cattest.csv from H2O:\n")
#alphabet.summary()
# Prepare data for scikit use
trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
converters={0:lambda s: ord(s.split("\"")[1])})
trainDataResponse = trainData[:,1]
trainDataFeatures = trainData[:,0]
# Train H2O GBM Model:
#Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
rf_h2o = h2o.random_forest(x=alphabet[['X']], y=alphabet["y"], ntrees=1, max_depth=1, nbins=100)
# Train scikit GBM Model:
# Log.info("scikit GBM with same parameters:")
rf_sci = ensemble.RandomForestClassifier(n_estimators=1, criterion='entropy', max_depth=1)
rf_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
# h2o
rf_perf = rf_h2o.model_performance(alphabet)
auc_h2o = rf_perf.auc()
# scikit
auc_sci = roc_auc_score(trainDataResponse, rf_sci.predict_proba(trainDataFeatures[:,np.newaxis])[:,1])
#Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
if __name__ == "__main__":
h2o.run_test(sys.argv, smallcatRF)