forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench_hist_gradient_boosting_higgsboson.py
117 lines (97 loc) · 3.81 KB
/
bench_hist_gradient_boosting_higgsboson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from urllib.request import urlretrieve
import os
from gzip import GzipFile
from time import time
import argparse
import numpy as np
import pandas as pd
from joblib import Memory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
parser = argparse.ArgumentParser()
parser.add_argument("--n-leaf-nodes", type=int, default=31)
parser.add_argument("--n-trees", type=int, default=10)
parser.add_argument("--lightgbm", action="store_true", default=False)
parser.add_argument("--xgboost", action="store_true", default=False)
parser.add_argument("--catboost", action="store_true", default=False)
parser.add_argument("--learning-rate", type=float, default=1.0)
parser.add_argument("--subsample", type=int, default=None)
parser.add_argument("--max-bins", type=int, default=255)
parser.add_argument("--no-predict", action="store_true", default=False)
parser.add_argument("--cache-loc", type=str, default="/tmp")
args = parser.parse_args()
HERE = os.path.dirname(__file__)
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
m = Memory(location=args.cache_loc, mmap_mode="r")
n_leaf_nodes = args.n_leaf_nodes
n_trees = args.n_trees
subsample = args.subsample
lr = args.learning_rate
max_bins = args.max_bins
@m.cache
def load_data():
filename = os.path.join(HERE, URL.rsplit("/", 1)[-1])
if not os.path.exists(filename):
print(f"Downloading {URL} to {filename} (2.6 GB)...")
urlretrieve(URL, filename)
print("done.")
print(f"Parsing {filename}...")
tic = time()
with GzipFile(filename) as f:
df = pd.read_csv(f, header=None, dtype=np.float32)
toc = time()
print(f"Loaded {df.values.nbytes / 1e9:0.3f} GB in {toc - tic:0.3f}s")
return df
def fit(est, data_train, target_train, libname):
print(f"Fitting a {libname} model...")
tic = time()
est.fit(data_train, target_train)
toc = time()
print(f"fitted in {toc - tic:.3f}s")
def predict(est, data_test, target_test):
if args.no_predict:
return
tic = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
toc = time()
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
df = load_data()
target = df.values[:, 0]
data = np.ascontiguousarray(df.values[:, 1:])
data_train, data_test, target_train, target_test = train_test_split(
data, target, test_size=0.2, random_state=0
)
n_classes = len(np.unique(target))
if subsample is not None:
data_train, target_train = data_train[:subsample], target_train[:subsample]
n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")
est = HistGradientBoostingClassifier(
loss="log_loss",
learning_rate=lr,
max_iter=n_trees,
max_bins=max_bins,
max_leaf_nodes=n_leaf_nodes,
early_stopping=False,
random_state=0,
verbose=1,
)
fit(est, data_train, target_train, "sklearn")
predict(est, data_test, target_test)
if args.lightgbm:
est = get_equivalent_estimator(est, lib="lightgbm", n_classes=n_classes)
fit(est, data_train, target_train, "lightgbm")
predict(est, data_test, target_test)
if args.xgboost:
est = get_equivalent_estimator(est, lib="xgboost", n_classes=n_classes)
fit(est, data_train, target_train, "xgboost")
predict(est, data_test, target_test)
if args.catboost:
est = get_equivalent_estimator(est, lib="catboost", n_classes=n_classes)
fit(est, data_train, target_train, "catboost")
predict(est, data_test, target_test)