-
Notifications
You must be signed in to change notification settings - Fork 42
/
GPBoost_algorithm_blog_post_example.py
178 lines (163 loc) · 8.42 KB
/
GPBoost_algorithm_blog_post_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
Demo code used in this blog post:
https://towardsdatascience.com/tree-boosted-mixed-effects-models-4df610b624cb
@author: Fabio Sigrist
Note: results in comments are obtained with gpboost version 0.7.6 and merf
version 0.3.0
"""
import gpboost as gpb
import numpy as np
import sklearn.datasets as datasets
import time
import pandas as pd
print("It is recommended that the examples are run in interactive mode")
# --------------------Simulate data----------------
ntrain = 5000 # number of samples for training
n = 2 * ntrain # combined number of training and test data
m = 500 # number of categories / levels for grouping variable
sigma2_1 = 1 # random effect variance
sigma2 = 1 ** 2 # error variance
# Simulate non-linear mean function
np.random.seed(1)
X, F = datasets.make_friedman3(n_samples=n)
X = pd.DataFrame(X,columns=['variable_1','variable_2','variable_3','variable_4'])
F = F * 10**0.5 # with this choice, the fixed-effects regression function has the same variance as the random effects
# Simulate random effects
group_train = np.arange(ntrain) # grouping variable
for i in range(m):
group_train[int(i * ntrain / m):int((i + 1) * ntrain / m)] = i
group_test = np.arange(ntrain) # grouping variable for test data. 50% existing and 50% new groups
m_test = 2 * m
for i in range(m_test):
group_test[int(i * ntrain / m_test):int((i + 1) * ntrain / m_test)] = i
group = np.concatenate((group_train,group_test))
b = np.sqrt(sigma2_1) * np.random.normal(size=m_test) # simulate random effects
Zb = b[group]
# Put everything together
xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term
y = F + Zb + xi # observed data
# split train and test data
y_train = y[0:ntrain]
y_test = y[ntrain:n]
X_train = X.iloc[0:ntrain,]
X_test = X.iloc[ntrain:n,]
# --------------------Learning and prediction----------------
# Define and train GPModel
gp_model = gpb.GPModel(group_data=group_train)
# create dataset for gpb.train function
data_train = gpb.Dataset(X_train, y_train)
# specify tree-boosting parameters as a dict
params = { 'objective': 'regression_l2', 'learning_rate': 0.1,
'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 }
# train model
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=31)
gp_model.summary() # estimated covariance parameters
#Covariance parameters:
# Error_term Group_1
#Param. 0.92534 1.016069
# Make predictions
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['response_mean']
np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.26
# Choosing number of boosting iterations using cross-validation
gp_model = gpb.GPModel(group_data=group_train)
cvbst = gpb.cv(params=params, train_set=data_train,
gp_model=gp_model, use_gp_model_for_validation=False,
num_boost_round=100, early_stopping_rounds=5,
nfold=4, verbose_eval=True, show_stdv=False, seed=1,
metric="l2") # 'metric' was called 'metrics' in earlier versions of gpboost
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
# Best number of iterations: 31
# --------------------Model interpretation----------------
# SHAP values and dependence plots
# Note: you need shap version>=0.36.0
import shap
shap_values = shap.TreeExplainer(bst).shap_values(X_test)
shap.summary_plot(shap_values, X_test)
shap.dependence_plot("variable_2", shap_values, X_test)
# Split-based feature importances
gpb.plot_importance(bst)
# Classical partial dependence plots
from pdpbox import pdp
# Single variable plots (takes a few seconds to compute)
pdp_dist = pdp.PDPIsolate(model=bst, df=X_train.copy(), model_features=X_train.columns, # need to copy() since PDPIsolate modifies the df
feature='variable_2', feature_name='variable_2',
n_classes=0, num_grid_points=50,
predict_kwds={"ignore_gp_model": True})
fig, axes = pdp_dist.plot(engine='matplotlib', plot_lines=True)
# Two variable interaction plot
interact = pdp.PDPInteract(model=bst, df=X_train.copy(), model_features=X_train.columns,
features=['variable_1','variable_2'],
feature_names=['variable_1','variable_2'],
n_classes=0, predict_kwds={"ignore_gp_model": True})
fig, axes = interact.plot(engine='matplotlib', plot_type='contour')
"""
# Note: the above code is for pdpbox version 0.3.0 or latter, for earlier versions use:
pdp_dist = pdp.pdp_isolate(model=bst, dataset=X_train, model_features=X_train.columns,
feature='variable_2', num_grid_points=50,
predict_kwds={"ignore_gp_model": True})
pdp.pdp_plot(pdp_dist, 'variable_2', plot_lines=True)
inter_rf = pdp.pdp_interact(model=bst, dataset=X_train, model_features=X_train.columns,
features=['variable_1','variable_2'],
predict_kwds={"ignore_gp_model": True})
pdp.pdp_interact_plot(inter_rf, ['variable_1','variable_2'], x_quantile=True,
plot_type='contour', plot_pdp=True)# ignore any error message
"""
# --------------------Comparison to alternative approaches----------------
results = pd.DataFrame(columns = ["RMSE","Time"],
index = ["GPBoost", "Linear_ME","Boosting_Ign","Boosting_Cat","MERF"])
# 1. GPBoost
gp_model = gpb.GPModel(group_data=group_train)
start_time = time.time() # measure time
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=best_iter)
results.loc["GPBoost","Time"] = time.time() - start_time
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['response_mean']
results.loc["GPBoost","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))
# 2. Linear mixed effects model ('Linear_ME')
gp_model = gpb.GPModel(group_data=group_train)
X_train_linear = np.column_stack((np.ones(ntrain),X_train))
X_test_linear = np.column_stack((np.ones(ntrain),X_test))
start_time = time.time() # measure time
gp_model.fit(y=y_train, X=X_train_linear) # add a column of 1's for intercept
results.loc["Linear_ME","Time"] = time.time() - start_time
y_pred = gp_model.predict(group_data_pred=group_test, X_pred=X_test_linear)
results.loc["Linear_ME","RMSE"] = np.sqrt(np.mean((y_test - y_pred['mu']) ** 2))
# 3. Gradient tree-boosting ignoring the grouping variable ('Boosting_Ign')
cvbst = gpb.cv(params=params, train_set=data_train,
num_boost_round=100, early_stopping_rounds=5,
nfold=4, verbose_eval=True, show_stdv=False, seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
start_time = time.time() # measure time
bst = gpb.train(params=params, train_set=data_train, num_boost_round=best_iter)
results.loc["Boosting_Ign","Time"] = time.time() - start_time
y_pred = bst.predict(data=X_test)
results.loc["Boosting_Ign","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))
# 4. Gradient tree-boosting including the grouping variable as a categorical variable ('Boosting_Cat')
X_train_cat = np.column_stack((group_train,X_train))
X_test_cat = np.column_stack((group_test,X_test))
data_train_cat = gpb.Dataset(X_train_cat, y_train, categorical_feature=[0])
cvbst = gpb.cv(params=params, train_set=data_train_cat,
num_boost_round=1000, early_stopping_rounds=5,
nfold=4, verbose_eval=True, show_stdv=False, seed=1)
best_iter = np.argmin(cvbst['l2-mean'])
print("Best number of iterations: " + str(best_iter))
start_time = time.time() # measure time
bst = gpb.train(params=params, train_set=data_train_cat, num_boost_round=best_iter)
results.loc["Boosting_Cat","Time"] = time.time() - start_time
y_pred = bst.predict(data=X_test_cat)
results.loc["Boosting_Cat","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))
# 5. Mixed-effects random forest ('MERF')
from merf import MERF
rf_params={'max_depth': 6, 'n_estimators': 300}
merf_model = MERF(max_iterations=100, rf_params=rf_params)
print("Warning: the following takes a lot of time")
start_time = time.time() # measure time
merf_model.fit(pd.DataFrame(X_train), np.ones(shape=(ntrain,1)), pd.Series(group_train), y_train)
results.loc["MERF","Time"] = time.time() - start_time
y_pred = merf_model.predict(pd.DataFrame(X_test), np.ones(shape=(ntrain,1)), pd.Series(group_test))
results.loc["MERF","RMSE"] = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(results.apply(pd.to_numeric).round(3))