/
code_demo_cart.Rmd
396 lines (329 loc) · 12.1 KB
/
code_demo_cart.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
---
output: pdf_document
params:
set_title: "Code demo for CART"
---
```{r cart-preamble, child = "../style/preamble_code_demos.Rmd", include = FALSE, purl = FALSE}
```
```{r cart-setup, child = "../style/setup.Rmd", include = FALSE, purl = FALSE}
```
# Code demo for CART
Classification and decision trees are very powerful, but they do have one major drawback: they are highly unstable. We show this with the following example on the __servo__ data set that offers a regression task and looks like that:
```{r cart-data}
library(rpart)
library(dplyr)
library(rattle)
library(mlbench)
library(BBmisc)
library(rpart.plot)
library(ggplot2)
library(mlr3)
library(mlr3learners)
data("Servo")
# transform ordered factors to numeric
servo <- Servo %>%
mutate_at(c("Pgain", "Vgain"), as.character) %>%
mutate_at(c("Pgain", "Vgain"), as.numeric)
rm(Servo)
str(servo)
head(servo)
```
We'll fit two CART's on our data, which we split in train and test with two different seeds,
resulting in slightly different train and test data sets.
Check the differences in the CART architecture that was induced by those differing seeds:
```{r cart-train_test}
# split in train and test with two different seeds to show data dependency
train_size <- 3 / 4
set.seed(1333)
train_indices <- sample(
x = seq(1, nrow(servo), by = 1),
size = ceiling(train_size * nrow(servo)), replace = FALSE
)
train_1 <- servo[ train_indices, ]
test_1 <- servo[ -train_indices, ]
set.seed(1)
train_indices <- sample(
x = seq(1, nrow(servo), by = 1),
size = ceiling(train_size * nrow(servo)), replace = FALSE
)
train_2 <- servo[ train_indices, ]
test_2 <- servo[ -train_indices, ]
model_1 <- rpart(Class ~ ., data = train_1)
rattle::fancyRpartPlot(model_1,
palettes = "Oranges", main =
"CART from split 1\n", sub = ""
)
model_2 <- rpart(Class ~ ., data = train_2)
rattle::fancyRpartPlot(model_2,
palettes = "Blues", main =
"CART from split 2\n", sub = ""
)
```
* Why is that a problem? Don't we want an algorithm that is sensitive to changes in the data?
* What can we do to address this issue?
## Understanding the Hyperparameters of CARTS
#### Min split:
```{r cart-min_split}
task <- TaskRegr$new(id = "train_1", backend = train_1, target = "Class")
lrn1 <- lrn("regr.rpart")
plot_param_sequence <- function(learner, task, param, values, plotfun, ...) {
for (v in values) {
xs <- list(...)
xs[[param]] <- v
lrn2 <- learner$clone()
lrn2$param_set$values <- xs
mod <- lrn2$train(task)$model
plotfun(mod)
title(sprintf("%s = %g", param, v))
pause()
}
}
plot_param_sequence_rpart <- function(...)
plot_param_sequence(learner = lrn1, plotfun = function(x)
rpart.plot(x, roundint = FALSE), ...)
min_splits <- c(3, 10, 20, 40)
plot_param_sequence_rpart(task = task, param = "minsplit", values = min_splits)
```
### Maximum depth of the tree
```{r cart-max_depth}
max_depth <- c(1, 4, 10)
plot_param_sequence_rpart(task = task, param = "maxdepth", values = max_depth)
```
## Invariance to monotonous feature transformation
The nice thing about CARTs is that they are invariant to scaling and other monotonous feature transformations.
We grow two trees on two data set, which only differ in the fact that the numeric features in the second data set are log-transformed.
Let's see how the architecture of the two CARTs differs (or not):
```{r cart-inv_mon}
train_log <- train_1 %>% mutate_at(c("Pgain", "Vgain"), log)
head(train_log, 3)
head(train_1, 3)
model_1 <- rpart(Class ~ ., data = train_1, minsplit = 30)
rattle::fancyRpartPlot(model_1,
palettes =
"Oranges", main = "CART with normal data\n", sub = ""
)
model_log <- rpart(Class ~ ., data = train_log, minsplit = 30)
rattle::fancyRpartPlot(model_log,
palettes = "Blues",
main = "CART with log transformed data\n", sub = ""
)
```
## Write simple regression tree implementation
Now we want to implement a simple regression tree learner (`rt`) for numerical features:
```{r rt-impl}
# Estimate a *r*egression *t*ree, which consists of nodes.
# Every node is either a splitting node with 2 subnodes
# or a leaf / terminal node without subnodes (end of a tree branch).
# The two subnodes for each split are:
# - true_node: the split (i.e. feature < treshold) was TRUE
# - false_node: the split was FALSE
# The tree is built recursively by calling rt on the subset of data in each
# tree branch again.
# The final tree is then represented as a nested list of splitting nodes
# containing splitting nodes containing further splitting nodes and leaf nodes.
# Every node itself is also simply represented as a list, see the very end of
# this function for the definition.
rt <- function(df, target, cur_depth = 0, max_depth = 30, min_split = 20) {
feature_results <- list()
# compute optimal split with minimal loss for every feature:
for (feature in colnames(df)) {
if (feature != target) {
feature_results[[feature]] <- optimal_split(df, feature, target)
}
}
df_feature_results <- do.call(rbind, feature_results)
# choose the feature whose optimal split minimizes the loss:
min_loss_split <- df_feature_results[which.min(df_feature_results[, "loss"]), ]
# use this feature's split:
split_result <- split_node(df, min_loss_split)
if (cur_depth + 1 == max_depth) {
# max_depth of the tree is reached
true_node <- list(is_leaf = TRUE)
false_node <- list(is_leaf = TRUE)
} else {
if (nrow(split_result$true_split) < min_split) {
# not enough observations to split node again
true_node <- list(is_leaf = TRUE)
}
else {
# call the rt function recursively again, using only the subset of data
# for which the current split was TRUE, and with incremented tree depth,
# s.t. the function will terminate if the maximal depth is reached
true_node <- rt(
df = split_result$true_split,
target = target,
cur_depth = cur_depth + 1,
max_depth = max_depth,
min_split = min_split
)
}
# .. now do the same things on the other node:
if (nrow(split_result$false_split) < min_split) {
# not enough observations to split node again
false_node <- list(is_leaf = TRUE)
}
else {
# call the rt function recursively again, using only the subset of data
# for which the current split was FALSE, and with incremented tree depth,
# s.t. the function will terminate if the maximal depth is reached
false_node <- rt(
df = split_result$false_split,
target = target,
cur_depth = cur_depth + 1,
max_depth = max_depth,
min_split = min_split
)
}
}
list(
is_leaf = FALSE, # terminal node=
true_node = true_node, # the tree branch for which split was TRUE
false_node = false_node, # the tree branch for which split was FALSE
loss = min_loss_split$loss, # the achieved loss in this split
feature = min_loss_split$feature, # the feature used for this split
threshold = min_loss_split$threshold, # the threshold of the feature used
# for the split
num_true = min_loss_split$num_true, # the predicted value for observations
# if split is TRUE
num_false = min_loss_split$num_false # the predicted value for observations
# in split is FALSE
)
}
# function which splits current dataframe according to split rule
# (if one wants to implement classification trees and/or support for categorical
# features this would have to be extended -- think about how?)
split_node <- function(df, split_rule) {
return(list(
true_split = df[df[, split_rule$feature] < split_rule$threshold, ],
false_split = df[df[, split_rule$feature] >= split_rule$threshold, ]
))
}
# function which finds the optimal split for a given feature
# (if one wants to implement classifiaction and/or support for categorical
# features this would have to be extended -- think about how?)
optimal_split <- function(df, feature, target) {
cur_losses <- list()
# sort data according to the feature, only keep feature and target
df <- df[order(df[, feature]), c(feature, target)]
split_points <- sort(unique(df[, feature]))
# compute losses for all split points
for (i in 2:length(split_points)) {
is_split_true <- df[, feature] < split_points[i]
split_true <- df[is_split_true, target]
split_false <- df[!is_split_true, target]
cur_losses[[as.character(i)]] <- data.frame(
loss = l2_loss(split_true) + l2_loss(split_false),
threshold = mean(split_points[(i - 1):i]),
num_true = mean(split_true),
num_false = mean(split_false)
)
}
df_curr_losses <- do.call(rbind, cur_losses)
# find split which yields the minimal loss
min_loss <- df_curr_losses[which.min(df_curr_losses[, "loss"]), ]
data.frame(
loss = min_loss[1, "loss"],
threshold = min_loss[1, "threshold"],
num_true = min_loss[1, "num_true"],
num_false = min_loss[1, "num_false"],
feature = feature
)
}
# l2 loss if target is estimated with l2-optimal constant (mean)
l2_loss <- function(target_values) {
sum((target_values - mean(target_values))^2)
}
# function which predicts data with regression tree
# (if one wants to implement classification and/or support for categorical
# features this would need to be extended)
predict_tree <- function(tree, df) {
pred_values <- numeric(nrow(df))
# loop over all observations in df
for (i in seq_len(nrow(df))) {
cur_node <- tree
# follow the splits until we reach a leaf
while (!cur_node$is_leaf) {
if (df[i, cur_node$feature] < cur_node$threshold) {
pred_values[i] <- cur_node$num_true
cur_node <- cur_node$true_node
} else {
pred_values[i] <- cur_node$num_false
cur_node <- cur_node$false_node
}
}
}
pred_values
}
```
Compare with CART implementation from `rpart`:
```{r, rt-example}
# create artifical data to check algorithm
set.seed(123)
n <- 300
x1 <- runif(n, -2, 3)
x2 <- runif(n, -2, 3)
y <- ifelse(x1 > 1.5, -1, 1) +
ifelse(x2 > 1.5, .5, -.5) +
ifelse(x1 < .5 & x2 < .5, -1, 1) +
rnorm(n, 0, 0.1)
train_data <- data.frame(x1, x2, y)
# compute regression tree
tree <- rt(train_data, "y", max_depth = 4)
# compare with rpart CART implementation:
rpart_tree <- rpart::rpart(y ~ x1 + x2, data = train_data, maxdepth = 4)
rattle::fancyRpartPlot(rpart_tree, sub = "")
# compare first split:
tree$feature
tree$threshold
# if the subnodes of the first node were leaves, this would be the
# output prediction for the splits:
tree$num_true # if feature < threshold
tree$num_false # if feature > threshold
# --> first split identical
# first splitting node on "FALSE" branch:
tree$false_node$feature
tree$false_node$threshold
# --> same as for rpart as well
```
```{r rt-viz}
# function for cart method with two features to visualize
# the estimated output X1 and X2 are the names of the two features to use.
plot_2D_cart <- function(train_data,
tree,
target = "y",
X1 = "x1",
X2 = "x2",
lengthX1 = 40,
lengthX2 = 40) {
gridX1 <- seq(
min(train_data[, X1]),
max(train_data[, X1]),
length.out = lengthX1
)
gridX2 <- seq(
min(train_data[, X2]),
max(train_data[, X2]),
length.out = lengthX2
)
# compute grid coordinates with cartesian product
grid_data <- expand.grid(gridX1, gridX2)
colnames(grid_data) <- c(X1, X2)
# predict grid cells
grid_data <- cbind(grid_data, target = predict_tree(tree, grid_data))
colnames(grid_data) <- c(X1, X2, target)
min_z <- min(c(min(grid_data[, target]), min(train_data[, target])))
max_z <- max(c(max(grid_data[, target]), max(train_data[, target])))
ggplot() +
geom_raster(
data = grid_data,
aes_string(x = X1, y = X2, fill = target),
alpha = .8
) +
geom_point(data = train_data, aes_string(x = X1, y = X2), color = "white",
size = 2, alpha = .5) +
geom_point(data = train_data, aes_string(x = X1, y = X2, color = target)) +
scale_colour_gradient(limits = c(min_z, max_z)) +
scale_fill_gradient(limits = c(min_z, max_z))
}
plot_2D_cart(train_data, tree)
```