-
Notifications
You must be signed in to change notification settings - Fork 10
/
buildMod.R
377 lines (344 loc) · 11.2 KB
/
buildMod.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#' Function to apply meteorological normalisation
#'
#' This is the main function to apply a gbm model to a data set.
#'
#' @param input_data Data frame to analyse. Must contain a POSIXct field called
#' `date`.
#' @param vars Explanatory variables to use. These variables will be used to
#' build the gbm model. Note that the model must include a trend component.
#' Several variables can be automatically calculated (see [prepData()] for
#' details).
#' @param pollutant The name of the variable to apply meteorological
#' normalisation to.
#' @param sam.size The number of random samples to extract from the data for
#' model building. While it is possible to use the full data set, for data
#' sets spanning years the model building can take a very long time to run.
#' Additionally, there will be diminishing returns in terms of model accuracy.
#' If `sam.size` is greater than the number of number of rows of data, the
#' number of rows of data is used instead.
#' @param n.trees Number of trees to fit.
#' @param simulate Should the original time series be randomly sampled with
#' replacement? The default is `FALSE`. Setting `simulate = TRUE` can be
#' useful for estimating model uncertainties. In which case models should be
#' run multiple times with `B = 1` and a different value of `seed` e.g. `seed
#' = runif(1)`.
#' @param B Number of bootstrap simulations for partial dependence plots.
#' @param n.core Number of cores to use for parallel processing.
#' @param shrinkage a shrinkage parameter applied to each tree in the expansion.
#' Also known as the learning rate or step-size reduction; 0.001 to 0.1
#' usually work, but a smaller learning rate typically requires more trees.
#' Default is 0.1.
#' @param interaction.depth Integer specifying the maximum depth of each tree
#' (i.e., the highest level of variable interactions allowed). A value of 1
#' implies an additive model, a value of 2 implies a model with up to 2-way
#' interactions, etc. Default is 5.
#' @param bag.fraction he fraction of the training set observations randomly
#' selected to propose the next tree in the expansion. This introduces
#' randomnesses into the model fit. If bag.fraction < 1 then running the same
#' model twice will result in similar but different.
#' @param n.minobsinnode Integer specifying the minimum number of observations
#' in the terminal nodes of the trees. Note that this is the actual number of
#' observations, not the total weight.
#' @param cv.folds Number of cross-validation folds to perform. If cv.folds>1
#' then gbm, in addition to the usual fit, will perform a cross-validation,
#' calculate an estimate of generalization error returned in \code{cv.error}.
#' @param seed Random number seed for reproducibility in returned model.
#'
#' @export
#' @seealso [testMod()] for testing models before they are built.
#' @seealso [metSim()] for using a built model with meteorological simulations.
#' @seealso [plot2Way()], [plotInfluence()] and [plotPD()] for visualising built
#' models.
#' @return Returns a list including the model, influence data frame and partial
#' dependence data frame.
#' @author David Carslaw
buildMod <- function(input_data,
vars = c(
"trend", "ws", "wd", "hour",
"weekday", "air_temp"
),
pollutant = "nox",
sam.size = nrow(input_data),
n.trees = 200,
shrinkage = 0.1,
interaction.depth = 5,
bag.fraction = 0.5,
n.minobsinnode = 10,
cv.folds = 0,
simulate = FALSE,
B = 100,
n.core = 4,
seed = 123) {
## add other variables, select only those required for modelling
input_data <- prepData(input_data)
input_data <-
dplyr::select(input_data, c("date", vars, pollutant))
input_data <-
stats::na.omit(input_data) # only build model where all data are available - can always predict in gaps
variables <- paste(vars, collapse = "+")
eq <- stats::formula(paste(pollutant, "~", variables))
# randomly sample data according to sam.size
if (sam.size > nrow(input_data)) {
sam.size <- nrow(input_data)
}
if (simulate) {
id <- sample(nrow(input_data), size = sam.size, replace = TRUE)
input_data <- input_data[id, ]
} else {
id <- sample(nrow(input_data), size = sam.size)
input_data <- input_data[id, ]
}
## if more than one simulation only return model ONCE
if (B != 1L) {
mod <- runGbm(
input_data,
eq,
vars,
return.mod = TRUE,
simulate = simulate,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds,
seed
)
}
# if model needs to be run multiple times
res <- partialDep(input_data, eq, vars, B, n.core,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds, seed = seed
)
if (B != 1) {
Mod <- mod$model
} else {
Mod <- res[[3]]
}
# return a list of model, data, partial deps
result <-
list(
model = Mod,
influence = res[[2]],
data = input_data,
pd = res[[1]]
)
class(result) <- "deweather"
return(result)
}
#' Extract PD
#' @noRd
extractPD <- function(vars, mod) {
n <- 100 ## resolution of output
if ("trend" %in% vars) {
n <- 100
}
if (vars %in% c("hour", "hour.local")) {
n <- 24
}
## extract partial dependence values
res <-
gbm::plot.gbm(mod,
vars,
continuous.resolution = n,
return.grid = TRUE
)
res <- data.frame(
y = res$y,
var = vars,
x = res[[vars]],
var_type = ifelse(is.numeric(res[[vars]]), "numeric", "character")
)
return(res)
}
#' Run Gbm
#' @noRd
runGbm <-
function(dat,
eq,
vars,
return.mod,
simulate,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds,
seed = seed) {
## sub-sample the data for bootstrapping
if (simulate) {
dat <- dat[sample(nrow(dat), nrow(dat), replace = TRUE), ]
}
# these models for AQ data are not very sensitive to tree sizes > 1000
# make reproducible
if (!simulate) {
set.seed(seed)
} else {
set.seed(stats::runif(1))
}
mod <- gbm::gbm(
eq,
data = dat,
distribution = "gaussian",
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
train.fraction = 1,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds,
keep.data = TRUE,
verbose = FALSE
)
## extract partial dependence components
pd <- purrr::map(vars, extractPD, mod = mod) %>%
purrr::map(~ dplyr::nest_by(.x, var, var_type) %>%
tidyr::pivot_wider(
names_from = "var_type",
values_from = "data"
)) %>%
dplyr::bind_rows()
## relative influence
ri <- summary(mod, plotit = FALSE)
ri$var <- stats::reorder(ri$var, ri$rel.inf)
if (return.mod) {
result <- list("pd" = pd, "ri" = ri, "model" = mod)
return(result)
} else {
return(list("pd" = pd, "ri" = ri))
}
}
#' @noRd
#' @importFrom rlang .data
partialDep <-
function(dat,
eq,
vars,
B = 100,
n.core = 4,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds,
seed) {
if (B == 1) {
return.mod <- TRUE
} else {
return.mod <- FALSE
}
if (B == 1) {
pred <- runGbm(
dat,
eq,
vars,
return.mod = TRUE,
simulate = FALSE,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds,
seed
)
} else {
cl <- parallel::makeCluster(n.core)
doParallel::registerDoParallel(cl)
pred <- foreach::foreach(
i = 1:B,
.inorder = FALSE,
.packages = "gbm",
.export = "runGbm"
) %dopar%
runGbm(
dat,
eq,
vars,
return.mod = FALSE,
simulate = TRUE,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
bag.fraction = bag.fraction,
n.minobsinnode = n.minobsinnode,
cv.folds = cv.folds
)
parallel::stopCluster(cl)
}
# partial dependence plots
if (B == 1) {
pd <- pred$pd
ri <- pred$ri
mod <- pred$model
} else {
pd <- purrr::map(pred, "pd") %>%
dplyr::bind_rows()
## relative influence
ri <- purrr::map(pred, "ri") %>%
dplyr::bind_rows()
mod <- pred[[1]]$model
}
# if either character/numeric not in the output df, add dummy col
if (!"character" %in% names(pd)) {
pd$character <- rep(list(data.frame()), nrow(pd))
}
if (!"numeric" %in% names(pd)) {
pd$numeric <- rep(list(data.frame()), nrow(pd))
}
# Calculate 95% CI for different vars
resCI <-
dplyr::group_by(pd, .data$var) %>%
dplyr::reframe(
numeric = list(dplyr::bind_rows(numeric)),
character = list(dplyr::bind_rows(character))
) %>%
dplyr::mutate(
numeric = purrr::map(
numeric,
purrr::possibly(
~ dplyr::mutate(.x, x_bin = cut(.data$x, 100, include.lowest = TRUE)) %>%
dplyr::group_by(x_bin) %>%
dplyr::summarise(
x = mean(x),
mean = mean(y),
lower = quantile(y, probs = 0.025),
upper = quantile(y, probs = 0.975)
) %>%
dplyr::ungroup()
)
),
character = purrr::map(
character,
purrr::possibly(
~ dplyr::group_by(.x, x) %>%
dplyr::summarise(
mean = mean(y),
lower = quantile(y, probs = 0.025),
upper = quantile(y, probs = 0.975)
) %>%
dplyr::ungroup()
)
)
)
resRI <- dplyr::group_by(ri, .data$var) %>%
dplyr::summarise(
mean = mean(.data$rel.inf),
lower = stats::quantile(.data$rel.inf, probs = c(0.025)),
upper = stats::quantile(.data$rel.inf, probs = c(0.975))
) %>%
dplyr::ungroup() %>%
dplyr::mutate(var = stats::reorder(var, mean)) %>%
dplyr::arrange(dplyr::desc(var))
if (return.mod) {
return(list(resCI, resRI, mod))
} else {
return(list(resCI, resRI))
}
}