/
impact_model.R
253 lines (225 loc) · 9.77 KB
/
impact_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Copyright 2014-2020 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Bayesian dynamic diffusion-regression state-space model for computing
# counterfactual predictions in a time series. Uses an MCMC algorithm
# implemented in the \code{bsts} package to compute samples from the posterior
# (smoothing) densities over states and parameters.
#
# Author: kbrodersen@google.com (Kay Brodersen)
# Some model priors are fixed, others can be adjusted through model.args.
# For full flexibility, construct your own bsts model and feed it into
# CausalImpactForBsts().
kLocalLevelPriorSampleSize <- 32
kStaticRegressionExpectedModelSize <- 3
kStaticRegressionExpectedR2 <- 0.8
kStaticRegressionPriorDf <- 50
kDynamicRegressionPriorSampleSize <- 32
ObservationsAreIllConditioned <- function(y) {
# Checks whether the response variable (i.e., the series of observations for
# the dependent variable y) are ill-conditioned. For example, the series might
# contain too few non-NA values. In such cases, inference will be aborted.
#
# Args:
# y: observed series (numeric vector or single zoo series)
#
# Returns:
# TRUE if something is wrong with the observations; FALSE otherwise.
assert_that(!is.null(y), length(y) >= 1)
ill.conditioned <- FALSE
# All NA?
if (all(is.na(y))) {
warning("Aborting inference due to input series being all NA.")
ill.conditioned <- TRUE
# Fewer than 3 non-NA values?
} else if (sum(!is.na(y)) < 3) {
warning("Aborting inference due to fewer than 3 non-NA values in input")
ill.conditioned <- TRUE
# Constant series?
} else if (sd(y, na.rm = TRUE) == 0) {
warning(paste0("Aborting inference due to input series being constant: ",
y[!is.na(y)][1]))
ill.conditioned <- TRUE
}
return(ill.conditioned)
}
FormatInputForConstructModel <- function(data, model.args) {
# Checks the input arguments supplied to ConstructModel(). Missing arguments
# in \code{model.args} will be filled using \code{.defaults} (see top of file
# impact_analysis.R).
#
# Args:
# data: time series of response variable and covariates
# model.args: list of additional arguments
#
# Returns:
# list of checked and correctly formatted arguments
# Check <data>
assert_that(!is.null(data))
data <- as.zoo(data)
if (is.null(ncol(data))) {
dim(data) <- c(length(data), 1)
}
assert_that(is.numeric(data))
assert_that(nrow(data) > 0)
# If <data> has no names, assign: y, x1, x2, ...
if (is.null(names(data))) {
if (ncol(data) == 1) {
names(data)[1] <- "y"
} else {
names(data) <- c("y", paste0("x", 2:ncol(data) - 1))
}
}
# Check covariates
if (ncol(data) >= 2) {
assert_that(all(!is.na(data[, -1])), msg = "covariates must not be NA")
}
# (Re-)parse <model.args>, fill gaps using <.defaults>
# (defined in impact_analysis.R)
model.args <- ParseArguments(model.args, .defaults)
# Check those parts of <model.args> that are used in this file
# Check <niter>
assert_that(is.scalar(model.args$niter))
assert_that(is.numeric(model.args$niter))
assert_that(!is.na(model.args$niter))
assert_that(is.wholenumber(model.args$niter))
model.args$niter <- round(model.args$niter)
assert_that(model.args$niter >= 10,
msg = paste0("must draw, at the very least, 10 MCMC samples; ",
"recommending 1000"))
if (model.args$niter < 1000) {
warning("Results potentially inaccurate. Consider using more MCMC samples.")
}
# Check <prior.level.sd>
assert_that(is.scalar(model.args$prior.level.sd))
assert_that(is.numeric(model.args$prior.level.sd))
assert_that(!is.na(model.args$prior.level.sd))
assert_that(model.args$prior.level.sd > 0)
# Check <nseasons>
assert_that(is.scalar(model.args$nseasons))
assert_that(is.numeric(model.args$nseasons))
assert_that(!is.na(model.args$nseasons))
assert_that(is.wholenumber(model.args$nseasons))
assert_that(model.args$nseasons >= 1,
msg = paste0("nseasons cannot be 0; use 1 in order not to have ",
"seaonsal components"))
# Check <season.duration>
assert_that(is.scalar(model.args$season.duration))
assert_that(is.numeric(model.args$season.duration))
assert_that(!is.na(model.args$season.duration))
assert_that(is.wholenumber(model.args$season.duration))
assert_that(model.args$season.duration >= 1)
# Check <dynamic.regression>
assert_that(is.scalar(model.args$dynamic.regression))
assert_that(is.logical(model.args$dynamic.regression))
assert_that(!is.na(model.args$dynamic.regression))
# Check <max.flips>
assert_that(is.scalar(model.args$max.flips))
assert_that(is.numeric(model.args$max.flips))
assert_that(!is.na(model.args$max.flips))
assert_that(is.wholenumber(model.args$max.flips))
assert_that(model.args$max.flips > 0 || model.args$max.flips == -1)
# Return updated args
return(list(data = data, model.args = model.args))
}
# Tell 'R CMD check' to treat `BstsOptions()` as global variable to avoid
# false positives as long as 'bsts' version 0.7.x is not published.
# TODO(alhauser): remove this when 'bsts' version 0.7.x is published.
if(getRversion() >= "2.15.1") {
utils::globalVariables("BstsOptions")
}
ConstructModel <- function(data, model.args = NULL) {
# Specifies the model and performs inference. Inference means using the data
# to pass from a prior distribution over parameters and states to a posterior
# distribution. In a Bayesian framework, estimating a model means to obtain
# p(parameters | data) from p(data | parameters) and p(parameters). This
# involves multiplying the prior with the likelihood and normalising the
# resulting distribution using the marginal likelihood or model evidence,
# p(data). Computing the evidence poses a virtually intractable
# high-dimensional integration problem which can be turned into an easier
# optimization problem using, for instance, an approximate stochastic
# inference strategy. Here, we use a Markov chain Monte Carlo algorithm, as
# implemented in the \code{bsts} package.
#
# Args:
# data: time series of response variable and optional covariates
# model.args: optional list of additional model arguments
#
# Returns:
# \code{bsts.model}, as returned by \code{bsts()}
# Check and format input
checked <- FormatInputForConstructModel(data, model.args)
data <- checked$data
model.args <- checked$model.args
y <- data[, 1]
# If the series is ill-conditioned, abort inference and return NULL
if (ObservationsAreIllConditioned(y)) {
return(NULL)
}
# Local level
# sigma.guess: standard deviation of the random walk of the level
sdy <- sd(y, na.rm = TRUE)
ss <- list()
sd.prior <- SdPrior(sigma.guess = model.args$prior.level.sd * sdy,
upper.limit = sdy,
sample.size = kLocalLevelPriorSampleSize)
ss <- AddLocalLevel(ss, y, sigma.prior = sd.prior)
# Add seasonal component?
if (model.args$nseasons > 1) {
ss <- AddSeasonal(ss, y,
nseasons = model.args$nseasons,
season.duration = model.args$season.duration)
}
# No regression?
if (ncol(data) == 1) {
bsts.model <- bsts(y, state.specification = ss, niter = model.args$niter,
seed = 1, ping = 0,
model.options =
BstsOptions(save.prediction.errors = TRUE),
max.flips = model.args$max.flips)
} else {
formula <- paste0(names(data)[1], " ~ .")
# Static regression?
if (!model.args$dynamic.regression) {
bsts.model <- bsts(formula, data = data, state.specification = ss,
expected.model.size =
kStaticRegressionExpectedModelSize,
expected.r2 = kStaticRegressionExpectedR2,
prior.df = kStaticRegressionPriorDf,
niter = model.args$niter, seed = 1, ping = 0,
model.options =
BstsOptions(save.prediction.errors = TRUE),
max.flips = model.args$max.flips)
time(bsts.model$original.series) <- time(data)
# Dynamic regression?
} else {
# Since we have predictor variables in the model, we need to explicitly
# make their coefficients time-varying using AddDynamicRegression(). In
# bsts(), we are therefore not giving a formula but just the response
# variable. We are then using SdPrior to only specify the prior on the
# residual standard deviation.
# prior.mean: precision of random walk of coefficients
sdx <- apply(data[, -1, drop = FALSE], 2, function(x) sd(x, na.rm = TRUE))
model.options <- DynamicRegressionRandomWalkOptions(sdx = sdx, sdy = sdy)
ss <- AddDynamicRegression(ss, formula, data = data,
model.options = model.options)
sd.prior <- SdPrior(sigma.guess = model.args$prior.level.sd * sdy,
upper.limit = 0.1 * sdy,
sample.size = kDynamicRegressionPriorSampleSize)
bsts.model <- bsts(y, state.specification = ss, niter = model.args$niter,
expected.model.size = 3, ping = 0, seed = 1,
prior = sd.prior, max.flips = model.args$max.flips)
}
}
return(bsts.model)
}