-
Notifications
You must be signed in to change notification settings - Fork 0
/
18.Rmd
1982 lines (1614 loc) · 87.7 KB
/
18.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Chapter 18. Metric Predicted Variable with Multiple Metric Predictors"
author: "A Solomon Kurz"
date: "`r format(Sys.Date())`"
output:
github_document
---
# Metric Predicted Variable with Multiple Metric Predictors
> We will consider models in which the predicted variable is an additive combination of predictors, all of which have proportional influence on the prediction. This kind of model is called *multiple linear regression*. We will also consider nonadditive combinations of predictors, which are called *interactions*. (p. 509, *emphasis* in the original)
## Multiple linear regression
If $y \sim \text{Normal} (\mu, \sigma)$ and $\mu = \beta_0 + \beta_1 x_1 + \beta_2 x_2$, then it's also the case that we can rewrite the formula for $y$ as:
$$y \sim \text{Normal} (\beta_0 + \beta_1 x_1 + \beta_2 x_2, \sigma)$$
As Kruschke pointed out, the basic model "assumes homogeneity of variance, which means that at all values of $x_1$ and $x_2$, the variance $\sigma^2$ of $y$ is the same" (p. 510).
If we presume the data for the two $x$ variables are uniformly distributed within 0 and 10, we can make the data for Figure 18.1 like this.
```{r, message = F, warning = F}
library(tidyverse)
n <- 300
set.seed(18)
d <-
tibble(x_1 = runif(n = n, 0, 10),
x_2 = runif(n = n, 0, 10)) %>%
mutate(y = rnorm(n = n, mean = 10 + x_1 + 2 * x_2))
head(d)
```
Here are three of the scatter plots in Figure 18.1.
```{r, fig.width = 3.5, fig.height = 3.25}
theme_set(theme_grey() +
theme(panel.grid = element_blank()))
d %>%
ggplot(aes(x = x_1, y = y)) +
geom_point(alpha = 2/3) +
coord_cartesian(ylim = 0:50)
d %>%
ggplot(aes(x = x_2, y = y)) +
geom_point(alpha = 2/3) +
coord_cartesian(ylim = 0:50)
d %>%
ggplot(aes(x = x_1, y = x_2)) +
geom_vline(xintercept = seq(from = 0, to = 10, by = .5), color = "grey98") +
geom_hline(yintercept = seq(from = 0, to = 10, by = .5), color = "grey98") +
geom_point(alpha = 2/3)
```
As in previous chapters, I'm not aware that ggplot2 allows for three-dimensional wireframe plots of the kind in the upper left panel. If you'd like to make one in base R, have at it. But with respect to the plots in the off-diagonal, I have no idea what Kruschke did with the compressed and diagonal grids. If you have that figured out, please [share your code](https://github.com/ASKurz/Doing-Bayesian-Data-Analysis-in-brms-and-the-tidyverse/issues).
For Figure 18.2, the $x$ variables look to be multivariate normal with a correlation of about -.95. We can simulate such data with help from the MASS package.
Sven Hohenstein's [answer to this stats.stackexchange.com question](https://stats.stackexchange.com/questions/164471/generating-a-simulated-dataset-from-a-correlation-matrix-with-means-and-standard) provides the steps for simulating the data. First, we'll need to specify the desired means and standard deviations for our variables. Then we'll make a correlation matrix with 1s on the diagonal and the desired correlation coefficient, $\rho$ on the off-diagonal. Since the correlation matrix is symmetric, both off-diagonal positions are the same. Then we convert the correlation matrix to a covariance matrix.
```{r}
mus <- c(5, 5)
sds <- c(2, 2)
cors <- matrix(c(1, -.95,
-.95, 1),
ncol = 2)
cors
covs <- sds %*% t(sds) * cors
covs
```
Now we've defined our means, standard deviations, and covariance matrix, we're ready to simulate the data with `MASS::mvrnorm()`.
```{r}
# how many data points would you like to simulate?
n <- 300
set.seed(18.2)
d <-
MASS::mvrnorm(n = n,
mu = mus,
Sigma = covs,
empirical = T) %>%
as_tibble() %>%
set_names("x_1", "x_2") %>%
mutate(y = rnorm(n = n, mean = 10 + x_1 + 2 * x_2))
```
Now we have our simulated data in hand, we're ready for three of the four panels of Figure 18.2.
```{r, fig.width = 3.5, fig.height = 3.25}
d %>%
ggplot(aes(x = x_1, y = y)) +
geom_point(alpha = 2/3) +
coord_cartesian(xlim = 0:10,
ylim = 0:50)
d %>%
ggplot(aes(x = x_2, y = y)) +
geom_point(alpha = 2/3) +
coord_cartesian(xlim = 0:10,
ylim = 0:50)
d %>%
ggplot(aes(x = x_1, y = x_2)) +
geom_vline(xintercept = seq(from = 0, to = 10, by = .5), color = "grey98") +
geom_hline(yintercept = seq(from = 0, to = 10, by = .5), color = "grey98") +
geom_point(alpha = 2/3) +
coord_cartesian(xlim = 0:10,
ylim = 0:10)
```
Pretty close.
### The perils of correlated predictors.
> Figures 18.1 and 18.2 show data generated from the same model. In both figures, $\sigma = 2, \beta_0 = 10, \beta_1 = 1, \text{ and } \beta_2 = 2$. All that differs between the two figures is the distribution of the $\langle x_1, x_2 \rangle$ values, which is not specified by the model. In Figure 18.1, the $\langle x_1, x_2 \rangle$ values are distributed independently. In Figure 18.2, the $\langle x_1, x_2 \rangle$ values are negatively correlated: When $x_1$ is small, $x_2$ tends to be large, and when $x_1$ is large, $x_2$ tends to be small. (p. 510)
If you look closely at our simulation code from above, you'll see we have done so, too.
> Real data often have correlated predictors. For example, consider trying to predict a state’s average high-school SAT score on the basis of the amount of money the state spends per pupil. If you plot only mean SAT against money spent, there is actually a *decreasing* trend... (p. 513, *emphasis* in the original)
Before we remake Figure 18.3 to examine that decreasing trend, we'll need to load the data.
```{r, message = F}
my_data <- read_csv("data.R/Guber1999data.csv")
glimpse(my_data)
```
Now we have the Guber (1999) data, here are three of the four plots.
```{r, fig.width = 3.5, fig.height = 3.25}
my_data %>%
ggplot(aes(x = Spend, y = SATT)) +
geom_point(alpha = 2/3) +
coord_cartesian(ylim = 800:1120)
my_data %>%
ggplot(aes(x = PrcntTake, y = SATT)) +
geom_point(alpha = 2/3) +
xlab("% Take") +
coord_cartesian(ylim = 800:1120)
my_data %>%
ggplot(aes(x = PrcntTake, y = Spend)) +
geom_vline(xintercept = seq(from = 5, to = 80, by = 5), color = "grey98") +
geom_hline(yintercept = seq(from = 3.5, to = 10, by = .5), color = "grey98") +
geom_point(alpha = 2/3) +
xlab("% Take")
```
### The model and implementation.
We'll make a custom function to standardize the criterion and predictor values.
```{r}
standardize <- function(x){
(x - mean(x)) / sd(x)
}
my_data <-
my_data %>%
mutate(PrcntTake_z = standardize(PrcntTake),
Spend_z = standardize(Spend),
SATT_z = standardize(SATT))
```
Let's open brms.
```{r, message = F, warning = F}
library(brms)
```
Now we're ready to fit the model. As Kruschke pointed out, the priors on the standardized predictors are set with
> an arbitrary standard deviation of 2.0. This value was chosen because standardized regression coefficients are algebraically constrained to fall between −1 and +1 in least-squares regression, and therefore, the regression coefficients will not exceed those limits by much. A normal distribution with standard deviation of 2.0 is reasonably flat over the range from −1 to +1. (p. 516)
With data like this, even a `prior(normal(0, 1), class = b)` would be only mildly regularizing.
```{r fit1, cache = T, message = F, warning = F}
fit1 <-
brm(data = my_data,
family = student,
SATT_z ~ 1 + Spend_z + PrcntTake_z,
prior = c(prior(normal(0, 2), class = Intercept),
prior(normal(0, 2), class = b),
prior(normal(0, 1), class = sigma),
prior(exponential(one_over_twentynine), class = nu)),
chains = 4, cores = 4,
stanvars = stanvar(1/29, name = "one_over_twentynine"),
seed = 18)
```
The model summary is as follows:
```{r}
print(fit1)
```
So when we use a multivariable model, increases in spending now appear associated with *increases* in SAT scores.
### The posterior distribution.
Based on equation 18.1, we can convert the standardized coefficients from our multivariable model back to their original metric as follows:
$$\beta_0 = SD_y \zeta_0 + M_y - SD_y \sum_j \frac{\zeta_j M_{x_j}}{SD_{x_j}}$$
and
$$\beta_j = \frac{SD_y \zeta_j}{SD_{x_j}}$$
To use them, we'll first extract the posterior samples.
```{r}
post <- posterior_samples(fit1)
head(post)
```
Like we did in Chapter 17, let's wrap the consequences of equation 18.1 into two functions.
```{r}
make_beta_0 <- function(zeta_0, zeta_1, zeta_2, sd_x_1, sd_x_2, sd_y, m_x_1, m_x_2, m_y){
sd_y * zeta_0 + m_y - sd_y * ((zeta_1 * m_x_1 / sd_x_1) + (zeta_2 * m_x_2 / sd_x_2))
}
make_beta_j <- function(zeta_j, sd_j, sd_y){
sd_y * zeta_j / sd_j
}
```
After saving a few values, we're ready to use our custom functions.
```{r}
sd_x_1 <- sd(my_data$Spend)
sd_x_2 <- sd(my_data$PrcntTake)
sd_y <- sd(my_data$SATT)
m_x_1 <- mean(my_data$Spend)
m_x_2 <- mean(my_data$PrcntTake)
m_y <- mean(my_data$SATT)
post <-
post %>%
mutate(b_0 = make_beta_0(zeta_0 = b_Intercept,
zeta_1 = b_Spend_z,
zeta_2 = b_PrcntTake_z,
sd_x_1 = sd_x_1,
sd_x_2 = sd_x_2,
sd_y = sd_y,
m_x_1 = m_x_1,
m_x_2 = m_x_2,
m_y = m_y),
b_1 = make_beta_j(zeta_j = b_Spend_z,
sd_j = sd_x_1,
sd_y = sd_y),
b_2 = make_beta_j(zeta_j = b_PrcntTake_z,
sd_j = sd_x_2,
sd_y = sd_y))
glimpse(post)
```
Here's the top panel of Figure 18.5.
```{r, fig.width = 6, fig.height = 4, warning = F, message = F}
library(tidybayes)
# here are the primary data
post %>%
transmute(Intercept = b_0,
Spend = b_1,
`Percent Take` = b_2,
Scale = sigma * sd_y,
Normality = nu %>% log10()) %>%
gather() %>%
# the plot
ggplot(aes(x = value)) +
geom_histogram(color = "grey92", fill = "grey67",
size = .2, bins = 40) +
stat_pointintervalh(aes(y = 0),
point_interval = mode_hdi, .width = c(.95, .5)) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
facet_wrap(~key, scales = "free", ncol = 3)
```
> The slope on spending has a mode of about 13, which suggests that SAT scores rise by about 13 points for every extra $1000 spent per pupil. The slope on percentage taking the exam (PrcntTake) is also credibly non-zero, with a mode around −2.8, which suggests that SAT scores fall by about 2.8 points for every additional 1% of students who take the test. (p. 517)
If you want those exact modes and, say, 50% intervals around them, you can just use `tidybayes::mode_hdi()`.
```{r}
post %>%
transmute(Spend = b_1,
`Percent Take` = b_2) %>%
gather() %>%
group_by(key) %>%
mode_hdi(value, .width = .5)
```
The `brms::bayes_R2()` function makes it easy to compute a Bayesian $R^2$. Simply feed a brm fit object into `bayes_R2()` and you'll bet back the posterior mean, $SD$, and 95% intervals.
```{r}
bayes_R2(fit1)
```
I'm not going to go into the technical details here, but you should be aware that the Bayeisan $R^2$ returned from the `bayes_R2()` function is not calculated the same as it is with OLS. If you want to dive in, check out the paper by [Gelman, Goodrich, Gabry, and Ali](https://github.com/jgabry/bayes_R2/blob/master/bayes_R2.pdf). Anyway, if you'd like to view the Bayesian $R^2$ distribution rather than just get the summaries, specify `summary = F`, convert the output to a tibble, and plot as usual.
```{r, fig.width = 2, fig.height = 2}
bayes_R2(fit1, summary = F) %>%
as_tibble() %>%
ggplot(aes(x = R2)) +
geom_histogram(color = "grey92", fill = "grey67",
size = .2, bins = 25) +
stat_pointintervalh(aes(y = 0),
point_interval = mode_hdi, .width = .95) +
scale_y_continuous(NULL, breaks = NULL) +
labs(subtitle = expression(paste("Bayesian ", italic(R)^2)),
x = NULL) +
coord_cartesian(xlim = c(.6, 1))
```
Since the `brms::bayes_R2()` function is not identical with Kruschke’s method in the text, the results might differ a bit.
We can get a sense of the scatter plots with `bayesplot::mcmc_pairs()`.
```{r, fig.width = 6, fig.height = 5, warning = F, message = F}
library(bayesplot)
color_scheme_set("gray")
post %>%
transmute(Intercept = b_0,
Spend = b_1,
`Percent Take` = b_2,
Scale = sigma * sd_y,
Normality = nu %>% log10()) %>%
mcmc_pairs(off_diag_args = list(size = 1/8, alpha = 1/8))
```
To get the Pearson's correlations among the parameters, we'll use `psych::lowerCor()`.
```{r}
post %>%
transmute(Intercept = b_0,
Spend = b_1,
`Percent Take` = b_2,
Scale = sigma * sd_y,
Normality = nu %>% log10()) %>%
psych::lowerCor(digits = 3)
```
Kruschke finished the subsection with the observations "Sometimes we are interested in using the linear model to predict $y$ values for $x$ values of interest. It is straight forward to generate a large sample of credible $y$ values for specified $x$ values" (p. 519).
Like we practiced with in the last chapter, the simplest way to do so in brms is with the `fitted()` function. For a quick example, say we wanted to know what the model would predict if we were to have a standard-score increase in spending and a simultaneous standard-score decrease in the percent taking the exam. We'd just specify those values in a tibble and feed that tibble into `fitted()` along with the model.
```{r}
nd <-
tibble(Spend_z = 1,
PrcntTake_z = -1)
fitted(fit1,
newdata = nd)
```
### Redundant predictors.
> As a simplified example of correlated predictors, think of just two data points: Suppose $y = 1 \text{ for } \langle x_1, x_2 \rangle = \langle 1, 1 \rangle \text{ and } y = 2 \text{ for } \langle x_1, x_2 \rangle = \langle 2, 2 \rangle$. The linear model, $y = \beta_1 x_1 + \beta_2 x_2$ is supposed to satisfy both data points, and in this case both are satisfied by $1 = \beta_1 + \beta_2$. Therefore, many different combinations of $\beta_1$ and $\beta_2$ satisfy the data. For example, it could be that $\beta_1 = 2$ and $\beta_2 = -1$, or $\beta_1 = 0.5$ and $\beta_2 = 0.5$, or $\beta_1 = 0$ and $\beta_2 = 1$. In other words, the credible values of $\beta_1$ and $\beta_2$ are anticorrelated and trade-off to fit the data. (p. 519)
Here are what those data look like. You would not want to fit a regression model with these data.
```{r}
tibble(x_1 = 1:2,
x_2 = 1:2,
y = 1:2)
```
We can take percentages and turn them into their inverse re-expressed as a proportion.
```{r}
percent_take <- 37
(100 - percent_take) / 100
```
Let's make a redundant predictor and then `standardize()` it.
```{r}
my_data <-
my_data %>%
mutate(PropNotTake = (100 - PrcntTake) / 100) %>%
mutate(PropNotTake_z = standardize(PropNotTake))
glimpse(my_data)
```
We're ready to fit the redundant-predictor model.
prior(normal(0, 2), class = b, coef = "Intercept"),
prior(normal(0, 2), class = b, coef = "Spend_z"),
prior(normal(0, 2), class = b, coef = "PrcntTake_z"),
prior(normal(0, 2), class = b, coef = "PropNotTake_z"),
```{r fit2, cache = T, message = F, warning = F}
fit2 <-
brm(data = my_data,
family = student,
SATT_z ~ 0 + Intercept + Spend_z + PrcntTake_z + PropNotTake_z,
prior = c(prior(normal(0, 2), class = b, coef = "Intercept"),
prior(normal(0, 2), class = b, coef = "Spend_z"),
prior(normal(0, 2), class = b, coef = "PrcntTake_z"),
prior(normal(0, 2), class = b, coef = "PropNotTake_z"),
prior(normal(0, 1), class = sigma),
prior(exponential(one_over_twentynine), class = nu)),
chains = 4, cores = 4,
stanvars = stanvar(1/29, name = "one_over_twentynine"),
seed = 18,
# This will let us use `prior_samples()` later on
sample_prior = "yes")
```
You might notice a few things about the `brm()` code. First, we have used the `~ 0 + Intercept + ...` syntax instead of the default syntax for intercepts. In normal situations, we would have been in good shape using the typical `~ 1 + ...` syntax for the intercept, especially given our use of standardized data. However, since brms version 2.5.0, using the `sample_prior` argument to draw samples from the prior distribution will no longer allow us to return samples from the typical brms intercept. Bürkner addressed the issue on the [Stan forums](https://discourse.mc-stan.org/t/prior-intercept-samples-no-longer-saved-in-brms-2-5-0/6107). As he pointed out, if you want to get prior samples from an intercept, you’ll have to use the alternative syntax. The other thing to point out is that even though we used the same prior on all the predictors, including the intercept, we still explicitly spelled each out with the `coef` argument. If we hadn’t been explicit like this, we would only get a single `b` vector from the `prior_samples()` function. But since we want separate vectors for each of our predictors, we used the verbose code. If you’re having a difficult time understanding these two points, experiment. Fit the model in a few different ways with either the typical or the alternative intercept syntax and with either the verbose prior code or the simplified `prior(normal(0, 2), class = b)` code. And after each, execute `prior_samples(fit2)`. You’ll see.
Let's move on. Kruschke mentioned high autocorrelations in the prose. Here are the autocorrelation plots for our $\beta$s.
```{r, fig.width = 6, fig.height = 4}
post <- posterior_samples(fit2, add_chain = T)
mcmc_acf(post,
pars = c("b_Intercept", "b_Spend_z", "b_PrcntTake_z", "b_PropNotTake_z"),
lags = 10)
```
Looks like HMC made a big difference. The $N_{eff}/N$ ratios weren't terrible, either.
```{r, fig.width = 6, fig.height = 1.75}
neff_ratio(fit2)[1:6] %>%
mcmc_neff() +
yaxis_text(hjust = 0)
```
The `brms::vcov()` function returns a variance/covariance matrix--or a correlation matrix when you set `correlation = T`--of the population-level parameters (i.e., the fixed effects). It returns the values to an unnecessary level of precision, so we'll simplify the output with `round()`.
```{r}
vcov(fit2, correlation = T) %>%
round(digits = 3)
```
Notice how much lower our `Spend_z` correlations are than those Kruschke displayed on page 520 of the text. However, it turns out the correlations among the redundant predictors were still very high.
> If any of the nondiagonal correlations are high (i.e., close to +1 or close to −1), be careful when interpreting the posterior distribution. Here, we can see that the correlation of PrcntTake and PropNotTake is −1.0, which is an immediate sign of redundant predictors. (p. 520)
You can really get a sense of the silliness of the parameters if you plot them. We'll use `geom_halfeyeh()` to get a sense of densities and summaries of the $\beta$s.
```{r, fig.width = 8, fig.height = 1.75}
post %>%
select(b_Intercept:b_PropNotTake_z) %>%
gather() %>%
# this line isn't necessary, but it does allow us to arrange the parameters on the y-axis
mutate(key = factor(key, levels = c("b_PropNotTake_z", "b_PrcntTake_z", "b_Spend_z", "b_Intercept"))) %>%
ggplot(aes(x = value, y = key)) +
geom_vline(xintercept = 0, color = "white") +
geom_halfeyeh(point_interval = mode_hdi,
.width = .95,
# the next two lines are purely aesthetic
scale = "width",
relative_scale = .9) +
labs(x = NULL,
y = NULL) +
coord_cartesian(xlim = -5:5) +
theme(axis.text.y = element_text(hjust = 0),
axis.ticks.y = element_blank())
```
Yeah, on the standardized scale those are some ridiculous estimates. Let's updata our `make_beta_0()` function.
```{r}
make_beta_0 <- function(zeta_0, zeta_1, zeta_2, zeta_3, sd_x_1, sd_x_2, sd_x_3, sd_y, m_x_1, m_x_2, m_x_3, m_y){
sd_y * zeta_0 + m_y - sd_y * ((zeta_1 * m_x_1 / sd_x_1) + (zeta_2 * m_x_2 / sd_x_2) + (zeta_3 * m_x_3 / sd_x_3))
}
```
```{r}
sd_x_1 <- sd(my_data$Spend)
sd_x_2 <- sd(my_data$PrcntTake)
sd_x_3 <- sd(my_data$PropNotTake)
sd_y <- sd(my_data$SATT)
m_x_1 <- mean(my_data$Spend)
m_x_2 <- mean(my_data$PrcntTake)
m_x_3 <- mean(my_data$PropNotTake)
m_y <- mean(my_data$SATT)
post <-
post %>%
transmute(Intercept = make_beta_0(zeta_0 = b_Intercept,
zeta_1 = b_Spend_z,
zeta_2 = b_PrcntTake_z,
zeta_3 = b_PropNotTake_z,
sd_x_1 = sd_x_1,
sd_x_2 = sd_x_2,
sd_x_3 = sd_x_3,
sd_y = sd_y,
m_x_1 = m_x_1,
m_x_2 = m_x_2,
m_x_3 = m_x_3,
m_y = m_y),
Spend = make_beta_j(zeta_j = b_Spend_z,
sd_j = sd_x_1,
sd_y = sd_y),
`Percent Take` = make_beta_j(zeta_j = b_PrcntTake_z,
sd_j = sd_x_2,
sd_y = sd_y),
`Proportion not Take` = make_beta_j(zeta_j = b_PropNotTake_z,
sd_j = sd_x_3,
sd_y = sd_y),
Scale = sigma * sd_y,
Normality = nu %>% log10())
glimpse(post)
```
Now we've done the conversions, here are the histograms of Figure 18.6.
```{r, fig.width = 6, fig.height = 4}
post %>%
gather() %>%
ggplot() +
geom_histogram(aes(x = value),
color = "grey92", fill = "grey67",
size = .2, bins = 40) +
stat_pointintervalh(aes(x = value, y = 0),
point_interval = mode_hdi, .width = c(.95, .5)) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
facet_wrap(~key, scales = "free", ncol = 3)
```
Their scatter plots are as follows:
```{r, fig.width = 7, fig.height = 6, warning = F}
post %>%
mcmc_pairs(off_diag_args = list(size = 1/8, alpha = 1/8))
```
Here are the Pearson's correlations.
```{r}
post %>%
psych::lowerCor(digits = 3)
```
Figure 18.7 is all about the prior predictive distribution. Here we'll extract the priors with `prior_samples()` and wrangle all in one step.
```{r}
prior <-
prior_samples(fit2) %>%
transmute(Intercept = make_beta_0(zeta_0 = b_Intercept,
zeta_1 = b_Spend_z,
zeta_2 = b_PrcntTake_z,
zeta_3 = b_PropNotTake_z,
sd_x_1 = sd_x_1,
sd_x_2 = sd_x_2,
sd_x_3 = sd_x_3,
sd_y = sd_y,
m_x_1 = m_x_1,
m_x_2 = m_x_2,
m_x_3 = m_x_3,
m_y = m_y),
Spend = make_beta_j(zeta_j = b_Spend_z,
sd_j = sd_x_1,
sd_y = sd_y),
`Percent Take` = make_beta_j(zeta_j = b_PrcntTake_z,
sd_j = sd_x_2,
sd_y = sd_y),
`Proportion not Take` = make_beta_j(zeta_j = b_PropNotTake_z,
sd_j = sd_x_3,
sd_y = sd_y),
Scale = sigma * sd_y,
Normality = nu %>% log10())
glimpse(prior)
```
Now we've wrangled the priors, we're ready to make the histograms at the top of Figure 18.7.
```{r, fig.width = 6, fig.height = 4}
prior %>%
gather() %>%
ggplot(aes(x = value)) +
geom_histogram(color = "grey92", fill = "grey67",
size = .2, bins = 40, boundary = 0) +
stat_pointintervalh(aes(y = 0),
point_interval = mode_hdi, .width = c(.95, .5)) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
facet_wrap(~key, scales = "free", ncol = 3)
```
Since we used the half-Gaussian prior for our $\sigma$, our `Scale` histogram looks different from Kruschke's. Otherwise, everything's on the up and up. Here are the scatter plots at the bottom of Figure 18.7.
```{r, fig.width = 7, fig.height = 6, warning = F}
prior %>%
mcmc_pairs(off_diag_args = list(size = 1/8, alpha = 1/8))
```
And finally, here are those Pearson's correlation coefficients for the priors.
```{r}
prior %>%
psych::lowerCor(digits = 3)
```
At the top of page 523, Kruschke asked us to "notice that the posterior distribution in Figure 18.6 has ranges for the redundant parameters that are only a little smaller than their priors." With a little wrangling, we can compare the prior/posterior distributions for our redundant parameters more directly.
```{r, fig.width = 6, fig.height = 3}
post %>%
gather() %>%
rename(parameter = key,
posterior = value) %>%
bind_cols(
prior %>%
gather() %>%
transmute(prior = value)
) %>%
gather(key, value, -parameter) %>%
filter(parameter %in% c("Percent Take", "Proportion not Take")) %>%
ggplot(aes(x = value, fill = key)) +
geom_histogram(color = "grey92",
size = .2, bins = 40, boundary = 0) +
stat_pointintervalh(aes(y = 0),
point_interval = mode_hdi, .width = c(.95, .5)) +
scale_fill_viridis_d(option = "D", begin = .35, end = .65) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
theme(legend.position = "none") +
facet_grid(key~parameter, scales = "free")
```
He was right. The posterior distributions are only slightly narrower than the priors for those two. With our combination of data and model, we learned virtually nothing beyond the knowledge we encoded in those priors.
Kruschke mentioned SEM as a possible solution to multicollinearity. brms [isn’t fully capable of SEM, at the moment](https://github.com/paul-buerkner/brms/issues/304), but its [multivariate syntax](https://cran.r-project.org/web/packages/brms/vignettes/brms_multivariate.html) does allow for [path analysis](http://www.imachordata.com/bayesian-sem-with-brms/) and [IRT models](https://github.com/paul-buerkner/brms/issues/203). However, you can currently fit a variety of Bayesian SEMs with the [blavaan package](https://cran.r-project.org/web/packages/blavaan/index.html). I'm not aware of any textbooks highlighting blavaan. If you've seen any, [please share](https://github.com/ASKurz/Doing-Bayesian-Data-Analysis-in-brms-and-the-tidyverse/issues).
### Informative priors, sparse data, and correlated predictors.
It's worth reproducing some of Kruschke's prose from this subsection.
> The examples in this book tend to use mildly informed priors (e.g., using information about the rough magnitude and range of the data). But a benefit of Bayesian analysis is the potential for cumulative scientific progress by using priors that have been informed from previous research.
>
> Informed priors can be especially useful when the amount of data is small compared to the parameter space. A strongly informed prior essentially reduces the scope of the credible parameter space, so that a small amount of new data implies a narrow zone of credible parameter values. (p. 523)
## Multiplicative interaction of metric predictors
> Formally, interactions can have many different specific functional forms. We will con- sider multiplicative interaction. This means that the nonadditive interaction is expressed by multiplying the predictors. The predicted value is a weighted combination of the individual predictors and, additionally, the multiplicative product of the predictors. For two metric predictors, regression with multiplicative interaction has these algebraically equivalent expressions:
$$
\begin{eqnarray}
\mu & = & \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_{1 \times 2} x_1 x_2 \\
& = & \beta_0 + \underbrace{(\beta_1 + \beta_{1 \times 2} x_2)}_{\text{slope of } x_1} x_1 + \beta_2 x_2 \\
& = & \beta_0 + \beta_1 x_1 + \underbrace{(\beta_2 + \beta_{1 \times 2} x_1)}_{\text{slope of } x_2} x_2
\end{eqnarray}
$$
Figure 18.8 is out of our ggplot2 repertoire.
### An example.
In brms, you can specify an interaction with either the `x_i*x_j` syntax or the `x_i:x_j` syntax. I typically use `x_i:x_j`. It’s often the case that you can just make the interaction term right in the model formula. But since we’re fitting the model with standardized predictors and then using Kruschke’s equations to convert the parameters back to the unstandardized metric, it seems easier to make the interaction term in the data, first.
```{r}
my_data <-
my_data %>%
mutate(interaction = Spend * PrcntTake) %>%
mutate(interaction_z = standardize(interaction))
```
Now we'll fit the model.
```{r fit3, cache = T, message = F, warning = F}
fit3 <-
brm(data = my_data,
family = student,
SATT_z ~ 1 + Spend_z + PrcntTake_z + interaction_z,
prior = c(prior(normal(0, 2), class = Intercept),
prior(normal(0, 2), class = b),
prior(normal(0, 1), class = sigma),
prior(exponential(one_over_twentynine), class = nu)),
chains = 4, cores = 4,
stanvars = stanvar(1/29, name = "one_over_twentynine"),
seed = 18)
```
Inspect the `summary()`.
```{r}
summary(fit3)
```
The correlations among our parameters are about as severe as those in the text.
```{r}
vcov(fit3, correlation = T) %>%
round(digits = 3)
```
> We can see that the interaction variable is strongly correlated with both predictors. Therefore, we know that there will be strong trade-offs among the regression coefficients, and the marginal distributions of single regression coefficients might be much wider than when there was no interaction included. (p. 528)
Let's convert to the unstandardized metric.
```{r}
sd_x_3 <- sd(my_data$interaction)
m_x_3 <- mean(my_data$interaction)
post <-
posterior_samples(fit3) %>%
transmute(Intercept = make_beta_0(zeta_0 = b_Intercept,
zeta_1 = b_Spend_z,
zeta_2 = b_PrcntTake_z,
zeta_3 = b_interaction_z,
sd_x_1 = sd_x_1,
sd_x_2 = sd_x_2,
sd_x_3 = sd_x_3,
sd_y = sd_y,
m_x_1 = m_x_1,
m_x_2 = m_x_2,
m_x_3 = m_x_3,
m_y = m_y),
Spend = make_beta_j(zeta_j = b_Spend_z,
sd_j = sd_x_1,
sd_y = sd_y),
`Percent Take` = make_beta_j(zeta_j = b_PrcntTake_z,
sd_j = sd_x_2,
sd_y = sd_y),
`Spend : Percent Take` = make_beta_j(zeta_j = b_interaction_z,
sd_j = sd_x_3,
sd_y = sd_y),
Scale = sigma * sd_y,
Normality = nu %>% log10())
glimpse(post)
```
Now we've done the conversions, here are the histograms of Figure 18.9.
```{r, fig.width = 6, fig.height = 4}
post %>%
gather() %>%
ggplot(aes(x = value)) +
geom_histogram(color = "grey92", fill = "grey67",
size = .2, bins = 40) +
stat_pointintervalh(aes(y = 0),
point_interval = mode_hdi, .width = c(.95, .5)) +
scale_y_continuous(NULL, breaks = NULL) +
xlab(NULL) +
facet_wrap(~key, scales = "free", ncol = 3)
```
"To properly understand the credible slopes on the two predictors, we must consider the credible slopes on each predictor as a function of the value of the other predictor" (p. 528). This is our motivation for the middle panel of Figure 18.9. To make it, we'll need to `expand()` our `post`, wrangle a bit, and plot with `geom_pointrange()`.
```{r, fig.width = 6, fig.height = 2.5}
# this will come in handy in `expand()`
bounds <- range(my_data$PrcntTake)
# wrangle
post %>%
expand(nesting(Spend, `Spend : Percent Take`),
PrcntTake = seq(from = bounds[1], to = bounds[2], length.out = 20)) %>%
mutate(slope = Spend + `Spend : Percent Take` * PrcntTake) %>%
group_by(PrcntTake) %>%
median_hdi(slope) %>%
# plot
ggplot(aes(x = PrcntTake, y = slope,
ymin = .lower, ymax = .upper)) +
geom_hline(yintercept = 0, color = "white") +
geom_pointrange(color = "grey50") +
labs(x = "Value of PrcntTake",
y = "Slope on Spend",
title = expression(paste("Slope on Spend is ", beta[1] + beta[3] %.% "PrcntTake")))
```
That worked like a charm. We'll follow the same basic order of operations for the final panel.
```{r, fig.width = 6, fig.height = 2.5}
# this will come in handy in `expand()`
bounds <- range(my_data$Spend)
# wrangle
post %>%
expand(nesting(`Percent Take`, `Spend : Percent Take`),
Spend = seq(from = bounds[1], to = bounds[2], length.out = 20)) %>%
mutate(slope = `Percent Take` + `Spend : Percent Take` * Spend) %>%
group_by(Spend) %>%
median_hdi(slope) %>%
# plot
ggplot(aes(x = Spend, y = slope,
ymin = .lower, ymax = .upper)) +
geom_pointrange(color = "grey50") +
labs(x = "Value of Spend",
y = "Slope on PrcntTake",
title = expression(paste("Slope on PrcntTake is ", beta[2] + beta[3] %.% "Spend")))
```
Kruschke outlined all this in the opening paragraphs of page 530. His parting words of this subsection warrant repeating: "if you include an interaction term, you cannot ignore it even if its marginal posterior distribution includes zero" (p. 530).
## Shrinkage of regression coefficients
> In some research, there are many candidate predictors which we suspect could possibly be informative about the predicted variable. For example, when predicting college GPA, we might include high-school GPA, high-school SAT score, income of student, income of parents, years of education of the parents, spending per pupil at the student’s high school, student IQ, student height, weight, shoe size, hours of sleep per night, distance from home to school, amount of caffeine consumed, hours spent studying, hours spent earning a wage, blood pressure, etc. We can include all the candidate predictors in the model, with a regression coefficient for every predictor. And this is not even considering interactions, which we will ignore for now.
>
> With so many candidate predictors of noisy data, there may be some regression coefficients that are spuriously estimated to be non-zero. We would like some protection against accidentally nonzero regression coefficients. (p. 530)
That's what this section is all about. We'll make our random noise predictors with `rnorm()`.
```{r}
set.seed(18)
my_data <-
my_data %>%
mutate(x_rand_1 = rnorm(n = n(), 0, 1),
x_rand_2 = rnorm(n = n(), 0, 1),
x_rand_3 = rnorm(n = n(), 0, 1),
x_rand_4 = rnorm(n = n(), 0, 1),
x_rand_5 = rnorm(n = n(), 0, 1),
x_rand_6 = rnorm(n = n(), 0, 1),
x_rand_7 = rnorm(n = n(), 0, 1),
x_rand_8 = rnorm(n = n(), 0, 1),
x_rand_9 = rnorm(n = n(), 0, 1),
x_rand_10 = rnorm(n = n(), 0, 1),
x_rand_11 = rnorm(n = n(), 0, 1),
x_rand_12 = rnorm(n = n(), 0, 1))
glimpse(my_data)
```
Here's the naïve model.
```{r fit4, cache = T, message = F, warning = F, results = 'hide'}
fit4 <-
update(fit1,
newdata = my_data,
formula = SATT_z ~ 1 + PrcntTake_z + Spend_z + x_rand_1 + x_rand_2 + x_rand_3 + x_rand_4 + x_rand_5 + x_rand_6 + x_rand_7 + x_rand_8 + x_rand_9 + x_rand_10 + x_rand_11 + x_rand_12,
seed = 18)
```
Its parameter summary is as follows:
```{r}
posterior_summary(fit4) %>%
round(digits = 2)
```
Before we can make Figure 18.11, we'll need to update our `make_beta_0()` function to accommodate this model.
```{r}
make_beta_0 <-
function(zeta_0, zeta_1, zeta_2, zeta_3, zeta_4, zeta_5, zeta_6, zeta_7, zeta_8, zeta_9, zeta_10, zeta_11, zeta_12, zeta_13, zeta_14,
sd_x_1, sd_x_2, sd_x_3, sd_x_4, sd_x_5, sd_x_6, sd_x_7, sd_x_8, sd_x_9, sd_x_10, sd_x_11, sd_x_12, sd_x_13, sd_x_14, sd_y,
m_x_1, m_x_2, m_x_3, m_x_4, m_x_5, m_x_6, m_x_7, m_x_8, m_x_9, m_x_10, m_x_11, m_x_12, m_x_13, m_x_14, m_y) {
sd_y * zeta_0 + m_y - sd_y * ((zeta_1 * m_x_1 / sd_x_1) +
(zeta_2 * m_x_2 / sd_x_2) +
(zeta_3 * m_x_3 / sd_x_3) +
(zeta_4 * m_x_4 / sd_x_4) +
(zeta_5 * m_x_5 / sd_x_5) +
(zeta_6 * m_x_6 / sd_x_6) +
(zeta_7 * m_x_7 / sd_x_7) +
(zeta_8 * m_x_8 / sd_x_8) +
(zeta_9 * m_x_9 / sd_x_9) +
(zeta_10 * m_x_10 / sd_x_10) +
(zeta_11 * m_x_11 / sd_x_11) +
(zeta_12 * m_x_12 / sd_x_12) +
(zeta_13 * m_x_13 / sd_x_13) +
(zeta_14 * m_x_14 / sd_x_14))
}
```
Sigh, our poor `make_beta_0()` and `make_beta_1()` code is getting obscene. I don’t have the energy to think of how to wrap this into a simpler function. But someone probably should. If that ends up as you, [do share your code](https://github.com/ASKurz/Doing-Bayesian-Data-Analysis-in-brms-and-the-tidyverse/issues).
```{r}
sd_x_1 <- sd(my_data$Spend)
sd_x_2 <- sd(my_data$PrcntTake)
sd_x_3 <- sd(my_data$x_rand_1)
sd_x_4 <- sd(my_data$x_rand_2)
sd_x_5 <- sd(my_data$x_rand_3)
sd_x_6 <- sd(my_data$x_rand_4)
sd_x_7 <- sd(my_data$x_rand_5)
sd_x_8 <- sd(my_data$x_rand_6)
sd_x_9 <- sd(my_data$x_rand_7)
sd_x_10 <- sd(my_data$x_rand_8)
sd_x_11 <- sd(my_data$x_rand_9)
sd_x_12 <- sd(my_data$x_rand_10)
sd_x_13 <- sd(my_data$x_rand_11)
sd_x_14 <- sd(my_data$x_rand_12)
sd_y <- sd(my_data$SATT)
m_x_1 <- mean(my_data$Spend)
m_x_2 <- mean(my_data$PrcntTake)
m_x_3 <- mean(my_data$x_rand_1)
m_x_4 <- mean(my_data$x_rand_2)
m_x_5 <- mean(my_data$x_rand_3)
m_x_6 <- mean(my_data$x_rand_4)
m_x_7 <- mean(my_data$x_rand_5)
m_x_8 <- mean(my_data$x_rand_6)
m_x_9 <- mean(my_data$x_rand_7)
m_x_10 <- mean(my_data$x_rand_8)
m_x_11 <- mean(my_data$x_rand_9)
m_x_12 <- mean(my_data$x_rand_10)
m_x_13 <- mean(my_data$x_rand_11)
m_x_14 <- mean(my_data$x_rand_12)
m_y <- mean(my_data$SATT)
post <-
posterior_samples(fit4) %>%
transmute(Intercept = make_beta_0(zeta_0 = b_Intercept,
zeta_1 = b_Spend_z,
zeta_2 = b_PrcntTake_z,
zeta_3 = b_x_rand_1,
zeta_4 = b_x_rand_2,
zeta_5 = b_x_rand_3,
zeta_6 = b_x_rand_4,
zeta_7 = b_x_rand_5,
zeta_8 = b_x_rand_6,
zeta_9 = b_x_rand_7,
zeta_10 = b_x_rand_8,
zeta_11 = b_x_rand_9,
zeta_12 = b_x_rand_10,
zeta_13 = b_x_rand_11,
zeta_14 = b_x_rand_12,
sd_x_1 = sd_x_1,
sd_x_2 = sd_x_2,
sd_x_3 = sd_x_3,
sd_x_4 = sd_x_4,
sd_x_5 = sd_x_5,
sd_x_6 = sd_x_6,
sd_x_7 = sd_x_7,
sd_x_8 = sd_x_8,
sd_x_9 = sd_x_9,
sd_x_10 = sd_x_10,
sd_x_11 = sd_x_11,
sd_x_12 = sd_x_12,
sd_x_13 = sd_x_13,
sd_x_14 = sd_x_14,
sd_y = sd_y,
m_x_1 = m_x_1,
m_x_2 = m_x_2,
m_x_3 = m_x_3,
m_x_4 = m_x_4,
m_x_5 = m_x_5,
m_x_6 = m_x_6,
m_x_7 = m_x_7,
m_x_8 = m_x_8,
m_x_9 = m_x_9,
m_x_10 = m_x_10,
m_x_11 = m_x_11,
m_x_12 = m_x_12,
m_x_13 = m_x_13,
m_x_14 = m_x_14,
m_y = m_y),
Spend = make_beta_j(zeta_j = b_Spend_z,
sd_j = sd_x_1,
sd_y = sd_y),
`Percent Take` = make_beta_j(zeta_j = b_PrcntTake_z,
sd_j = sd_x_2,
sd_y = sd_y),
x_rand_1 = make_beta_j(zeta_j = b_x_rand_1,
sd_j = sd_x_3,
sd_y = sd_y),
x_rand_2 = make_beta_j(zeta_j = b_x_rand_2,
sd_j = sd_x_4,
sd_y = sd_y),
x_rand_3 = make_beta_j(zeta_j = b_x_rand_3,
sd_j = sd_x_5,
sd_y = sd_y),
x_rand_4 = make_beta_j(zeta_j = b_x_rand_4,
sd_j = sd_x_6,
sd_y = sd_y),
x_rand_5 = make_beta_j(zeta_j = b_x_rand_5,
sd_j = sd_x_7,
sd_y = sd_y),
x_rand_6 = make_beta_j(zeta_j = b_x_rand_6,
sd_j = sd_x_8,
sd_y = sd_y),
x_rand_7 = make_beta_j(zeta_j = b_x_rand_7,
sd_j = sd_x_9,
sd_y = sd_y),