/
survextrap.R
940 lines (890 loc) · 40.5 KB
/
survextrap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
#' Flexible Bayesian parametric survival models
#'
#' Flexible Bayesian parametric survival models. Individual data are
#' represented using M-splines and a proportional hazards or flexible
#' non-proportional hazards model. External aggregate data can be
#' included, for example, to enable extrapolation outside the
#' individual data. A fixed background hazard can also be included in
#' an additive hazards (relative survival) model. Mixture cure
#' versions of these models can also be used.
#'
#' @param formula A survival formula in standard R formula syntax, with a call to `Surv()`
#' on the left hand side.
#'
#' Covariates included on the right hand side of the formula with be
#' modelled with proportional hazards, or if \code{nonprop} is
#' \code{TRUE} then a non-proportional hazards is used.
#'
#' If \code{data} is omitted, so that the model is being fitted to
#' external aggregate data alone, without individual data, then the
#' formula should not include a \code{Surv()} call. The left-hand
#' side of the formula will then be empty, and the right hand side
#' specifies the covariates as usual. For example, \code{formula =
#' ~1} if there are no covariates.
#'
#' @param data Data frame containing variables in `formula`.
#' Variables should be in a data frame, and not in the working
#' environment.
#'
#' This may be omitted, in which case \code{external} must be
#' supplied. This allows a model to be fitted to external aggregate
#' data alone, without any individual-level data.
#'
#' @param external External data as a data frame of aggregate survival counts with columns named:
#'
#' `start`: Start time
#'
#' `stop`: Follow-up time
#'
#' `n`: Number of people alive at `start`
#'
#' `r`: Number of those people who are still alive at `stop`
#'
#' If there are covariates in \code{formula}, then the values they
#' take in the external data must be supplied as additional columns in
#' \code{external}. Therefore if there are external data, the
#' covariates in \code{formula} and \code{data} should not be named
#' \code{start},\code{stop},\code{n} or \code{r}.
#'
#' @param cure If `TRUE`, a mixture cure model is used, where the
#' "uncured" survival is defined by the M-spline model, and the cure
#' probability is estimated.
#'
#' @param nonprop Non-proportional hazards model specification.
#' This is achieved by modelling the spline basis coefficients in terms of the covariates. See
#' the [methods vignette](https://chjackson.github.io/survextrap/articles/methods.html) for more details.
#'
#' If \code{TRUE}, then all covariates are modelled with
#' non-proportional hazards, using the same model formula as
#' \code{formula}.
#'
#' If this is a formula, then this is assumed to define a model for
#' the dependence of the basis coefficients on the covariates.
#'
#' IF this is \code{NULL} or \code{FALSE} (the default) then any
#' covariates are modelled with proportional hazards.
#'
#' @param prior_hscale Prior for the baseline log hazard scale
#' parameter (`alpha` or `log(eta)`). This should be a call to a
#' prior constructor function, such as `p_normal(0,1)` or
#' `p_t(0,2,2)`. Supported prior distribution families are normal
#' (parameters mean and SD) and t distributions (parameters
#' location, scale and degrees of freedom). The default is a normal
#' distribution with mean 0 and standard deviation 20.
#'
#' Note that `eta` is not in itself a hazard, but it is proportional
#' to the hazard (see the vignette for the full model specification).
#'
#' "Baseline" is defined by the continuous covariates taking a value
#' of zero and factor covariates taking their reference level. To
#' use a different baseline, the data should be transformed
#' appropriately beforehand, so that a value of zero has a different
#' meaning. For continuous covariates, it helps for both
#' computation and interpretation to define the value of zero to
#' denote a typical value in the data, e.g. the mean.
#'
#' @param prior_loghr Priors for log hazard ratios. This should be a
#' call to `p_normal()` or `p_t()`. A list of calls can also be
#' provided, to give different priors to different coefficients,
#' where the name of each list component matches the name of the
#' coefficient, e.g. ```list("age45-59" = p_normal(0,1), "age60+" =
#' p_t(0,2,3))```
#'
#' The default is `p_normal(0,2.5)` for all coefficients.
#'
#' @param prior_hsd Gamma prior for the standard deviation that
#' controls the variability over time (or smoothness) of the hazard
#' function. This should be a call to `p_gamma()`. The default is
#' `p_gamma(2,1)`. See \code{\link{prior_haz_sd}} for a way to
#' calibrate this to represent a meaningful belief.
#'
#' @param prior_cure Prior for the baseline cure probability. This should be a
#' call to `p_beta()`. The default is a uniform prior, `p_beta(1,1)`.
#' Baseline is defined by the mean of continuous covariates and the reference
#' level of factor covariates.
#'
#' @param prior_logor_cure Priors for log odds ratios on cure probabilities.
#' This should be a call to `p_normal()` or `p_t()`. The default is
#' `p_normal(0,2.5)`.
#'
#' @param prior_hrsd Prior for the standard deviation parameters that
#' smooth the non-proportionality effects over time in
#' non-proportional hazards models. This should be a call to
#' `p_gamma()` or a list of calls to `p_gamma()` with one component
#' per covariate, as in `prior_loghr`. See
#' \code{\link{prior_hr_sd}} for a way to calibrate this to
#' represent a meaningful belief.
#'
#' @param backhaz Background hazard, that is, for causes of death
#' other than the cause of interest. This defines a
#' "relative survival" model where the overall hazard is the sum of
#' a cause-specific hazard and a background hazard. The background
#' hazard is assumed to be known, and the cause-specific hazard is
#' modelled with the flexible parametric model.
#'
#' The background hazard can be supplied in two forms. The meaning of predictions
#' from the model depends on which of these is used.
#'
#' (a) A data frame with columns \code{"hazard"} and \code{"time"},
#' specifying the background hazard at all times as a
#' piecewise-constant (step) function. Each row gives the background
#' hazard between the specified time and the next time. The first
#' element of \code{"time"} should be 0, and the final row specifies
#' the hazard at all times greater than the last element of
#' \code{"time"}. Predictions from the model fitted by `survextrap`
#' will then include this background hazard, because it is known at
#' all times.
#'
#' (b) The (quoted) name of a variable in the data giving the
#' background hazard. For censored cases, the exact value does not
#' matter. The predictions from `survextrap` will then describe the
#' excess hazard or survival on top of this background. The overall
#' hazard cannot be predicted in general, because the background
#' hazard is only specified over a limited range of time.
#'
#' If there is external data, and `backhaz` is supplied in form (b),
#' then the user should also supply the background survival at the
#' start and stop points in columns of the external data named
#' `"backsurv_start"` and `"backsurv_stop"`. This should describe the
#' same reference population as `backhaz`, though the package does not
#' check for consistency between these.
#'
#' If there are stratifying variables specified in
#' \code{backhaz_strata}, then there should be multiple rows giving
#' the background hazard value for each time period and stratifying
#' variable.
#'
#' If `backhaz` is \code{NULL} (the default) then no background hazard
#' component is included in the model.
#'
#' @param backhaz_strata A character vector of names of variables that
#' appear in \code{backhaz} that indicate strata, e.g.
#' \code{backhaz_strata = c("agegroup","sex")}. This allows
#' different background hazard values to be used for different
#' subgroups. These variables must also
#' appear in the datasets being modelled, that is, in \code{data},
#' \code{external} or both. Each row of those datasets should then
#' have a corresponding row in \code{backhaz} which has the same
#' values of the stratifying variables.
#'
#' This is \code{NULL} by default, indicating no stratification of the
#' background hazard.
#'
#' If stratification is done, then \code{backhaz} must be supplied in
#' form (a), as a data frame rather than a variable in the data.
#'
#' @param mspline A list of control parameters defining the spline model.
#'
#' `knots`: Spline knots. If this is not supplied, then the number
#' of knots is taken from `df`, and their location is taken from
#' equally-spaced quantiles of the observed event times in the
#' individual-level data.
#'
#' `add_knots`: This is intended
#' to be used when there are \code{external} data included in the
#' model. External data are typically outside the time period
#' covered by the individual data. `add_knots` would then be chosen
#' to span the time period covered by the external data, so that the
#' hazard trajectory can vary over that time.
#'
#' If there are external data, and both `knots` and `add_knots` are
#' omitted, then a default set of knots is chosen to span both the
#' individual and external data, by taking the quantiles of a vector
#' defined by concatenating the individual-level event times with
#' the \code{start} and \code{stop} times in the external data.
#'
#' `df`: Degrees of freedom, i.e. the number of parameters (or basis
#' terms) intended to result from choosing knots based on quantiles
#' of the data. The total number of parameters will then be `df`
#' plus the number of additional knots specified in
#' `add_knots`. `df` defaults to 10. This does not necessarily
#' overfit, because the function is smoothed through the prior.
#'
#' `degree`: Polynomial degree used for the basis function. The
#' default is 3, giving a cubic. This can only be changed from 3
#' if `bsmooth` is \code{FALSE}.
#'
#' `bsmooth`: If \code{TRUE} (on by default) the spline is smoother
#' at the highest knot, by defining the derivative and second derivative
#' at this point to be zero.
#'
#' @param add_knots Any extra knots beyond those chosen from the
#' individual-level data (or supplied in `knots`). All other spline
#' specifications are set to their defaults, as described in
#' \code{mspline}. For example, `add_knots = 10` is a shorthand
#' for `mspline = list(add_knots = 10)`.
#'
#' @param smooth_model The default \code{"exchangeable"} uses
#' independent logistic priors on the multinomial-logit spline
#' coefficients, conditionally on a common smoothing variance
#' parameter.
#'
#' The alternative, \code{"random_walk"}, specifies a second-order
#' random walk for the multinomial-logit spline coefficients, based
#' on logistic distributions with a common smoothing variance. See
#' the [methods vignette](https://chjackson.github.io/survextrap/articles/methods.html)
#' for full details.
#'
#' In non-proportional hazards models, setting \code{smooth_model} also
#' determines whether an exchangeable or random walk model is used for the
#' non-proportionality parameters (\eqn{\delta}).
#'
#' @param hsd Smoothing variance parameter estimation.
#'
#' `"bayes"`: the smoothing parameter is estimated by full Bayes (the default).
#'
#' `"eb"`: empirical Bayes is used.
#'
#' Alternatively, if a number is supplied here, then the smoothing parameter is fixed to this number.
#'
#' @param coefs_mean Spline basis coefficients that define the prior
#' mean for the hazard function. By default, these are set to values
#' that define a constant hazard function (see
#' \code{\link{mspline_constant_coefs}}). They are normalised to
#' sum to 1 internally (if they do not already).
#'
#' @param fit_method Method from \pkg{rstan} used to fit the model.
#'
##' If \code{fit_method="mcmc"} then a sample from the posterior is
##' drawn using Markov Chain Monte Carlo sampling, via \pkg{rstan}'s
##' \code{\link[rstan:stanmodel-method-sampling]{rstan::sampling()}}
##' function. This is the default. It is the most accurate but the
##' slowest method.
##'
##' If \code{fit_method="opt"}, then instead of an MCMC sample from
##' the posterior, `survextrap` returns the posterior mode
##' calculated using optimisation, via \pkg{rstan}'s
##' \code{\link[rstan:stanmodel-method-optimizing]{rstan::optimizing()}}
##' function. A sample from a normal approximation to the
##' (real-line-transformed) posterior distribution is drawn in order
##' to obtain credible intervals.
##'
##' If \code{fit_method="vb"}, then variational Bayes methods are
##' used, via \pkg{rstan}'s
##' \code{\link[rstan:stanmodel-method-vb]{rstan::vb()}} function.
##' This is labelled as "experimental" by \pkg{rstan}. It might
##' give a better approximation to the posterior than
##' \code{fit_method="opt"}, and is faster than MCMC, in particular
##' for large datasets, but has not been investigated in depth for
##' these models.
##'
##' @param loo Compute leave-one-out cross-validation statistics.
##' This is done by default. Set to \code{FALSE} to not compute
##' them. If these statistics are computed, then they are returned
##' in the \code{loo} and \code{loo_external} components of the
##' object returned by \code{survextrap}. \code{loo} describes the
##' fit of the model to the individual-level data, and
##' \code{loo_external} describes fit to the external data.
##'
##' See the \code{"examples"} vignette for more explanation of these.
##'
##' @param ... Additional arguments to supply to control the Stan fit,
##' passed to the appropriate \pkg{rstan} function, depending on
##' which is chosen through the `fit_method` argument.
#'
#' @return A list of objects defining the fitted model. These are not
#' intended to be extracted directly by users. Instead see
#' \code{\link{summary.survextrap}} for summarising the parameter
#' estimates, and the functions \code{\link{hazard}},
#' \code{\link{survival}}, \code{\link{rmst}} and others for
#' computing interesting summaries of the fitted survival
#' distribution.
#'
#' The object returned by `rstan` containing samples from the fitted
#' model is returned in the `stanfit` component. See the
#' [rstan](https://mc-stan.org/rstan/index.html) documentation. The
#' function \code{\link{get_draws}} is provided to convert this to a
#' simple matrix of posterior samples with all chains collapsed.
#'
#' @references Jackson, C. (2023) \code{survextrap}: a package for flexible and transparent
#' survival extrapolation. BMC Medical Research Methodology 23:282.
#' \doi{10.1186/s12874-023-02094-1}
#'
#' @export
survextrap <- function(formula,
data=NULL,
external=NULL,
cure = FALSE,
nonprop = FALSE,
prior_hscale = p_normal(0, 20),
prior_loghr = NULL,
prior_hsd = p_gamma(2,1),
prior_cure = p_beta(1,1),
prior_logor_cure = NULL,
prior_hrsd = p_gamma(2,1),
backhaz = NULL,
backhaz_strata = NULL,
mspline = NULL,
add_knots = NULL,
smooth_model = "exchangeable",
hsd = "bayes",
coefs_mean = NULL,
fit_method = "mcmc",
loo = (fit_method=="mcmc"),
...)
{
validate_formula(formula, needs_response=FALSE)
td <- make_td(formula, data)
data_x <- if (td$indiv) data else external
x <- make_x(formula, data_x, td)
xcure <- make_xcure(cure, data, td)
xnph <- make_nonprop(nonprop, data, td, formula)
backhaz <- make_backhaz(backhaz, data, external, td, backhaz_strata)
external <- make_external(external, formula, x, xcure, xnph, backhaz, backhaz_strata)
mspline <- make_mspline(mspline, td, external, add_knots)
basis_event <- make_basis(td$t_event, mspline)
ibasis_event <- make_basis(td$t_event, mspline, integrate = TRUE)
ibasis_rcens <- make_basis(td$t_rcens, mspline, integrate = TRUE)
nvars <- mspline$nvars
if (is.null(coefs_mean)){
coefs_mean <- mspline_constant_coefs(mspline)
} else {
coefs_mean <- validate_coefs_mean(coefs_mean)
}
b_mean <- aa(log(coefs_mean[-1] / coefs_mean[1]))
smooth_models <- c("exchangeable","random_walk")
smooth_model <- match.arg(smooth_model, smooth_models)
smooth_model_id <- match(smooth_model, smooth_models)
est_hsd <- (hsd == "bayes")
if (est_hsd) hsd_init <- 1
ibasis_ext_stop <- if (external$nextern>0) make_basis(external$stop, mspline, integrate = TRUE) else matrix(nrow=0, ncol=nvars)
ibasis_ext_start <- if (external$nextern>0) make_basis(external$start, mspline, integrate = TRUE) else matrix(nrow=0, ncol=nvars)
if (xnph$ncovs > 0) {
if (x$ncovs==0) {
warning("Ignoring non-proportional hazards model specification, since no covariates in model. ")
}
}
priors <- get_priors(prior_hscale, prior_loghr, prior_hsd, prior_cure, prior_logor_cure,
x, xcure, xnph, est_hsd, prior_hrsd)
standata <- nlist(nevent=td$nevent, nrcens=td$nrcens,
nvars, nextern=external$nextern, ncovs = x$ncovs,
basis_event, ibasis_event, ibasis_rcens,
ibasis_ext_stop, ibasis_ext_start,
x_event=x$event, x_rcens=x$rcens,
r_ext = external$r,
n_ext = external$n,
x_ext = external$X,
ncurecovs = xcure$ncovs,
nnphcovs = xnph$ncovs,
xcure_event = xcure$event, xcure_rcens = xcure$rcens,
xcure_ext = external$Xcure,
xnph_event = xnph$event, xnph_rcens = xnph$rcens,
xnph_ext = external$Xnph,
b_mean,
est_hsd,
smooth_model = smooth_model_id,
cure = xcure$cure,
relative=backhaz$relative,
backhaz_event = backhaz$event,
backsurv_ext_stop = external$backsurv_stop,
backsurv_ext_start = external$backsurv_start,
prior_hscale_dist = priors$hscale$distid,
prior_hscale = as.numeric(unlist(priors$hscale[c("location","scale","df")])),
prior_loghr_dist = aa(priors$loghr$distid),
prior_loghr_location = aa(priors$loghr$location),
prior_loghr_scale = aa(priors$loghr$scale),
prior_loghr_df = aa(priors$loghr$df),
prior_logor_cure_dist = aa(priors$logor_cure$distid),
prior_logor_cure_location = aa(priors$logor_cure$location),
prior_logor_cure_scale = aa(priors$logor_cure$scale),
prior_logor_cure_df = aa(priors$logor_cure$df),
prior_hsd = as.numeric(unlist(priors$hsd[c("shape","rate")])),
prior_cure = as.numeric(unlist(priors$cure[c("shape1","shape2")])),
modelid = 1,
prior_hrsd = priors$hrsd
)
pcure_init <- if (xcure$cure) 0.5 else numeric()
staninit <- list(gamma = aa(0),
loghr = aa(rep(0, standata$ncovs)),
beta_err = aa(rep(0, standata$nvars-1)),
hsd = aa(if(standata$est_hsd) hsd_init else numeric()),
pcure = aa(pcure_init))
if (identical(hsd, "eb")){
hsd <- eb_smoothness(standata, staninit, prior_hsd)
}
standata$hsd_fixed <- if (est_hsd) aa(numeric()) else aa(hsd)
rstan_optimizing_ops <- function(...){
ops <- list(...)
ops <- ops[names(ops) %in% .rstan_optimizing_args]
if (is.null(ops$hessian)) ops$hessian <- TRUE
if (is.null(ops$draws)) ops$draws <- 2000
ops
}
rstan_sampling_ops <- function(...){
ops <- list(...)
ops <- ops[names(ops) %in% .rstan_sampling_args]
ops
}
rstan_vb_ops <- function(...){
ops <- list(...)
ops <- ops[names(ops) %in% .rstan_vb_args]
ops
}
stanmod <- "survextrap"
if (fit_method=="opt")
fits <- do.call(rstan::optimizing,
c(list(object=stanmodels[[stanmod]], data=standata, init=staninit),
rstan_optimizing_ops(...)))
else if (fit_method=="mcmc")
fits <- do.call(rstan::sampling,
c(list(object=stanmodels[[stanmod]], data=standata,
pars = "beta", include=FALSE), rstan_sampling_ops(...)))
else if (fit_method=="vb")
fits <- do.call(rstan::vb,
c(list(object=stanmodels[[stanmod]], data=standata,
pars = "beta", include=FALSE), rstan_vb_ops(...)))
else stop(sprintf("Unknown fit_method: %s",fit_method))
km <- if (td$indiv) surv_summary(survfit(formula, data=data), data=data) else NULL
misc_keep <- nlist(formula, indiv=td$indiv, stanfit=fits,
fit_method,
cure_formula = xcure$cure_formula,
nph_formula = xnph$nph_formula,
backhaz=backhaz$df,
backhaz_strata)
standata_keep <- standata[c("nvars","ncovs","ncurecovs","nnphcovs","nevent","nrcens","nextern")]
model_keep <- nlist(cure=xcure$cure, est_hsd)
spline_keep <- nlist(mspline)
covinfo_names <- c("xnames","xlevs","xinds","mfbase")
x <- list(x = x[covinfo_names])
xcure <- list(xcure = xcure[covinfo_names])
xnph <- list(xnph = xnph[covinfo_names])
prior_keep <- list(priors=priors)
prioretc_keep <- nlist(coefs_mean, hsd)
res <- c(misc_keep, standata_keep, model_keep, spline_keep, x, xcure, xnph,
prior_keep, prioretc_keep, nlist(km))
prior_sample <- get_prior_sample(mspline=mspline,
coefs_mean=coefs_mean,
prior_hsd=prior_hsd,
prior_hscale=prior_hscale,
prior_loghr=prior_loghr,
formula = if (res$ncovs==0) NULL else as.formula(delete.response(terms(formula))),
cure_formula = if (standata$ncurecovs==0) NULL else xcure$cure_formula,
nph_formula = xnph$nph_formula,
prior_hrsd=prior_hrsd,
default_newdata = default_newdata(res),
xlevs=x$x$xlevs)
res <- c(res, nlist(prior_sample))
class(res) <- "survextrap"
if (loo && (fit_method=="mcmc")) {
res$loo <- loo_survextrap(res, standata, loglik_ipd)
if (external$nextern > 0)
res$loo_external <- loo_survextrap(res, standata, loglik_external)
}
else {
res$loo <- sprintf("Cross-validation statistics not available with `fit_method=\"%s\"`", fit_method)
class(res$loo) <- "message"
}
res
}
##' Make default M-spline knot specification given a survival dataset.
##'
##' Choose default M-spline knot locations given a dataset and desired
##' number of spline parameters. Assumes a cubic spline, and knots
##' based on quantiles of event times observed in the individual data.
##'
##' If there are also external data, then these are based on quantiles
##' of a vector defined by concatenating the event times in the
##' individual data with the unique start and stop times in the
##' external data.
##'
##' This is designed to have the same arguments as
##' \code{\link{survextrap}}. It is intended for use when we want to
##' fit a set of \code{\link{survextrap}} models with the same spline
##' specification.
##'
##' See also \code{\link{mspline_list_init}} and \code{\link{mspline_init}},
##' which have lower-level interfaces, and are designed for use without
##' data, e.g. when illustrating a theoretical M-spline model.
##'
##' @inheritParams survextrap
##'
##' @inheritParams mspline_init
##'
##' @param add_knots Additional knots, other than those determined from the quantiles of the individual data.
##' Typically used to add a maximum knot at the time that we want to extrapolate to.
##'
##' @return A list with components
##'
##' \code{knots} Knot locations. The number of
##' knots will be equal to \code{df} + \code{degree} + 2.
##' \code{degree} Spline polynomial degree (i.e. 3)
##' \code{nvars} Number of basis variables (an alias for \code{df})
##'
##' @export
mspline_spec <- function(formula, data, cure=FALSE, nonprop=NULL,
backhaz=NULL, backhaz_strata=NULL, external=NULL,
df=10, add_knots=NULL, degree=3, bsmooth=TRUE){
td <- make_td(formula, data)
x <- make_x(formula, data, td)
xcure <- make_xcure(cure, data, td)
xnph <- make_nonprop(nonprop, data, td)
backhaz <- make_backhaz(backhaz, data, external, td, backhaz_strata)
external <- make_external(external, formula, x, xcure, xnph, backhaz, backhaz_strata)
mspline <- list(df=df,degree=degree,bsmooth=bsmooth,add_knots=add_knots)
mspline <- make_mspline(mspline, td, external)
mspline
}
apply_factor_levels <- function(newdata, xlevs){
for (i in names(xlevs))
newdata[[i]] <- factor(newdata[[i]], levels=xlevs[[i]])
newdata
}
## Construct functions to draw from the prior predictive distributions
## for interesting quantities in a model specified with survextrap()
get_prior_sample <- function(mspline,
coefs_mean, prior_hsd, prior_hscale, prior_loghr,
formula, cure_formula, nph_formula,
prior_hrsd, default_newdata, xlevs){
## Samples of basic parameters defining the spline
sample <- function(newdata=default_newdata, nsim=100){
prior_sample(mspline=mspline,
coefs_mean=coefs_mean, prior_hsd=prior_hsd,
prior_hscale=prior_hscale, prior_loghr=prior_loghr,
newdata=apply_factor_levels(newdata,xlevs),
prior_hrsd=prior_hrsd,
formula=formula, cure=cure_formula, nonprop=nph_formula,
nsim=nsim)
}
## Summary of the hazard if coefs_mean was set to a constant hazard
haz_const <- function(nsim=10000, quantiles=c(0.025, 0.5, 0.975)){
prior_haz_const(mspline=mspline,
prior_hscale = prior_hscale,
nsim=nsim, quantiles=quantiles)
}
## Samples of hazard curves over time
haz <- function(newdata=default_newdata, tmin=0, tmax=max(mspline$knots), nsim=10){
prior_sample_hazard(knots=mspline$knots, degree=mspline$degree,
coefs_mean=coefs_mean, prior_hsd=prior_hsd,
prior_hscale=prior_hscale, prior_loghr=prior_loghr,
newdata=apply_factor_levels(newdata,xlevs),
prior_hrsd=prior_hrsd,
formula=formula, cure=cure_formula, nonprop=nph_formula,
tmin=tmin, tmax=tmax,
nsim=nsim)
}
## SD describing variation in hazard function with time
haz_sd <- function(newdata=default_newdata, tmin=0, tmax=max(mspline$knots), nsim=100,
hq = c(0.1, 0.9),
quantiles = c(0.025, 0.5, 0.975)){
prior_haz_sd(mspline=mspline,
coefs_mean=coefs_mean, prior_hsd=prior_hsd,
prior_hscale=prior_hscale, prior_loghr=prior_loghr,
newdata=apply_factor_levels(newdata,xlevs),
prior_hrsd=prior_hrsd,
formula=formula, cure=cure_formula, nonprop=nph_formula,
tmin=tmin, tmax=tmax, nsim=nsim, hq=hq, quantiles=quantiles)
}
## SD describing variation in hazard ratio function with time
hr_sd <- function(newdata, newdata0, tmin=0, tmax=max(mspline$knots), nsim=100,
quantiles = c(0.025, 0.5, 0.975)){
prior_hr_sd(mspline=mspline,
coefs_mean=coefs_mean, prior_hsd=prior_hsd,
prior_hscale=prior_hscale, prior_loghr=prior_loghr,
newdata=apply_factor_levels(newdata, xlevs),
newdata0=apply_factor_levels(newdata0, xlevs),
prior_hrsd=prior_hrsd,
formula=formula, cure=cure_formula, nonprop=nph_formula,
tmin=tmin, tmax=tmax, nsim=nsim, quantiles=quantiles)
}
nlist(sample, haz, haz_const, haz_sd, hr_sd)
}
## No centering is done here. It assumes that any covariates are pre-centered
## around a meaningful value. This corresponds to the intercept that the prior is placed on
make_x <- function(formula, data, td, xlevs=NULL){
form <- delete.response(terms(formula))
mforig <- get_all_vars(form, data)
if (ncol(mforig) > 0){
factors <- names(mforig)[sapply(mforig, is.factor)]
numerics <- names(mforig)[sapply(mforig, is.numeric)]
tofactors <- names(mforig)[sapply(mforig, function(x){is.character(x)|is.logical(x)})]
for (i in tofactors)
mforig[[i]] <- factor(mforig[[i]])
factors <- c(factors, tofactors)
xlevs <- lapply(mforig[factors], levels)
if ((length(factors)==1) && (length(numerics)==0)){
ref_levs <- lapply(xlevs, function(x)x[1])
base_levs <- xlevs
}
else if (length(factors) > 0){
ref_levs <- lapply(xlevs, function(x)x[1])
base_levs <- ref_levs
} else base_levs <- ref_levs <- NULL
numeric_zeros <- lapply(mforig[numerics], function(x)0)
mfbase <- as.data.frame(c(numeric_zeros, base_levs))
} else factors <- numerics <- xlevs <- mfbase <- NULL
## model frame containing factor(x), log(x) etc not x
mf <- stats::model.frame(form, data)
xlevs <- .getXlevels(terms(mf), mf)
X <- drop_intercept(model.matrix(form, mf, xlev = xlevs))
ncovs <- NCOL(X)
xnames <- colnames(X)
event <- X[td$ind_event, , drop = FALSE]
rcens <- X[td$ind_rcens, , drop = FALSE]
nlist(X, N = NROW(X),
ncovs = NCOL(X), xnames, factors, numerics, xlevs,
## mfbase is used as the default "newdata" in output functions
## i.e. zero values of continuous covariates, first levels of factors
mfbase,
event, rcens)
}
eb_smoothness <- function(standata, staninit, prior_hsd){
standata$est_hsd <- 1
standata$hsd_fixed <- aa(numeric()) # dummy
prior <- get_prior_hsd(prior_hsd, est_hsd=TRUE)
standata$prior_hsd <- as.numeric(unlist(prior[c("shape","rate")]))
staninit$hsd <- aa(1)
fits <- rstan::optimizing(stanmodels$survextrap, data=standata, init=staninit, hessian=FALSE, verbose=TRUE)
if (fits$return_code==0){
hsd <- fits$par["hsd[1]"]
} else {
warning("Empirical Bayes estimation of smoothness parameter failed, continuing with default")
hsd <- 1
}
hsd
}
validate_formula <- function(formula, needs_response = TRUE) {
if (!inherits(formula, "formula")) {
stop("'formula' must be a formula.")
}
if (needs_response) {
len <- length(formula)
if (len < 3) {
stop("'formula' must contain a response.")
}
}
as.formula(formula)
}
make_td <- function(formula, data){
if (is.null(data)){
## model is for external data alone
if (attr(terms(formula),"response"))
warning("`formula` has a left-hand side, but `data` not supplied")
return(nlist(indiv=FALSE,
t_event=numeric(), t_rcens=numeric(),
nevent=0, nrcens=0, ind_event=numeric(), ind_rcens=numeric()))
}
mf <- stats::model.frame(terms(formula), data)
resp <- if (survival::is.Surv(mf)) mf else model.response(mf)
surv <- attr(resp, "type")
err <- paste0("Cannot handle '", surv, "' Surv objects.")
t_end <- switch(surv, "right" = as.vector(resp[, "time"]), stop(err))
if (any(t_end<0)) stop("Some survival times are negative")
status <- make_d(mf)
t_event <- aa(t_end[status == 1]) # exact event time
t_rcens <- aa(t_end[status == 0]) # right censoring time
nevent <- sum(status == 1)
nrcens <- sum(status == 0)
ind_event <- which(status==1)
ind_rcens <- which(status==0)
nlist(indiv=TRUE, # is there any individual-level data
t_event, t_rcens, nevent, nrcens,
ind_event, ind_rcens)
}
make_d <- function(model_frame) {
resp <- if (survival::is.Surv(model_frame))
model_frame else model.response(model_frame)
surv <- attr(resp, "type")
err <- paste0("Bug found: cannot handle '", surv, "' Surv objects.")
switch(surv,
"right" = as.vector(resp[, "status"]),
stop(err))
}
make_external <- function(external, formula, x, xcure, xnph, backhaz,
backhaz_strata = NULL){
if (is.null(external))
extl <- list(nextern=0, stop=numeric(), start=numeric(),
r=integer(), n=integer(),
backsurv_stop=numeric(), backsurv_start=numeric(),
tmax=0)
else {
validate_external(external, x, xcure, backhaz)
extl <- c(as.list(external), nextern=nrow(external),
tmax=max(external$stop))
if (backhaz$relative) {
if (is.data.frame(backhaz$df)) {
st <- make_backhaz_strata(backhaz_strata, backhaz$df, external)
backhaz$df$stratum <- factor(st$back)
extl$backsurv_stop <- aa(exp(-get_cum_backhaz(external$stop, backhaz$df,
strata=st$dat)))
extl$backsurv_start <- aa(exp(-get_cum_backhaz(external$start, backhaz$df,
strata=st$dat)))
} else {
extl$backsurv_stop <- aa(external$backsurv_stop)
extl$backsurv_start <- aa(external$backsurv_start)
}
} else {
extl$backsurv_stop <- extl$backsurv_start <- aa(rep(1, extl$nextern))
}
if (x$ncovs>0){
form <- delete.response(terms(formula))
mf <- model.frame(form, external, xlev = x$xlevs)
X <- drop_intercept(model.matrix(form, mf))
}
if (xcure$ncovs>0){
form <- delete.response(terms(xcure$cure_formula))
mf <- model.frame(form, external, xlev = xcure$xlevs)
Xcure <- drop_intercept(model.matrix(form, mf))
}
if (xnph$ncovs>0){
form <- delete.response(terms(xnph$nph_formula))
mf <- model.frame(form, external, xlev = xnph$xlevs)
Xnph <- drop_intercept(model.matrix(form, mf))
}
}
if ((extl$nextern==0) || (x$ncovs==0))
X <- array(dim=c(extl$nextern, x$ncovs))
if ((extl$nextern==0) || (xcure$ncovs==0))
Xcure <- array(dim=c(extl$nextern, xcure$ncovs))
if ((extl$nextern==0) || (xnph$ncovs==0))
Xnph <- array(dim=c(extl$nextern, xnph$ncovs))
extl$stop <- aa(extl$stop)
extl$start <- aa(extl$start)
extl$r <- aa(extl$r)
extl$n <- aa(extl$n)
extl <- c(extl, nlist(X), nlist(Xcure), nlist(Xnph))
extl
}
validate_external <- function(external, x, xcure, backhaz){
reqd_cols <- c("start","stop","r","n")
missing_cols <- setdiff(reqd_cols, names(external))
if (length(missing_cols) > 0) {
mstr <- paste(paste0("\"",missing_cols,"\"", collapse=","))
plural <- if (length(missing_cols) > 1) "s" else ""
stop(sprintf("Column%s %s not found in `external`",plural,mstr))
}
covnames <- union(names(x$mfbase), names(xcure$mfbase))
missing_cols <- setdiff(covnames, names(external))
if (length(missing_cols) > 0) {
mstr <- paste(paste0("\"",missing_cols,"\"", collapse=","))
plural <- if (length(missing_cols) > 1) "s" else ""
stop(sprintf("Covariate%s %s not found in `external`",plural,mstr))
}
if (backhaz$relative && !is.data.frame(backhaz$df)) {
reqd_cols <- c("backsurv_start","backsurv_stop")
missing_cols <- setdiff(reqd_cols, names(external))
if (length(missing_cols) > 0) {
mstr <- paste(paste0("\"",missing_cols,"\"", collapse=","))
plural <- if (length(missing_cols) > 1) "s" else ""
stop(sprintf("Column%s %s not found in `external`. These are required for relative survival models.",
plural,mstr))
}
}
}
make_basis <- function(times, mspline, integrate = FALSE) {
N <- length(times)
K <- mspline$nvars
if (!N) { # times is NULL or empty vector
return(matrix(0, 0, K))
}
mspline_basis(times, knots = mspline$knots, degree = mspline$degree,
integrate = integrate, bsmooth=mspline$bsmooth)
}
aa <- function(x, ...) as.array(x,...)
validate_df <- function(df, degree, bsmooth){
minddg <- if (bsmooth) 0 else 2
if (df < degree + minddg){
## unsmooth: order k = d+1, degree d = k-1, df = nik+k = nik+d+1 = nk+d
## smooth: df - degree >= nik - 1, unsmooth: df - degree >= nik + 1
## where nik is number of internal knots, which has to be at least 1
stop(sprintf("df = %s and degree = %s, but df - degree should be >= %s", df, degree, minddg))
}
}
make_mspline <- function(mspline, td, external, add_knots=NULL){
## observed times to choose default knots to span
tmax <- max(c(td$t_event, td$t_rcens, external$tmax), na.rm = TRUE)
tt <- td$t_event # uncensored event times
if (is.null(mspline$knots) && !length(tt)) {
tt <- td$t_rcens
if (length(tt)>0)
warning("No observed events found in the data. Censoring times will ",
"be used to evaluate default knot locations for splines.")
}
obstimes <- unique(tt)
if (is.numeric(mspline$add_knots)) add_knots <- mspline$add_knots
if (!is.numeric(add_knots))
obstimes <- sort(c(obstimes, unique(c(external$start, external$stop))))
mspline <- mspline_list_init(mspline, obstimes)
if (is.numeric(add_knots)) {
add_knots <- setdiff(unique(add_knots), mspline$knots)
mspline$knots <- sort(c(mspline$knots, add_knots))
mspline$df <- mspline$df + length(add_knots)
}
mspline$nvars <- mspline$df
mspline[c("nvars","knots","degree","bsmooth","df")]
}
validate_coefs_mean <- function(coefs){
if (!is.numeric(coefs)) stop("`coefs_mean` must be numeric")
if (!all(coefs > 0)) stop("`coefs_mean` must all be > 0")
coefs / sum(coefs)
}
make_cure_formula <- function(cure){
if (isTRUE(cure)) {
cure_formula <- ~1
} else if (inherits(cure,"formula")) {
cure_formula <- cure
} else if (isFALSE(cure)) {
cure_formula <- NULL
} else stop("`cure` must either be TRUE, FALSE or a model formula")
cure_formula
}
make_xcure <- function(cure, data, td){
cure_formula <- make_cure_formula(cure)
if (!is.null(cure_formula)){
xcure <- make_x(cure_formula, data, td)
cure <- TRUE
} else {
xcure <- list(ncovs = 0,
X = matrix(nrow=NROW(data), ncol=0),
event = matrix(nrow=td$nevent,ncol=0),
rcens = matrix(nrow=td$nrcens,ncol=0))
}
res <- c(nlist(cure, cure_formula), xcure)
res
}
make_nonprop <- function(nonprop, data, td, formula){
if (isTRUE(nonprop)) {
nph_formula <- formula
} else if (inherits(nonprop,"formula")) {
nph_formula <- nonprop
} else if (isFALSE(nonprop) || is.null(nonprop)) {
nph_formula <- NULL
} else stop("`nonprop` must either be TRUE, FALSE, NULL or a model formula")
if (!is.null(nph_formula)){
xnph <- make_x(nph_formula, data, td)
} else {
xnph <- list(ncovs = 0,
X = matrix(nrow=NROW(data), ncol=0),
event = matrix(nrow=td$nevent,ncol=0),
rcens = matrix(nrow=td$nrcens,ncol=0))
}
res <- c(nlist(nph_formula), xnph)
res
}
# Drop intercept from a model matrix
drop_intercept <- function(x) {
sel <- which("(Intercept)" %in% colnames(x))
if (length(sel) && is.matrix(x)) {
x <- x[, -sel, drop = FALSE]
}
x
}
## Named arguments to Stan computation functions
## These are specified explicitly, so that the user can pass e.g.
## chains=NULL to fit_method="opt" and it will be ignored rather than
## giving an error.
## Ideally we would be able to read these from rstan by using
## formals(), but they are hidden deeply inside methods. Instead they
## are taken from help(rstan::optimizing) - should ensure it is up to
## date
.rstan_sampling_args <- c("object","data","pars","chains","iter","warmup","thin",
"seed","init","check_data","sample_file","diagnostic_file","verbose",
"algorithm","control","cores","open_progress","show_messages",
"chain_id","init_r","test_grad","append_samples","refresh","enable_random_init")
.rstan_optimizing_args <- c("object","data","seed","init","check_data","sample_file","algorithm",
"verbose","hessian","as_vector","draws","constrained","importance_resampling",
"iter","save_iterations","refresh","init_alpha","tol_obj","tol_rel_obj",
"tol_grad","tol_rel_grad","tol_param","history_size")
.rstan_vb_args <- c("object","data","pars","include", "seed","init","sample_file","algorithm",
"importance_resampling","keep_every",
"iter","grad_samples","elbo_samples","eta","adapt_engaged",
"tol_rel_obj","eval_elbo","output_samples","adapt_iter")